Merged LinkedIn-extracted staff sections from PENDING files into their corresponding proper GHCID custodian files. This consolidates data from two extraction sources: - Existing enriched files: Google Maps, Museum Register, YouTube, etc. - PENDING files: LinkedIn staff data extraction Files modified: - 28 custodian files enriched with staff data - 35 PENDING files deleted (merged into proper locations) - Originals archived to archive/pending_duplicates_20250109/ Key institutions enriched: - Rijksmuseum (NL-NH-AMS-M-RM) - Stedelijk Museum Amsterdam (NL-NH-AMS-M-SMA) - Amsterdam Museum (NL-NH-AMS-M-AM) - Regionaal Archief Alkmaar (NL-NH-ALK-A-RAA) - Maritiem Museum Rotterdam (NL-ZH-ROT-M-MMR) - And 23 more museums/archives across NL New scripts: - scripts/merge_staff_data.py: Automated staff data merger - scripts/categorize_pending_files.py: PENDING file analysis utility
272 lines
11 KiB
Python
Executable file
272 lines
11 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Merge staff data from PENDING/archived files into existing custodian files.
|
|
|
|
This script takes staff sections from LinkedIn-extracted PENDING files and merges
|
|
them into existing custodian YAML files that have enrichment data (Google Maps,
|
|
Museum Register, etc.) but lack staff information.
|
|
|
|
Usage:
|
|
python scripts/merge_staff_data.py --dry-run # Preview changes
|
|
python scripts/merge_staff_data.py # Apply changes
|
|
python scripts/merge_staff_data.py --source-file <file> --target-file <file> # Single merge
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import yaml
|
|
import argparse
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from typing import Optional, Dict, Any, List, Tuple
|
|
|
|
# Preserve YAML formatting
|
|
class PreservingDumper(yaml.SafeDumper):
|
|
pass
|
|
|
|
def str_representer(dumper, data):
|
|
if '\n' in data:
|
|
return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')
|
|
return dumper.represent_scalar('tag:yaml.org,2002:str', data)
|
|
|
|
PreservingDumper.add_representer(str, str_representer)
|
|
|
|
# Known mappings for archived duplicates to target files
|
|
DUPLICATE_MAPPINGS = {
|
|
'NL-XX-XXX-PENDING-AMSTERDAM-CHEESE-MUSEUM': 'NL-NH-AMS-M-ACM',
|
|
'NL-XX-XXX-PENDING-AMSTERDAM-MUSEUM': 'NL-NH-AMS-M-AM',
|
|
'NL-XX-XXX-PENDING-AMSTERDAM-PIPE-MUSEUM': 'NL-NH-AMS-M-APM',
|
|
'NL-XX-XXX-PENDING-AMSTERDAM-TATTOO-MUSEUM': 'NL-NH-AMS-M-ATM',
|
|
'NL-XX-XXX-PENDING-AMSTERDAM-TULIP-MUSEUM': 'NL-NH-AMS-M-ATM', # Collision with Tattoo
|
|
'NL-XX-XXX-PENDING-CANNABIS-MUSEUM-AMSTERDAM': 'NL-NH-AMS-M-CMA',
|
|
'NL-XX-XXX-PENDING-DEVENTER-VERHAAL': 'NL-OV-DEV-M-DV',
|
|
'NL-XX-XXX-PENDING-DIAMOND-MUSEUM-AMSTERDAM': 'NL-NH-AMS-M-DMA-diamond_museum_amsterdam',
|
|
'NL-XX-XXX-PENDING-EINDHOVEN-MUSEUM': 'NL-NB-EIN-M-EM',
|
|
'NL-XX-XXX-PENDING-FIETSERS-MUSEUM-UTRECHT': 'NL-UT-UTR-M-FMU',
|
|
'NL-XX-XXX-PENDING-GRID-GRAFISCH-MUSEUM-GRONINGEN': 'NL-GR-GRO-M-GGMG',
|
|
'NL-XX-XXX-PENDING-HISTORISCH-MUSEUM-EDE': 'NL-GE-EDE-M-HME',
|
|
'NL-XX-XXX-PENDING-KRUIDENIERS-MUSEUM-UTRECHT': 'NL-UT-UTR-M-KMU',
|
|
'NL-XX-XXX-PENDING-KUNSTPUNT-GRONINGEN': 'NL-GR-GRO-M-KG', # Mapped to existing M type
|
|
'NL-XX-XXX-PENDING-LANDSCHAPSBEHEER-GRONINGEN': 'NL-GR-GRO-M-LG', # Mapped to existing M type
|
|
'NL-XX-XXX-PENDING-MAASTRICHT-MUSEUM': 'NL-LI-MAA-M-MM',
|
|
'NL-XX-XXX-PENDING-MARITIEM-MUSEUM-ROTTERDAM': 'NL-ZH-ROT-M-MMR',
|
|
'NL-XX-XXX-PENDING-MUSEUM-AMSTERDAM-NOORD': 'NL-NH-AMS-M-MAN',
|
|
'NL-XX-XXX-PENDING-MUSEUM-GOUDA': 'NL-ZH-GOU-M-MG',
|
|
'NL-XX-XXX-PENDING-MUSEUM-HELMOND': 'NL-NB-HEL-M-MH',
|
|
'NL-XX-XXX-PENDING-MUSEUM-PRINSENHOF-DELFT': 'NL-ZH-DEL-M-MPD',
|
|
'NL-XX-XXX-PENDING-MUSEUM-ROTTERDAM': 'NL-ZH-ROT-M-MR',
|
|
'NL-XX-XXX-PENDING-MUSEUMSTOOMTRAM-HOORN-MEDEMBLIK': 'NL-NH-HOO-M-MHM',
|
|
'NL-XX-XXX-PENDING-REGIONAAL-ARCHIEF-ALKMAAR': 'NL-NH-ALK-A-RAA',
|
|
'NL-XX-XXX-PENDING-REGIONAAL-ARCHIEF-DORDRECHT': 'NL-ZH-DOR-A-RAD',
|
|
'NL-XX-XXX-PENDING-REGIONAAL-ARCHIEF-NIJMEGEN': 'NL-GE-NIJ-A-RAN',
|
|
'NL-XX-XXX-PENDING-REGIONAAL-ARCHIEF-TILBURG': 'NL-NB-TIL-A-RAT',
|
|
'NL-XX-XXX-PENDING-ROYAL-DELFT-MUSEUM': 'NL-ZH-DEL-M-RDM',
|
|
'NL-XX-XXX-PENDING-STEDELIJK-MUSEUM-ALKMAAR': 'NL-NH-ALK-M-SMA',
|
|
'NL-XX-XXX-PENDING-STEDELIJK-MUSEUM-AMSTERDAM': 'NL-NH-AMS-M-SMA',
|
|
'NL-XX-XXX-PENDING-STEDELIJK-MUSEUM-BREDA': 'NL-NB-BRE-M-SMB',
|
|
'NL-XX-XXX-PENDING-THE-LIVING-MUSEUM-LEEUWARDEN': 'NL-FR-LEE-M-LML',
|
|
'NL-XX-XXX-PENDING-VERWEY-MUSEUM-HAARLEM': 'NL-NH-HAA-M-VMH',
|
|
'NL-XX-XXX-PENDING-WESTFRIES-MUSEUM-HOORN': 'NL-NH-HOO-M-WMH',
|
|
# Main PENDING file
|
|
'NL-XX-XXX-PENDING-RIJKSMUSEUM': 'NL-NH-AMS-M-RM',
|
|
}
|
|
|
|
|
|
def load_yaml(filepath: Path) -> Dict[str, Any]:
|
|
"""Load YAML file safely."""
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
return yaml.safe_load(f)
|
|
|
|
|
|
def save_yaml(filepath: Path, data: Dict[str, Any]):
|
|
"""Save YAML file with proper formatting."""
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, Dumper=PreservingDumper, allow_unicode=True,
|
|
default_flow_style=False, sort_keys=False, width=120)
|
|
|
|
|
|
def merge_staff_data(source_data: Dict[str, Any], target_data: Dict[str, Any],
|
|
source_name: str) -> Tuple[Dict[str, Any], int]:
|
|
"""
|
|
Merge staff section from source into target.
|
|
|
|
Returns:
|
|
Tuple of (merged_data, staff_count_added)
|
|
"""
|
|
if 'staff' not in source_data:
|
|
return target_data, 0
|
|
|
|
source_staff = source_data['staff']
|
|
staff_list = source_staff.get('staff_list', [])
|
|
|
|
if not staff_list:
|
|
return target_data, 0
|
|
|
|
# Check if target already has staff
|
|
if 'staff' in target_data and target_data['staff'].get('staff_list'):
|
|
existing_count = len(target_data['staff']['staff_list'])
|
|
print(f" WARNING: Target already has {existing_count} staff members")
|
|
# Could implement merge logic here, for now skip
|
|
return target_data, 0
|
|
|
|
# Add staff section to target
|
|
target_data['staff'] = {
|
|
'provenance': source_staff.get('provenance', {}),
|
|
'staff_list': staff_list
|
|
}
|
|
|
|
# Add merge note to provenance
|
|
if 'provenance' not in target_data:
|
|
target_data['provenance'] = {}
|
|
|
|
if 'notes' not in target_data['provenance']:
|
|
target_data['provenance']['notes'] = []
|
|
elif isinstance(target_data['provenance']['notes'], str):
|
|
target_data['provenance']['notes'] = [target_data['provenance']['notes']]
|
|
|
|
merge_note = f"Staff data merged from {source_name} on {datetime.now(timezone.utc).isoformat()}"
|
|
target_data['provenance']['notes'].append(merge_note)
|
|
|
|
return target_data, len(staff_list)
|
|
|
|
|
|
def find_target_file(source_ghcid: str, custodian_dir: Path) -> Optional[Path]:
|
|
"""Find the target file for a source GHCID."""
|
|
if source_ghcid in DUPLICATE_MAPPINGS:
|
|
target_ghcid = DUPLICATE_MAPPINGS[source_ghcid]
|
|
target_file = custodian_dir / f"{target_ghcid}.yaml"
|
|
if target_file.exists():
|
|
return target_file
|
|
|
|
# Try with name suffix
|
|
for f in custodian_dir.glob(f"{target_ghcid}*.yaml"):
|
|
return f
|
|
|
|
return None
|
|
|
|
|
|
def process_archived_duplicates(custodian_dir: Path, archive_dir: Path, dry_run: bool = True):
|
|
"""Process all archived duplicate files and merge their staff data."""
|
|
print("=" * 80)
|
|
print("MERGING STAFF DATA FROM ARCHIVED DUPLICATES")
|
|
print("=" * 80)
|
|
print(f"Archive directory: {archive_dir}")
|
|
print(f"Custodian directory: {custodian_dir}")
|
|
print(f"Mode: {'DRY RUN' if dry_run else 'LIVE'}")
|
|
print()
|
|
|
|
total_merged = 0
|
|
total_staff = 0
|
|
errors = []
|
|
|
|
for source_file in sorted(archive_dir.glob("NL-XX-XXX-PENDING-*.yaml")):
|
|
source_ghcid = source_file.stem
|
|
|
|
target_file = find_target_file(source_ghcid, custodian_dir)
|
|
|
|
if not target_file:
|
|
errors.append(f"{source_ghcid}: No target file found")
|
|
continue
|
|
|
|
# Load both files
|
|
try:
|
|
source_data = load_yaml(source_file)
|
|
target_data = load_yaml(target_file)
|
|
except Exception as e:
|
|
errors.append(f"{source_ghcid}: Error loading files: {e}")
|
|
continue
|
|
|
|
# Check if source has staff
|
|
source_staff_count = len(source_data.get('staff', {}).get('staff_list', []))
|
|
if source_staff_count == 0:
|
|
continue
|
|
|
|
# Check if target already has staff
|
|
target_staff_count = len(target_data.get('staff', {}).get('staff_list', []))
|
|
|
|
print(f"[{'DRY RUN' if dry_run else 'MERGING'}] {source_ghcid}")
|
|
print(f" Source: {source_staff_count} staff")
|
|
print(f" Target: {target_file.name} ({target_staff_count} existing staff)")
|
|
|
|
if target_staff_count > 0:
|
|
print(f" SKIPPED: Target already has staff data")
|
|
continue
|
|
|
|
# Merge
|
|
merged_data, staff_added = merge_staff_data(source_data, target_data, source_file.name)
|
|
|
|
if staff_added > 0:
|
|
print(f" -> Would add {staff_added} staff members")
|
|
|
|
if not dry_run:
|
|
save_yaml(target_file, merged_data)
|
|
print(f" -> SAVED")
|
|
|
|
total_merged += 1
|
|
total_staff += staff_added
|
|
|
|
print()
|
|
|
|
print("=" * 80)
|
|
print("SUMMARY")
|
|
print("=" * 80)
|
|
print(f"Files merged: {total_merged}")
|
|
print(f"Total staff added: {total_staff}")
|
|
if errors:
|
|
print(f"Errors: {len(errors)}")
|
|
for e in errors:
|
|
print(f" - {e}")
|
|
|
|
|
|
def process_single_merge(source_file: Path, target_file: Path, dry_run: bool = True):
|
|
"""Merge a single source file into a target file."""
|
|
print(f"Source: {source_file}")
|
|
print(f"Target: {target_file}")
|
|
print(f"Mode: {'DRY RUN' if dry_run else 'LIVE'}")
|
|
print()
|
|
|
|
source_data = load_yaml(source_file)
|
|
target_data = load_yaml(target_file)
|
|
|
|
source_staff_count = len(source_data.get('staff', {}).get('staff_list', []))
|
|
target_staff_count = len(target_data.get('staff', {}).get('staff_list', []))
|
|
|
|
print(f"Source staff: {source_staff_count}")
|
|
print(f"Target staff: {target_staff_count}")
|
|
|
|
if target_staff_count > 0:
|
|
print("WARNING: Target already has staff data. Merge would overwrite.")
|
|
return
|
|
|
|
merged_data, staff_added = merge_staff_data(source_data, target_data, source_file.name)
|
|
|
|
if staff_added > 0:
|
|
print(f"Would add {staff_added} staff members")
|
|
|
|
if not dry_run:
|
|
save_yaml(target_file, merged_data)
|
|
print("SAVED")
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='Merge staff data from PENDING files')
|
|
parser.add_argument('--dry-run', action='store_true', help='Preview changes without saving')
|
|
parser.add_argument('--source-file', type=Path, help='Single source file to merge')
|
|
parser.add_argument('--target-file', type=Path, help='Single target file to merge into')
|
|
parser.add_argument('--custodian-dir', type=Path,
|
|
default=Path('/Users/kempersc/apps/glam/data/custodian'),
|
|
help='Custodian directory')
|
|
parser.add_argument('--archive-dir', type=Path,
|
|
default=Path('/Users/kempersc/apps/glam/data/custodian/archive/pending_duplicates_20250109'),
|
|
help='Archive directory with duplicates')
|
|
|
|
args = parser.parse_args()
|
|
|
|
if args.source_file and args.target_file:
|
|
process_single_merge(args.source_file, args.target_file, args.dry_run)
|
|
else:
|
|
process_archived_duplicates(args.custodian_dir, args.archive_dir, args.dry_run)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|