glam/scripts/merge_staff_data.py
kempersc 1f723fd5d7 feat(data): merge staff data from 35 PENDING files into enriched custodians
Merged LinkedIn-extracted staff sections from PENDING files into their
corresponding proper GHCID custodian files. This consolidates data from
two extraction sources:
- Existing enriched files: Google Maps, Museum Register, YouTube, etc.
- PENDING files: LinkedIn staff data extraction

Files modified:
- 28 custodian files enriched with staff data
- 35 PENDING files deleted (merged into proper locations)
- Originals archived to archive/pending_duplicates_20250109/

Key institutions enriched:
- Rijksmuseum (NL-NH-AMS-M-RM)
- Stedelijk Museum Amsterdam (NL-NH-AMS-M-SMA)
- Amsterdam Museum (NL-NH-AMS-M-AM)
- Regionaal Archief Alkmaar (NL-NH-ALK-A-RAA)
- Maritiem Museum Rotterdam (NL-ZH-ROT-M-MMR)
- And 23 more museums/archives across NL

New scripts:
- scripts/merge_staff_data.py: Automated staff data merger
- scripts/categorize_pending_files.py: PENDING file analysis utility
2026-01-09 14:51:17 +01:00

272 lines
11 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Merge staff data from PENDING/archived files into existing custodian files.
This script takes staff sections from LinkedIn-extracted PENDING files and merges
them into existing custodian YAML files that have enrichment data (Google Maps,
Museum Register, etc.) but lack staff information.
Usage:
python scripts/merge_staff_data.py --dry-run # Preview changes
python scripts/merge_staff_data.py # Apply changes
python scripts/merge_staff_data.py --source-file <file> --target-file <file> # Single merge
"""
import os
import sys
import yaml
import argparse
from pathlib import Path
from datetime import datetime, timezone
from typing import Optional, Dict, Any, List, Tuple
# Preserve YAML formatting
class PreservingDumper(yaml.SafeDumper):
pass
def str_representer(dumper, data):
if '\n' in data:
return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')
return dumper.represent_scalar('tag:yaml.org,2002:str', data)
PreservingDumper.add_representer(str, str_representer)
# Known mappings for archived duplicates to target files
DUPLICATE_MAPPINGS = {
'NL-XX-XXX-PENDING-AMSTERDAM-CHEESE-MUSEUM': 'NL-NH-AMS-M-ACM',
'NL-XX-XXX-PENDING-AMSTERDAM-MUSEUM': 'NL-NH-AMS-M-AM',
'NL-XX-XXX-PENDING-AMSTERDAM-PIPE-MUSEUM': 'NL-NH-AMS-M-APM',
'NL-XX-XXX-PENDING-AMSTERDAM-TATTOO-MUSEUM': 'NL-NH-AMS-M-ATM',
'NL-XX-XXX-PENDING-AMSTERDAM-TULIP-MUSEUM': 'NL-NH-AMS-M-ATM', # Collision with Tattoo
'NL-XX-XXX-PENDING-CANNABIS-MUSEUM-AMSTERDAM': 'NL-NH-AMS-M-CMA',
'NL-XX-XXX-PENDING-DEVENTER-VERHAAL': 'NL-OV-DEV-M-DV',
'NL-XX-XXX-PENDING-DIAMOND-MUSEUM-AMSTERDAM': 'NL-NH-AMS-M-DMA-diamond_museum_amsterdam',
'NL-XX-XXX-PENDING-EINDHOVEN-MUSEUM': 'NL-NB-EIN-M-EM',
'NL-XX-XXX-PENDING-FIETSERS-MUSEUM-UTRECHT': 'NL-UT-UTR-M-FMU',
'NL-XX-XXX-PENDING-GRID-GRAFISCH-MUSEUM-GRONINGEN': 'NL-GR-GRO-M-GGMG',
'NL-XX-XXX-PENDING-HISTORISCH-MUSEUM-EDE': 'NL-GE-EDE-M-HME',
'NL-XX-XXX-PENDING-KRUIDENIERS-MUSEUM-UTRECHT': 'NL-UT-UTR-M-KMU',
'NL-XX-XXX-PENDING-KUNSTPUNT-GRONINGEN': 'NL-GR-GRO-M-KG', # Mapped to existing M type
'NL-XX-XXX-PENDING-LANDSCHAPSBEHEER-GRONINGEN': 'NL-GR-GRO-M-LG', # Mapped to existing M type
'NL-XX-XXX-PENDING-MAASTRICHT-MUSEUM': 'NL-LI-MAA-M-MM',
'NL-XX-XXX-PENDING-MARITIEM-MUSEUM-ROTTERDAM': 'NL-ZH-ROT-M-MMR',
'NL-XX-XXX-PENDING-MUSEUM-AMSTERDAM-NOORD': 'NL-NH-AMS-M-MAN',
'NL-XX-XXX-PENDING-MUSEUM-GOUDA': 'NL-ZH-GOU-M-MG',
'NL-XX-XXX-PENDING-MUSEUM-HELMOND': 'NL-NB-HEL-M-MH',
'NL-XX-XXX-PENDING-MUSEUM-PRINSENHOF-DELFT': 'NL-ZH-DEL-M-MPD',
'NL-XX-XXX-PENDING-MUSEUM-ROTTERDAM': 'NL-ZH-ROT-M-MR',
'NL-XX-XXX-PENDING-MUSEUMSTOOMTRAM-HOORN-MEDEMBLIK': 'NL-NH-HOO-M-MHM',
'NL-XX-XXX-PENDING-REGIONAAL-ARCHIEF-ALKMAAR': 'NL-NH-ALK-A-RAA',
'NL-XX-XXX-PENDING-REGIONAAL-ARCHIEF-DORDRECHT': 'NL-ZH-DOR-A-RAD',
'NL-XX-XXX-PENDING-REGIONAAL-ARCHIEF-NIJMEGEN': 'NL-GE-NIJ-A-RAN',
'NL-XX-XXX-PENDING-REGIONAAL-ARCHIEF-TILBURG': 'NL-NB-TIL-A-RAT',
'NL-XX-XXX-PENDING-ROYAL-DELFT-MUSEUM': 'NL-ZH-DEL-M-RDM',
'NL-XX-XXX-PENDING-STEDELIJK-MUSEUM-ALKMAAR': 'NL-NH-ALK-M-SMA',
'NL-XX-XXX-PENDING-STEDELIJK-MUSEUM-AMSTERDAM': 'NL-NH-AMS-M-SMA',
'NL-XX-XXX-PENDING-STEDELIJK-MUSEUM-BREDA': 'NL-NB-BRE-M-SMB',
'NL-XX-XXX-PENDING-THE-LIVING-MUSEUM-LEEUWARDEN': 'NL-FR-LEE-M-LML',
'NL-XX-XXX-PENDING-VERWEY-MUSEUM-HAARLEM': 'NL-NH-HAA-M-VMH',
'NL-XX-XXX-PENDING-WESTFRIES-MUSEUM-HOORN': 'NL-NH-HOO-M-WMH',
# Main PENDING file
'NL-XX-XXX-PENDING-RIJKSMUSEUM': 'NL-NH-AMS-M-RM',
}
def load_yaml(filepath: Path) -> Dict[str, Any]:
"""Load YAML file safely."""
with open(filepath, 'r', encoding='utf-8') as f:
return yaml.safe_load(f)
def save_yaml(filepath: Path, data: Dict[str, Any]):
"""Save YAML file with proper formatting."""
with open(filepath, 'w', encoding='utf-8') as f:
yaml.dump(data, f, Dumper=PreservingDumper, allow_unicode=True,
default_flow_style=False, sort_keys=False, width=120)
def merge_staff_data(source_data: Dict[str, Any], target_data: Dict[str, Any],
source_name: str) -> Tuple[Dict[str, Any], int]:
"""
Merge staff section from source into target.
Returns:
Tuple of (merged_data, staff_count_added)
"""
if 'staff' not in source_data:
return target_data, 0
source_staff = source_data['staff']
staff_list = source_staff.get('staff_list', [])
if not staff_list:
return target_data, 0
# Check if target already has staff
if 'staff' in target_data and target_data['staff'].get('staff_list'):
existing_count = len(target_data['staff']['staff_list'])
print(f" WARNING: Target already has {existing_count} staff members")
# Could implement merge logic here, for now skip
return target_data, 0
# Add staff section to target
target_data['staff'] = {
'provenance': source_staff.get('provenance', {}),
'staff_list': staff_list
}
# Add merge note to provenance
if 'provenance' not in target_data:
target_data['provenance'] = {}
if 'notes' not in target_data['provenance']:
target_data['provenance']['notes'] = []
elif isinstance(target_data['provenance']['notes'], str):
target_data['provenance']['notes'] = [target_data['provenance']['notes']]
merge_note = f"Staff data merged from {source_name} on {datetime.now(timezone.utc).isoformat()}"
target_data['provenance']['notes'].append(merge_note)
return target_data, len(staff_list)
def find_target_file(source_ghcid: str, custodian_dir: Path) -> Optional[Path]:
"""Find the target file for a source GHCID."""
if source_ghcid in DUPLICATE_MAPPINGS:
target_ghcid = DUPLICATE_MAPPINGS[source_ghcid]
target_file = custodian_dir / f"{target_ghcid}.yaml"
if target_file.exists():
return target_file
# Try with name suffix
for f in custodian_dir.glob(f"{target_ghcid}*.yaml"):
return f
return None
def process_archived_duplicates(custodian_dir: Path, archive_dir: Path, dry_run: bool = True):
"""Process all archived duplicate files and merge their staff data."""
print("=" * 80)
print("MERGING STAFF DATA FROM ARCHIVED DUPLICATES")
print("=" * 80)
print(f"Archive directory: {archive_dir}")
print(f"Custodian directory: {custodian_dir}")
print(f"Mode: {'DRY RUN' if dry_run else 'LIVE'}")
print()
total_merged = 0
total_staff = 0
errors = []
for source_file in sorted(archive_dir.glob("NL-XX-XXX-PENDING-*.yaml")):
source_ghcid = source_file.stem
target_file = find_target_file(source_ghcid, custodian_dir)
if not target_file:
errors.append(f"{source_ghcid}: No target file found")
continue
# Load both files
try:
source_data = load_yaml(source_file)
target_data = load_yaml(target_file)
except Exception as e:
errors.append(f"{source_ghcid}: Error loading files: {e}")
continue
# Check if source has staff
source_staff_count = len(source_data.get('staff', {}).get('staff_list', []))
if source_staff_count == 0:
continue
# Check if target already has staff
target_staff_count = len(target_data.get('staff', {}).get('staff_list', []))
print(f"[{'DRY RUN' if dry_run else 'MERGING'}] {source_ghcid}")
print(f" Source: {source_staff_count} staff")
print(f" Target: {target_file.name} ({target_staff_count} existing staff)")
if target_staff_count > 0:
print(f" SKIPPED: Target already has staff data")
continue
# Merge
merged_data, staff_added = merge_staff_data(source_data, target_data, source_file.name)
if staff_added > 0:
print(f" -> Would add {staff_added} staff members")
if not dry_run:
save_yaml(target_file, merged_data)
print(f" -> SAVED")
total_merged += 1
total_staff += staff_added
print()
print("=" * 80)
print("SUMMARY")
print("=" * 80)
print(f"Files merged: {total_merged}")
print(f"Total staff added: {total_staff}")
if errors:
print(f"Errors: {len(errors)}")
for e in errors:
print(f" - {e}")
def process_single_merge(source_file: Path, target_file: Path, dry_run: bool = True):
"""Merge a single source file into a target file."""
print(f"Source: {source_file}")
print(f"Target: {target_file}")
print(f"Mode: {'DRY RUN' if dry_run else 'LIVE'}")
print()
source_data = load_yaml(source_file)
target_data = load_yaml(target_file)
source_staff_count = len(source_data.get('staff', {}).get('staff_list', []))
target_staff_count = len(target_data.get('staff', {}).get('staff_list', []))
print(f"Source staff: {source_staff_count}")
print(f"Target staff: {target_staff_count}")
if target_staff_count > 0:
print("WARNING: Target already has staff data. Merge would overwrite.")
return
merged_data, staff_added = merge_staff_data(source_data, target_data, source_file.name)
if staff_added > 0:
print(f"Would add {staff_added} staff members")
if not dry_run:
save_yaml(target_file, merged_data)
print("SAVED")
def main():
parser = argparse.ArgumentParser(description='Merge staff data from PENDING files')
parser.add_argument('--dry-run', action='store_true', help='Preview changes without saving')
parser.add_argument('--source-file', type=Path, help='Single source file to merge')
parser.add_argument('--target-file', type=Path, help='Single target file to merge into')
parser.add_argument('--custodian-dir', type=Path,
default=Path('/Users/kempersc/apps/glam/data/custodian'),
help='Custodian directory')
parser.add_argument('--archive-dir', type=Path,
default=Path('/Users/kempersc/apps/glam/data/custodian/archive/pending_duplicates_20250109'),
help='Archive directory with duplicates')
args = parser.parse_args()
if args.source_file and args.target_file:
process_single_merge(args.source_file, args.target_file, args.dry_run)
else:
process_archived_duplicates(args.custodian_dir, args.archive_dir, args.dry_run)
if __name__ == '__main__':
main()