#!/usr/bin/env python3 """ Merge staff data from PENDING/archived files into existing custodian files. This script takes staff sections from LinkedIn-extracted PENDING files and merges them into existing custodian YAML files that have enrichment data (Google Maps, Museum Register, etc.) but lack staff information. Usage: python scripts/merge_staff_data.py --dry-run # Preview changes python scripts/merge_staff_data.py # Apply changes python scripts/merge_staff_data.py --source-file --target-file # Single merge """ import os import sys import yaml import argparse from pathlib import Path from datetime import datetime, timezone from typing import Optional, Dict, Any, List, Tuple # Preserve YAML formatting class PreservingDumper(yaml.SafeDumper): pass def str_representer(dumper, data): if '\n' in data: return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|') return dumper.represent_scalar('tag:yaml.org,2002:str', data) PreservingDumper.add_representer(str, str_representer) # Known mappings for archived duplicates to target files DUPLICATE_MAPPINGS = { 'NL-XX-XXX-PENDING-AMSTERDAM-CHEESE-MUSEUM': 'NL-NH-AMS-M-ACM', 'NL-XX-XXX-PENDING-AMSTERDAM-MUSEUM': 'NL-NH-AMS-M-AM', 'NL-XX-XXX-PENDING-AMSTERDAM-PIPE-MUSEUM': 'NL-NH-AMS-M-APM', 'NL-XX-XXX-PENDING-AMSTERDAM-TATTOO-MUSEUM': 'NL-NH-AMS-M-ATM', 'NL-XX-XXX-PENDING-AMSTERDAM-TULIP-MUSEUM': 'NL-NH-AMS-M-ATM', # Collision with Tattoo 'NL-XX-XXX-PENDING-CANNABIS-MUSEUM-AMSTERDAM': 'NL-NH-AMS-M-CMA', 'NL-XX-XXX-PENDING-DEVENTER-VERHAAL': 'NL-OV-DEV-M-DV', 'NL-XX-XXX-PENDING-DIAMOND-MUSEUM-AMSTERDAM': 'NL-NH-AMS-M-DMA-diamond_museum_amsterdam', 'NL-XX-XXX-PENDING-EINDHOVEN-MUSEUM': 'NL-NB-EIN-M-EM', 'NL-XX-XXX-PENDING-FIETSERS-MUSEUM-UTRECHT': 'NL-UT-UTR-M-FMU', 'NL-XX-XXX-PENDING-GRID-GRAFISCH-MUSEUM-GRONINGEN': 'NL-GR-GRO-M-GGMG', 'NL-XX-XXX-PENDING-HISTORISCH-MUSEUM-EDE': 'NL-GE-EDE-M-HME', 'NL-XX-XXX-PENDING-KRUIDENIERS-MUSEUM-UTRECHT': 'NL-UT-UTR-M-KMU', 'NL-XX-XXX-PENDING-KUNSTPUNT-GRONINGEN': 'NL-GR-GRO-M-KG', # Mapped to existing M type 'NL-XX-XXX-PENDING-LANDSCHAPSBEHEER-GRONINGEN': 'NL-GR-GRO-M-LG', # Mapped to existing M type 'NL-XX-XXX-PENDING-MAASTRICHT-MUSEUM': 'NL-LI-MAA-M-MM', 'NL-XX-XXX-PENDING-MARITIEM-MUSEUM-ROTTERDAM': 'NL-ZH-ROT-M-MMR', 'NL-XX-XXX-PENDING-MUSEUM-AMSTERDAM-NOORD': 'NL-NH-AMS-M-MAN', 'NL-XX-XXX-PENDING-MUSEUM-GOUDA': 'NL-ZH-GOU-M-MG', 'NL-XX-XXX-PENDING-MUSEUM-HELMOND': 'NL-NB-HEL-M-MH', 'NL-XX-XXX-PENDING-MUSEUM-PRINSENHOF-DELFT': 'NL-ZH-DEL-M-MPD', 'NL-XX-XXX-PENDING-MUSEUM-ROTTERDAM': 'NL-ZH-ROT-M-MR', 'NL-XX-XXX-PENDING-MUSEUMSTOOMTRAM-HOORN-MEDEMBLIK': 'NL-NH-HOO-M-MHM', 'NL-XX-XXX-PENDING-REGIONAAL-ARCHIEF-ALKMAAR': 'NL-NH-ALK-A-RAA', 'NL-XX-XXX-PENDING-REGIONAAL-ARCHIEF-DORDRECHT': 'NL-ZH-DOR-A-RAD', 'NL-XX-XXX-PENDING-REGIONAAL-ARCHIEF-NIJMEGEN': 'NL-GE-NIJ-A-RAN', 'NL-XX-XXX-PENDING-REGIONAAL-ARCHIEF-TILBURG': 'NL-NB-TIL-A-RAT', 'NL-XX-XXX-PENDING-ROYAL-DELFT-MUSEUM': 'NL-ZH-DEL-M-RDM', 'NL-XX-XXX-PENDING-STEDELIJK-MUSEUM-ALKMAAR': 'NL-NH-ALK-M-SMA', 'NL-XX-XXX-PENDING-STEDELIJK-MUSEUM-AMSTERDAM': 'NL-NH-AMS-M-SMA', 'NL-XX-XXX-PENDING-STEDELIJK-MUSEUM-BREDA': 'NL-NB-BRE-M-SMB', 'NL-XX-XXX-PENDING-THE-LIVING-MUSEUM-LEEUWARDEN': 'NL-FR-LEE-M-LML', 'NL-XX-XXX-PENDING-VERWEY-MUSEUM-HAARLEM': 'NL-NH-HAA-M-VMH', 'NL-XX-XXX-PENDING-WESTFRIES-MUSEUM-HOORN': 'NL-NH-HOO-M-WMH', # Main PENDING file 'NL-XX-XXX-PENDING-RIJKSMUSEUM': 'NL-NH-AMS-M-RM', } def load_yaml(filepath: Path) -> Dict[str, Any]: """Load YAML file safely.""" with open(filepath, 'r', encoding='utf-8') as f: return yaml.safe_load(f) def save_yaml(filepath: Path, data: Dict[str, Any]): """Save YAML file with proper formatting.""" with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f, Dumper=PreservingDumper, allow_unicode=True, default_flow_style=False, sort_keys=False, width=120) def merge_staff_data(source_data: Dict[str, Any], target_data: Dict[str, Any], source_name: str) -> Tuple[Dict[str, Any], int]: """ Merge staff section from source into target. Returns: Tuple of (merged_data, staff_count_added) """ if 'staff' not in source_data: return target_data, 0 source_staff = source_data['staff'] staff_list = source_staff.get('staff_list', []) if not staff_list: return target_data, 0 # Check if target already has staff if 'staff' in target_data and target_data['staff'].get('staff_list'): existing_count = len(target_data['staff']['staff_list']) print(f" WARNING: Target already has {existing_count} staff members") # Could implement merge logic here, for now skip return target_data, 0 # Add staff section to target target_data['staff'] = { 'provenance': source_staff.get('provenance', {}), 'staff_list': staff_list } # Add merge note to provenance if 'provenance' not in target_data: target_data['provenance'] = {} if 'notes' not in target_data['provenance']: target_data['provenance']['notes'] = [] elif isinstance(target_data['provenance']['notes'], str): target_data['provenance']['notes'] = [target_data['provenance']['notes']] merge_note = f"Staff data merged from {source_name} on {datetime.now(timezone.utc).isoformat()}" target_data['provenance']['notes'].append(merge_note) return target_data, len(staff_list) def find_target_file(source_ghcid: str, custodian_dir: Path) -> Optional[Path]: """Find the target file for a source GHCID.""" if source_ghcid in DUPLICATE_MAPPINGS: target_ghcid = DUPLICATE_MAPPINGS[source_ghcid] target_file = custodian_dir / f"{target_ghcid}.yaml" if target_file.exists(): return target_file # Try with name suffix for f in custodian_dir.glob(f"{target_ghcid}*.yaml"): return f return None def process_archived_duplicates(custodian_dir: Path, archive_dir: Path, dry_run: bool = True): """Process all archived duplicate files and merge their staff data.""" print("=" * 80) print("MERGING STAFF DATA FROM ARCHIVED DUPLICATES") print("=" * 80) print(f"Archive directory: {archive_dir}") print(f"Custodian directory: {custodian_dir}") print(f"Mode: {'DRY RUN' if dry_run else 'LIVE'}") print() total_merged = 0 total_staff = 0 errors = [] for source_file in sorted(archive_dir.glob("NL-XX-XXX-PENDING-*.yaml")): source_ghcid = source_file.stem target_file = find_target_file(source_ghcid, custodian_dir) if not target_file: errors.append(f"{source_ghcid}: No target file found") continue # Load both files try: source_data = load_yaml(source_file) target_data = load_yaml(target_file) except Exception as e: errors.append(f"{source_ghcid}: Error loading files: {e}") continue # Check if source has staff source_staff_count = len(source_data.get('staff', {}).get('staff_list', [])) if source_staff_count == 0: continue # Check if target already has staff target_staff_count = len(target_data.get('staff', {}).get('staff_list', [])) print(f"[{'DRY RUN' if dry_run else 'MERGING'}] {source_ghcid}") print(f" Source: {source_staff_count} staff") print(f" Target: {target_file.name} ({target_staff_count} existing staff)") if target_staff_count > 0: print(f" SKIPPED: Target already has staff data") continue # Merge merged_data, staff_added = merge_staff_data(source_data, target_data, source_file.name) if staff_added > 0: print(f" -> Would add {staff_added} staff members") if not dry_run: save_yaml(target_file, merged_data) print(f" -> SAVED") total_merged += 1 total_staff += staff_added print() print("=" * 80) print("SUMMARY") print("=" * 80) print(f"Files merged: {total_merged}") print(f"Total staff added: {total_staff}") if errors: print(f"Errors: {len(errors)}") for e in errors: print(f" - {e}") def process_single_merge(source_file: Path, target_file: Path, dry_run: bool = True): """Merge a single source file into a target file.""" print(f"Source: {source_file}") print(f"Target: {target_file}") print(f"Mode: {'DRY RUN' if dry_run else 'LIVE'}") print() source_data = load_yaml(source_file) target_data = load_yaml(target_file) source_staff_count = len(source_data.get('staff', {}).get('staff_list', [])) target_staff_count = len(target_data.get('staff', {}).get('staff_list', [])) print(f"Source staff: {source_staff_count}") print(f"Target staff: {target_staff_count}") if target_staff_count > 0: print("WARNING: Target already has staff data. Merge would overwrite.") return merged_data, staff_added = merge_staff_data(source_data, target_data, source_file.name) if staff_added > 0: print(f"Would add {staff_added} staff members") if not dry_run: save_yaml(target_file, merged_data) print("SAVED") def main(): parser = argparse.ArgumentParser(description='Merge staff data from PENDING files') parser.add_argument('--dry-run', action='store_true', help='Preview changes without saving') parser.add_argument('--source-file', type=Path, help='Single source file to merge') parser.add_argument('--target-file', type=Path, help='Single target file to merge into') parser.add_argument('--custodian-dir', type=Path, default=Path('/Users/kempersc/apps/glam/data/custodian'), help='Custodian directory') parser.add_argument('--archive-dir', type=Path, default=Path('/Users/kempersc/apps/glam/data/custodian/archive/pending_duplicates_20250109'), help='Archive directory with duplicates') args = parser.parse_args() if args.source_file and args.target_file: process_single_merge(args.source_file, args.target_file, args.dry_run) else: process_archived_duplicates(args.custodian_dir, args.archive_dir, args.dry_run) if __name__ == '__main__': main()