#!/usr/bin/env python3 """ Merge staff data from collision PENDING files into existing files. Then archive the PENDING files. Usage: python scripts/merge_collision_files.py --dry-run python scripts/merge_collision_files.py """ import yaml from pathlib import Path import re import shutil from datetime import datetime, timezone from typing import Dict, Optional, Tuple, List # City patterns (same as resolver) CITY_PATTERNS = { r'\bamsterdam\b': ('NH', 'AMS'), r'\brotterdam\b': ('ZH', 'ROT'), r'\bden haag\b': ('ZH', 'DHA'), r'\butrecht\b': ('UT', 'UTR'), r'\bgroningen\b': ('GR', 'GRO'), r'\bdelft\b': ('ZH', 'DEL'), r'\bhaarlem\b': ('NH', 'HAA'), r'\bmaastricht\b': ('LI', 'MAA'), r'\btilburg\b': ('NB', 'TIL'), r'\barnhem\b': ('GE', 'ARN'), r'\bnijmegen\b': ('GE', 'NIJ'), r'\bleiden\b': ('ZH', 'LEI'), r'\beindhoven\b': ('NB', 'EIN'), r'\bbreda\b': ('NB', 'BRE'), r'\bapeldoorn\b': ('GE', 'APE'), r'\bdeventer\b': ('OV', 'DEV'), r'\bzwolle\b': ('OV', 'ZWO'), } KNOWN_ORGANIZATIONS = { 'ministerie van buitenlandse zaken': ('ZH', 'DHA', 'O', 'MBZ'), 'ministerie van justitie en veiligheid': ('ZH', 'DHA', 'O', 'MJV'), 'dienst uitvoering onderwijs': ('GR', 'GRO', 'O', 'DUO'), 'nationaal archief': ('ZH', 'DHA', 'A', 'NA'), 'van gogh museum': ('NH', 'AMS', 'M', 'VGM'), 'joods historisch museum': ('NH', 'AMS', 'M', 'JHM'), 'airborne museum': ('GE', 'ARN', 'M', 'ABM'), 'niod': ('NH', 'AMS', 'R', 'NIOD'), 'allard pierson': ('NH', 'AMS', 'M', 'AP'), 'anne frank stichting': ('NH', 'AMS', 'M', 'AFS'), } TYPE_KEYWORDS = { 'museum': 'M', 'musea': 'M', 'archief': 'A', 'archive': 'A', 'bibliotheek': 'L', 'library': 'L', 'universiteit': 'E', 'university': 'E', 'hogeschool': 'E', 'academie': 'E', 'ministerie': 'O', 'ministry': 'O', 'gemeente': 'O', 'politie': 'O', } def gen_abbrev(name: str) -> str: """Generate abbreviation from name.""" skip = {'de', 'het', 'van', 'voor', 'museum', 'stichting', 'archief', 'bibliotheek', 'en', 'of'} words = re.split(r'[\s\-]+', name) return ''.join(w[0].upper() for w in words if w.lower() not in skip and w and w[0].isalpha())[:8] or 'UNK' def infer_type(name: str) -> str: """Infer institution type from name.""" name_lower = name.lower() for keyword, type_code in TYPE_KEYWORDS.items(): if keyword in name_lower: return type_code return 'M' # Default to Museum def get_target_ghcid(name: str) -> Optional[str]: """Get the target GHCID for a given name.""" name_lower = name.lower() # Known org lookup first for pattern, (prov, city, t, abbrev) in KNOWN_ORGANIZATIONS.items(): if pattern in name_lower: return f'NL-{prov}-{city}-{t}-{abbrev}' # City extraction for pattern, (prov, city) in CITY_PATTERNS.items(): if re.search(pattern, name_lower): t = infer_type(name) abbrev = gen_abbrev(name) return f'NL-{prov}-{city}-{t}-{abbrev}' return None def merge_staff_section(existing: Dict, pending: Dict) -> int: """Merge staff section from pending into existing. Returns staff count added.""" if 'staff' not in pending: return 0 pending_staff = pending.get('staff', {}).get('staff_list', []) if not pending_staff: return 0 # Initialize staff section if needed if 'staff' not in existing: existing['staff'] = { 'provenance': pending['staff'].get('provenance', {}), 'staff_list': [] } # Get existing staff IDs existing_ids = set() for s in existing.get('staff', {}).get('staff_list', []): if s.get('staff_id'): existing_ids.add(s['staff_id']) # Add new staff added = 0 for s in pending_staff: if s.get('staff_id') not in existing_ids: existing['staff']['staff_list'].append(s) existing_ids.add(s.get('staff_id')) added += 1 # Update provenance if 'provenance' in pending['staff']: existing['staff']['provenance']['merged_from'] = existing['staff']['provenance'].get('merged_from', []) existing['staff']['provenance']['merged_from'].append({ 'source': pending.get('ghcid_current', 'unknown'), 'timestamp': datetime.now(timezone.utc).isoformat(), 'staff_added': added }) return added def load_yaml(filepath: Path) -> Optional[Dict]: """Load YAML file.""" try: with open(filepath, 'r', encoding='utf-8') as f: return yaml.safe_load(f) except Exception as e: print(f"Error loading {filepath}: {e}") return None def save_yaml(filepath: Path, data: Dict): """Save YAML file.""" with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False, width=120) def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('--dry-run', action='store_true') parser.add_argument('--custodian-dir', type=Path, default=Path('/Users/kempersc/apps/glam/data/custodian')) args = parser.parse_args() custodian_dir = args.custodian_dir archive_dir = custodian_dir / 'archive' / 'pending_collisions_20250109' print("=" * 80) print("COLLISION FILE MERGER") print("=" * 80) print(f"Mode: {'DRY RUN' if args.dry_run else 'LIVE'}") print() # Find all NL PENDING files and detect collisions pending_files = list(custodian_dir.glob('NL-XX-XXX-PENDING-*.yaml')) collisions = [] for f in pending_files: data = load_yaml(f) if not data: continue name = data.get('custodian_name', {}).get('emic_name', '') if not name: continue target_ghcid = get_target_ghcid(name) if target_ghcid: target_path = custodian_dir / f'{target_ghcid}.yaml' if target_path.exists(): collisions.append((f, target_path, name, data)) print(f"Found {len(collisions)} collision files to merge") print() total_staff_merged = 0 files_merged = 0 for pending_path, target_path, name, pending_data in collisions: target_data = load_yaml(target_path) if not target_data: print(f"[SKIP] Cannot load target: {target_path.name}") continue # Count staff in pending pending_staff_count = len(pending_data.get('staff', {}).get('staff_list', [])) if pending_staff_count == 0: print(f"[SKIP] No staff in: {name[:50]}") continue # Merge staff_added = merge_staff_section(target_data, pending_data) if staff_added > 0: print(f"[{'DRY RUN' if args.dry_run else 'MERGE'}] {name[:50]}") print(f" Staff: +{staff_added} (from {pending_staff_count} total)") print(f" {pending_path.name} -> {target_path.name}") print() if not args.dry_run: # Save updated target save_yaml(target_path, target_data) # Archive pending file archive_dir.mkdir(parents=True, exist_ok=True) shutil.move(str(pending_path), str(archive_dir / pending_path.name)) total_staff_merged += staff_added files_merged += 1 else: print(f"[SKIP] All staff already exist: {name[:50]}") print("=" * 80) print("SUMMARY") print("=" * 80) print(f" Files merged: {files_merged}") print(f" Staff added: {total_staff_merged}") if not args.dry_run: print(f" Archived to: {archive_dir}") if __name__ == '__main__': main()