glam/scripts/merge_collision_files.py
kempersc eaf80ec756 data(custodian): merge PENDING collision files into existing custodians
Merge staff data from 7 PENDING files into their matching custodian records:
- NL-XX-XXX-PENDING-SPOT-GRONINGEN → NL-GR-GRO-M-SG (SPOT Groningen, 120 staff)
- NL-XX-XXX-PENDING-DIENST-UITVOERING-ONDERWIJS → NL-GR-GRO-O-DUO
- NL-XX-XXX-PENDING-ANNE-FRANK-STICHTING → NL-NH-AMS-M-AFS
- NL-XX-XXX-PENDING-ALLARD-PIERSON → NL-NH-AMS-M-AP
- NL-XX-XXX-PENDING-STICHTING-JOODS-HISTORISCH-MUSEUM → NL-NH-AMS-M-JHM
- NL-XX-XXX-PENDING-MINISTERIE-VAN-BUITENLANDSE-ZAKEN → NL-ZH-DHA-O-MBZ
- NL-XX-XXX-PENDING-MINISTERIE-VAN-JUSTITIE-EN-VEILIGHEID → NL-ZH-DHA-O-MJV

Originals archived in data/custodian/archive/pending_collisions_20250109/
Add scripts/merge_collision_files.py for reproducible merging
2026-01-09 18:33:00 +01:00

247 lines
7.8 KiB
Python

#!/usr/bin/env python3
"""
Merge staff data from collision PENDING files into existing files.
Then archive the PENDING files.
Usage:
python scripts/merge_collision_files.py --dry-run
python scripts/merge_collision_files.py
"""
import yaml
from pathlib import Path
import re
import shutil
from datetime import datetime, timezone
from typing import Dict, Optional, Tuple, List
# City patterns (same as resolver)
CITY_PATTERNS = {
r'\bamsterdam\b': ('NH', 'AMS'),
r'\brotterdam\b': ('ZH', 'ROT'),
r'\bden haag\b': ('ZH', 'DHA'),
r'\butrecht\b': ('UT', 'UTR'),
r'\bgroningen\b': ('GR', 'GRO'),
r'\bdelft\b': ('ZH', 'DEL'),
r'\bhaarlem\b': ('NH', 'HAA'),
r'\bmaastricht\b': ('LI', 'MAA'),
r'\btilburg\b': ('NB', 'TIL'),
r'\barnhem\b': ('GE', 'ARN'),
r'\bnijmegen\b': ('GE', 'NIJ'),
r'\bleiden\b': ('ZH', 'LEI'),
r'\beindhoven\b': ('NB', 'EIN'),
r'\bbreda\b': ('NB', 'BRE'),
r'\bapeldoorn\b': ('GE', 'APE'),
r'\bdeventer\b': ('OV', 'DEV'),
r'\bzwolle\b': ('OV', 'ZWO'),
}
KNOWN_ORGANIZATIONS = {
'ministerie van buitenlandse zaken': ('ZH', 'DHA', 'O', 'MBZ'),
'ministerie van justitie en veiligheid': ('ZH', 'DHA', 'O', 'MJV'),
'dienst uitvoering onderwijs': ('GR', 'GRO', 'O', 'DUO'),
'nationaal archief': ('ZH', 'DHA', 'A', 'NA'),
'van gogh museum': ('NH', 'AMS', 'M', 'VGM'),
'joods historisch museum': ('NH', 'AMS', 'M', 'JHM'),
'airborne museum': ('GE', 'ARN', 'M', 'ABM'),
'niod': ('NH', 'AMS', 'R', 'NIOD'),
'allard pierson': ('NH', 'AMS', 'M', 'AP'),
'anne frank stichting': ('NH', 'AMS', 'M', 'AFS'),
}
TYPE_KEYWORDS = {
'museum': 'M',
'musea': 'M',
'archief': 'A',
'archive': 'A',
'bibliotheek': 'L',
'library': 'L',
'universiteit': 'E',
'university': 'E',
'hogeschool': 'E',
'academie': 'E',
'ministerie': 'O',
'ministry': 'O',
'gemeente': 'O',
'politie': 'O',
}
def gen_abbrev(name: str) -> str:
"""Generate abbreviation from name."""
skip = {'de', 'het', 'van', 'voor', 'museum', 'stichting', 'archief', 'bibliotheek', 'en', 'of'}
words = re.split(r'[\s\-]+', name)
return ''.join(w[0].upper() for w in words if w.lower() not in skip and w and w[0].isalpha())[:8] or 'UNK'
def infer_type(name: str) -> str:
"""Infer institution type from name."""
name_lower = name.lower()
for keyword, type_code in TYPE_KEYWORDS.items():
if keyword in name_lower:
return type_code
return 'M' # Default to Museum
def get_target_ghcid(name: str) -> Optional[str]:
"""Get the target GHCID for a given name."""
name_lower = name.lower()
# Known org lookup first
for pattern, (prov, city, t, abbrev) in KNOWN_ORGANIZATIONS.items():
if pattern in name_lower:
return f'NL-{prov}-{city}-{t}-{abbrev}'
# City extraction
for pattern, (prov, city) in CITY_PATTERNS.items():
if re.search(pattern, name_lower):
t = infer_type(name)
abbrev = gen_abbrev(name)
return f'NL-{prov}-{city}-{t}-{abbrev}'
return None
def merge_staff_section(existing: Dict, pending: Dict) -> int:
"""Merge staff section from pending into existing. Returns staff count added."""
if 'staff' not in pending:
return 0
pending_staff = pending.get('staff', {}).get('staff_list', [])
if not pending_staff:
return 0
# Initialize staff section if needed
if 'staff' not in existing:
existing['staff'] = {
'provenance': pending['staff'].get('provenance', {}),
'staff_list': []
}
# Get existing staff IDs
existing_ids = set()
for s in existing.get('staff', {}).get('staff_list', []):
if s.get('staff_id'):
existing_ids.add(s['staff_id'])
# Add new staff
added = 0
for s in pending_staff:
if s.get('staff_id') not in existing_ids:
existing['staff']['staff_list'].append(s)
existing_ids.add(s.get('staff_id'))
added += 1
# Update provenance
if 'provenance' in pending['staff']:
existing['staff']['provenance']['merged_from'] = existing['staff']['provenance'].get('merged_from', [])
existing['staff']['provenance']['merged_from'].append({
'source': pending.get('ghcid_current', 'unknown'),
'timestamp': datetime.now(timezone.utc).isoformat(),
'staff_added': added
})
return added
def load_yaml(filepath: Path) -> Optional[Dict]:
"""Load YAML file."""
try:
with open(filepath, 'r', encoding='utf-8') as f:
return yaml.safe_load(f)
except Exception as e:
print(f"Error loading {filepath}: {e}")
return None
def save_yaml(filepath: Path, data: Dict):
"""Save YAML file."""
with open(filepath, 'w', encoding='utf-8') as f:
yaml.dump(data, f, allow_unicode=True, default_flow_style=False,
sort_keys=False, width=120)
def main():
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--dry-run', action='store_true')
parser.add_argument('--custodian-dir', type=Path,
default=Path('/Users/kempersc/apps/glam/data/custodian'))
args = parser.parse_args()
custodian_dir = args.custodian_dir
archive_dir = custodian_dir / 'archive' / 'pending_collisions_20250109'
print("=" * 80)
print("COLLISION FILE MERGER")
print("=" * 80)
print(f"Mode: {'DRY RUN' if args.dry_run else 'LIVE'}")
print()
# Find all NL PENDING files and detect collisions
pending_files = list(custodian_dir.glob('NL-XX-XXX-PENDING-*.yaml'))
collisions = []
for f in pending_files:
data = load_yaml(f)
if not data:
continue
name = data.get('custodian_name', {}).get('emic_name', '')
if not name:
continue
target_ghcid = get_target_ghcid(name)
if target_ghcid:
target_path = custodian_dir / f'{target_ghcid}.yaml'
if target_path.exists():
collisions.append((f, target_path, name, data))
print(f"Found {len(collisions)} collision files to merge")
print()
total_staff_merged = 0
files_merged = 0
for pending_path, target_path, name, pending_data in collisions:
target_data = load_yaml(target_path)
if not target_data:
print(f"[SKIP] Cannot load target: {target_path.name}")
continue
# Count staff in pending
pending_staff_count = len(pending_data.get('staff', {}).get('staff_list', []))
if pending_staff_count == 0:
print(f"[SKIP] No staff in: {name[:50]}")
continue
# Merge
staff_added = merge_staff_section(target_data, pending_data)
if staff_added > 0:
print(f"[{'DRY RUN' if args.dry_run else 'MERGE'}] {name[:50]}")
print(f" Staff: +{staff_added} (from {pending_staff_count} total)")
print(f" {pending_path.name} -> {target_path.name}")
print()
if not args.dry_run:
# Save updated target
save_yaml(target_path, target_data)
# Archive pending file
archive_dir.mkdir(parents=True, exist_ok=True)
shutil.move(str(pending_path), str(archive_dir / pending_path.name))
total_staff_merged += staff_added
files_merged += 1
else:
print(f"[SKIP] All staff already exist: {name[:50]}")
print("=" * 80)
print("SUMMARY")
print("=" * 80)
print(f" Files merged: {files_merged}")
print(f" Staff added: {total_staff_merged}")
if not args.dry_run:
print(f" Archived to: {archive_dir}")
if __name__ == '__main__':
main()