Merge staff data from 7 PENDING files into their matching custodian records: - NL-XX-XXX-PENDING-SPOT-GRONINGEN → NL-GR-GRO-M-SG (SPOT Groningen, 120 staff) - NL-XX-XXX-PENDING-DIENST-UITVOERING-ONDERWIJS → NL-GR-GRO-O-DUO - NL-XX-XXX-PENDING-ANNE-FRANK-STICHTING → NL-NH-AMS-M-AFS - NL-XX-XXX-PENDING-ALLARD-PIERSON → NL-NH-AMS-M-AP - NL-XX-XXX-PENDING-STICHTING-JOODS-HISTORISCH-MUSEUM → NL-NH-AMS-M-JHM - NL-XX-XXX-PENDING-MINISTERIE-VAN-BUITENLANDSE-ZAKEN → NL-ZH-DHA-O-MBZ - NL-XX-XXX-PENDING-MINISTERIE-VAN-JUSTITIE-EN-VEILIGHEID → NL-ZH-DHA-O-MJV Originals archived in data/custodian/archive/pending_collisions_20250109/ Add scripts/merge_collision_files.py for reproducible merging
247 lines
7.8 KiB
Python
247 lines
7.8 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Merge staff data from collision PENDING files into existing files.
|
|
Then archive the PENDING files.
|
|
|
|
Usage:
|
|
python scripts/merge_collision_files.py --dry-run
|
|
python scripts/merge_collision_files.py
|
|
"""
|
|
|
|
import yaml
|
|
from pathlib import Path
|
|
import re
|
|
import shutil
|
|
from datetime import datetime, timezone
|
|
from typing import Dict, Optional, Tuple, List
|
|
|
|
# City patterns (same as resolver)
|
|
CITY_PATTERNS = {
|
|
r'\bamsterdam\b': ('NH', 'AMS'),
|
|
r'\brotterdam\b': ('ZH', 'ROT'),
|
|
r'\bden haag\b': ('ZH', 'DHA'),
|
|
r'\butrecht\b': ('UT', 'UTR'),
|
|
r'\bgroningen\b': ('GR', 'GRO'),
|
|
r'\bdelft\b': ('ZH', 'DEL'),
|
|
r'\bhaarlem\b': ('NH', 'HAA'),
|
|
r'\bmaastricht\b': ('LI', 'MAA'),
|
|
r'\btilburg\b': ('NB', 'TIL'),
|
|
r'\barnhem\b': ('GE', 'ARN'),
|
|
r'\bnijmegen\b': ('GE', 'NIJ'),
|
|
r'\bleiden\b': ('ZH', 'LEI'),
|
|
r'\beindhoven\b': ('NB', 'EIN'),
|
|
r'\bbreda\b': ('NB', 'BRE'),
|
|
r'\bapeldoorn\b': ('GE', 'APE'),
|
|
r'\bdeventer\b': ('OV', 'DEV'),
|
|
r'\bzwolle\b': ('OV', 'ZWO'),
|
|
}
|
|
|
|
KNOWN_ORGANIZATIONS = {
|
|
'ministerie van buitenlandse zaken': ('ZH', 'DHA', 'O', 'MBZ'),
|
|
'ministerie van justitie en veiligheid': ('ZH', 'DHA', 'O', 'MJV'),
|
|
'dienst uitvoering onderwijs': ('GR', 'GRO', 'O', 'DUO'),
|
|
'nationaal archief': ('ZH', 'DHA', 'A', 'NA'),
|
|
'van gogh museum': ('NH', 'AMS', 'M', 'VGM'),
|
|
'joods historisch museum': ('NH', 'AMS', 'M', 'JHM'),
|
|
'airborne museum': ('GE', 'ARN', 'M', 'ABM'),
|
|
'niod': ('NH', 'AMS', 'R', 'NIOD'),
|
|
'allard pierson': ('NH', 'AMS', 'M', 'AP'),
|
|
'anne frank stichting': ('NH', 'AMS', 'M', 'AFS'),
|
|
}
|
|
|
|
TYPE_KEYWORDS = {
|
|
'museum': 'M',
|
|
'musea': 'M',
|
|
'archief': 'A',
|
|
'archive': 'A',
|
|
'bibliotheek': 'L',
|
|
'library': 'L',
|
|
'universiteit': 'E',
|
|
'university': 'E',
|
|
'hogeschool': 'E',
|
|
'academie': 'E',
|
|
'ministerie': 'O',
|
|
'ministry': 'O',
|
|
'gemeente': 'O',
|
|
'politie': 'O',
|
|
}
|
|
|
|
|
|
def gen_abbrev(name: str) -> str:
|
|
"""Generate abbreviation from name."""
|
|
skip = {'de', 'het', 'van', 'voor', 'museum', 'stichting', 'archief', 'bibliotheek', 'en', 'of'}
|
|
words = re.split(r'[\s\-]+', name)
|
|
return ''.join(w[0].upper() for w in words if w.lower() not in skip and w and w[0].isalpha())[:8] or 'UNK'
|
|
|
|
|
|
def infer_type(name: str) -> str:
|
|
"""Infer institution type from name."""
|
|
name_lower = name.lower()
|
|
for keyword, type_code in TYPE_KEYWORDS.items():
|
|
if keyword in name_lower:
|
|
return type_code
|
|
return 'M' # Default to Museum
|
|
|
|
|
|
def get_target_ghcid(name: str) -> Optional[str]:
|
|
"""Get the target GHCID for a given name."""
|
|
name_lower = name.lower()
|
|
|
|
# Known org lookup first
|
|
for pattern, (prov, city, t, abbrev) in KNOWN_ORGANIZATIONS.items():
|
|
if pattern in name_lower:
|
|
return f'NL-{prov}-{city}-{t}-{abbrev}'
|
|
|
|
# City extraction
|
|
for pattern, (prov, city) in CITY_PATTERNS.items():
|
|
if re.search(pattern, name_lower):
|
|
t = infer_type(name)
|
|
abbrev = gen_abbrev(name)
|
|
return f'NL-{prov}-{city}-{t}-{abbrev}'
|
|
return None
|
|
|
|
|
|
def merge_staff_section(existing: Dict, pending: Dict) -> int:
|
|
"""Merge staff section from pending into existing. Returns staff count added."""
|
|
if 'staff' not in pending:
|
|
return 0
|
|
|
|
pending_staff = pending.get('staff', {}).get('staff_list', [])
|
|
if not pending_staff:
|
|
return 0
|
|
|
|
# Initialize staff section if needed
|
|
if 'staff' not in existing:
|
|
existing['staff'] = {
|
|
'provenance': pending['staff'].get('provenance', {}),
|
|
'staff_list': []
|
|
}
|
|
|
|
# Get existing staff IDs
|
|
existing_ids = set()
|
|
for s in existing.get('staff', {}).get('staff_list', []):
|
|
if s.get('staff_id'):
|
|
existing_ids.add(s['staff_id'])
|
|
|
|
# Add new staff
|
|
added = 0
|
|
for s in pending_staff:
|
|
if s.get('staff_id') not in existing_ids:
|
|
existing['staff']['staff_list'].append(s)
|
|
existing_ids.add(s.get('staff_id'))
|
|
added += 1
|
|
|
|
# Update provenance
|
|
if 'provenance' in pending['staff']:
|
|
existing['staff']['provenance']['merged_from'] = existing['staff']['provenance'].get('merged_from', [])
|
|
existing['staff']['provenance']['merged_from'].append({
|
|
'source': pending.get('ghcid_current', 'unknown'),
|
|
'timestamp': datetime.now(timezone.utc).isoformat(),
|
|
'staff_added': added
|
|
})
|
|
|
|
return added
|
|
|
|
|
|
def load_yaml(filepath: Path) -> Optional[Dict]:
|
|
"""Load YAML file."""
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
return yaml.safe_load(f)
|
|
except Exception as e:
|
|
print(f"Error loading {filepath}: {e}")
|
|
return None
|
|
|
|
|
|
def save_yaml(filepath: Path, data: Dict):
|
|
"""Save YAML file."""
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, allow_unicode=True, default_flow_style=False,
|
|
sort_keys=False, width=120)
|
|
|
|
|
|
def main():
|
|
import argparse
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument('--dry-run', action='store_true')
|
|
parser.add_argument('--custodian-dir', type=Path,
|
|
default=Path('/Users/kempersc/apps/glam/data/custodian'))
|
|
args = parser.parse_args()
|
|
|
|
custodian_dir = args.custodian_dir
|
|
archive_dir = custodian_dir / 'archive' / 'pending_collisions_20250109'
|
|
|
|
print("=" * 80)
|
|
print("COLLISION FILE MERGER")
|
|
print("=" * 80)
|
|
print(f"Mode: {'DRY RUN' if args.dry_run else 'LIVE'}")
|
|
print()
|
|
|
|
# Find all NL PENDING files and detect collisions
|
|
pending_files = list(custodian_dir.glob('NL-XX-XXX-PENDING-*.yaml'))
|
|
collisions = []
|
|
|
|
for f in pending_files:
|
|
data = load_yaml(f)
|
|
if not data:
|
|
continue
|
|
name = data.get('custodian_name', {}).get('emic_name', '')
|
|
if not name:
|
|
continue
|
|
target_ghcid = get_target_ghcid(name)
|
|
if target_ghcid:
|
|
target_path = custodian_dir / f'{target_ghcid}.yaml'
|
|
if target_path.exists():
|
|
collisions.append((f, target_path, name, data))
|
|
|
|
print(f"Found {len(collisions)} collision files to merge")
|
|
print()
|
|
|
|
total_staff_merged = 0
|
|
files_merged = 0
|
|
|
|
for pending_path, target_path, name, pending_data in collisions:
|
|
target_data = load_yaml(target_path)
|
|
if not target_data:
|
|
print(f"[SKIP] Cannot load target: {target_path.name}")
|
|
continue
|
|
|
|
# Count staff in pending
|
|
pending_staff_count = len(pending_data.get('staff', {}).get('staff_list', []))
|
|
if pending_staff_count == 0:
|
|
print(f"[SKIP] No staff in: {name[:50]}")
|
|
continue
|
|
|
|
# Merge
|
|
staff_added = merge_staff_section(target_data, pending_data)
|
|
|
|
if staff_added > 0:
|
|
print(f"[{'DRY RUN' if args.dry_run else 'MERGE'}] {name[:50]}")
|
|
print(f" Staff: +{staff_added} (from {pending_staff_count} total)")
|
|
print(f" {pending_path.name} -> {target_path.name}")
|
|
print()
|
|
|
|
if not args.dry_run:
|
|
# Save updated target
|
|
save_yaml(target_path, target_data)
|
|
|
|
# Archive pending file
|
|
archive_dir.mkdir(parents=True, exist_ok=True)
|
|
shutil.move(str(pending_path), str(archive_dir / pending_path.name))
|
|
|
|
total_staff_merged += staff_added
|
|
files_merged += 1
|
|
else:
|
|
print(f"[SKIP] All staff already exist: {name[:50]}")
|
|
|
|
print("=" * 80)
|
|
print("SUMMARY")
|
|
print("=" * 80)
|
|
print(f" Files merged: {files_merged}")
|
|
print(f" Staff added: {total_staff_merged}")
|
|
if not args.dry_run:
|
|
print(f" Archived to: {archive_dir}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|