data(custodian): merge PENDING collision files into existing custodians

Merge staff data from 7 PENDING files into their matching custodian records:
- NL-XX-XXX-PENDING-SPOT-GRONINGEN → NL-GR-GRO-M-SG (SPOT Groningen, 120 staff)
- NL-XX-XXX-PENDING-DIENST-UITVOERING-ONDERWIJS → NL-GR-GRO-O-DUO
- NL-XX-XXX-PENDING-ANNE-FRANK-STICHTING → NL-NH-AMS-M-AFS
- NL-XX-XXX-PENDING-ALLARD-PIERSON → NL-NH-AMS-M-AP
- NL-XX-XXX-PENDING-STICHTING-JOODS-HISTORISCH-MUSEUM → NL-NH-AMS-M-JHM
- NL-XX-XXX-PENDING-MINISTERIE-VAN-BUITENLANDSE-ZAKEN → NL-ZH-DHA-O-MBZ
- NL-XX-XXX-PENDING-MINISTERIE-VAN-JUSTITIE-EN-VEILIGHEID → NL-ZH-DHA-O-MJV

Originals archived in data/custodian/archive/pending_collisions_20250109/
Add scripts/merge_collision_files.py for reproducible merging
This commit is contained in:
kempersc 2026-01-09 18:33:00 +01:00
parent e9c9aefc37
commit eaf80ec756
15 changed files with 23041 additions and 645 deletions

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -11,6 +11,10 @@ staff:
registered_timestamp: '2025-12-30T09:59:36Z'
registration_method: html_parsing_with_full_staff_data
total_staff_extracted: 3
merged_from:
- source: NL-XX-XXX-PENDING-STICHTING-JOODS-HISTORISCH-MUSEUM
timestamp: '2026-01-09T17:32:05.360611+00:00'
staff_added: 2
staff_list:
- staff_id: joods-historisch-museum-jewish-historical-museum_staff_0000_victor_brilleman
person_name: Victor Brilleman
@ -36,6 +40,22 @@ staff:
heritage_type: M
linkedin_profile_url: https://www.linkedin.com/in/renata-klasson-305526324
linkedin_slug: renata-klasson-305526324
- staff_id: stichting-joods-historisch-museum_staff_0000_conchita_peral_van_oversteeg
person_name: Conchita Peral van Oversteeg
person_profile_path: data/custodian/person/entity/conchita-peral-van-oversteeg-309174a_*.json
role_title: Financieel Administratief medewerker bij Stichting Joods Historisch Museum
heritage_relevant: false
heritage_type: null
linkedin_profile_url: https://www.linkedin.com/in/conchita-peral-van-oversteeg-309174a
linkedin_slug: conchita-peral-van-oversteeg-309174a
- staff_id: stichting-joods-historisch-museum_staff_0001_billha_zussman
person_name: Billha Zussman
person_profile_path: data/custodian/person/entity/billha-zussman-4b76672a_*.json
role_title: Government Relations Services
heritage_relevant: false
heritage_type: null
linkedin_profile_url: https://www.linkedin.com/in/billha-zussman-4b76672a
linkedin_slug: billha-zussman-4b76672a
linkedin_enrichment:
source_file: (21) Joods Historisch Museum _ Jewish Historical Museum_ People _ LinkedIn.html
extraction_date: '2025-12-30T09:59:36Z'

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,247 @@
#!/usr/bin/env python3
"""
Merge staff data from collision PENDING files into existing files.
Then archive the PENDING files.
Usage:
python scripts/merge_collision_files.py --dry-run
python scripts/merge_collision_files.py
"""
import yaml
from pathlib import Path
import re
import shutil
from datetime import datetime, timezone
from typing import Dict, Optional, Tuple, List
# City patterns (same as resolver)
CITY_PATTERNS = {
r'\bamsterdam\b': ('NH', 'AMS'),
r'\brotterdam\b': ('ZH', 'ROT'),
r'\bden haag\b': ('ZH', 'DHA'),
r'\butrecht\b': ('UT', 'UTR'),
r'\bgroningen\b': ('GR', 'GRO'),
r'\bdelft\b': ('ZH', 'DEL'),
r'\bhaarlem\b': ('NH', 'HAA'),
r'\bmaastricht\b': ('LI', 'MAA'),
r'\btilburg\b': ('NB', 'TIL'),
r'\barnhem\b': ('GE', 'ARN'),
r'\bnijmegen\b': ('GE', 'NIJ'),
r'\bleiden\b': ('ZH', 'LEI'),
r'\beindhoven\b': ('NB', 'EIN'),
r'\bbreda\b': ('NB', 'BRE'),
r'\bapeldoorn\b': ('GE', 'APE'),
r'\bdeventer\b': ('OV', 'DEV'),
r'\bzwolle\b': ('OV', 'ZWO'),
}
KNOWN_ORGANIZATIONS = {
'ministerie van buitenlandse zaken': ('ZH', 'DHA', 'O', 'MBZ'),
'ministerie van justitie en veiligheid': ('ZH', 'DHA', 'O', 'MJV'),
'dienst uitvoering onderwijs': ('GR', 'GRO', 'O', 'DUO'),
'nationaal archief': ('ZH', 'DHA', 'A', 'NA'),
'van gogh museum': ('NH', 'AMS', 'M', 'VGM'),
'joods historisch museum': ('NH', 'AMS', 'M', 'JHM'),
'airborne museum': ('GE', 'ARN', 'M', 'ABM'),
'niod': ('NH', 'AMS', 'R', 'NIOD'),
'allard pierson': ('NH', 'AMS', 'M', 'AP'),
'anne frank stichting': ('NH', 'AMS', 'M', 'AFS'),
}
TYPE_KEYWORDS = {
'museum': 'M',
'musea': 'M',
'archief': 'A',
'archive': 'A',
'bibliotheek': 'L',
'library': 'L',
'universiteit': 'E',
'university': 'E',
'hogeschool': 'E',
'academie': 'E',
'ministerie': 'O',
'ministry': 'O',
'gemeente': 'O',
'politie': 'O',
}
def gen_abbrev(name: str) -> str:
"""Generate abbreviation from name."""
skip = {'de', 'het', 'van', 'voor', 'museum', 'stichting', 'archief', 'bibliotheek', 'en', 'of'}
words = re.split(r'[\s\-]+', name)
return ''.join(w[0].upper() for w in words if w.lower() not in skip and w and w[0].isalpha())[:8] or 'UNK'
def infer_type(name: str) -> str:
"""Infer institution type from name."""
name_lower = name.lower()
for keyword, type_code in TYPE_KEYWORDS.items():
if keyword in name_lower:
return type_code
return 'M' # Default to Museum
def get_target_ghcid(name: str) -> Optional[str]:
"""Get the target GHCID for a given name."""
name_lower = name.lower()
# Known org lookup first
for pattern, (prov, city, t, abbrev) in KNOWN_ORGANIZATIONS.items():
if pattern in name_lower:
return f'NL-{prov}-{city}-{t}-{abbrev}'
# City extraction
for pattern, (prov, city) in CITY_PATTERNS.items():
if re.search(pattern, name_lower):
t = infer_type(name)
abbrev = gen_abbrev(name)
return f'NL-{prov}-{city}-{t}-{abbrev}'
return None
def merge_staff_section(existing: Dict, pending: Dict) -> int:
"""Merge staff section from pending into existing. Returns staff count added."""
if 'staff' not in pending:
return 0
pending_staff = pending.get('staff', {}).get('staff_list', [])
if not pending_staff:
return 0
# Initialize staff section if needed
if 'staff' not in existing:
existing['staff'] = {
'provenance': pending['staff'].get('provenance', {}),
'staff_list': []
}
# Get existing staff IDs
existing_ids = set()
for s in existing.get('staff', {}).get('staff_list', []):
if s.get('staff_id'):
existing_ids.add(s['staff_id'])
# Add new staff
added = 0
for s in pending_staff:
if s.get('staff_id') not in existing_ids:
existing['staff']['staff_list'].append(s)
existing_ids.add(s.get('staff_id'))
added += 1
# Update provenance
if 'provenance' in pending['staff']:
existing['staff']['provenance']['merged_from'] = existing['staff']['provenance'].get('merged_from', [])
existing['staff']['provenance']['merged_from'].append({
'source': pending.get('ghcid_current', 'unknown'),
'timestamp': datetime.now(timezone.utc).isoformat(),
'staff_added': added
})
return added
def load_yaml(filepath: Path) -> Optional[Dict]:
"""Load YAML file."""
try:
with open(filepath, 'r', encoding='utf-8') as f:
return yaml.safe_load(f)
except Exception as e:
print(f"Error loading {filepath}: {e}")
return None
def save_yaml(filepath: Path, data: Dict):
"""Save YAML file."""
with open(filepath, 'w', encoding='utf-8') as f:
yaml.dump(data, f, allow_unicode=True, default_flow_style=False,
sort_keys=False, width=120)
def main():
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--dry-run', action='store_true')
parser.add_argument('--custodian-dir', type=Path,
default=Path('/Users/kempersc/apps/glam/data/custodian'))
args = parser.parse_args()
custodian_dir = args.custodian_dir
archive_dir = custodian_dir / 'archive' / 'pending_collisions_20250109'
print("=" * 80)
print("COLLISION FILE MERGER")
print("=" * 80)
print(f"Mode: {'DRY RUN' if args.dry_run else 'LIVE'}")
print()
# Find all NL PENDING files and detect collisions
pending_files = list(custodian_dir.glob('NL-XX-XXX-PENDING-*.yaml'))
collisions = []
for f in pending_files:
data = load_yaml(f)
if not data:
continue
name = data.get('custodian_name', {}).get('emic_name', '')
if not name:
continue
target_ghcid = get_target_ghcid(name)
if target_ghcid:
target_path = custodian_dir / f'{target_ghcid}.yaml'
if target_path.exists():
collisions.append((f, target_path, name, data))
print(f"Found {len(collisions)} collision files to merge")
print()
total_staff_merged = 0
files_merged = 0
for pending_path, target_path, name, pending_data in collisions:
target_data = load_yaml(target_path)
if not target_data:
print(f"[SKIP] Cannot load target: {target_path.name}")
continue
# Count staff in pending
pending_staff_count = len(pending_data.get('staff', {}).get('staff_list', []))
if pending_staff_count == 0:
print(f"[SKIP] No staff in: {name[:50]}")
continue
# Merge
staff_added = merge_staff_section(target_data, pending_data)
if staff_added > 0:
print(f"[{'DRY RUN' if args.dry_run else 'MERGE'}] {name[:50]}")
print(f" Staff: +{staff_added} (from {pending_staff_count} total)")
print(f" {pending_path.name} -> {target_path.name}")
print()
if not args.dry_run:
# Save updated target
save_yaml(target_path, target_data)
# Archive pending file
archive_dir.mkdir(parents=True, exist_ok=True)
shutil.move(str(pending_path), str(archive_dir / pending_path.name))
total_staff_merged += staff_added
files_merged += 1
else:
print(f"[SKIP] All staff already exist: {name[:50]}")
print("=" * 80)
print("SUMMARY")
print("=" * 80)
print(f" Files merged: {files_merged}")
print(f" Staff added: {total_staff_merged}")
if not args.dry_run:
print(f" Archived to: {archive_dir}")
if __name__ == '__main__':
main()