data(custodian): merge PENDING collision files into existing custodians
Merge staff data from 7 PENDING files into their matching custodian records: - NL-XX-XXX-PENDING-SPOT-GRONINGEN → NL-GR-GRO-M-SG (SPOT Groningen, 120 staff) - NL-XX-XXX-PENDING-DIENST-UITVOERING-ONDERWIJS → NL-GR-GRO-O-DUO - NL-XX-XXX-PENDING-ANNE-FRANK-STICHTING → NL-NH-AMS-M-AFS - NL-XX-XXX-PENDING-ALLARD-PIERSON → NL-NH-AMS-M-AP - NL-XX-XXX-PENDING-STICHTING-JOODS-HISTORISCH-MUSEUM → NL-NH-AMS-M-JHM - NL-XX-XXX-PENDING-MINISTERIE-VAN-BUITENLANDSE-ZAKEN → NL-ZH-DHA-O-MBZ - NL-XX-XXX-PENDING-MINISTERIE-VAN-JUSTITIE-EN-VEILIGHEID → NL-ZH-DHA-O-MJV Originals archived in data/custodian/archive/pending_collisions_20250109/ Add scripts/merge_collision_files.py for reproducible merging
This commit is contained in:
parent
e9c9aefc37
commit
eaf80ec756
15 changed files with 23041 additions and 645 deletions
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
|
|
@ -11,6 +11,10 @@ staff:
|
|||
registered_timestamp: '2025-12-30T09:59:36Z'
|
||||
registration_method: html_parsing_with_full_staff_data
|
||||
total_staff_extracted: 3
|
||||
merged_from:
|
||||
- source: NL-XX-XXX-PENDING-STICHTING-JOODS-HISTORISCH-MUSEUM
|
||||
timestamp: '2026-01-09T17:32:05.360611+00:00'
|
||||
staff_added: 2
|
||||
staff_list:
|
||||
- staff_id: joods-historisch-museum-jewish-historical-museum_staff_0000_victor_brilleman
|
||||
person_name: Victor Brilleman
|
||||
|
|
@ -36,6 +40,22 @@ staff:
|
|||
heritage_type: M
|
||||
linkedin_profile_url: https://www.linkedin.com/in/renata-klasson-305526324
|
||||
linkedin_slug: renata-klasson-305526324
|
||||
- staff_id: stichting-joods-historisch-museum_staff_0000_conchita_peral_van_oversteeg
|
||||
person_name: Conchita Peral van Oversteeg
|
||||
person_profile_path: data/custodian/person/entity/conchita-peral-van-oversteeg-309174a_*.json
|
||||
role_title: Financieel Administratief medewerker bij Stichting Joods Historisch Museum
|
||||
heritage_relevant: false
|
||||
heritage_type: null
|
||||
linkedin_profile_url: https://www.linkedin.com/in/conchita-peral-van-oversteeg-309174a
|
||||
linkedin_slug: conchita-peral-van-oversteeg-309174a
|
||||
- staff_id: stichting-joods-historisch-museum_staff_0001_billha_zussman
|
||||
person_name: Billha Zussman
|
||||
person_profile_path: data/custodian/person/entity/billha-zussman-4b76672a_*.json
|
||||
role_title: Government Relations Services
|
||||
heritage_relevant: false
|
||||
heritage_type: null
|
||||
linkedin_profile_url: https://www.linkedin.com/in/billha-zussman-4b76672a
|
||||
linkedin_slug: billha-zussman-4b76672a
|
||||
linkedin_enrichment:
|
||||
source_file: (21) Joods Historisch Museum _ Jewish Historical Museum_ People _ LinkedIn.html
|
||||
extraction_date: '2025-12-30T09:59:36Z'
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
247
scripts/merge_collision_files.py
Normal file
247
scripts/merge_collision_files.py
Normal file
|
|
@ -0,0 +1,247 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Merge staff data from collision PENDING files into existing files.
|
||||
Then archive the PENDING files.
|
||||
|
||||
Usage:
|
||||
python scripts/merge_collision_files.py --dry-run
|
||||
python scripts/merge_collision_files.py
|
||||
"""
|
||||
|
||||
import yaml
|
||||
from pathlib import Path
|
||||
import re
|
||||
import shutil
|
||||
from datetime import datetime, timezone
|
||||
from typing import Dict, Optional, Tuple, List
|
||||
|
||||
# City patterns (same as resolver)
|
||||
CITY_PATTERNS = {
|
||||
r'\bamsterdam\b': ('NH', 'AMS'),
|
||||
r'\brotterdam\b': ('ZH', 'ROT'),
|
||||
r'\bden haag\b': ('ZH', 'DHA'),
|
||||
r'\butrecht\b': ('UT', 'UTR'),
|
||||
r'\bgroningen\b': ('GR', 'GRO'),
|
||||
r'\bdelft\b': ('ZH', 'DEL'),
|
||||
r'\bhaarlem\b': ('NH', 'HAA'),
|
||||
r'\bmaastricht\b': ('LI', 'MAA'),
|
||||
r'\btilburg\b': ('NB', 'TIL'),
|
||||
r'\barnhem\b': ('GE', 'ARN'),
|
||||
r'\bnijmegen\b': ('GE', 'NIJ'),
|
||||
r'\bleiden\b': ('ZH', 'LEI'),
|
||||
r'\beindhoven\b': ('NB', 'EIN'),
|
||||
r'\bbreda\b': ('NB', 'BRE'),
|
||||
r'\bapeldoorn\b': ('GE', 'APE'),
|
||||
r'\bdeventer\b': ('OV', 'DEV'),
|
||||
r'\bzwolle\b': ('OV', 'ZWO'),
|
||||
}
|
||||
|
||||
KNOWN_ORGANIZATIONS = {
|
||||
'ministerie van buitenlandse zaken': ('ZH', 'DHA', 'O', 'MBZ'),
|
||||
'ministerie van justitie en veiligheid': ('ZH', 'DHA', 'O', 'MJV'),
|
||||
'dienst uitvoering onderwijs': ('GR', 'GRO', 'O', 'DUO'),
|
||||
'nationaal archief': ('ZH', 'DHA', 'A', 'NA'),
|
||||
'van gogh museum': ('NH', 'AMS', 'M', 'VGM'),
|
||||
'joods historisch museum': ('NH', 'AMS', 'M', 'JHM'),
|
||||
'airborne museum': ('GE', 'ARN', 'M', 'ABM'),
|
||||
'niod': ('NH', 'AMS', 'R', 'NIOD'),
|
||||
'allard pierson': ('NH', 'AMS', 'M', 'AP'),
|
||||
'anne frank stichting': ('NH', 'AMS', 'M', 'AFS'),
|
||||
}
|
||||
|
||||
TYPE_KEYWORDS = {
|
||||
'museum': 'M',
|
||||
'musea': 'M',
|
||||
'archief': 'A',
|
||||
'archive': 'A',
|
||||
'bibliotheek': 'L',
|
||||
'library': 'L',
|
||||
'universiteit': 'E',
|
||||
'university': 'E',
|
||||
'hogeschool': 'E',
|
||||
'academie': 'E',
|
||||
'ministerie': 'O',
|
||||
'ministry': 'O',
|
||||
'gemeente': 'O',
|
||||
'politie': 'O',
|
||||
}
|
||||
|
||||
|
||||
def gen_abbrev(name: str) -> str:
|
||||
"""Generate abbreviation from name."""
|
||||
skip = {'de', 'het', 'van', 'voor', 'museum', 'stichting', 'archief', 'bibliotheek', 'en', 'of'}
|
||||
words = re.split(r'[\s\-]+', name)
|
||||
return ''.join(w[0].upper() for w in words if w.lower() not in skip and w and w[0].isalpha())[:8] or 'UNK'
|
||||
|
||||
|
||||
def infer_type(name: str) -> str:
|
||||
"""Infer institution type from name."""
|
||||
name_lower = name.lower()
|
||||
for keyword, type_code in TYPE_KEYWORDS.items():
|
||||
if keyword in name_lower:
|
||||
return type_code
|
||||
return 'M' # Default to Museum
|
||||
|
||||
|
||||
def get_target_ghcid(name: str) -> Optional[str]:
|
||||
"""Get the target GHCID for a given name."""
|
||||
name_lower = name.lower()
|
||||
|
||||
# Known org lookup first
|
||||
for pattern, (prov, city, t, abbrev) in KNOWN_ORGANIZATIONS.items():
|
||||
if pattern in name_lower:
|
||||
return f'NL-{prov}-{city}-{t}-{abbrev}'
|
||||
|
||||
# City extraction
|
||||
for pattern, (prov, city) in CITY_PATTERNS.items():
|
||||
if re.search(pattern, name_lower):
|
||||
t = infer_type(name)
|
||||
abbrev = gen_abbrev(name)
|
||||
return f'NL-{prov}-{city}-{t}-{abbrev}'
|
||||
return None
|
||||
|
||||
|
||||
def merge_staff_section(existing: Dict, pending: Dict) -> int:
|
||||
"""Merge staff section from pending into existing. Returns staff count added."""
|
||||
if 'staff' not in pending:
|
||||
return 0
|
||||
|
||||
pending_staff = pending.get('staff', {}).get('staff_list', [])
|
||||
if not pending_staff:
|
||||
return 0
|
||||
|
||||
# Initialize staff section if needed
|
||||
if 'staff' not in existing:
|
||||
existing['staff'] = {
|
||||
'provenance': pending['staff'].get('provenance', {}),
|
||||
'staff_list': []
|
||||
}
|
||||
|
||||
# Get existing staff IDs
|
||||
existing_ids = set()
|
||||
for s in existing.get('staff', {}).get('staff_list', []):
|
||||
if s.get('staff_id'):
|
||||
existing_ids.add(s['staff_id'])
|
||||
|
||||
# Add new staff
|
||||
added = 0
|
||||
for s in pending_staff:
|
||||
if s.get('staff_id') not in existing_ids:
|
||||
existing['staff']['staff_list'].append(s)
|
||||
existing_ids.add(s.get('staff_id'))
|
||||
added += 1
|
||||
|
||||
# Update provenance
|
||||
if 'provenance' in pending['staff']:
|
||||
existing['staff']['provenance']['merged_from'] = existing['staff']['provenance'].get('merged_from', [])
|
||||
existing['staff']['provenance']['merged_from'].append({
|
||||
'source': pending.get('ghcid_current', 'unknown'),
|
||||
'timestamp': datetime.now(timezone.utc).isoformat(),
|
||||
'staff_added': added
|
||||
})
|
||||
|
||||
return added
|
||||
|
||||
|
||||
def load_yaml(filepath: Path) -> Optional[Dict]:
|
||||
"""Load YAML file."""
|
||||
try:
|
||||
with open(filepath, 'r', encoding='utf-8') as f:
|
||||
return yaml.safe_load(f)
|
||||
except Exception as e:
|
||||
print(f"Error loading {filepath}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def save_yaml(filepath: Path, data: Dict):
|
||||
"""Save YAML file."""
|
||||
with open(filepath, 'w', encoding='utf-8') as f:
|
||||
yaml.dump(data, f, allow_unicode=True, default_flow_style=False,
|
||||
sort_keys=False, width=120)
|
||||
|
||||
|
||||
def main():
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--dry-run', action='store_true')
|
||||
parser.add_argument('--custodian-dir', type=Path,
|
||||
default=Path('/Users/kempersc/apps/glam/data/custodian'))
|
||||
args = parser.parse_args()
|
||||
|
||||
custodian_dir = args.custodian_dir
|
||||
archive_dir = custodian_dir / 'archive' / 'pending_collisions_20250109'
|
||||
|
||||
print("=" * 80)
|
||||
print("COLLISION FILE MERGER")
|
||||
print("=" * 80)
|
||||
print(f"Mode: {'DRY RUN' if args.dry_run else 'LIVE'}")
|
||||
print()
|
||||
|
||||
# Find all NL PENDING files and detect collisions
|
||||
pending_files = list(custodian_dir.glob('NL-XX-XXX-PENDING-*.yaml'))
|
||||
collisions = []
|
||||
|
||||
for f in pending_files:
|
||||
data = load_yaml(f)
|
||||
if not data:
|
||||
continue
|
||||
name = data.get('custodian_name', {}).get('emic_name', '')
|
||||
if not name:
|
||||
continue
|
||||
target_ghcid = get_target_ghcid(name)
|
||||
if target_ghcid:
|
||||
target_path = custodian_dir / f'{target_ghcid}.yaml'
|
||||
if target_path.exists():
|
||||
collisions.append((f, target_path, name, data))
|
||||
|
||||
print(f"Found {len(collisions)} collision files to merge")
|
||||
print()
|
||||
|
||||
total_staff_merged = 0
|
||||
files_merged = 0
|
||||
|
||||
for pending_path, target_path, name, pending_data in collisions:
|
||||
target_data = load_yaml(target_path)
|
||||
if not target_data:
|
||||
print(f"[SKIP] Cannot load target: {target_path.name}")
|
||||
continue
|
||||
|
||||
# Count staff in pending
|
||||
pending_staff_count = len(pending_data.get('staff', {}).get('staff_list', []))
|
||||
if pending_staff_count == 0:
|
||||
print(f"[SKIP] No staff in: {name[:50]}")
|
||||
continue
|
||||
|
||||
# Merge
|
||||
staff_added = merge_staff_section(target_data, pending_data)
|
||||
|
||||
if staff_added > 0:
|
||||
print(f"[{'DRY RUN' if args.dry_run else 'MERGE'}] {name[:50]}")
|
||||
print(f" Staff: +{staff_added} (from {pending_staff_count} total)")
|
||||
print(f" {pending_path.name} -> {target_path.name}")
|
||||
print()
|
||||
|
||||
if not args.dry_run:
|
||||
# Save updated target
|
||||
save_yaml(target_path, target_data)
|
||||
|
||||
# Archive pending file
|
||||
archive_dir.mkdir(parents=True, exist_ok=True)
|
||||
shutil.move(str(pending_path), str(archive_dir / pending_path.name))
|
||||
|
||||
total_staff_merged += staff_added
|
||||
files_merged += 1
|
||||
else:
|
||||
print(f"[SKIP] All staff already exist: {name[:50]}")
|
||||
|
||||
print("=" * 80)
|
||||
print("SUMMARY")
|
||||
print("=" * 80)
|
||||
print(f" Files merged: {files_merged}")
|
||||
print(f" Staff added: {total_staff_merged}")
|
||||
if not args.dry_run:
|
||||
print(f" Archived to: {archive_dir}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Loading…
Reference in a new issue