glam/scripts/resolve_pending_by_region.py
2026-01-09 20:35:19 +01:00

220 lines
7.9 KiB
Python

#!/usr/bin/env python3
"""
Resolve NL PENDING files based on regional/provincial names.
Organizations like "Erfgoedhuis Zuid-Holland", "Noord-Hollands Archief", etc.
can be resolved using their regional headquarters.
Usage:
python scripts/resolve_pending_by_region.py --dry-run
python scripts/resolve_pending_by_region.py
"""
import re
import yaml
import shutil
from pathlib import Path
from datetime import datetime, timezone
from typing import Optional, Tuple, Dict
CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian")
# Regional organizations with their typical headquarters
REGIONAL_ORGS = {
# Exact matches
'archief gooi en vechtstreek': ('NH', 'HIL', 'Hilversum'),
'waterlands archief': ('NH', 'PUR', 'Purmerend'),
'noord-hollands archief': ('NH', 'HAA', 'Haarlem'),
'west-brabants archief': ('NB', 'BER', 'Bergen op Zoom'),
'streekarchief midden-holland': ('ZH', 'GOU', 'Gouda'),
'erfgoedhuis zuid-holland': ('ZH', 'DEL', 'Delft'),
'steunpunt cultureel erfgoed noord-holland': ('NH', 'AMS', 'Amsterdam'),
'stichting landschap noord-holland': ('NH', 'AMS', 'Amsterdam'),
'collectie overijssel': ('OV', 'ZWO', 'Zwolle'),
'huis voor de kunsten limburg': ('LI', 'ROE', 'Roermond'),
'landschapsbeheer drenthe': ('DR', 'ASS', 'Assen'),
'natuurmuseum brabant': ('NB', 'TIL', 'Tilburg'),
'groene hotspot zeeland': ('ZE', 'GOE', 'Goes'),
'utrechts landschap': ('UT', 'UTR', 'Utrecht'),
'tracé - limburgs samenlevingsarchief': ('LI', 'MAA', 'Maastricht'),
# Pattern-based regional orgs (headquarters in provincial capitals)
'noord-holland': ('NH', 'HAA', 'Haarlem'),
'zuid-holland': ('ZH', 'DHA', 'Den Haag'),
'noord-brabant': ('NB', 'DBO', "'s-Hertogenbosch"),
'brabant': ('NB', 'DBO', "'s-Hertogenbosch"),
'limburg': ('LI', 'MAA', 'Maastricht'),
'zeeland': ('ZE', 'MID', 'Middelburg'),
'drenthe': ('DR', 'ASS', 'Assen'),
'overijssel': ('OV', 'ZWO', 'Zwolle'),
'gelderland': ('GE', 'ARN', 'Arnhem'),
'friesland': ('FR', 'LEE', 'Leeuwarden'),
'groningen': ('GR', 'GRO', 'Groningen'),
'flevoland': ('FL', 'LEL', 'Lelystad'),
# Regional areas
'gooi': ('NH', 'HIL', 'Hilversum'),
'vechtstreek': ('NH', 'WEE', 'Weesp'),
'kennemerland': ('NH', 'HAA', 'Haarlem'),
'west-friesland': ('NH', 'HOO', 'Hoorn'),
'waterland': ('NH', 'PUR', 'Purmerend'),
'zaanstreek': ('NH', 'ZAA', 'Zaandam'),
'alblasserwaard': ('ZH', 'GOR', 'Gorinchem'),
'vijfheerenlanden': ('UT', 'VIA', 'Vianen'),
'achterhoek': ('GE', 'DOE', 'Doetinchem'),
'veluwe': ('GE', 'APE', 'Apeldoorn'),
'rivierenland': ('GE', 'TIE', 'Tiel'),
'twente': ('OV', 'ENS', 'Enschede'),
'salland': ('OV', 'DEV', 'Deventer'),
'de peel': ('NB', 'HEL', 'Helmond'),
'maasvallei': ('LI', 'VEN', 'Venlo'),
'heuvelland': ('LI', 'MAA', 'Maastricht'),
'walcheren': ('ZE', 'MID', 'Middelburg'),
# Museums with regional scope
'philzuid': ('NB', 'EIN', 'Eindhoven'), # Philips museum
}
def match_regional_org(emic_name: str) -> Optional[Tuple[str, str, str]]:
"""Match organization to regional headquarters."""
name_lower = emic_name.lower()
# Check exact matches first (sorted by length, longest first)
for pattern, location in sorted(REGIONAL_ORGS.items(), key=lambda x: -len(x[0])):
if pattern in name_lower:
return location
return None
def extract_abbreviation(name: str) -> str:
"""Extract abbreviation from organization name."""
skip_words = {
'de', 'het', 'een', 'van', 'voor', 'in', 'op', 'te', 'den', 'der', 'des',
'en', 'of', 'the', 'a', 'an', 'and', 'or', 'museum', 'stichting',
}
name_clean = re.sub(r'[^\w\s]', ' ', name)
words = [w for w in name_clean.split() if w.lower() not in skip_words and len(w) > 1]
if not words:
words = name_clean.split()[:3]
if len(words) == 1:
abbrev = words[0][:4].upper()
else:
abbrev = ''.join(w[0] for w in words[:5]).upper()
return abbrev if abbrev else 'XXX'
def get_institution_type(data: Dict) -> str:
"""Get institution type code from data."""
type_map = {
'MUSEUM': 'M', 'ARCHIVE': 'A', 'LIBRARY': 'L', 'GALLERY': 'G',
'RESEARCH_CENTER': 'R', 'EDUCATION_PROVIDER': 'E', 'OFFICIAL_INSTITUTION': 'O',
'COLLECTING_SOCIETY': 'S', 'HOLY_SITES': 'H', 'DIGITAL_PLATFORM': 'D',
'BOTANICAL_ZOO': 'B', 'CORPORATION': 'C', 'FEATURES': 'F',
'INTANGIBLE_HERITAGE_GROUP': 'I', 'MIXED': 'X', 'PERSONAL_COLLECTION': 'P',
'NGO': 'N', 'TASTE_SMELL': 'T', 'UNKNOWN': 'U',
}
inst_type = data.get('institution_type', 'MUSEUM')
return type_map.get(inst_type, 'M')
def process_pending_file(filepath: Path, dry_run: bool = True) -> Optional[str]:
"""Process a single PENDING file."""
try:
with open(filepath, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
emic_name = data.get('custodian_name', {}).get('emic_name', '')
if not emic_name:
return None
# Try to match regional org
result = match_regional_org(emic_name)
if not result:
return None
prov, city_code, city_name = result
inst_type = get_institution_type(data)
abbrev = extract_abbreviation(emic_name)
# Generate new GHCID
new_ghcid = f"NL-{prov}-{city_code.upper()}-{inst_type}-{abbrev}"
# Check for collision
new_path = CUSTODIAN_DIR / f"{new_ghcid}.yaml"
if new_path.exists() and new_path != filepath:
# Add name suffix for collision
name_slug = re.sub(r'[^\w]+', '-', emic_name.lower()).strip('-')[:30]
new_ghcid = f"{new_ghcid}-{name_slug}"
new_path = CUSTODIAN_DIR / f"{new_ghcid}.yaml"
if dry_run:
print(f"[WOULD RESOLVE] {emic_name}")
print(f" Region/Province: {prov} ({city_name})")
print(f" -> {new_ghcid}.yaml")
return 'dry_run'
# Update data
data['ghcid_current'] = new_ghcid
if 'location' not in data:
data['location'] = {}
data['location']['city'] = city_name
data['location']['country'] = 'NL'
# Add resolution provenance
if 'ghcid_resolution' not in data:
data['ghcid_resolution'] = {}
data['ghcid_resolution']['method'] = 'regional_name_extraction'
data['ghcid_resolution']['resolved_at'] = datetime.now(timezone.utc).isoformat()
data['ghcid_resolution']['matched_region'] = prov
# Write and rename
with open(filepath, 'w', encoding='utf-8') as f:
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
shutil.move(filepath, new_path)
print(f"[RESOLVED] {emic_name}")
print(f" Region: {prov} ({city_name}) -> {new_ghcid}.yaml")
return new_ghcid
except Exception as e:
print(f"[ERROR] {filepath.name}: {e}")
return None
def main():
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--dry-run', action='store_true')
parser.add_argument('--limit', type=int, default=0)
args = parser.parse_args()
# Find NL PENDING files only
pending_files = list(CUSTODIAN_DIR.glob("NL-XX-XXX-PENDING-*.yaml"))
print(f"Found {len(pending_files)} NL PENDING files")
if args.limit:
pending_files = pending_files[:args.limit]
resolved = 0
failed = 0
for filepath in pending_files:
result = process_pending_file(filepath, dry_run=args.dry_run)
if result:
resolved += 1
else:
failed += 1
print(f"\n{'Would resolve' if args.dry_run else 'Resolved'}: {resolved}")
print(f"No region found: {failed}")
if __name__ == '__main__':
main()