#!/usr/bin/env python3 """ Resolve NL PENDING files based on regional/provincial names. Organizations like "Erfgoedhuis Zuid-Holland", "Noord-Hollands Archief", etc. can be resolved using their regional headquarters. Usage: python scripts/resolve_pending_by_region.py --dry-run python scripts/resolve_pending_by_region.py """ import re import yaml import shutil from pathlib import Path from datetime import datetime, timezone from typing import Optional, Tuple, Dict CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian") # Regional organizations with their typical headquarters REGIONAL_ORGS = { # Exact matches 'archief gooi en vechtstreek': ('NH', 'HIL', 'Hilversum'), 'waterlands archief': ('NH', 'PUR', 'Purmerend'), 'noord-hollands archief': ('NH', 'HAA', 'Haarlem'), 'west-brabants archief': ('NB', 'BER', 'Bergen op Zoom'), 'streekarchief midden-holland': ('ZH', 'GOU', 'Gouda'), 'erfgoedhuis zuid-holland': ('ZH', 'DEL', 'Delft'), 'steunpunt cultureel erfgoed noord-holland': ('NH', 'AMS', 'Amsterdam'), 'stichting landschap noord-holland': ('NH', 'AMS', 'Amsterdam'), 'collectie overijssel': ('OV', 'ZWO', 'Zwolle'), 'huis voor de kunsten limburg': ('LI', 'ROE', 'Roermond'), 'landschapsbeheer drenthe': ('DR', 'ASS', 'Assen'), 'natuurmuseum brabant': ('NB', 'TIL', 'Tilburg'), 'groene hotspot zeeland': ('ZE', 'GOE', 'Goes'), 'utrechts landschap': ('UT', 'UTR', 'Utrecht'), 'tracé - limburgs samenlevingsarchief': ('LI', 'MAA', 'Maastricht'), # Pattern-based regional orgs (headquarters in provincial capitals) 'noord-holland': ('NH', 'HAA', 'Haarlem'), 'zuid-holland': ('ZH', 'DHA', 'Den Haag'), 'noord-brabant': ('NB', 'DBO', "'s-Hertogenbosch"), 'brabant': ('NB', 'DBO', "'s-Hertogenbosch"), 'limburg': ('LI', 'MAA', 'Maastricht'), 'zeeland': ('ZE', 'MID', 'Middelburg'), 'drenthe': ('DR', 'ASS', 'Assen'), 'overijssel': ('OV', 'ZWO', 'Zwolle'), 'gelderland': ('GE', 'ARN', 'Arnhem'), 'friesland': ('FR', 'LEE', 'Leeuwarden'), 'groningen': ('GR', 'GRO', 'Groningen'), 'flevoland': ('FL', 'LEL', 'Lelystad'), # Regional areas 'gooi': ('NH', 'HIL', 'Hilversum'), 'vechtstreek': ('NH', 'WEE', 'Weesp'), 'kennemerland': ('NH', 'HAA', 'Haarlem'), 'west-friesland': ('NH', 'HOO', 'Hoorn'), 'waterland': ('NH', 'PUR', 'Purmerend'), 'zaanstreek': ('NH', 'ZAA', 'Zaandam'), 'alblasserwaard': ('ZH', 'GOR', 'Gorinchem'), 'vijfheerenlanden': ('UT', 'VIA', 'Vianen'), 'achterhoek': ('GE', 'DOE', 'Doetinchem'), 'veluwe': ('GE', 'APE', 'Apeldoorn'), 'rivierenland': ('GE', 'TIE', 'Tiel'), 'twente': ('OV', 'ENS', 'Enschede'), 'salland': ('OV', 'DEV', 'Deventer'), 'de peel': ('NB', 'HEL', 'Helmond'), 'maasvallei': ('LI', 'VEN', 'Venlo'), 'heuvelland': ('LI', 'MAA', 'Maastricht'), 'walcheren': ('ZE', 'MID', 'Middelburg'), # Museums with regional scope 'philzuid': ('NB', 'EIN', 'Eindhoven'), # Philips museum } def match_regional_org(emic_name: str) -> Optional[Tuple[str, str, str]]: """Match organization to regional headquarters.""" name_lower = emic_name.lower() # Check exact matches first (sorted by length, longest first) for pattern, location in sorted(REGIONAL_ORGS.items(), key=lambda x: -len(x[0])): if pattern in name_lower: return location return None def extract_abbreviation(name: str) -> str: """Extract abbreviation from organization name.""" skip_words = { 'de', 'het', 'een', 'van', 'voor', 'in', 'op', 'te', 'den', 'der', 'des', 'en', 'of', 'the', 'a', 'an', 'and', 'or', 'museum', 'stichting', } name_clean = re.sub(r'[^\w\s]', ' ', name) words = [w for w in name_clean.split() if w.lower() not in skip_words and len(w) > 1] if not words: words = name_clean.split()[:3] if len(words) == 1: abbrev = words[0][:4].upper() else: abbrev = ''.join(w[0] for w in words[:5]).upper() return abbrev if abbrev else 'XXX' def get_institution_type(data: Dict) -> str: """Get institution type code from data.""" type_map = { 'MUSEUM': 'M', 'ARCHIVE': 'A', 'LIBRARY': 'L', 'GALLERY': 'G', 'RESEARCH_CENTER': 'R', 'EDUCATION_PROVIDER': 'E', 'OFFICIAL_INSTITUTION': 'O', 'COLLECTING_SOCIETY': 'S', 'HOLY_SITES': 'H', 'DIGITAL_PLATFORM': 'D', 'BOTANICAL_ZOO': 'B', 'CORPORATION': 'C', 'FEATURES': 'F', 'INTANGIBLE_HERITAGE_GROUP': 'I', 'MIXED': 'X', 'PERSONAL_COLLECTION': 'P', 'NGO': 'N', 'TASTE_SMELL': 'T', 'UNKNOWN': 'U', } inst_type = data.get('institution_type', 'MUSEUM') return type_map.get(inst_type, 'M') def process_pending_file(filepath: Path, dry_run: bool = True) -> Optional[str]: """Process a single PENDING file.""" try: with open(filepath, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) emic_name = data.get('custodian_name', {}).get('emic_name', '') if not emic_name: return None # Try to match regional org result = match_regional_org(emic_name) if not result: return None prov, city_code, city_name = result inst_type = get_institution_type(data) abbrev = extract_abbreviation(emic_name) # Generate new GHCID new_ghcid = f"NL-{prov}-{city_code.upper()}-{inst_type}-{abbrev}" # Check for collision new_path = CUSTODIAN_DIR / f"{new_ghcid}.yaml" if new_path.exists() and new_path != filepath: # Add name suffix for collision name_slug = re.sub(r'[^\w]+', '-', emic_name.lower()).strip('-')[:30] new_ghcid = f"{new_ghcid}-{name_slug}" new_path = CUSTODIAN_DIR / f"{new_ghcid}.yaml" if dry_run: print(f"[WOULD RESOLVE] {emic_name}") print(f" Region/Province: {prov} ({city_name})") print(f" -> {new_ghcid}.yaml") return 'dry_run' # Update data data['ghcid_current'] = new_ghcid if 'location' not in data: data['location'] = {} data['location']['city'] = city_name data['location']['country'] = 'NL' # Add resolution provenance if 'ghcid_resolution' not in data: data['ghcid_resolution'] = {} data['ghcid_resolution']['method'] = 'regional_name_extraction' data['ghcid_resolution']['resolved_at'] = datetime.now(timezone.utc).isoformat() data['ghcid_resolution']['matched_region'] = prov # Write and rename with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) shutil.move(filepath, new_path) print(f"[RESOLVED] {emic_name}") print(f" Region: {prov} ({city_name}) -> {new_ghcid}.yaml") return new_ghcid except Exception as e: print(f"[ERROR] {filepath.name}: {e}") return None def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('--dry-run', action='store_true') parser.add_argument('--limit', type=int, default=0) args = parser.parse_args() # Find NL PENDING files only pending_files = list(CUSTODIAN_DIR.glob("NL-XX-XXX-PENDING-*.yaml")) print(f"Found {len(pending_files)} NL PENDING files") if args.limit: pending_files = pending_files[:args.limit] resolved = 0 failed = 0 for filepath in pending_files: result = process_pending_file(filepath, dry_run=args.dry_run) if result: resolved += 1 else: failed += 1 print(f"\n{'Would resolve' if args.dry_run else 'Resolved'}: {resolved}") print(f"No region found: {failed}") if __name__ == '__main__': main()