#!/usr/bin/env python3 """ Resolve PENDING files using a comprehensive known organizations database. This script contains manually curated locations for Dutch heritage organizations that couldn't be resolved automatically. Usage: python scripts/resolve_pending_known_orgs.py --dry-run python scripts/resolve_pending_known_orgs.py """ import re import yaml import shutil from pathlib import Path from datetime import datetime, timezone from typing import Dict, Optional, Tuple CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian") # Known organizations with their locations # Format: 'normalized_name': (province, city_code, city_name, inst_type) KNOWN_ORGS = { # Museums 'amsterdamse school museum het schip': ('NH', 'AMS', 'Amsterdam', 'M'), 'bonnefanten': ('LI', 'MAA', 'Maastricht', 'M'), 'bonami spelcomputer museum': ('OV', 'ZWO', 'Zwolle', 'M'), 'bakkerijmuseum de oude bakkerij': ('NH', 'MED', 'Medemblik', 'M'), 'chabot museum': ('ZH', 'ROT', 'Rotterdam', 'M'), 'coda museum': ('GE', 'APE', 'Apeldoorn', 'M'), 'comm museum voor communicatie': ('ZH', 'DHA', 'Den Haag', 'M'), 'cruquius museum': ('NH', 'HAA', 'Haarlemmermeer', 'M'), 'diva museum': ('BE', 'ANT', 'Antwerpen', 'M'), # Belgium 'dordrechts museum': ('ZH', 'DOR', 'Dordrecht', 'M'), 'dutch museum of freemasonry': ('ZH', 'DHA', 'Den Haag', 'M'), 'eise eisinga planetarium': ('FR', 'FRA', 'Franeker', 'M'), 'elisabeth weeshuis museum': ('UT', 'CUL', 'Culemborg', 'M'), 'design museum huis dedel': ('ZH', 'DHA', 'Den Haag', 'M'), 'fries landbouw museum': ('FR', 'LEE', 'Leeuwarden', 'M'), 'fries scheepvaart museum': ('FR', 'SNE', 'Sneek', 'M'), 'gelderse archeologie': ('GE', 'ARN', 'Arnhem', 'R'), 'gelders archief': ('GE', 'ARN', 'Arnhem', 'A'), 'gorcums museum': ('ZH', 'GOR', 'Gorinchem', 'M'), 'hart museum': ('NH', 'AMS', 'Amsterdam', 'M'), 'h art museum': ('NH', 'AMS', 'Amsterdam', 'M'), 'het drentse landschap': ('DR', 'ASS', 'Assen', 'N'), 'het museum voor onbedoelde kunst': ('NH', 'AMS', 'Amsterdam', 'M'), 'het schip': ('NH', 'AMS', 'Amsterdam', 'M'), 'huygens instituut': ('NH', 'AMS', 'Amsterdam', 'R'), 'katwijks museum': ('ZH', 'KAT', 'Katwijk', 'M'), 'kroller muller museum': ('GE', 'OTT', 'Otterlo', 'M'), 'kunsthal': ('ZH', 'ROT', 'Rotterdam', 'G'), 'literatuurmuseum': ('ZH', 'DHA', 'Den Haag', 'M'), 'museum aan de ijssel': ('GE', 'DOE', 'Doesburg', 'M'), 'museum de buitenplaats': ('DR', 'EEL', 'Eelde', 'M'), 'museum de casteelse poort': ('GE', 'WAG', 'Wageningen', 'M'), 'museum de koperen knop': ('ZE', 'HAR', 'Hardinxveld', 'M'), 'museum de lakenhal': ('ZH', 'LEI', 'Leiden', 'M'), 'museum geert groote huis': ('OV', 'DEV', 'Deventer', 'M'), 'museum het oude raadhuis': ('UT', 'URK', 'Urk', 'M'), 'museum het valkhof': ('GE', 'NIJ', 'Nijmegen', 'M'), 'museum hoeksche waard': ('ZH', 'OIB', 'Oud-Beijerland', 'M'), 'museum huys der historie': ('NB', 'HEL', 'Helmond', 'M'), 'museum ijsselstein': ('UT', 'IJS', 'IJsselstein', 'M'), 'museum kaap skil': ('NH', 'TEX', 'Texel', 'M'), 'museum kasteel wijchen': ('GE', 'WIJ', 'Wijchen', 'M'), 'museum maelwael van lymborch': ('GE', 'NIJ', 'Nijmegen', 'M'), 'museum ons lieve heer op solder': ('NH', 'AMS', 'Amsterdam', 'M'), 'museum plus bus': ('NH', 'AMS', 'Amsterdam', 'M'), 'museum romeinse katakomben': ('LI', 'VAL', 'Valkenburg', 'M'), 'museum stedhus': ('FR', 'WOR', 'Workum', 'M'), 'museum t oude slot': ('GE', 'VEL', 'Velp', 'M'), 'museum tot zover': ('NH', 'AMS', 'Amsterdam', 'M'), 'museum valse kunst': ('GE', 'VIE', 'Vierhouten', 'M'), 'museum van de twintigste eeuw': ('NH', 'HOO', 'Hoorn', 'M'), 'museum van lien': ('GE', 'WAG', 'Wageningen', 'M'), 'museum vd 20e eeuw': ('NH', 'HOO', 'Hoorn', 'M'), 'museum voormeer': ('NH', 'AMS', 'Amsterdam', 'M'), 'museum zaanse tijd': ('NH', 'ZAA', 'Zaandam', 'M'), 'museumboerderij west frisia': ('NH', 'HOO', 'Hoogkarspel', 'M'), 'museumpark': ('ZH', 'ROT', 'Rotterdam', 'M'), 'nationaal militair museum': ('UT', 'SOE', 'Soesterberg', 'M'), 'nationaal monument oranjehotel': ('ZH', 'DHA', 'Den Haag', 'M'), 'nationaal muziekinstrumenten fonds': ('NH', 'AMS', 'Amsterdam', 'M'), 'nationaal orgelmuseum': ('GE', 'ELB', 'Elburg', 'M'), 'nationaal tinnen figuren museum': ('GE', 'OMM', 'Ommen', 'M'), 'nationaal vlechtmuseum': ('DR', 'NOR', 'Noordwolde', 'M'), 'nederlands dans theater': ('ZH', 'DHA', 'Den Haag', 'E'), 'nederlands fotomuseum': ('ZH', 'ROT', 'Rotterdam', 'M'), 'nederlands instituut voor beeld en geluid': ('NH', 'HIL', 'Hilversum', 'A'), 'nederlands mijnmuseum': ('LI', 'HEE', 'Heerlen', 'M'), 'nederlands transport museum': ('ZH', 'NIE', 'Nieuw-Vennep', 'M'), 'nieuwe kerk amsterdam': ('NH', 'AMS', 'Amsterdam', 'H'), 'nieuwe kerk delft': ('ZH', 'DEL', 'Delft', 'H'), 'nijntje museum': ('UT', 'UTR', 'Utrecht', 'M'), 'nh museum': ('NH', 'HAA', 'Haarlem', 'M'), 'oorlogsmuseum overloon': ('NB', 'OVL', 'Overloon', 'M'), 'openluchtmuseum het hoogeland': ('GR', 'WAR', 'Warffum', 'M'), 'paleis het loo': ('GE', 'APE', 'Apeldoorn', 'M'), 'purmerends museum': ('NH', 'PUR', 'Purmerend', 'M'), 'rijksmuseum boerhaave': ('ZH', 'LEI', 'Leiden', 'M'), 'rijksmuseum twenthe': ('OV', 'ENS', 'Enschede', 'M'), 'singer laren': ('NH', 'LAR', 'Laren', 'M'), 'sonnenborgh museum': ('UT', 'UTR', 'Utrecht', 'M'), 'zeeuws museum': ('ZE', 'MID', 'Middelburg', 'M'), # Libraries 'de bblthk': ('GE', 'WAG', 'Wageningen', 'L'), 'kb nationale bibliotheek': ('ZH', 'DHA', 'Den Haag', 'L'), # Archives 'digitar het online archief': ('UT', 'UTR', 'Utrecht', 'D'), # Organizations (stichtingen, etc.) '3 october vereeniging': ('ZH', 'LEI', 'Leiden', 'S'), 'abdij o l v koningshoeven': ('NB', 'TIL', 'Tilburg', 'H'), 'amphion cultuurbedrijf': ('GE', 'DOE', 'Doetinchem', 'E'), 'bijenstichting': ('UT', 'UTR', 'Utrecht', 'N'), 'bomenstichting': ('UT', 'UTR', 'Utrecht', 'N'), 'boerennatuur': ('UT', 'UTR', 'Utrecht', 'N'), 'cbg': ('ZH', 'DHA', 'Den Haag', 'R'), # Central Bureau for Genealogy 'creatieve hubs nederland': ('NH', 'AMS', 'Amsterdam', 'O'), 'de commandostichting': ('NH', 'HAA', 'Haarlem', 'N'), 'defabrique evenementenlocatie': ('UT', 'UTR', 'Utrecht', 'E'), 'delamar': ('NH', 'AMS', 'Amsterdam', 'E'), 'den kennisinstituut cultuur digitale transformatie': ('NH', 'AMS', 'Amsterdam', 'R'), 'dutch national opera ballet': ('NH', 'AMS', 'Amsterdam', 'E'), 'expertisecentrum literair vertalen elv': ('NH', 'AMS', 'Amsterdam', 'R'), 'fim federatie instandhouding monumenten': ('NH', 'AMS', 'Amsterdam', 'N'), 'fonds 21': ('UT', 'UTR', 'Utrecht', 'N'), 'framer framed': ('NH', 'AMS', 'Amsterdam', 'G'), 'ark rewilding nederland': ('GE', 'NIJ', 'Nijmegen', 'N'), 'centraal joods overleg cjo': ('NH', 'AMS', 'Amsterdam', 'N'), 'kenniscentrum immaterieel erfgoed nederland': ('NH', 'AMS', 'Amsterdam', 'R'), 'kenniscommunity informatie en archief': ('NH', 'AMS', 'Amsterdam', 'N'), 'koninklijke nederlandse academie van wetenschappen': ('NH', 'AMS', 'Amsterdam', 'R'), # Research centers 'adc archeoprojecten': ('GE', 'AME', 'Amersfoort', 'R'), 'archol': ('ZH', 'LEI', 'Leiden', 'R'), 'kitlv': ('ZH', 'LEI', 'Leiden', 'R'), # Theaters/Venues 'theater de veste': ('ZH', 'DEL', 'Delft', 'E'), 'theater a d schie': ('ZH', 'SCH', 'Schiedam', 'E'), # Foreign organizations that should be reclassified 'caen memorial': ('FR', 'CAE', 'Caen', 'M'), # France 'den gamle by': ('DK', 'AAR', 'Aarhus', 'M'), # Denmark 'den kongelige samling': ('DK', 'CPH', 'Copenhagen', 'M'), # Denmark 'castello di rivoli': ('IT', 'TOR', 'Torino', 'M'), # Italy 'consorzio delle residenze reali sabaude': ('IT', 'TOR', 'Torino', 'M'), # Italy } def normalize_name(name: str) -> str: """Normalize organization name for matching.""" import unicodedata normalized = unicodedata.normalize('NFKD', name) normalized = normalized.lower().strip() # Remove punctuation normalized = re.sub(r'[^\w\s]', ' ', normalized) normalized = ' '.join(normalized.split()) return normalized def extract_abbreviation(name: str) -> str: """Extract abbreviation from organization name.""" skip_words = { 'de', 'het', 'een', 'van', 'voor', 'in', 'op', 'te', 'den', 'der', 'des', 'en', 'of', 'the', 'a', 'an', 'and', 'or', 'museum', 'stichting', } name_clean = re.sub(r'[^\w\s]', ' ', name) words = [w for w in name_clean.split() if w.lower() not in skip_words and len(w) > 1] if not words: words = name_clean.split()[:3] if len(words) == 1: abbrev = words[0][:4].upper() else: abbrev = ''.join(w[0] for w in words[:5]).upper() return abbrev if abbrev else 'XXX' def match_known_org(emic_name: str) -> Optional[Tuple[str, str, str, str]]: """Match organization to known database.""" name_lower = normalize_name(emic_name) # Exact match first if name_lower in KNOWN_ORGS: return KNOWN_ORGS[name_lower] # Partial match - check if known org name is contained in emic name for known_name, location in sorted(KNOWN_ORGS.items(), key=lambda x: -len(x[0])): if known_name in name_lower or name_lower in known_name: return location return None def process_pending_file(filepath: Path, dry_run: bool = True) -> Optional[str]: """Process a single PENDING file.""" try: with open(filepath, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) emic_name = data.get('custodian_name', {}).get('emic_name', '') if not emic_name: return None result = match_known_org(emic_name) if not result: return None province, city_code, city_name, inst_type = result abbrev = extract_abbreviation(emic_name) # Handle non-Dutch organizations country = 'NL' if province in ['FR', 'DK', 'IT', 'BE', 'DE', 'GB', 'US']: country = province province = 'XX' new_ghcid = f"{country}-{province}-{city_code.upper()}-{inst_type}-{abbrev}" new_path = CUSTODIAN_DIR / f"{new_ghcid}.yaml" # Handle collision if new_path.exists() and new_path != filepath: name_slug = re.sub(r'[^\w]+', '-', emic_name.lower()).strip('-')[:30] new_ghcid = f"{new_ghcid}-{name_slug}" new_path = CUSTODIAN_DIR / f"{new_ghcid}.yaml" if dry_run: print(f"[WOULD RESOLVE] {emic_name}") print(f" Location: {city_name} ({country if country != 'NL' else province})") print(f" -> {new_ghcid}.yaml") return 'dry_run' # Update data data['ghcid_current'] = new_ghcid if 'location' not in data: data['location'] = {} data['location']['city'] = city_name data['location']['country'] = country if 'ghcid_resolution' not in data: data['ghcid_resolution'] = {} data['ghcid_resolution']['method'] = 'known_organization_database' data['ghcid_resolution']['resolved_at'] = datetime.now(timezone.utc).isoformat() with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) shutil.move(filepath, new_path) print(f"[RESOLVED] {emic_name}") print(f" -> {new_ghcid}.yaml") return new_ghcid except Exception as e: print(f"[ERROR] {filepath.name}: {e}") return None def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('--dry-run', action='store_true') args = parser.parse_args() # Process all PENDING files (not just NL) pending_files = list(CUSTODIAN_DIR.glob("*PENDING*.yaml")) print(f"Processing {len(pending_files)} PENDING files against {len(KNOWN_ORGS)} known organizations...") print() resolved = 0 not_found = 0 for filepath in pending_files: result = process_pending_file(filepath, dry_run=args.dry_run) if result: resolved += 1 else: not_found += 1 print() print(f"{'Would resolve' if args.dry_run else 'Resolved'}: {resolved}") print(f"Not in database: {not_found}") if __name__ == '__main__': main()