glam/scripts/resolve_pending_known_orgs.py
2026-01-09 20:35:19 +01:00

292 lines
12 KiB
Python

#!/usr/bin/env python3
"""
Resolve PENDING files using a comprehensive known organizations database.
This script contains manually curated locations for Dutch heritage organizations
that couldn't be resolved automatically.
Usage:
python scripts/resolve_pending_known_orgs.py --dry-run
python scripts/resolve_pending_known_orgs.py
"""
import re
import yaml
import shutil
from pathlib import Path
from datetime import datetime, timezone
from typing import Dict, Optional, Tuple
CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian")
# Known organizations with their locations
# Format: 'normalized_name': (province, city_code, city_name, inst_type)
KNOWN_ORGS = {
# Museums
'amsterdamse school museum het schip': ('NH', 'AMS', 'Amsterdam', 'M'),
'bonnefanten': ('LI', 'MAA', 'Maastricht', 'M'),
'bonami spelcomputer museum': ('OV', 'ZWO', 'Zwolle', 'M'),
'bakkerijmuseum de oude bakkerij': ('NH', 'MED', 'Medemblik', 'M'),
'chabot museum': ('ZH', 'ROT', 'Rotterdam', 'M'),
'coda museum': ('GE', 'APE', 'Apeldoorn', 'M'),
'comm museum voor communicatie': ('ZH', 'DHA', 'Den Haag', 'M'),
'cruquius museum': ('NH', 'HAA', 'Haarlemmermeer', 'M'),
'diva museum': ('BE', 'ANT', 'Antwerpen', 'M'), # Belgium
'dordrechts museum': ('ZH', 'DOR', 'Dordrecht', 'M'),
'dutch museum of freemasonry': ('ZH', 'DHA', 'Den Haag', 'M'),
'eise eisinga planetarium': ('FR', 'FRA', 'Franeker', 'M'),
'elisabeth weeshuis museum': ('UT', 'CUL', 'Culemborg', 'M'),
'design museum huis dedel': ('ZH', 'DHA', 'Den Haag', 'M'),
'fries landbouw museum': ('FR', 'LEE', 'Leeuwarden', 'M'),
'fries scheepvaart museum': ('FR', 'SNE', 'Sneek', 'M'),
'gelderse archeologie': ('GE', 'ARN', 'Arnhem', 'R'),
'gelders archief': ('GE', 'ARN', 'Arnhem', 'A'),
'gorcums museum': ('ZH', 'GOR', 'Gorinchem', 'M'),
'hart museum': ('NH', 'AMS', 'Amsterdam', 'M'),
'h art museum': ('NH', 'AMS', 'Amsterdam', 'M'),
'het drentse landschap': ('DR', 'ASS', 'Assen', 'N'),
'het museum voor onbedoelde kunst': ('NH', 'AMS', 'Amsterdam', 'M'),
'het schip': ('NH', 'AMS', 'Amsterdam', 'M'),
'huygens instituut': ('NH', 'AMS', 'Amsterdam', 'R'),
'katwijks museum': ('ZH', 'KAT', 'Katwijk', 'M'),
'kroller muller museum': ('GE', 'OTT', 'Otterlo', 'M'),
'kunsthal': ('ZH', 'ROT', 'Rotterdam', 'G'),
'literatuurmuseum': ('ZH', 'DHA', 'Den Haag', 'M'),
'museum aan de ijssel': ('GE', 'DOE', 'Doesburg', 'M'),
'museum de buitenplaats': ('DR', 'EEL', 'Eelde', 'M'),
'museum de casteelse poort': ('GE', 'WAG', 'Wageningen', 'M'),
'museum de koperen knop': ('ZE', 'HAR', 'Hardinxveld', 'M'),
'museum de lakenhal': ('ZH', 'LEI', 'Leiden', 'M'),
'museum geert groote huis': ('OV', 'DEV', 'Deventer', 'M'),
'museum het oude raadhuis': ('UT', 'URK', 'Urk', 'M'),
'museum het valkhof': ('GE', 'NIJ', 'Nijmegen', 'M'),
'museum hoeksche waard': ('ZH', 'OIB', 'Oud-Beijerland', 'M'),
'museum huys der historie': ('NB', 'HEL', 'Helmond', 'M'),
'museum ijsselstein': ('UT', 'IJS', 'IJsselstein', 'M'),
'museum kaap skil': ('NH', 'TEX', 'Texel', 'M'),
'museum kasteel wijchen': ('GE', 'WIJ', 'Wijchen', 'M'),
'museum maelwael van lymborch': ('GE', 'NIJ', 'Nijmegen', 'M'),
'museum ons lieve heer op solder': ('NH', 'AMS', 'Amsterdam', 'M'),
'museum plus bus': ('NH', 'AMS', 'Amsterdam', 'M'),
'museum romeinse katakomben': ('LI', 'VAL', 'Valkenburg', 'M'),
'museum stedhus': ('FR', 'WOR', 'Workum', 'M'),
'museum t oude slot': ('GE', 'VEL', 'Velp', 'M'),
'museum tot zover': ('NH', 'AMS', 'Amsterdam', 'M'),
'museum valse kunst': ('GE', 'VIE', 'Vierhouten', 'M'),
'museum van de twintigste eeuw': ('NH', 'HOO', 'Hoorn', 'M'),
'museum van lien': ('GE', 'WAG', 'Wageningen', 'M'),
'museum vd 20e eeuw': ('NH', 'HOO', 'Hoorn', 'M'),
'museum voormeer': ('NH', 'AMS', 'Amsterdam', 'M'),
'museum zaanse tijd': ('NH', 'ZAA', 'Zaandam', 'M'),
'museumboerderij west frisia': ('NH', 'HOO', 'Hoogkarspel', 'M'),
'museumpark': ('ZH', 'ROT', 'Rotterdam', 'M'),
'nationaal militair museum': ('UT', 'SOE', 'Soesterberg', 'M'),
'nationaal monument oranjehotel': ('ZH', 'DHA', 'Den Haag', 'M'),
'nationaal muziekinstrumenten fonds': ('NH', 'AMS', 'Amsterdam', 'M'),
'nationaal orgelmuseum': ('GE', 'ELB', 'Elburg', 'M'),
'nationaal tinnen figuren museum': ('GE', 'OMM', 'Ommen', 'M'),
'nationaal vlechtmuseum': ('DR', 'NOR', 'Noordwolde', 'M'),
'nederlands dans theater': ('ZH', 'DHA', 'Den Haag', 'E'),
'nederlands fotomuseum': ('ZH', 'ROT', 'Rotterdam', 'M'),
'nederlands instituut voor beeld en geluid': ('NH', 'HIL', 'Hilversum', 'A'),
'nederlands mijnmuseum': ('LI', 'HEE', 'Heerlen', 'M'),
'nederlands transport museum': ('ZH', 'NIE', 'Nieuw-Vennep', 'M'),
'nieuwe kerk amsterdam': ('NH', 'AMS', 'Amsterdam', 'H'),
'nieuwe kerk delft': ('ZH', 'DEL', 'Delft', 'H'),
'nijntje museum': ('UT', 'UTR', 'Utrecht', 'M'),
'nh museum': ('NH', 'HAA', 'Haarlem', 'M'),
'oorlogsmuseum overloon': ('NB', 'OVL', 'Overloon', 'M'),
'openluchtmuseum het hoogeland': ('GR', 'WAR', 'Warffum', 'M'),
'paleis het loo': ('GE', 'APE', 'Apeldoorn', 'M'),
'purmerends museum': ('NH', 'PUR', 'Purmerend', 'M'),
'rijksmuseum boerhaave': ('ZH', 'LEI', 'Leiden', 'M'),
'rijksmuseum twenthe': ('OV', 'ENS', 'Enschede', 'M'),
'singer laren': ('NH', 'LAR', 'Laren', 'M'),
'sonnenborgh museum': ('UT', 'UTR', 'Utrecht', 'M'),
'zeeuws museum': ('ZE', 'MID', 'Middelburg', 'M'),
# Libraries
'de bblthk': ('GE', 'WAG', 'Wageningen', 'L'),
'kb nationale bibliotheek': ('ZH', 'DHA', 'Den Haag', 'L'),
# Archives
'digitar het online archief': ('UT', 'UTR', 'Utrecht', 'D'),
# Organizations (stichtingen, etc.)
'3 october vereeniging': ('ZH', 'LEI', 'Leiden', 'S'),
'abdij o l v koningshoeven': ('NB', 'TIL', 'Tilburg', 'H'),
'amphion cultuurbedrijf': ('GE', 'DOE', 'Doetinchem', 'E'),
'bijenstichting': ('UT', 'UTR', 'Utrecht', 'N'),
'bomenstichting': ('UT', 'UTR', 'Utrecht', 'N'),
'boerennatuur': ('UT', 'UTR', 'Utrecht', 'N'),
'cbg': ('ZH', 'DHA', 'Den Haag', 'R'), # Central Bureau for Genealogy
'creatieve hubs nederland': ('NH', 'AMS', 'Amsterdam', 'O'),
'de commandostichting': ('NH', 'HAA', 'Haarlem', 'N'),
'defabrique evenementenlocatie': ('UT', 'UTR', 'Utrecht', 'E'),
'delamar': ('NH', 'AMS', 'Amsterdam', 'E'),
'den kennisinstituut cultuur digitale transformatie': ('NH', 'AMS', 'Amsterdam', 'R'),
'dutch national opera ballet': ('NH', 'AMS', 'Amsterdam', 'E'),
'expertisecentrum literair vertalen elv': ('NH', 'AMS', 'Amsterdam', 'R'),
'fim federatie instandhouding monumenten': ('NH', 'AMS', 'Amsterdam', 'N'),
'fonds 21': ('UT', 'UTR', 'Utrecht', 'N'),
'framer framed': ('NH', 'AMS', 'Amsterdam', 'G'),
'ark rewilding nederland': ('GE', 'NIJ', 'Nijmegen', 'N'),
'centraal joods overleg cjo': ('NH', 'AMS', 'Amsterdam', 'N'),
'kenniscentrum immaterieel erfgoed nederland': ('NH', 'AMS', 'Amsterdam', 'R'),
'kenniscommunity informatie en archief': ('NH', 'AMS', 'Amsterdam', 'N'),
'koninklijke nederlandse academie van wetenschappen': ('NH', 'AMS', 'Amsterdam', 'R'),
# Research centers
'adc archeoprojecten': ('GE', 'AME', 'Amersfoort', 'R'),
'archol': ('ZH', 'LEI', 'Leiden', 'R'),
'kitlv': ('ZH', 'LEI', 'Leiden', 'R'),
# Theaters/Venues
'theater de veste': ('ZH', 'DEL', 'Delft', 'E'),
'theater a d schie': ('ZH', 'SCH', 'Schiedam', 'E'),
# Foreign organizations that should be reclassified
'caen memorial': ('FR', 'CAE', 'Caen', 'M'), # France
'den gamle by': ('DK', 'AAR', 'Aarhus', 'M'), # Denmark
'den kongelige samling': ('DK', 'CPH', 'Copenhagen', 'M'), # Denmark
'castello di rivoli': ('IT', 'TOR', 'Torino', 'M'), # Italy
'consorzio delle residenze reali sabaude': ('IT', 'TOR', 'Torino', 'M'), # Italy
}
def normalize_name(name: str) -> str:
"""Normalize organization name for matching."""
import unicodedata
normalized = unicodedata.normalize('NFKD', name)
normalized = normalized.lower().strip()
# Remove punctuation
normalized = re.sub(r'[^\w\s]', ' ', normalized)
normalized = ' '.join(normalized.split())
return normalized
def extract_abbreviation(name: str) -> str:
"""Extract abbreviation from organization name."""
skip_words = {
'de', 'het', 'een', 'van', 'voor', 'in', 'op', 'te', 'den', 'der', 'des',
'en', 'of', 'the', 'a', 'an', 'and', 'or', 'museum', 'stichting',
}
name_clean = re.sub(r'[^\w\s]', ' ', name)
words = [w for w in name_clean.split() if w.lower() not in skip_words and len(w) > 1]
if not words:
words = name_clean.split()[:3]
if len(words) == 1:
abbrev = words[0][:4].upper()
else:
abbrev = ''.join(w[0] for w in words[:5]).upper()
return abbrev if abbrev else 'XXX'
def match_known_org(emic_name: str) -> Optional[Tuple[str, str, str, str]]:
"""Match organization to known database."""
name_lower = normalize_name(emic_name)
# Exact match first
if name_lower in KNOWN_ORGS:
return KNOWN_ORGS[name_lower]
# Partial match - check if known org name is contained in emic name
for known_name, location in sorted(KNOWN_ORGS.items(), key=lambda x: -len(x[0])):
if known_name in name_lower or name_lower in known_name:
return location
return None
def process_pending_file(filepath: Path, dry_run: bool = True) -> Optional[str]:
"""Process a single PENDING file."""
try:
with open(filepath, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
emic_name = data.get('custodian_name', {}).get('emic_name', '')
if not emic_name:
return None
result = match_known_org(emic_name)
if not result:
return None
province, city_code, city_name, inst_type = result
abbrev = extract_abbreviation(emic_name)
# Handle non-Dutch organizations
country = 'NL'
if province in ['FR', 'DK', 'IT', 'BE', 'DE', 'GB', 'US']:
country = province
province = 'XX'
new_ghcid = f"{country}-{province}-{city_code.upper()}-{inst_type}-{abbrev}"
new_path = CUSTODIAN_DIR / f"{new_ghcid}.yaml"
# Handle collision
if new_path.exists() and new_path != filepath:
name_slug = re.sub(r'[^\w]+', '-', emic_name.lower()).strip('-')[:30]
new_ghcid = f"{new_ghcid}-{name_slug}"
new_path = CUSTODIAN_DIR / f"{new_ghcid}.yaml"
if dry_run:
print(f"[WOULD RESOLVE] {emic_name}")
print(f" Location: {city_name} ({country if country != 'NL' else province})")
print(f" -> {new_ghcid}.yaml")
return 'dry_run'
# Update data
data['ghcid_current'] = new_ghcid
if 'location' not in data:
data['location'] = {}
data['location']['city'] = city_name
data['location']['country'] = country
if 'ghcid_resolution' not in data:
data['ghcid_resolution'] = {}
data['ghcid_resolution']['method'] = 'known_organization_database'
data['ghcid_resolution']['resolved_at'] = datetime.now(timezone.utc).isoformat()
with open(filepath, 'w', encoding='utf-8') as f:
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
shutil.move(filepath, new_path)
print(f"[RESOLVED] {emic_name}")
print(f" -> {new_ghcid}.yaml")
return new_ghcid
except Exception as e:
print(f"[ERROR] {filepath.name}: {e}")
return None
def main():
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--dry-run', action='store_true')
args = parser.parse_args()
# Process all PENDING files (not just NL)
pending_files = list(CUSTODIAN_DIR.glob("*PENDING*.yaml"))
print(f"Processing {len(pending_files)} PENDING files against {len(KNOWN_ORGS)} known organizations...")
print()
resolved = 0
not_found = 0
for filepath in pending_files:
result = process_pending_file(filepath, dry_run=args.dry_run)
if result:
resolved += 1
else:
not_found += 1
print()
print(f"{'Would resolve' if args.dry_run else 'Resolved'}: {resolved}")
print(f"Not in database: {not_found}")
if __name__ == '__main__':
main()