292 lines
12 KiB
Python
292 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Resolve PENDING files using a comprehensive known organizations database.
|
|
|
|
This script contains manually curated locations for Dutch heritage organizations
|
|
that couldn't be resolved automatically.
|
|
|
|
Usage:
|
|
python scripts/resolve_pending_known_orgs.py --dry-run
|
|
python scripts/resolve_pending_known_orgs.py
|
|
"""
|
|
|
|
import re
|
|
import yaml
|
|
import shutil
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from typing import Dict, Optional, Tuple
|
|
|
|
CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian")
|
|
|
|
# Known organizations with their locations
|
|
# Format: 'normalized_name': (province, city_code, city_name, inst_type)
|
|
KNOWN_ORGS = {
|
|
# Museums
|
|
'amsterdamse school museum het schip': ('NH', 'AMS', 'Amsterdam', 'M'),
|
|
'bonnefanten': ('LI', 'MAA', 'Maastricht', 'M'),
|
|
'bonami spelcomputer museum': ('OV', 'ZWO', 'Zwolle', 'M'),
|
|
'bakkerijmuseum de oude bakkerij': ('NH', 'MED', 'Medemblik', 'M'),
|
|
'chabot museum': ('ZH', 'ROT', 'Rotterdam', 'M'),
|
|
'coda museum': ('GE', 'APE', 'Apeldoorn', 'M'),
|
|
'comm museum voor communicatie': ('ZH', 'DHA', 'Den Haag', 'M'),
|
|
'cruquius museum': ('NH', 'HAA', 'Haarlemmermeer', 'M'),
|
|
'diva museum': ('BE', 'ANT', 'Antwerpen', 'M'), # Belgium
|
|
'dordrechts museum': ('ZH', 'DOR', 'Dordrecht', 'M'),
|
|
'dutch museum of freemasonry': ('ZH', 'DHA', 'Den Haag', 'M'),
|
|
'eise eisinga planetarium': ('FR', 'FRA', 'Franeker', 'M'),
|
|
'elisabeth weeshuis museum': ('UT', 'CUL', 'Culemborg', 'M'),
|
|
'design museum huis dedel': ('ZH', 'DHA', 'Den Haag', 'M'),
|
|
'fries landbouw museum': ('FR', 'LEE', 'Leeuwarden', 'M'),
|
|
'fries scheepvaart museum': ('FR', 'SNE', 'Sneek', 'M'),
|
|
'gelderse archeologie': ('GE', 'ARN', 'Arnhem', 'R'),
|
|
'gelders archief': ('GE', 'ARN', 'Arnhem', 'A'),
|
|
'gorcums museum': ('ZH', 'GOR', 'Gorinchem', 'M'),
|
|
'hart museum': ('NH', 'AMS', 'Amsterdam', 'M'),
|
|
'h art museum': ('NH', 'AMS', 'Amsterdam', 'M'),
|
|
'het drentse landschap': ('DR', 'ASS', 'Assen', 'N'),
|
|
'het museum voor onbedoelde kunst': ('NH', 'AMS', 'Amsterdam', 'M'),
|
|
'het schip': ('NH', 'AMS', 'Amsterdam', 'M'),
|
|
'huygens instituut': ('NH', 'AMS', 'Amsterdam', 'R'),
|
|
'katwijks museum': ('ZH', 'KAT', 'Katwijk', 'M'),
|
|
'kroller muller museum': ('GE', 'OTT', 'Otterlo', 'M'),
|
|
'kunsthal': ('ZH', 'ROT', 'Rotterdam', 'G'),
|
|
'literatuurmuseum': ('ZH', 'DHA', 'Den Haag', 'M'),
|
|
'museum aan de ijssel': ('GE', 'DOE', 'Doesburg', 'M'),
|
|
'museum de buitenplaats': ('DR', 'EEL', 'Eelde', 'M'),
|
|
'museum de casteelse poort': ('GE', 'WAG', 'Wageningen', 'M'),
|
|
'museum de koperen knop': ('ZE', 'HAR', 'Hardinxveld', 'M'),
|
|
'museum de lakenhal': ('ZH', 'LEI', 'Leiden', 'M'),
|
|
'museum geert groote huis': ('OV', 'DEV', 'Deventer', 'M'),
|
|
'museum het oude raadhuis': ('UT', 'URK', 'Urk', 'M'),
|
|
'museum het valkhof': ('GE', 'NIJ', 'Nijmegen', 'M'),
|
|
'museum hoeksche waard': ('ZH', 'OIB', 'Oud-Beijerland', 'M'),
|
|
'museum huys der historie': ('NB', 'HEL', 'Helmond', 'M'),
|
|
'museum ijsselstein': ('UT', 'IJS', 'IJsselstein', 'M'),
|
|
'museum kaap skil': ('NH', 'TEX', 'Texel', 'M'),
|
|
'museum kasteel wijchen': ('GE', 'WIJ', 'Wijchen', 'M'),
|
|
'museum maelwael van lymborch': ('GE', 'NIJ', 'Nijmegen', 'M'),
|
|
'museum ons lieve heer op solder': ('NH', 'AMS', 'Amsterdam', 'M'),
|
|
'museum plus bus': ('NH', 'AMS', 'Amsterdam', 'M'),
|
|
'museum romeinse katakomben': ('LI', 'VAL', 'Valkenburg', 'M'),
|
|
'museum stedhus': ('FR', 'WOR', 'Workum', 'M'),
|
|
'museum t oude slot': ('GE', 'VEL', 'Velp', 'M'),
|
|
'museum tot zover': ('NH', 'AMS', 'Amsterdam', 'M'),
|
|
'museum valse kunst': ('GE', 'VIE', 'Vierhouten', 'M'),
|
|
'museum van de twintigste eeuw': ('NH', 'HOO', 'Hoorn', 'M'),
|
|
'museum van lien': ('GE', 'WAG', 'Wageningen', 'M'),
|
|
'museum vd 20e eeuw': ('NH', 'HOO', 'Hoorn', 'M'),
|
|
'museum voormeer': ('NH', 'AMS', 'Amsterdam', 'M'),
|
|
'museum zaanse tijd': ('NH', 'ZAA', 'Zaandam', 'M'),
|
|
'museumboerderij west frisia': ('NH', 'HOO', 'Hoogkarspel', 'M'),
|
|
'museumpark': ('ZH', 'ROT', 'Rotterdam', 'M'),
|
|
'nationaal militair museum': ('UT', 'SOE', 'Soesterberg', 'M'),
|
|
'nationaal monument oranjehotel': ('ZH', 'DHA', 'Den Haag', 'M'),
|
|
'nationaal muziekinstrumenten fonds': ('NH', 'AMS', 'Amsterdam', 'M'),
|
|
'nationaal orgelmuseum': ('GE', 'ELB', 'Elburg', 'M'),
|
|
'nationaal tinnen figuren museum': ('GE', 'OMM', 'Ommen', 'M'),
|
|
'nationaal vlechtmuseum': ('DR', 'NOR', 'Noordwolde', 'M'),
|
|
'nederlands dans theater': ('ZH', 'DHA', 'Den Haag', 'E'),
|
|
'nederlands fotomuseum': ('ZH', 'ROT', 'Rotterdam', 'M'),
|
|
'nederlands instituut voor beeld en geluid': ('NH', 'HIL', 'Hilversum', 'A'),
|
|
'nederlands mijnmuseum': ('LI', 'HEE', 'Heerlen', 'M'),
|
|
'nederlands transport museum': ('ZH', 'NIE', 'Nieuw-Vennep', 'M'),
|
|
'nieuwe kerk amsterdam': ('NH', 'AMS', 'Amsterdam', 'H'),
|
|
'nieuwe kerk delft': ('ZH', 'DEL', 'Delft', 'H'),
|
|
'nijntje museum': ('UT', 'UTR', 'Utrecht', 'M'),
|
|
'nh museum': ('NH', 'HAA', 'Haarlem', 'M'),
|
|
'oorlogsmuseum overloon': ('NB', 'OVL', 'Overloon', 'M'),
|
|
'openluchtmuseum het hoogeland': ('GR', 'WAR', 'Warffum', 'M'),
|
|
'paleis het loo': ('GE', 'APE', 'Apeldoorn', 'M'),
|
|
'purmerends museum': ('NH', 'PUR', 'Purmerend', 'M'),
|
|
'rijksmuseum boerhaave': ('ZH', 'LEI', 'Leiden', 'M'),
|
|
'rijksmuseum twenthe': ('OV', 'ENS', 'Enschede', 'M'),
|
|
'singer laren': ('NH', 'LAR', 'Laren', 'M'),
|
|
'sonnenborgh museum': ('UT', 'UTR', 'Utrecht', 'M'),
|
|
'zeeuws museum': ('ZE', 'MID', 'Middelburg', 'M'),
|
|
|
|
# Libraries
|
|
'de bblthk': ('GE', 'WAG', 'Wageningen', 'L'),
|
|
'kb nationale bibliotheek': ('ZH', 'DHA', 'Den Haag', 'L'),
|
|
|
|
# Archives
|
|
'digitar het online archief': ('UT', 'UTR', 'Utrecht', 'D'),
|
|
|
|
# Organizations (stichtingen, etc.)
|
|
'3 october vereeniging': ('ZH', 'LEI', 'Leiden', 'S'),
|
|
'abdij o l v koningshoeven': ('NB', 'TIL', 'Tilburg', 'H'),
|
|
'amphion cultuurbedrijf': ('GE', 'DOE', 'Doetinchem', 'E'),
|
|
'bijenstichting': ('UT', 'UTR', 'Utrecht', 'N'),
|
|
'bomenstichting': ('UT', 'UTR', 'Utrecht', 'N'),
|
|
'boerennatuur': ('UT', 'UTR', 'Utrecht', 'N'),
|
|
'cbg': ('ZH', 'DHA', 'Den Haag', 'R'), # Central Bureau for Genealogy
|
|
'creatieve hubs nederland': ('NH', 'AMS', 'Amsterdam', 'O'),
|
|
'de commandostichting': ('NH', 'HAA', 'Haarlem', 'N'),
|
|
'defabrique evenementenlocatie': ('UT', 'UTR', 'Utrecht', 'E'),
|
|
'delamar': ('NH', 'AMS', 'Amsterdam', 'E'),
|
|
'den kennisinstituut cultuur digitale transformatie': ('NH', 'AMS', 'Amsterdam', 'R'),
|
|
'dutch national opera ballet': ('NH', 'AMS', 'Amsterdam', 'E'),
|
|
'expertisecentrum literair vertalen elv': ('NH', 'AMS', 'Amsterdam', 'R'),
|
|
'fim federatie instandhouding monumenten': ('NH', 'AMS', 'Amsterdam', 'N'),
|
|
'fonds 21': ('UT', 'UTR', 'Utrecht', 'N'),
|
|
'framer framed': ('NH', 'AMS', 'Amsterdam', 'G'),
|
|
'ark rewilding nederland': ('GE', 'NIJ', 'Nijmegen', 'N'),
|
|
'centraal joods overleg cjo': ('NH', 'AMS', 'Amsterdam', 'N'),
|
|
'kenniscentrum immaterieel erfgoed nederland': ('NH', 'AMS', 'Amsterdam', 'R'),
|
|
'kenniscommunity informatie en archief': ('NH', 'AMS', 'Amsterdam', 'N'),
|
|
'koninklijke nederlandse academie van wetenschappen': ('NH', 'AMS', 'Amsterdam', 'R'),
|
|
|
|
# Research centers
|
|
'adc archeoprojecten': ('GE', 'AME', 'Amersfoort', 'R'),
|
|
'archol': ('ZH', 'LEI', 'Leiden', 'R'),
|
|
'kitlv': ('ZH', 'LEI', 'Leiden', 'R'),
|
|
|
|
# Theaters/Venues
|
|
'theater de veste': ('ZH', 'DEL', 'Delft', 'E'),
|
|
'theater a d schie': ('ZH', 'SCH', 'Schiedam', 'E'),
|
|
|
|
# Foreign organizations that should be reclassified
|
|
'caen memorial': ('FR', 'CAE', 'Caen', 'M'), # France
|
|
'den gamle by': ('DK', 'AAR', 'Aarhus', 'M'), # Denmark
|
|
'den kongelige samling': ('DK', 'CPH', 'Copenhagen', 'M'), # Denmark
|
|
'castello di rivoli': ('IT', 'TOR', 'Torino', 'M'), # Italy
|
|
'consorzio delle residenze reali sabaude': ('IT', 'TOR', 'Torino', 'M'), # Italy
|
|
}
|
|
|
|
|
|
def normalize_name(name: str) -> str:
|
|
"""Normalize organization name for matching."""
|
|
import unicodedata
|
|
normalized = unicodedata.normalize('NFKD', name)
|
|
normalized = normalized.lower().strip()
|
|
# Remove punctuation
|
|
normalized = re.sub(r'[^\w\s]', ' ', normalized)
|
|
normalized = ' '.join(normalized.split())
|
|
return normalized
|
|
|
|
|
|
def extract_abbreviation(name: str) -> str:
|
|
"""Extract abbreviation from organization name."""
|
|
skip_words = {
|
|
'de', 'het', 'een', 'van', 'voor', 'in', 'op', 'te', 'den', 'der', 'des',
|
|
'en', 'of', 'the', 'a', 'an', 'and', 'or', 'museum', 'stichting',
|
|
}
|
|
name_clean = re.sub(r'[^\w\s]', ' ', name)
|
|
words = [w for w in name_clean.split() if w.lower() not in skip_words and len(w) > 1]
|
|
if not words:
|
|
words = name_clean.split()[:3]
|
|
if len(words) == 1:
|
|
abbrev = words[0][:4].upper()
|
|
else:
|
|
abbrev = ''.join(w[0] for w in words[:5]).upper()
|
|
return abbrev if abbrev else 'XXX'
|
|
|
|
|
|
def match_known_org(emic_name: str) -> Optional[Tuple[str, str, str, str]]:
|
|
"""Match organization to known database."""
|
|
name_lower = normalize_name(emic_name)
|
|
|
|
# Exact match first
|
|
if name_lower in KNOWN_ORGS:
|
|
return KNOWN_ORGS[name_lower]
|
|
|
|
# Partial match - check if known org name is contained in emic name
|
|
for known_name, location in sorted(KNOWN_ORGS.items(), key=lambda x: -len(x[0])):
|
|
if known_name in name_lower or name_lower in known_name:
|
|
return location
|
|
|
|
return None
|
|
|
|
|
|
def process_pending_file(filepath: Path, dry_run: bool = True) -> Optional[str]:
|
|
"""Process a single PENDING file."""
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
emic_name = data.get('custodian_name', {}).get('emic_name', '')
|
|
if not emic_name:
|
|
return None
|
|
|
|
result = match_known_org(emic_name)
|
|
if not result:
|
|
return None
|
|
|
|
province, city_code, city_name, inst_type = result
|
|
abbrev = extract_abbreviation(emic_name)
|
|
|
|
# Handle non-Dutch organizations
|
|
country = 'NL'
|
|
if province in ['FR', 'DK', 'IT', 'BE', 'DE', 'GB', 'US']:
|
|
country = province
|
|
province = 'XX'
|
|
|
|
new_ghcid = f"{country}-{province}-{city_code.upper()}-{inst_type}-{abbrev}"
|
|
new_path = CUSTODIAN_DIR / f"{new_ghcid}.yaml"
|
|
|
|
# Handle collision
|
|
if new_path.exists() and new_path != filepath:
|
|
name_slug = re.sub(r'[^\w]+', '-', emic_name.lower()).strip('-')[:30]
|
|
new_ghcid = f"{new_ghcid}-{name_slug}"
|
|
new_path = CUSTODIAN_DIR / f"{new_ghcid}.yaml"
|
|
|
|
if dry_run:
|
|
print(f"[WOULD RESOLVE] {emic_name}")
|
|
print(f" Location: {city_name} ({country if country != 'NL' else province})")
|
|
print(f" -> {new_ghcid}.yaml")
|
|
return 'dry_run'
|
|
|
|
# Update data
|
|
data['ghcid_current'] = new_ghcid
|
|
if 'location' not in data:
|
|
data['location'] = {}
|
|
data['location']['city'] = city_name
|
|
data['location']['country'] = country
|
|
|
|
if 'ghcid_resolution' not in data:
|
|
data['ghcid_resolution'] = {}
|
|
data['ghcid_resolution']['method'] = 'known_organization_database'
|
|
data['ghcid_resolution']['resolved_at'] = datetime.now(timezone.utc).isoformat()
|
|
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
shutil.move(filepath, new_path)
|
|
print(f"[RESOLVED] {emic_name}")
|
|
print(f" -> {new_ghcid}.yaml")
|
|
|
|
return new_ghcid
|
|
|
|
except Exception as e:
|
|
print(f"[ERROR] {filepath.name}: {e}")
|
|
return None
|
|
|
|
|
|
def main():
|
|
import argparse
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument('--dry-run', action='store_true')
|
|
args = parser.parse_args()
|
|
|
|
# Process all PENDING files (not just NL)
|
|
pending_files = list(CUSTODIAN_DIR.glob("*PENDING*.yaml"))
|
|
print(f"Processing {len(pending_files)} PENDING files against {len(KNOWN_ORGS)} known organizations...")
|
|
print()
|
|
|
|
resolved = 0
|
|
not_found = 0
|
|
|
|
for filepath in pending_files:
|
|
result = process_pending_file(filepath, dry_run=args.dry_run)
|
|
if result:
|
|
resolved += 1
|
|
else:
|
|
not_found += 1
|
|
|
|
print()
|
|
print(f"{'Would resolve' if args.dry_run else 'Resolved'}: {resolved}")
|
|
print(f"Not in database: {not_found}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|