glam/scripts/resolve_pending_known_orgs.py

#!/usr/bin/env python3
"""
Resolve PENDING files using a comprehensive known organizations database.

This script contains manually curated locations for Dutch heritage organizations
that couldn't be resolved automatically.

Usage:
    python scripts/resolve_pending_known_orgs.py --dry-run
    python scripts/resolve_pending_known_orgs.py
"""

import re
import yaml
import shutil
from pathlib import Path
from datetime import datetime, timezone
from typing import Dict, Optional, Tuple

CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian")

# Known organizations with their locations
# Format: 'normalized_name': (province, city_code, city_name, inst_type)
KNOWN_ORGS = {
    # Museums
    'amsterdamse school museum het schip': ('NH', 'AMS', 'Amsterdam', 'M'),
    'bonnefanten': ('LI', 'MAA', 'Maastricht', 'M'),
    'bonami spelcomputer museum': ('OV', 'ZWO', 'Zwolle', 'M'),
    'bakkerijmuseum de oude bakkerij': ('NH', 'MED', 'Medemblik', 'M'),
    'chabot museum': ('ZH', 'ROT', 'Rotterdam', 'M'),
    'coda museum': ('GE', 'APE', 'Apeldoorn', 'M'),
    'comm museum voor communicatie': ('ZH', 'DHA', 'Den Haag', 'M'),
    'cruquius museum': ('NH', 'HAA', 'Haarlemmermeer', 'M'),
    'diva museum': ('BE', 'ANT', 'Antwerpen', 'M'),  # Belgium
    'dordrechts museum': ('ZH', 'DOR', 'Dordrecht', 'M'),
    'dutch museum of freemasonry': ('ZH', 'DHA', 'Den Haag', 'M'),
    'eise eisinga planetarium': ('FR', 'FRA', 'Franeker', 'M'),
    'elisabeth weeshuis museum': ('UT', 'CUL', 'Culemborg', 'M'),
    'design museum huis dedel': ('ZH', 'DHA', 'Den Haag', 'M'),
    'fries landbouw museum': ('FR', 'LEE', 'Leeuwarden', 'M'),
    'fries scheepvaart museum': ('FR', 'SNE', 'Sneek', 'M'),
    'gelderse archeologie': ('GE', 'ARN', 'Arnhem', 'R'),
    'gelders archief': ('GE', 'ARN', 'Arnhem', 'A'),
    'gorcums museum': ('ZH', 'GOR', 'Gorinchem', 'M'),
    'hart museum': ('NH', 'AMS', 'Amsterdam', 'M'),
    'h art museum': ('NH', 'AMS', 'Amsterdam', 'M'),
    'het drentse landschap': ('DR', 'ASS', 'Assen', 'N'),
    'het museum voor onbedoelde kunst': ('NH', 'AMS', 'Amsterdam', 'M'),
    'het schip': ('NH', 'AMS', 'Amsterdam', 'M'),
    'huygens instituut': ('NH', 'AMS', 'Amsterdam', 'R'),
    'katwijks museum': ('ZH', 'KAT', 'Katwijk', 'M'),
    'kroller muller museum': ('GE', 'OTT', 'Otterlo', 'M'),
    'kunsthal': ('ZH', 'ROT', 'Rotterdam', 'G'),
    'literatuurmuseum': ('ZH', 'DHA', 'Den Haag', 'M'),
    'museum aan de ijssel': ('GE', 'DOE', 'Doesburg', 'M'),
    'museum de buitenplaats': ('DR', 'EEL', 'Eelde', 'M'),
    'museum de casteelse poort': ('GE', 'WAG', 'Wageningen', 'M'),
    'museum de koperen knop': ('ZE', 'HAR', 'Hardinxveld', 'M'),
    'museum de lakenhal': ('ZH', 'LEI', 'Leiden', 'M'),
    'museum geert groote huis': ('OV', 'DEV', 'Deventer', 'M'),
    'museum het oude raadhuis': ('UT', 'URK', 'Urk', 'M'),
    'museum het valkhof': ('GE', 'NIJ', 'Nijmegen', 'M'),
    'museum hoeksche waard': ('ZH', 'OIB', 'Oud-Beijerland', 'M'),
    'museum huys der historie': ('NB', 'HEL', 'Helmond', 'M'),
    'museum ijsselstein': ('UT', 'IJS', 'IJsselstein', 'M'),
    'museum kaap skil': ('NH', 'TEX', 'Texel', 'M'),
    'museum kasteel wijchen': ('GE', 'WIJ', 'Wijchen', 'M'),
    'museum maelwael van lymborch': ('GE', 'NIJ', 'Nijmegen', 'M'),
    'museum ons lieve heer op solder': ('NH', 'AMS', 'Amsterdam', 'M'),
    'museum plus bus': ('NH', 'AMS', 'Amsterdam', 'M'),
    'museum romeinse katakomben': ('LI', 'VAL', 'Valkenburg', 'M'),
    'museum stedhus': ('FR', 'WOR', 'Workum', 'M'),
    'museum t oude slot': ('GE', 'VEL', 'Velp', 'M'),
    'museum tot zover': ('NH', 'AMS', 'Amsterdam', 'M'),
    'museum valse kunst': ('GE', 'VIE', 'Vierhouten', 'M'),
    'museum van de twintigste eeuw': ('NH', 'HOO', 'Hoorn', 'M'),
    'museum van lien': ('GE', 'WAG', 'Wageningen', 'M'),
    'museum vd 20e eeuw': ('NH', 'HOO', 'Hoorn', 'M'),
    'museum voormeer': ('NH', 'AMS', 'Amsterdam', 'M'),
    'museum zaanse tijd': ('NH', 'ZAA', 'Zaandam', 'M'),
    'museumboerderij west frisia': ('NH', 'HOO', 'Hoogkarspel', 'M'),
    'museumpark': ('ZH', 'ROT', 'Rotterdam', 'M'),
    'nationaal militair museum': ('UT', 'SOE', 'Soesterberg', 'M'),
    'nationaal monument oranjehotel': ('ZH', 'DHA', 'Den Haag', 'M'),
    'nationaal muziekinstrumenten fonds': ('NH', 'AMS', 'Amsterdam', 'M'),
    'nationaal orgelmuseum': ('GE', 'ELB', 'Elburg', 'M'),
    'nationaal tinnen figuren museum': ('GE', 'OMM', 'Ommen', 'M'),
    'nationaal vlechtmuseum': ('DR', 'NOR', 'Noordwolde', 'M'),
    'nederlands dans theater': ('ZH', 'DHA', 'Den Haag', 'E'),
    'nederlands fotomuseum': ('ZH', 'ROT', 'Rotterdam', 'M'),
    'nederlands instituut voor beeld en geluid': ('NH', 'HIL', 'Hilversum', 'A'),
    'nederlands mijnmuseum': ('LI', 'HEE', 'Heerlen', 'M'),
    'nederlands transport museum': ('ZH', 'NIE', 'Nieuw-Vennep', 'M'),
    'nieuwe kerk amsterdam': ('NH', 'AMS', 'Amsterdam', 'H'),
    'nieuwe kerk delft': ('ZH', 'DEL', 'Delft', 'H'),
    'nijntje museum': ('UT', 'UTR', 'Utrecht', 'M'),
    'nh museum': ('NH', 'HAA', 'Haarlem', 'M'),
    'oorlogsmuseum overloon': ('NB', 'OVL', 'Overloon', 'M'),
    'openluchtmuseum het hoogeland': ('GR', 'WAR', 'Warffum', 'M'),
    'paleis het loo': ('GE', 'APE', 'Apeldoorn', 'M'),
    'purmerends museum': ('NH', 'PUR', 'Purmerend', 'M'),
    'rijksmuseum boerhaave': ('ZH', 'LEI', 'Leiden', 'M'),
    'rijksmuseum twenthe': ('OV', 'ENS', 'Enschede', 'M'),
    'singer laren': ('NH', 'LAR', 'Laren', 'M'),
    'sonnenborgh museum': ('UT', 'UTR', 'Utrecht', 'M'),
    'zeeuws museum': ('ZE', 'MID', 'Middelburg', 'M'),

    # Libraries
    'de bblthk': ('GE', 'WAG', 'Wageningen', 'L'),
    'kb nationale bibliotheek': ('ZH', 'DHA', 'Den Haag', 'L'),

    # Archives
    'digitar het online archief': ('UT', 'UTR', 'Utrecht', 'D'),

    # Organizations (stichtingen, etc.)
    '3 october vereeniging': ('ZH', 'LEI', 'Leiden', 'S'),
    'abdij o l v koningshoeven': ('NB', 'TIL', 'Tilburg', 'H'),
    'amphion cultuurbedrijf': ('GE', 'DOE', 'Doetinchem', 'E'),
    'bijenstichting': ('UT', 'UTR', 'Utrecht', 'N'),
    'bomenstichting': ('UT', 'UTR', 'Utrecht', 'N'),
    'boerennatuur': ('UT', 'UTR', 'Utrecht', 'N'),
    'cbg': ('ZH', 'DHA', 'Den Haag', 'R'),  # Central Bureau for Genealogy
    'creatieve hubs nederland': ('NH', 'AMS', 'Amsterdam', 'O'),
    'de commandostichting': ('NH', 'HAA', 'Haarlem', 'N'),
    'defabrique evenementenlocatie': ('UT', 'UTR', 'Utrecht', 'E'),
    'delamar': ('NH', 'AMS', 'Amsterdam', 'E'),
    'den kennisinstituut cultuur digitale transformatie': ('NH', 'AMS', 'Amsterdam', 'R'),
    'dutch national opera ballet': ('NH', 'AMS', 'Amsterdam', 'E'),
    'expertisecentrum literair vertalen elv': ('NH', 'AMS', 'Amsterdam', 'R'),
    'fim federatie instandhouding monumenten': ('NH', 'AMS', 'Amsterdam', 'N'),
    'fonds 21': ('UT', 'UTR', 'Utrecht', 'N'),
    'framer framed': ('NH', 'AMS', 'Amsterdam', 'G'),
    'ark rewilding nederland': ('GE', 'NIJ', 'Nijmegen', 'N'),
    'centraal joods overleg cjo': ('NH', 'AMS', 'Amsterdam', 'N'),
    'kenniscentrum immaterieel erfgoed nederland': ('NH', 'AMS', 'Amsterdam', 'R'),
    'kenniscommunity informatie en archief': ('NH', 'AMS', 'Amsterdam', 'N'),
    'koninklijke nederlandse academie van wetenschappen': ('NH', 'AMS', 'Amsterdam', 'R'),

    # Research centers
    'adc archeoprojecten': ('GE', 'AME', 'Amersfoort', 'R'),
    'archol': ('ZH', 'LEI', 'Leiden', 'R'),
    'kitlv': ('ZH', 'LEI', 'Leiden', 'R'),

    # Theaters/Venues
    'theater de veste': ('ZH', 'DEL', 'Delft', 'E'),
    'theater a d schie': ('ZH', 'SCH', 'Schiedam', 'E'),

    # Foreign organizations that should be reclassified
    'caen memorial': ('FR', 'CAE', 'Caen', 'M'),  # France
    'den gamle by': ('DK', 'AAR', 'Aarhus', 'M'),  # Denmark
    'den kongelige samling': ('DK', 'CPH', 'Copenhagen', 'M'),  # Denmark
    'castello di rivoli': ('IT', 'TOR', 'Torino', 'M'),  # Italy
    'consorzio delle residenze reali sabaude': ('IT', 'TOR', 'Torino', 'M'),  # Italy
}


def normalize_name(name: str) -> str:
    """Normalize organization name for matching."""
    import unicodedata
    normalized = unicodedata.normalize('NFKD', name)
    normalized = normalized.lower().strip()
    # Remove punctuation
    normalized = re.sub(r'[^\w\s]', ' ', normalized)
    normalized = ' '.join(normalized.split())
    return normalized


def extract_abbreviation(name: str) -> str:
    """Extract abbreviation from organization name."""
    skip_words = {
        'de', 'het', 'een', 'van', 'voor', 'in', 'op', 'te', 'den', 'der', 'des',
        'en', 'of', 'the', 'a', 'an', 'and', 'or', 'museum', 'stichting',
    }
    name_clean = re.sub(r'[^\w\s]', ' ', name)
    words = [w for w in name_clean.split() if w.lower() not in skip_words and len(w) > 1]
    if not words:
        words = name_clean.split()[:3]
    if len(words) == 1:
        abbrev = words[0][:4].upper()
    else:
        abbrev = ''.join(w[0] for w in words[:5]).upper()
    return abbrev if abbrev else 'XXX'


def match_known_org(emic_name: str) -> Optional[Tuple[str, str, str, str]]:
    """Match organization to known database."""
    name_lower = normalize_name(emic_name)

    # Exact match first
    if name_lower in KNOWN_ORGS:
        return KNOWN_ORGS[name_lower]

    # Partial match - check if known org name is contained in emic name
    for known_name, location in sorted(KNOWN_ORGS.items(), key=lambda x: -len(x[0])):
        if known_name in name_lower or name_lower in known_name:
            return location

    return None


def process_pending_file(filepath: Path, dry_run: bool = True) -> Optional[str]:
    """Process a single PENDING file."""
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            data = yaml.safe_load(f)

        emic_name = data.get('custodian_name', {}).get('emic_name', '')
        if not emic_name:
            return None

        result = match_known_org(emic_name)
        if not result:
            return None

        province, city_code, city_name, inst_type = result
        abbrev = extract_abbreviation(emic_name)

        # Handle non-Dutch organizations
        country = 'NL'
        if province in ['FR', 'DK', 'IT', 'BE', 'DE', 'GB', 'US']:
            country = province
            province = 'XX'

        new_ghcid = f"{country}-{province}-{city_code.upper()}-{inst_type}-{abbrev}"
        new_path = CUSTODIAN_DIR / f"{new_ghcid}.yaml"

        # Handle collision
        if new_path.exists() and new_path != filepath:
            name_slug = re.sub(r'[^\w]+', '-', emic_name.lower()).strip('-')[:30]
            new_ghcid = f"{new_ghcid}-{name_slug}"
            new_path = CUSTODIAN_DIR / f"{new_ghcid}.yaml"

        if dry_run:
            print(f"[WOULD RESOLVE] {emic_name}")
            print(f"  Location: {city_name} ({country if country != 'NL' else province})")
            print(f"  -> {new_ghcid}.yaml")
            return 'dry_run'

        # Update data
        data['ghcid_current'] = new_ghcid
        if 'location' not in data:
            data['location'] = {}
        data['location']['city'] = city_name
        data['location']['country'] = country

        if 'ghcid_resolution' not in data:
            data['ghcid_resolution'] = {}
        data['ghcid_resolution']['method'] = 'known_organization_database'
        data['ghcid_resolution']['resolved_at'] = datetime.now(timezone.utc).isoformat()

        with open(filepath, 'w', encoding='utf-8') as f:
            yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)

        shutil.move(filepath, new_path)
        print(f"[RESOLVED] {emic_name}")
        print(f"  -> {new_ghcid}.yaml")

        return new_ghcid

    except Exception as e:
        print(f"[ERROR] {filepath.name}: {e}")
        return None


def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--dry-run', action='store_true')
    args = parser.parse_args()

    # Process all PENDING files (not just NL)
    pending_files = list(CUSTODIAN_DIR.glob("*PENDING*.yaml"))
    print(f"Processing {len(pending_files)} PENDING files against {len(KNOWN_ORGS)} known organizations...")
    print()

    resolved = 0
    not_found = 0

    for filepath in pending_files:
        result = process_pending_file(filepath, dry_run=args.dry_run)
        if result:
            resolved += 1
        else:
            not_found += 1

    print()
    print(f"{'Would resolve' if args.dry_run else 'Resolved'}: {resolved}")
    print(f"Not in database: {not_found}")


if __name__ == '__main__':
    main()