glam/scripts/resolve_pending_by_region.py

#!/usr/bin/env python3
"""
Resolve NL PENDING files based on regional/provincial names.

Organizations like "Erfgoedhuis Zuid-Holland", "Noord-Hollands Archief", etc.
can be resolved using their regional headquarters.

Usage:
    python scripts/resolve_pending_by_region.py --dry-run
    python scripts/resolve_pending_by_region.py
"""

import re
import yaml
import shutil
from pathlib import Path
from datetime import datetime, timezone
from typing import Optional, Tuple, Dict

CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian")

# Regional organizations with their typical headquarters
REGIONAL_ORGS = {
    # Exact matches
    'archief gooi en vechtstreek': ('NH', 'HIL', 'Hilversum'),
    'waterlands archief': ('NH', 'PUR', 'Purmerend'),
    'noord-hollands archief': ('NH', 'HAA', 'Haarlem'),
    'west-brabants archief': ('NB', 'BER', 'Bergen op Zoom'),
    'streekarchief midden-holland': ('ZH', 'GOU', 'Gouda'),
    'erfgoedhuis zuid-holland': ('ZH', 'DEL', 'Delft'),
    'steunpunt cultureel erfgoed noord-holland': ('NH', 'AMS', 'Amsterdam'),
    'stichting landschap noord-holland': ('NH', 'AMS', 'Amsterdam'),
    'collectie overijssel': ('OV', 'ZWO', 'Zwolle'),
    'huis voor de kunsten limburg': ('LI', 'ROE', 'Roermond'),
    'landschapsbeheer drenthe': ('DR', 'ASS', 'Assen'),
    'natuurmuseum brabant': ('NB', 'TIL', 'Tilburg'),
    'groene hotspot zeeland': ('ZE', 'GOE', 'Goes'),
    'utrechts landschap': ('UT', 'UTR', 'Utrecht'),
    'tracé - limburgs samenlevingsarchief': ('LI', 'MAA', 'Maastricht'),

    # Pattern-based regional orgs (headquarters in provincial capitals)
    'noord-holland': ('NH', 'HAA', 'Haarlem'),
    'zuid-holland': ('ZH', 'DHA', 'Den Haag'),
    'noord-brabant': ('NB', 'DBO', "'s-Hertogenbosch"),
    'brabant': ('NB', 'DBO', "'s-Hertogenbosch"),
    'limburg': ('LI', 'MAA', 'Maastricht'),
    'zeeland': ('ZE', 'MID', 'Middelburg'),
    'drenthe': ('DR', 'ASS', 'Assen'),
    'overijssel': ('OV', 'ZWO', 'Zwolle'),
    'gelderland': ('GE', 'ARN', 'Arnhem'),
    'friesland': ('FR', 'LEE', 'Leeuwarden'),
    'groningen': ('GR', 'GRO', 'Groningen'),
    'flevoland': ('FL', 'LEL', 'Lelystad'),

    # Regional areas
    'gooi': ('NH', 'HIL', 'Hilversum'),
    'vechtstreek': ('NH', 'WEE', 'Weesp'),
    'kennemerland': ('NH', 'HAA', 'Haarlem'),
    'west-friesland': ('NH', 'HOO', 'Hoorn'),
    'waterland': ('NH', 'PUR', 'Purmerend'),
    'zaanstreek': ('NH', 'ZAA', 'Zaandam'),
    'alblasserwaard': ('ZH', 'GOR', 'Gorinchem'),
    'vijfheerenlanden': ('UT', 'VIA', 'Vianen'),
    'achterhoek': ('GE', 'DOE', 'Doetinchem'),
    'veluwe': ('GE', 'APE', 'Apeldoorn'),
    'rivierenland': ('GE', 'TIE', 'Tiel'),
    'twente': ('OV', 'ENS', 'Enschede'),
    'salland': ('OV', 'DEV', 'Deventer'),
    'de peel': ('NB', 'HEL', 'Helmond'),
    'maasvallei': ('LI', 'VEN', 'Venlo'),
    'heuvelland': ('LI', 'MAA', 'Maastricht'),
    'walcheren': ('ZE', 'MID', 'Middelburg'),

    # Museums with regional scope
    'philzuid': ('NB', 'EIN', 'Eindhoven'),  # Philips museum
}


def match_regional_org(emic_name: str) -> Optional[Tuple[str, str, str]]:
    """Match organization to regional headquarters."""
    name_lower = emic_name.lower()

    # Check exact matches first (sorted by length, longest first)
    for pattern, location in sorted(REGIONAL_ORGS.items(), key=lambda x: -len(x[0])):
        if pattern in name_lower:
            return location

    return None


def extract_abbreviation(name: str) -> str:
    """Extract abbreviation from organization name."""
    skip_words = {
        'de', 'het', 'een', 'van', 'voor', 'in', 'op', 'te', 'den', 'der', 'des',
        'en', 'of', 'the', 'a', 'an', 'and', 'or', 'museum', 'stichting',
    }

    name_clean = re.sub(r'[^\w\s]', ' ', name)
    words = [w for w in name_clean.split() if w.lower() not in skip_words and len(w) > 1]

    if not words:
        words = name_clean.split()[:3]

    if len(words) == 1:
        abbrev = words[0][:4].upper()
    else:
        abbrev = ''.join(w[0] for w in words[:5]).upper()

    return abbrev if abbrev else 'XXX'


def get_institution_type(data: Dict) -> str:
    """Get institution type code from data."""
    type_map = {
        'MUSEUM': 'M', 'ARCHIVE': 'A', 'LIBRARY': 'L', 'GALLERY': 'G',
        'RESEARCH_CENTER': 'R', 'EDUCATION_PROVIDER': 'E', 'OFFICIAL_INSTITUTION': 'O',
        'COLLECTING_SOCIETY': 'S', 'HOLY_SITES': 'H', 'DIGITAL_PLATFORM': 'D',
        'BOTANICAL_ZOO': 'B', 'CORPORATION': 'C', 'FEATURES': 'F',
        'INTANGIBLE_HERITAGE_GROUP': 'I', 'MIXED': 'X', 'PERSONAL_COLLECTION': 'P',
        'NGO': 'N', 'TASTE_SMELL': 'T', 'UNKNOWN': 'U',
    }
    inst_type = data.get('institution_type', 'MUSEUM')
    return type_map.get(inst_type, 'M')


def process_pending_file(filepath: Path, dry_run: bool = True) -> Optional[str]:
    """Process a single PENDING file."""
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            data = yaml.safe_load(f)

        emic_name = data.get('custodian_name', {}).get('emic_name', '')
        if not emic_name:
            return None

        # Try to match regional org
        result = match_regional_org(emic_name)
        if not result:
            return None

        prov, city_code, city_name = result
        inst_type = get_institution_type(data)
        abbrev = extract_abbreviation(emic_name)

        # Generate new GHCID
        new_ghcid = f"NL-{prov}-{city_code.upper()}-{inst_type}-{abbrev}"

        # Check for collision
        new_path = CUSTODIAN_DIR / f"{new_ghcid}.yaml"
        if new_path.exists() and new_path != filepath:
            # Add name suffix for collision
            name_slug = re.sub(r'[^\w]+', '-', emic_name.lower()).strip('-')[:30]
            new_ghcid = f"{new_ghcid}-{name_slug}"
            new_path = CUSTODIAN_DIR / f"{new_ghcid}.yaml"

        if dry_run:
            print(f"[WOULD RESOLVE] {emic_name}")
            print(f"  Region/Province: {prov} ({city_name})")
            print(f"  -> {new_ghcid}.yaml")
            return 'dry_run'

        # Update data
        data['ghcid_current'] = new_ghcid
        if 'location' not in data:
            data['location'] = {}
        data['location']['city'] = city_name
        data['location']['country'] = 'NL'

        # Add resolution provenance
        if 'ghcid_resolution' not in data:
            data['ghcid_resolution'] = {}
        data['ghcid_resolution']['method'] = 'regional_name_extraction'
        data['ghcid_resolution']['resolved_at'] = datetime.now(timezone.utc).isoformat()
        data['ghcid_resolution']['matched_region'] = prov

        # Write and rename
        with open(filepath, 'w', encoding='utf-8') as f:
            yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)

        shutil.move(filepath, new_path)
        print(f"[RESOLVED] {emic_name}")
        print(f"  Region: {prov} ({city_name}) -> {new_ghcid}.yaml")

        return new_ghcid

    except Exception as e:
        print(f"[ERROR] {filepath.name}: {e}")
        return None


def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--dry-run', action='store_true')
    parser.add_argument('--limit', type=int, default=0)
    args = parser.parse_args()

    # Find NL PENDING files only
    pending_files = list(CUSTODIAN_DIR.glob("NL-XX-XXX-PENDING-*.yaml"))
    print(f"Found {len(pending_files)} NL PENDING files")

    if args.limit:
        pending_files = pending_files[:args.limit]

    resolved = 0
    failed = 0

    for filepath in pending_files:
        result = process_pending_file(filepath, dry_run=args.dry_run)
        if result:
            resolved += 1
        else:
            failed += 1

    print(f"\n{'Would resolve' if args.dry_run else 'Resolved'}: {resolved}")
    print(f"No region found: {failed}")


if __name__ == '__main__':
    main()