glam/scripts/resolve_pending_wikidata_v2.py

#!/usr/bin/env python3
"""
Resolve PENDING files using Wikidata location lookup (improved version).

This script:
1. Searches Wikidata using multiple query variations
2. Gets location (P131) from Wikidata via SPARQL
3. Maps location to Dutch province/city code (expanded mapping)
4. Assigns proper GHCID and renames file

Usage:
    python scripts/resolve_pending_wikidata_v2.py --dry-run    # Preview
    python scripts/resolve_pending_wikidata_v2.py --limit 50   # Process 50 files
    python scripts/resolve_pending_wikidata_v2.py              # Process all
"""

import os
import re
import yaml
import time
import requests
from pathlib import Path
from datetime import datetime, timezone
from typing import Dict, Optional, Tuple, List

CUSTODIAN_DIR = Path('/Users/kempersc/apps/glam/data/custodian')

# Expanded Dutch city to province/code mapping
CITY_MAPPING = {
    # Noord-Holland
    'amsterdam': ('NH', 'AMS'), 'haarlem': ('NH', 'HAA'), 'alkmaar': ('NH', 'ALK'),
    'hilversum': ('NH', 'HIL'), 'zaandam': ('NH', 'ZAA'), 'hoorn': ('NH', 'HOO'),
    'enkhuizen': ('NH', 'ENK'), 'purmerend': ('NH', 'PUR'), 'amstelveen': ('NH', 'AMV'),
    'heemstede': ('NH', 'HEE'), 'beverwijk': ('NH', 'BEV'), 'velsen': ('NH', 'VEL'),
    'castricum': ('NH', 'CAS'), 'huizen': ('NH', 'HUI'), 'bussum': ('NH', 'BUS'),
    'naarden': ('NH', 'NAA'), 'weesp': ('NH', 'WEE'), 'edam': ('NH', 'EDA'),
    'volendam': ('NH', 'VOL'), 'texel': ('NH', 'TEX'), 'den helder': ('NH', 'DHE'),
    'schagen': ('NH', 'SGA'), 'heerhugowaard': ('NH', 'HHW'), 'laren': ('NH', 'LAR'),
    'blaricum': ('NH', 'BLA'), 'muiden': ('NH', 'MUI'), 'monnickendam': ('NH', 'MON'),
    'uithoorn': ('NH', 'UIT'),

    # Zuid-Holland
    'rotterdam': ('ZH', 'ROT'), 'den haag': ('ZH', 'DHA'), 'the hague': ('ZH', 'DHA'),
    "'s-gravenhage": ('ZH', 'DHA'), 'leiden': ('ZH', 'LEI'), 'delft': ('ZH', 'DEL'),
    'dordrecht': ('ZH', 'DOR'), 'gouda': ('ZH', 'GOU'), 'schiedam': ('ZH', 'SCH'),
    'zoetermeer': ('ZH', 'ZOE'), 'vlaardingen': ('ZH', 'VLA'), 'spijkenisse': ('ZH', 'SPI'),
    'alphen aan den rijn': ('ZH', 'AAR'), 'katwijk': ('ZH', 'KAT'),
    'rijswijk': ('ZH', 'RIJ'), 'wassenaar': ('ZH', 'WAS'), 'lisse': ('ZH', 'LIS'),
    'noordwijk': ('ZH', 'NWK'), 'gorinchem': ('ZH', 'GOR'), 'schoonhoven': ('ZH', 'SOH'),
    'hoek van holland': ('ZH', 'HVH'), 'maassluis': ('ZH', 'MSL'),

    # Utrecht
    'utrecht': ('UT', 'UTR'), 'amersfoort': ('UT', 'AME'), 'zeist': ('UT', 'ZEI'),
    'nieuwegein': ('UT', 'NIE'), 'veenendaal': ('UT', 'VEE'), 'houten': ('UT', 'HOU'),
    'soest': ('UT', 'SOE'), 'baarn': ('UT', 'BAA'), 'bunnik': ('UT', 'BUN'),
    'doorn': ('UT', 'DOO'), 'driebergen': ('UT', 'DRI'), 'wijk bij duurstede': ('UT', 'WBD'),
    'breukelen': ('UT', 'BRE'), 'vianen': ('UT', 'VIA'), 'culemborg': ('UT', 'CUL'),

    # Gelderland
    'arnhem': ('GE', 'ARN'), 'nijmegen': ('GE', 'NIJ'), 'apeldoorn': ('GE', 'APE'),
    'ede': ('GE', 'EDE'), 'wageningen': ('GE', 'WAG'), 'doetinchem': ('GE', 'DOE'),
    'zutphen': ('GE', 'ZUT'), 'harderwijk': ('GE', 'HAR'), 'zevenaar': ('GE', 'ZEV'),
    'tiel': ('GE', 'TIE'), 'barneveld': ('GE', 'BNV'), 'elburg': ('GE', 'ELB'),
    'nunspeet': ('GE', 'NUN'), 'putten': ('GE', 'PUT'), 'ermelo': ('GE', 'ERM'),
    'epe': ('GE', 'EPE'), 'hattem': ('GE', 'HAT'), 'zaltbommel': ('GE', 'ZAL'),
    'winterswijk': ('GE', 'WIN'), 'oosterbeek': ('GE', 'OOS'), 'velp': ('GE', 'VEL'),
    'borculo': ('GE', 'BOR'), 'lochem': ('GE', 'LOC'), 'aalten': ('GE', 'AAL'),

    # Noord-Brabant
    'eindhoven': ('NB', 'EIN'), 'tilburg': ('NB', 'TIL'), 'breda': ('NB', 'BRE'),
    "'s-hertogenbosch": ('NB', 'DBO'), 'den bosch': ('NB', 'DBO'), 'helmond': ('NB', 'HEL'),
    'oss': ('NB', 'OSS'), 'roosendaal': ('NB', 'ROS'), 'bergen op zoom': ('NB', 'BOZ'),
    'waalwijk': ('NB', 'WAA'), 'uden': ('NB', 'UDE'), 'veghel': ('NB', 'VEG'),
    'boxtel': ('NB', 'BOX'), 'oisterwijk': ('NB', 'OIS'), 'vught': ('NB', 'VUG'),
    'cuijk': ('NB', 'CUI'), 'deurne': ('NB', 'DEU'), 'valkenswaard': ('NB', 'VAL'),
    'nuenen': ('NB', 'NUE'), 'best': ('NB', 'BES'), 'etten-leur': ('NB', 'ETL'),
    'oosterhout': ('NB', 'OOH'),

    # Limburg
    'maastricht': ('LI', 'MAA'), 'venlo': ('LI', 'VEN'), 'heerlen': ('LI', 'HEE'),
    'roermond': ('LI', 'ROE'), 'sittard': ('LI', 'SIT'), 'geleen': ('LI', 'GEL'),
    'weert': ('LI', 'WEE'), 'kerkrade': ('LI', 'KER'), 'valkenburg': ('LI', 'VAL'),
    'vaals': ('LI', 'VAA'), 'venray': ('LI', 'VRY'), 'thorn': ('LI', 'THO'),

    # Overijssel
    'zwolle': ('OV', 'ZWO'), 'deventer': ('OV', 'DEV'), 'enschede': ('OV', 'ENS'),
    'hengelo': ('OV', 'HEN'), 'almelo': ('OV', 'ALM'), 'kampen': ('OV', 'KAM'),
    'oldenzaal': ('OV', 'OLD'), 'hardenberg': ('OV', 'HDB'), 'rijssen': ('OV', 'RIJ'),
    'holten': ('OV', 'HOL'), 'raalte': ('OV', 'RAA'), 'ommen': ('OV', 'OMM'),
    'staphorst': ('OV', 'STA'), 'giethoorn': ('OV', 'GIE'), 'steenwijk': ('OV', 'STW'),

    # Friesland
    'leeuwarden': ('FR', 'LEE'), 'drachten': ('FR', 'DRA'), 'sneek': ('FR', 'SNE'),
    'heerenveen': ('FR', 'HEE'), 'harlingen': ('FR', 'HAR'), 'dokkum': ('FR', 'DOK'),
    'bolsward': ('FR', 'BOL'), 'franeker': ('FR', 'FRA'), 'joure': ('FR', 'JOU'),
    'workum': ('FR', 'WOR'), 'makkum': ('FR', 'MAK'), 'hindeloopen': ('FR', 'HIN'),

    # Drenthe
    'assen': ('DR', 'ASS'), 'emmen': ('DR', 'EMM'), 'hoogeveen': ('DR', 'HOO'),
    'meppel': ('DR', 'MEP'), 'coevorden': ('DR', 'COE'), 'borger': ('DR', 'BOR'),
    'veenhuizen': ('DR', 'VHU'),

    # Groningen
    'groningen': ('GR', 'GRO'), 'veendam': ('GR', 'VEE'), 'winschoten': ('GR', 'WIN'),
    'hoogezand': ('GR', 'HZD'), 'stadskanaal': ('GR', 'STK'), 'delfzijl': ('GR', 'DEL'),
    'appingedam': ('GR', 'APP'), 'ter apel': ('GR', 'TAP'),

    # Zeeland
    'middelburg': ('ZE', 'MID'), 'vlissingen': ('ZE', 'VLI'), 'goes': ('ZE', 'GOE'),
    'terneuzen': ('ZE', 'TER'), 'hulst': ('ZE', 'HUL'), 'zierikzee': ('ZE', 'ZIE'),
    'veere': ('ZE', 'VEE'), 'domburg': ('ZE', 'DOM'),

    # Flevoland
    'almere': ('FL', 'ALM'), 'lelystad': ('FL', 'LEL'), 'dronten': ('FL', 'DRO'),
    'emmeloord': ('FL', 'EMM'), 'urk': ('FL', 'URK'), 'zeewolde': ('FL', 'ZEE'),
}

# Institution type mapping
TYPE_MAP = {
    'MUSEUM': 'M', 'ARCHIVE': 'A', 'LIBRARY': 'L', 'GALLERY': 'G',
    'RESEARCH_CENTER': 'R', 'EDUCATION_PROVIDER': 'E', 'OFFICIAL_INSTITUTION': 'O',
    'COLLECTING_SOCIETY': 'S', 'HOLY_SITES': 'H', 'DIGITAL_PLATFORM': 'D',
    'BOTANICAL_ZOO': 'B', 'CORPORATION': 'C', 'FEATURES': 'F',
    'INTANGIBLE_HERITAGE_GROUP': 'I', 'MIXED': 'X', 'PERSONAL_COLLECTION': 'P',
    'NGO': 'N', 'TASTE_SMELL': 'T', 'UNKNOWN': 'U',
}

# Skip words for abbreviation
SKIP_WORDS = {'de', 'het', 'een', 'van', 'voor', 'in', 'op', 'te', 'en', 'of',
              'the', 'a', 'an', 'of', 'for', 'and', 'or', 'at', 'museum', 'stichting'}


def search_wikidata_variations(name: str) -> Optional[str]:
    """Search Wikidata with multiple query variations."""
    url = "https://www.wikidata.org/w/api.php"

    # Generate query variations
    queries = [name]

    # Try without "Stichting" prefix
    if name.lower().startswith('stichting '):
        queries.append(name[10:])

    # Try without "het" prefix
    if name.lower().startswith('het '):
        queries.append(name[4:])

    # Try without "de" prefix
    if name.lower().startswith('de '):
        queries.append(name[3:])

    # Try simplified version (remove parentheses content)
    simplified = re.sub(r'\([^)]*\)', '', name).strip()
    if simplified != name:
        queries.append(simplified)

    # Try without special characters
    clean = re.sub(r'[^\w\s]', ' ', name).strip()
    if clean != name:
        queries.append(clean)

    for query in queries:
        for lang in ['nl', 'en']:
            params = {
                'action': 'wbsearchentities',
                'search': query,
                'language': lang,
                'format': 'json',
                'limit': 5
            }
            try:
                resp = requests.get(url, params=params, timeout=10)
                resp.raise_for_status()
                data = resp.json()
                if data.get('search'):
                    # Return first result that looks like a Dutch organization
                    for result in data['search']:
                        desc = result.get('description', '').lower()
                        # Prefer Dutch organizations
                        if 'netherlands' in desc or 'dutch' in desc or 'nederland' in desc:
                            return result['id']
                    # Otherwise return first result
                    return data['search'][0]['id']
            except:
                pass
            time.sleep(0.1)  # Rate limit

    return None


def get_location_via_sparql(entity_id: str) -> Optional[str]:
    """Get location from Wikidata via SPARQL."""
    sparql = f"""
    SELECT ?locationLabel WHERE {{
      wd:{entity_id} wdt:P131 ?location.
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "nl,en". }}
    }}
    LIMIT 1
    """
    url = "https://query.wikidata.org/sparql"
    headers = {'User-Agent': 'GLAM-Resolver/1.0'}
    try:
        resp = requests.get(url, params={'query': sparql, 'format': 'json'},
                          headers=headers, timeout=15)
        resp.raise_for_status()
        data = resp.json()
        bindings = data.get('results', {}).get('bindings', [])
        if bindings:
            return bindings[0].get('locationLabel', {}).get('value', '')
    except:
        pass
    return None


def get_province_city(location: str) -> Tuple[Optional[str], Optional[str]]:
    """Map location to province and city code."""
    if not location:
        return None, None
    location_lower = location.lower().strip()
    if location_lower in CITY_MAPPING:
        return CITY_MAPPING[location_lower]
    return None, None


def generate_abbreviation(name: str) -> str:
    """Generate abbreviation from name."""
    words = name.replace('-', ' ').replace("'", ' ').split()
    abbrev = ''.join(w[0].upper() for w in words if w.lower() not in SKIP_WORDS and w)
    return abbrev[:8] if abbrev else 'UNK'


def get_institution_type_code(data: Dict) -> str:
    """Get institution type code from data."""
    inst_type = data.get('institution_type', 'MUSEUM')
    return TYPE_MAP.get(inst_type, 'M')


def load_yaml(filepath: Path) -> Optional[Dict]:
    """Load YAML file."""
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            return yaml.safe_load(f)
    except:
        return None


def save_yaml(filepath: Path, data: Dict):
    """Save YAML file."""
    with open(filepath, 'w', encoding='utf-8') as f:
        yaml.dump(data, f, allow_unicode=True, default_flow_style=False,
                  sort_keys=False, width=120)


def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--dry-run', action='store_true')
    parser.add_argument('--limit', type=int, default=0)
    args = parser.parse_args()

    print("=" * 80)
    print("RESOLVING PENDING FILES VIA WIKIDATA (v2)")
    print("=" * 80)
    print(f"Mode: {'DRY RUN' if args.dry_run else 'LIVE'}")
    if args.limit:
        print(f"Limit: {args.limit} files")
    print()

    # Find NL PENDING files
    pending_files = sorted(CUSTODIAN_DIR.glob('NL-XX-XXX-PENDING-*.yaml'))
    if args.limit:
        pending_files = pending_files[:args.limit]

    print(f"Processing {len(pending_files)} files...")
    print()

    resolved = 0
    no_match = 0
    no_location = 0
    collision = 0

    for i, filepath in enumerate(pending_files):
        data = load_yaml(filepath)
        if not data:
            continue

        name = data.get('custodian_name', {}).get('emic_name', '')
        if not name:
            continue

        # Rate limiting
        if i > 0 and i % 5 == 0:
            time.sleep(1)

        # Search Wikidata with variations
        entity_id = search_wikidata_variations(name)
        if not entity_id:
            no_match += 1
            continue

        # Get location via SPARQL
        location = get_location_via_sparql(entity_id)
        province, city_code = get_province_city(location)

        if not province or not city_code:
            no_location += 1
            if args.dry_run and no_location <= 5:
                print(f"[NO LOC] {name[:50]}")
                print(f"  Wikidata: {entity_id}, Location: {location}")
            continue

        # Generate new GHCID
        inst_type = get_institution_type_code(data)
        abbrev = generate_abbreviation(name)
        new_ghcid = f"NL-{province}-{city_code}-{inst_type}-{abbrev}"
        new_filepath = CUSTODIAN_DIR / f"{new_ghcid}.yaml"

        # Handle collision
        if new_filepath.exists():
            name_slug = re.sub(r'[^\w]+', '-', name.lower()).strip('-')[:30]
            new_ghcid = f"{new_ghcid}-{name_slug}"
            new_filepath = CUSTODIAN_DIR / f"{new_ghcid}.yaml"

        if new_filepath.exists():
            collision += 1
            continue

        print(f"[{'DRY' if args.dry_run else 'RESOLVE'}] {name[:50]}")
        print(f"  Wikidata: {entity_id}, Location: {location}")
        print(f"  -> {new_ghcid}.yaml")

        if not args.dry_run:
            # Update data
            data['ghcid_current'] = new_ghcid

            if 'location' not in data:
                data['location'] = {}
            data['location']['city'] = location
            data['location']['country'] = 'NL'

            if 'ghcid_resolution' not in data:
                data['ghcid_resolution'] = {}
            data['ghcid_resolution']['method'] = 'wikidata_lookup'
            data['ghcid_resolution']['wikidata_id'] = entity_id
            data['ghcid_resolution']['resolved_at'] = datetime.now(timezone.utc).isoformat()

            # Save and rename
            save_yaml(new_filepath, data)
            filepath.unlink()

        resolved += 1
        print()

    print("=" * 80)
    print("SUMMARY")
    print("=" * 80)
    print(f"Resolved: {resolved}")
    print(f"No Wikidata match: {no_match}")
    print(f"No location in Wikidata: {no_location}")
    print(f"Collision: {collision}")


if __name__ == '__main__':
    main()