#!/usr/bin/env python3
"""
Extract city names from organization names in PENDING files.

This script looks for Dutch city names embedded in organization names
and resolves them to proper GHCIDs.

Usage:
    python scripts/resolve_pending_by_city_name.py --dry-run
    python scripts/resolve_pending_by_city_name.py
"""

import re
import yaml
import shutil
from pathlib import Path
from datetime import datetime, timezone
from typing import Optional, Tuple, Dict

CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian")

# Comprehensive Dutch city list with province and code
DUTCH_CITIES = {
    # Noord-Holland
    'amsterdam': ('NH', 'AMS'), 'haarlem': ('NH', 'HAA'), 'alkmaar': ('NH', 'ALK'),
    'hilversum': ('NH', 'HIL'), 'zaandam': ('NH', 'ZAA'), 'hoorn': ('NH', 'HOO'),
    'enkhuizen': ('NH', 'ENK'), 'purmerend': ('NH', 'PUR'), 'amstelveen': ('NH', 'AMV'),
    'heemstede': ('NH', 'HEE'), 'beverwijk': ('NH', 'BEV'), 'velsen': ('NH', 'VEL'),
    'castricum': ('NH', 'CAS'), 'huizen': ('NH', 'HUI'), 'bussum': ('NH', 'BUS'),
    'naarden': ('NH', 'NAA'), 'weesp': ('NH', 'WEE'), 'edam': ('NH', 'EDA'),
    'volendam': ('NH', 'VOL'), 'texel': ('NH', 'TEX'), 'den helder': ('NH', 'DHE'),
    'schagen': ('NH', 'SGA'), 'heerhugowaard': ('NH', 'HHW'), 'laren': ('NH', 'LAR'),
    'blaricum': ('NH', 'BLA'), 'muiden': ('NH', 'MUI'), 'monnickendam': ('NH', 'MON'),
    'bergen': ('NH', 'BER'), 'heiloo': ('NH', 'HEI'), 'uitgeest': ('NH', 'UIT'),
    'krommenie': ('NH', 'KRO'), 'wormer': ('NH', 'WOR'), 'westzaan': ('NH', 'WZA'),
    'landsmeer': ('NH', 'LAN'), 'broek in waterland': ('NH', 'BIW'),
    
    # Zuid-Holland
    'rotterdam': ('ZH', 'ROT'), 'den haag': ('ZH', 'DHA'), 'the hague': ('ZH', 'DHA'),
    "'s-gravenhage": ('ZH', 'DHA'), 'leiden': ('ZH', 'LEI'), 'delft': ('ZH', 'DEL'),
    'dordrecht': ('ZH', 'DOR'), 'gouda': ('ZH', 'GOU'), 'schiedam': ('ZH', 'SCH'),
    'zoetermeer': ('ZH', 'ZOE'), 'vlaardingen': ('ZH', 'VLA'), 'spijkenisse': ('ZH', 'SPI'),
    'alphen aan den rijn': ('ZH', 'AAR'), 'katwijk': ('ZH', 'KAT'), 'leidschendam': ('ZH', 'LSD'),
    'voorburg': ('ZH', 'VOO'), 'rijswijk': ('ZH', 'RIJ'), 'wassenaar': ('ZH', 'WAS'),
    'oegstgeest': ('ZH', 'OEG'), 'voorschoten': ('ZH', 'VOR'), 'lisse': ('ZH', 'LIS'),
    'noordwijk': ('ZH', 'NWK'), 'sassenheim': ('ZH', 'SAS'), 'gorinchem': ('ZH', 'GOR'),
    'schoonhoven': ('ZH', 'SOH'), 'woerden': ('ZH', 'WOE'), 'sliedrecht': ('ZH', 'SLI'),
    'hoek van holland': ('ZH', 'HVH'), 'maassluis': ('ZH', 'MSL'), 'hellevoetsluis': ('ZH', 'HEV'),
    'brielle': ('ZH', 'BRI'), 'middelharnis': ('ZH', 'MID'), 'goedereede': ('ZH', 'GOE'),
    'papendrecht': ('ZH', 'PAP'), 'hendrik-ido-ambacht': ('ZH', 'HIA'), 'zwijndrecht': ('ZH', 'ZWI'),
    'barendrecht': ('ZH', 'BAR'), 'ridderkerk': ('ZH', 'RID'), 'capelle aan den ijssel': ('ZH', 'CAP'),
    
    # Utrecht
    'utrecht': ('UT', 'UTR'), 'amersfoort': ('UT', 'AME'), 'zeist': ('UT', 'ZEI'),
    'nieuwegein': ('UT', 'NIE'), 'veenendaal': ('UT', 'VEE'), 'houten': ('UT', 'HOU'),
    'soest': ('UT', 'SOE'), 'baarn': ('UT', 'BAA'), 'bunnik': ('UT', 'BUN'),
    'maarssen': ('UT', 'MAA'), 'de bilt': ('UT', 'BIL'), 'bilthoven': ('UT', 'BIT'),
    'doorn': ('UT', 'DOO'), 'driebergen': ('UT', 'DRI'), 'wijk bij duurstede': ('UT', 'WBD'),
    'breukelen': ('UT', 'BRE'), 'loenen': ('UT', 'LOE'), 'oudewater': ('UT', 'OUD'),
    'rhenen': ('UT', 'RHE'), 'vianen': ('UT', 'VIA'), 'culemborg': ('UT', 'CUL'),
    
    # Gelderland
    'arnhem': ('GE', 'ARN'), 'nijmegen': ('GE', 'NIJ'), 'apeldoorn': ('GE', 'APE'),
    'ede': ('GE', 'EDE'), 'wageningen': ('GE', 'WAG'), 'doetinchem': ('GE', 'DOE'),
    'zutphen': ('GE', 'ZUT'), 'harderwijk': ('GE', 'HAR'), 'zevenaar': ('GE', 'ZEV'),
    'tiel': ('GE', 'TIE'), 'barneveld': ('GE', 'BNV'), 'elburg': ('GE', 'ELB'),
    'nunspeet': ('GE', 'NUN'), 'putten': ('GE', 'PUT'), 'ermelo': ('GE', 'ERM'),
    'epe': ('GE', 'EPE'), 'hattem': ('GE', 'HAT'), 'zaltbommel': ('GE', 'ZAL'),
    'winterswijk': ('GE', 'WIN'), 'oosterbeek': ('GE', 'OOS'), 'velp': ('GE', 'VEL'),
    'buren': ('GE', 'BUR'), 'groenlo': ('GE', 'GRO'), 'borculo': ('GE', 'BOR'),
    'lochem': ('GE', 'LOC'), 'hengelo': ('GE', 'HEN'), 'bronkhorst': ('GE', 'BRO'),
    'elst': ('GE', 'ELS'), 'bemmel': ('GE', 'BEM'), 'duiven': ('GE', 'DUI'),
    'westervoort': ('GE', 'WES'), 'renkum': ('GE', 'REN'), 'rozendaal': ('GE', 'ROZ'),
    'berkelland': ('GE', 'BKL'), 'aalten': ('GE', 'AAL'), 'lichtenvoorde': ('GE', 'LIC'),
    
    # Noord-Brabant
    'eindhoven': ('NB', 'EIN'), 'tilburg': ('NB', 'TIL'), 'breda': ('NB', 'BRE'),
    "'s-hertogenbosch": ('NB', 'DBO'), 'den bosch': ('NB', 'DBO'), 'helmond': ('NB', 'HEL'),
    'oss': ('NB', 'OSS'), 'roosendaal': ('NB', 'ROS'), 'bergen op zoom': ('NB', 'BOZ'),
    'waalwijk': ('NB', 'WAA'), 'uden': ('NB', 'UDE'), 'veghel': ('NB', 'VEG'),
    'boxtel': ('NB', 'BOX'), 'oisterwijk': ('NB', 'OIS'), 'vught': ('NB', 'VUG'),
    'cuijk': ('NB', 'CUI'), 'deurne': ('NB', 'DEU'), 'geldrop': ('NB', 'GEL'),
    'mierlo': ('NB', 'MIE'), 'nuenen': ('NB', 'NUE'), 'valkenswaard': ('NB', 'VAL'),
    'heeze': ('NB', 'HEE'), 'best': ('NB', 'BES'), 'son': ('NB', 'SON'),
    'etten-leur': ('NB', 'ETL'), 'oosterhout': ('NB', 'OOH'), 'dongen': ('NB', 'DON'),
    'gilze': ('NB', 'GIL'), 'rijen': ('NB', 'RIJ'), 'made': ('NB', 'MAD'),
    'raamsdonk': ('NB', 'RAA'), 'geertruidenberg': ('NB', 'GEE'), 'heusden': ('NB', 'HEU'),
    'drunen': ('NB', 'DRU'), 'vlijmen': ('NB', 'VLI'),
    
    # Limburg
    'maastricht': ('LI', 'MAA'), 'venlo': ('LI', 'VEN'), 'heerlen': ('LI', 'HEE'),
    'roermond': ('LI', 'ROE'), 'sittard': ('LI', 'SIT'), 'geleen': ('LI', 'GEL'),
    'weert': ('LI', 'WEE'), 'kerkrade': ('LI', 'KER'), 'valkenburg': ('LI', 'VAL'),
    'vaals': ('LI', 'VAA'), 'meerssen': ('LI', 'MEE'), 'brunssum': ('LI', 'BRU'),
    'landgraaf': ('LI', 'LAN'), 'stein': ('LI', 'STE'), 'beek': ('LI', 'BEE'),
    'eijsden': ('LI', 'EIJ'), 'gulpen': ('LI', 'GUL'), 'margraten': ('LI', 'MAR'),
    'simpelveld': ('LI', 'SIM'), 'thorn': ('LI', 'THO'), 'horst': ('LI', 'HOR'),
    'venray': ('LI', 'VRY'), 'tegelen': ('LI', 'TEG'), 'blerick': ('LI', 'BLE'),
    'arcen': ('LI', 'ARC'), 'neeritter': ('LI', 'NEE'),
    
    # Overijssel
    'zwolle': ('OV', 'ZWO'), 'deventer': ('OV', 'DEV'), 'enschede': ('OV', 'ENS'),
    'hengelo': ('OV', 'HEN'), 'almelo': ('OV', 'ALM'), 'kampen': ('OV', 'KAM'),
    'oldenzaal': ('OV', 'OLD'), 'hardenberg': ('OV', 'HDB'), 'rijssen': ('OV', 'RIJ'),
    'holten': ('OV', 'HOL'), 'raalte': ('OV', 'RAA'), 'ommen': ('OV', 'OMM'),
    'dalfsen': ('OV', 'DAL'), 'staphorst': ('OV', 'STA'), 'giethoorn': ('OV', 'GIE'),
    'hasselt': ('OV', 'HAS'), 'steenwijk': ('OV', 'STW'), 'vollenhove': ('OV', 'VOL'),
    'blokzijl': ('OV', 'BLO'), 'tubbergen': ('OV', 'TUB'), 'losser': ('OV', 'LOS'),
    'denekamp': ('OV', 'DEN'), 'haaksbergen': ('OV', 'HAA'), 'markelo': ('OV', 'MRK'),
    'diepenheim': ('OV', 'DIE'), 'goor': ('OV', 'GOO'), 'vriezenveen': ('OV', 'VRI'),
    'wierden': ('OV', 'WIE'), 'nijverdal': ('OV', 'NIJ'), 'hellendoorn': ('OV', 'HEL'),
    
    # Friesland
    'leeuwarden': ('FR', 'LEE'), 'drachten': ('FR', 'DRA'), 'sneek': ('FR', 'SNE'),
    'heerenveen': ('FR', 'HEE'), 'harlingen': ('FR', 'HAR'), 'dokkum': ('FR', 'DOK'),
    'bolsward': ('FR', 'BOL'), 'franeker': ('FR', 'FRA'), 'joure': ('FR', 'JOU'),
    'lemmer': ('FR', 'LEM'), 'workum': ('FR', 'WOR'), 'makkum': ('FR', 'MAK'),
    'hindeloopen': ('FR', 'HIN'), 'stavoren': ('FR', 'STA'), 'sloten': ('FR', 'SLO'),
    'koudum': ('FR', 'KOU'), 'balk': ('FR', 'BAL'), 'gorredijk': ('FR', 'GOR'),
    'wolvega': ('FR', 'WOL'), 'akkrum': ('FR', 'AKK'), 'burgum': ('FR', 'BUR'),
    'kollum': ('FR', 'KOL'), 'damwoude': ('FR', 'DAM'), 'ferwerd': ('FR', 'FER'),
    
    # Drenthe
    'assen': ('DR', 'ASS'), 'emmen': ('DR', 'EMM'), 'hoogeveen': ('DR', 'HOO'),
    'meppel': ('DR', 'MEP'), 'coevorden': ('DR', 'COE'), 'borger': ('DR', 'BOR'),
    'dwingeloo': ('DR', 'DWI'), 'westerbork': ('DR', 'WES'), 'beilen': ('DR', 'BEI'),
    'roden': ('DR', 'ROD'), 'norg': ('DR', 'NOR'), 'zuidlaren': ('DR', 'ZUI'),
    'gieten': ('DR', 'GIE'), 'exloo': ('DR', 'EXL'), 'orvelte': ('DR', 'ORV'),
    'veenhuizen': ('DR', 'VHU'),
    
    # Groningen
    'groningen': ('GR', 'GRO'), 'veendam': ('GR', 'VEE'), 'winschoten': ('GR', 'WIN'),
    'hoogezand': ('GR', 'HZD'), 'stadskanaal': ('GR', 'STK'), 'delfzijl': ('GR', 'DEL'),
    'appingedam': ('GR', 'APP'), 'ter apel': ('GR', 'TAP'), 'leek': ('GR', 'LEE'),
    'marum': ('GR', 'MAR'), 'zuidhorn': ('GR', 'ZDH'), 'uithuizen': ('GR', 'UIT'),
    'loppersum': ('GR', 'LOP'), 'middelstum': ('GR', 'MID'), 'baflo': ('GR', 'BAF'),
    'usquert': ('GR', 'USQ'), 'warffum': ('GR', 'WAR'),
    
    # Zeeland
    'middelburg': ('ZE', 'MID'), 'vlissingen': ('ZE', 'VLI'), 'goes': ('ZE', 'GOE'),
    'terneuzen': ('ZE', 'TER'), 'hulst': ('ZE', 'HUL'), 'zierikzee': ('ZE', 'ZIE'),
    'veere': ('ZE', 'VEE'), 'domburg': ('ZE', 'DOM'), 'sluis': ('ZE', 'SLU'),
    'yerseke': ('ZE', 'YER'), 'tholen': ('ZE', 'THO'), 'sint-maartensdijk': ('ZE', 'SMD'),
    'bruinisse': ('ZE', 'BRU'), 'brouwershaven': ('ZE', 'BRO'),
    
    # Flevoland
    'almere': ('FL', 'ALM'), 'lelystad': ('FL', 'LEL'), 'dronten': ('FL', 'DRO'),
    'emmeloord': ('FL', 'EMM'), 'urk': ('FL', 'URK'), 'zeewolde': ('FL', 'ZEE'),
    'biddinghuizen': ('FL', 'BID'),
}

# Special place names that aren't cities but indicate locations
SPECIAL_LOCATIONS = {
    'muiderslot': ('NH', 'MUI'),  # Muiden castle
    'hilde': ('NH', 'CAS'),  # Huis van Hilde in Castricum
    'veenhuizen': ('DR', 'VHU'),
    'kinderdijk': ('ZH', 'KIN'),
    'zaanstreek': ('NH', 'ZAA'),
    'tweestromenland': ('GE', 'MAA'),  # Maas-Waal region
}


def extract_city_from_name(org_name: str) -> Optional[Tuple[str, str, str]]:
    """
    Extract city name from organization name.
    Returns (province_code, city_code, city_name) or None.
    """
    name_lower = org_name.lower()
    
    # First check special locations
    for location, (prov, code) in SPECIAL_LOCATIONS.items():
        if location in name_lower:
            return (prov, code, location)
    
    # Sort cities by length (longest first) to match "hoek van holland" before "holland"
    sorted_cities = sorted(DUTCH_CITIES.keys(), key=len, reverse=True)
    
    for city in sorted_cities:
        # Check if city name appears as a word boundary
        pattern = r'\b' + re.escape(city) + r'\b'
        if re.search(pattern, name_lower):
            prov, code = DUTCH_CITIES[city]
            return (prov, code, city)
    
    return None


def extract_abbreviation(name: str) -> str:
    """Extract abbreviation from organization name."""
    skip_words = {
        'de', 'het', 'een', 'van', 'voor', 'in', 'op', 'te', 'den', 'der', 'des',
        'en', 'of', 'the', 'a', 'an', 'and', 'or', 'museum', 'stichting',
    }
    
    name_clean = re.sub(r'[^\w\s]', ' ', name)
    words = [w for w in name_clean.split() if w.lower() not in skip_words and len(w) > 1]
    
    if not words:
        words = name_clean.split()[:3]
    
    if len(words) == 1:
        abbrev = words[0][:4].upper()
    else:
        abbrev = ''.join(w[0] for w in words[:5]).upper()
    
    return abbrev if abbrev else 'XXX'


def get_institution_type(data: Dict) -> str:
    """Get institution type code from data."""
    type_map = {
        'MUSEUM': 'M', 'ARCHIVE': 'A', 'LIBRARY': 'L', 'GALLERY': 'G',
        'RESEARCH_CENTER': 'R', 'EDUCATION_PROVIDER': 'E', 'OFFICIAL_INSTITUTION': 'O',
        'COLLECTING_SOCIETY': 'S', 'HOLY_SITES': 'H', 'DIGITAL_PLATFORM': 'D',
        'BOTANICAL_ZOO': 'B', 'CORPORATION': 'C', 'FEATURES': 'F',
        'INTANGIBLE_HERITAGE_GROUP': 'I', 'MIXED': 'X', 'PERSONAL_COLLECTION': 'P',
        'NGO': 'N', 'TASTE_SMELL': 'T', 'UNKNOWN': 'U',
    }
    inst_type = data.get('institution_type', 'MUSEUM')
    return type_map.get(inst_type, 'M')


def process_pending_file(filepath: Path, dry_run: bool = True) -> Optional[str]:
    """Process a single PENDING file."""
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            data = yaml.safe_load(f)
        
        emic_name = data.get('custodian_name', {}).get('emic_name', '')
        if not emic_name:
            return None
        
        # Try to extract city from name
        result = extract_city_from_name(emic_name)
        if not result:
            return None
        
        prov, city_code, city_name = result
        inst_type = get_institution_type(data)
        abbrev = extract_abbreviation(emic_name)
        
        # Generate new GHCID
        new_ghcid = f"NL-{prov}-{city_code.upper()}-{inst_type}-{abbrev}"
        
        # Check for collision
        new_path = CUSTODIAN_DIR / f"{new_ghcid}.yaml"
        if new_path.exists() and new_path != filepath:
            # Add name suffix for collision
            name_slug = re.sub(r'[^\w]+', '-', emic_name.lower()).strip('-')[:30]
            new_ghcid = f"{new_ghcid}-{name_slug}"
            new_path = CUSTODIAN_DIR / f"{new_ghcid}.yaml"
        
        if dry_run:
            print(f"[WOULD RESOLVE] {emic_name}")
            print(f"  City found: {city_name}")
            print(f"  -> {new_ghcid}.yaml")
            return 'dry_run'
        
        # Update data
        data['ghcid_current'] = new_ghcid
        if 'location' not in data:
            data['location'] = {}
        data['location']['city'] = city_name.title()
        data['location']['country'] = 'NL'
        
        # Add resolution provenance
        if 'ghcid_resolution' not in data:
            data['ghcid_resolution'] = {}
        data['ghcid_resolution']['method'] = 'city_name_extraction'
        data['ghcid_resolution']['resolved_at'] = datetime.now(timezone.utc).isoformat()
        data['ghcid_resolution']['extracted_city'] = city_name
        
        # Write and rename
        with open(filepath, 'w', encoding='utf-8') as f:
            yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
        
        shutil.move(filepath, new_path)
        print(f"[RESOLVED] {emic_name}")
        print(f"  City: {city_name} -> {new_ghcid}.yaml")
        
        return new_ghcid
        
    except Exception as e:
        print(f"[ERROR] {filepath.name}: {e}")
        return None


def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--dry-run', action='store_true')
    parser.add_argument('--limit', type=int, default=0)
    args = parser.parse_args()
    
    # Find NL PENDING files only
    pending_files = list(CUSTODIAN_DIR.glob("NL-XX-XXX-PENDING-*.yaml"))
    print(f"Found {len(pending_files)} NL PENDING files")
    
    if args.limit:
        pending_files = pending_files[:args.limit]
    
    resolved = 0
    failed = 0
    
    for filepath in pending_files:
        result = process_pending_file(filepath, dry_run=args.dry_run)
        if result:
            resolved += 1
        else:
            failed += 1
    
    print(f"\n{'Would resolve' if args.dry_run else 'Resolved'}: {resolved}")
    print(f"No city found: {failed}")


if __name__ == '__main__':
    main()