glam/scripts/enrich_austrian_cities.py

#!/usr/bin/env python3
"""
Enrich Austrian custodian files with city data.

Strategy:
1. Use coordinates for reverse geocoding when available
2. Extract city names from institution names (Wien, Salzburg, Graz, etc.)
3. Validate against GeoNames database

Usage:
    python scripts/enrich_austrian_cities.py [--dry-run]
"""

import re
import sqlite3
import sys
import unicodedata
from datetime import datetime, timezone
from pathlib import Path

# Austrian admin1 codes (GeoNames → ISO 3166-2:AT)
AUSTRIAN_ADMIN1_MAP = {
    '01': 'B',   # Burgenland
    '02': 'K',   # Carinthia (Kärnten)
    '03': 'NO',  # Lower Austria (Niederösterreich)
    '04': 'OO',  # Upper Austria (Oberösterreich)
    '05': 'S',   # Salzburg
    '06': 'ST',  # Styria (Steiermark)
    '07': 'T',   # Tyrol (Tirol)
    '08': 'V',   # Vorarlberg
    '09': 'W',   # Vienna (Wien)
}

# Known Austrian cities in institution names
AUSTRIAN_CITY_PATTERNS = [
    # Major cities
    (r'\bWien\b', 'Wien'),
    (r'\bVienna\b', 'Wien'),
    (r'\bGraz\b', 'Graz'),
    (r'\bLinz\b', 'Linz'),
    (r'\bSalzburg\b', 'Salzburg'),
    (r'\bInnsbruck\b', 'Innsbruck'),
    (r'\bKlagenfurt\b', 'Klagenfurt'),
    (r'\bVillach\b', 'Villach'),
    (r'\bWels\b', 'Wels'),
    (r'\bSt\.?\s*Pölten\b', 'Sankt Pölten'),
    (r'\bSankt\s+Pölten\b', 'Sankt Pölten'),
    (r'\bDornbirn\b', 'Dornbirn'),
    (r'\bWiener\s+Neustadt\b', 'Wiener Neustadt'),
    (r'\bSteyr\b', 'Steyr'),
    (r'\bFeldkirch\b', 'Feldkirch'),
    (r'\bBregenz\b', 'Bregenz'),
    (r'\bLeonding\b', 'Leonding'),
    (r'\bKlosterneuburg\b', 'Klosterneuburg'),
    (r'\bBaden\b', 'Baden'),
    (r'\bLeoben\b', 'Leoben'),
    (r'\bKrems\b', 'Krems an der Donau'),
    (r'\bAmstetten\b', 'Amstetten'),
    (r'\bMödling\b', 'Mödling'),
    (r'\bKapfenberg\b', 'Kapfenberg'),
    (r'\bLustenau\b', 'Lustenau'),
    (r'\bHallein\b', 'Hallein'),
    (r'\bKufstein\b', 'Kufstein'),
    (r'\bTraun\b', 'Traun'),
    (r'\bAnsfelden\b', 'Ansfelden'),
    (r'\bHohenems\b', 'Hohenems'),
    (r'\bSchwechat\b', 'Schwechat'),
    (r'\bBraunau\b', 'Braunau am Inn'),
    (r'\bStockerau\b', 'Stockerau'),
    (r'\bSaalfelden\b', 'Saalfelden am Steinernen Meer'),
    (r'\bTernitz\b', 'Ternitz'),
    (r'\bPerchtoldsdorf\b', 'Perchtoldsdorf'),
    (r'\bEisenstädter?\b', 'Eisenstadt'),
    (r'\bEisenstadt\b', 'Eisenstadt'),
    (r'\bTelfs\b', 'Telfs'),
    (r'\bWolfsberg\b', 'Wolfsberg'),
    (r'\bHard\b', 'Hard'),
    (r'\bKorneuburg\b', 'Korneuburg'),
    (r'\bNeunkirchen\b', 'Neunkirchen'),
    (r'\bRied\b', 'Ried im Innkreis'),
    (r'\bBad\s+Ischl\b', 'Bad Ischl'),
    (r'\bGmunden\b', 'Gmunden'),
    (r'\bWörgl\b', 'Wörgl'),
    (r'\bMelk\b', 'Melk'),
    (r'\bZell\s+am\s+See\b', 'Zell am See'),
    (r'\bMistelbach\b', 'Mistelbach'),
    (r'\bVöcklabruck\b', 'Vöcklabruck'),
    (r'\bMarchtrenk\b', 'Marchtrenk'),
    (r'\bEnns\b', 'Enns'),
    (r'\bBruck\s+an\s+der\s+Mur\b', 'Bruck an der Mur'),
    (r'\bSpittal\b', 'Spittal an der Drau'),
    (r'\bSchwaz\b', 'Schwaz'),
    (r'\bVoitsberg\b', 'Voitsberg'),
    (r'\bRankweil\b', 'Rankweil'),
    (r'\bBad\s+Vöslau\b', 'Bad Vöslau'),
    (r'\bTulln\b', 'Tulln an der Donau'),
    (r'\bGänserndorf\b', 'Gänserndorf'),
    (r'\bHollabrunn\b', 'Hollabrunn'),
    (r'\bLienz\b', 'Lienz'),
    (r'\bHall\s+in\s+Tirol\b', 'Hall in Tirol'),
    (r'\bFeldkirchen\b', 'Feldkirchen in Kärnten'),
    (r'\bZwettl\b', 'Zwettl'),
    (r'\bWaidhofen\b', 'Waidhofen an der Ybbs'),
    (r'\bMattersburg\b', 'Mattersburg'),
    (r'\bOberwart\b', 'Oberwart'),
    (r'\bJudenburg\b', 'Judenburg'),
    (r'\bPöchlarn\b', 'Pöchlarn'),
    (r'\bFranziskanerplatz\b', 'Wien'),  # Common Vienna address
    (r'\bJosefsplatz\b', 'Wien'),  # Hofburg, Vienna

    # Regional references → capital cities
    (r'\bTiroler\b', 'Innsbruck'),  # Amt der Tiroler Landesregierung
    (r'\bBurgenländische\b', 'Eisenstadt'),  # Burgenländische Landesbibliothek
    (r'\bKärnt(?:en|ner)\b', 'Klagenfurt'),  # Kärnten/Kärntner → Klagenfurt
    (r'\bVorarlberg(?:er)?\b', 'Feldkirch'),  # Vorarlberg
    (r'\bSteiermark\b', 'Graz'),  # Steiermark
    (r'\bSteiermärk\b', 'Graz'),  # Steiermärkisch
    (r'\bOÖ\b', 'Linz'),  # OÖ = Oberösterreich
    (r'\bOberösterreich\b', 'Linz'),  # Oberösterreich
    (r'\bNiederösterreich\b', 'Sankt Pölten'),  # Niederösterreich
    (r'\bNÖ\b', 'Sankt Pölten'),  # NÖ = Niederösterreich
    (r'\bSalzburg(?:er)?\b', 'Salzburg'),  # Salzburger Festspiele

    # Small towns mentioned in institution names
    (r'\bKaltenleutgeben\b', 'Kaltenleutgeben'),
    (r'\bLambach\b', 'Lambach'),
    (r'\bSeitenstetten\b', 'Seitenstetten'),
    (r'\bMattsee\b', 'Mattsee'),
    (r'\bPöggstall\b', 'Pöggstall'),
    (r'\bLaxenburg\b', 'Laxenburg'),
    (r'\bEggenburg\b', 'Eggenburg'),
    (r'\bPressbaum\b', 'Pressbaum'),
    (r'\bSeeburg\b', 'Seekirchen am Wallersee'),  # Schloss Seeburg
    (r'\bSchotten(?:stift)?\b', 'Wien'),  # Schottenstift is in Vienna
    (r'\bAlbertina\b', 'Wien'),  # Albertina is in Vienna
    (r'\bMozarteum\b', 'Salzburg'),  # Mozarteum is in Salzburg
    (r'\bParacelsus\b', 'Salzburg'),  # Paracelsus Medizinische Privatuniversität
    (r'\bJoanneum\b', 'Graz'),  # FH Joanneum is in Graz
    (r'\bParlament\b', 'Wien'),  # Parlamentsbibliothek
    (r'\bBundeskanzleramt\b', 'Wien'),  # Federal Chancellery
    (r'\bBundesministerium\b', 'Wien'),  # Federal Ministries
    (r'\bBundesdenkmalamt\b', 'Wien'),  # Federal Monument Office
    (r'\bÖsterreich(?:ische[rn]?)?\b', 'Wien'),  # Austrian national institutions
    (r'\bIST\s*Austria\b', 'Klosterneuburg'),  # Institute of Science and Technology Austria
    (r'\bInstitute\s+of\s+Science\s+and\s+Technology\s+Austria\b', 'Klosterneuburg'),  # Full name
    (r'\bRapid(?:eum)?\b', 'Wien'),  # SK Rapid Vienna
    (r'\bMetalab\b', 'Wien'),  # Metalab hackerspace Vienna
    (r'\bSigmund\s+Freud\b', 'Wien'),  # Sigmund Freud museum Vienna
    (r'\bMax\s+Perutz\b', 'Wien'),  # Max Perutz Library (Vienna Biocenter)

    # Additional specific institutions
    (r'\bAnton\s+Bruckner\b', 'Linz'),  # Anton Bruckner Private University
    (r'\bbifeb\b', 'Strobl'),  # Bundesinstitut für Erwachsenenbildung
    (r'\bBundesinstitut\s+für\s+Erwachsenenbildung\b', 'Strobl'),
    (r'\bZeitgenossen\b', 'Krems an der Donau'),  # Archiv der Zeitgenossen
    (r'\bCompass[-\s]Verlag\b', 'Wien'),  # Compass-Verlag
    (r'\bErnst\s+Krenek\b', 'Krems an der Donau'),  # Ernst Krenek Institut
    (r'\bFrauensolidarität\b', 'Wien'),  # Frauensolidarität
    (r'\bGeoSphere\b', 'Wien'),  # GeoSphere Austria
    (r'\bHochschule\s+Burgenland\b', 'Eisenstadt'),  # FH Burgenland
    (r'\bAgrar[-\s]und\s+Umweltpädagogik\b', 'Wien'),  # Hochschule für Agrar
    (r'\bHochschule\s+für\s+Agrar\b', 'Wien'),  # Hochschule für Agrar (full)
    (r'\bHöhere\s+Studien\b', 'Wien'),  # IHS
    (r'\bInterdisciplinary\s+Transformation\b', 'Wien'),  # ITU
    (r'\bJAM\s+Music\s+Lab\b', 'Wien'),  # JAM Music Lab
    (r'\bKDZ\b', 'Wien'),  # KDZ Zentrum
    (r'\bNew\s+Design\s+University\b', 'Sankt Pölten'),  # NDU
    (r'\bPädagogische\s+Hochschule\s+Tirol\b', 'Innsbruck'),  # PH Tirol
    (r'\bPädagogische\s+Hochschule\s+Burgenland\b', 'Eisenstadt'),  # PPH Burgenland
    (r'\bShared\s+Archiving\b', 'Wien'),  # SAA
    (r'\bVerbund\s+für\s+Bildung\b', 'Wien'),  # VBKV
    (r'\bVilla\s+North\b', 'Wien'),  # Villa North
    (r'\bInformationswissenschaft\b', 'Graz'),  # VFI
    (r'\bErinnerungskultur\b', 'Villach'),  # ZEG is in Villach, not Graz
    (r'\bParlament(?:s)?(?:direktion|bibliothek)?\b', 'Wien'),  # Parlamentsbibliothek
]


def load_source_data(source_file: str) -> dict:
    """Load Austrian source data with coordinates and ISIL codes."""
    import yaml

    with open(source_file, 'r', encoding='utf-8') as f:
        data = yaml.safe_load(f)

    lookup = {}
    for inst in data.get('institutions', []):
        # Get ISIL code
        isil = None
        for ident in inst.get('identifiers', []):
            if ident.get('identifier_scheme') == 'ISIL':
                isil = ident.get('identifier_value')
                break

        if isil:
            locs = inst.get('locations', [])
            coords = None
            if locs and locs[0].get('latitude') and locs[0].get('longitude'):
                coords = (locs[0]['latitude'], locs[0]['longitude'])

            lookup[isil] = {
                'name': inst.get('name', ''),
                'coords': coords,
            }

    return lookup


def extract_city_from_name(name: str) -> str | None:
    """Extract city name from Austrian institution name."""
    for pattern, city in AUSTRIAN_CITY_PATTERNS:
        if re.search(pattern, name, re.IGNORECASE):
            return city
    return None


def generate_city_code(city_name: str) -> str:
    """Generate 3-letter city code from city name."""
    normalized = unicodedata.normalize('NFD', city_name)
    ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')

    clean = re.sub(r'[^a-zA-Z\s-]', '', ascii_name)
    words = clean.split()

    if len(words) == 1:
        return words[0][:3].upper()
    else:
        if len(words) == 2:
            return (words[0][0] + words[1][:2]).upper()
        else:
            return ''.join(w[0] for w in words[:3]).upper()


def reverse_geocode(lat: float, lon: float, conn: sqlite3.Connection) -> dict | None:
    """Reverse geocode coordinates to find nearest Austrian city."""
    cursor = conn.cursor()

    cursor.execute('''
        SELECT name, ascii_name, admin1_code, admin1_name, latitude, longitude, geonames_id, population, feature_code,
               ((latitude - ?) * (latitude - ?) + (longitude - ?) * (longitude - ?)) as distance_sq
        FROM cities
        WHERE country_code = 'AT'
          AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
        ORDER BY distance_sq
        LIMIT 1
    ''', (lat, lat, lon, lon))

    row = cursor.fetchone()
    if row:
        return {
            'name': row[0],
            'ascii_name': row[1],
            'admin1_code': row[2],
            'admin1_name': row[3],
            'latitude': row[4],
            'longitude': row[5],
            'geonames_id': row[6],
            'population': row[7],
            'feature_code': row[8],
        }
    return None


def lookup_city_in_geonames(city_name: str, conn: sqlite3.Connection) -> dict | None:
    """Look up city in GeoNames database."""
    cursor = conn.cursor()

    # Try exact match
    cursor.execute('''
        SELECT name, ascii_name, admin1_code, admin1_name, latitude, longitude, geonames_id, population, feature_code
        FROM cities
        WHERE country_code = 'AT'
          AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
          AND (LOWER(name) = LOWER(?) OR LOWER(ascii_name) = LOWER(?))
        ORDER BY population DESC
        LIMIT 1
    ''', (city_name, city_name))

    row = cursor.fetchone()
    if row:
        return {
            'name': row[0],
            'ascii_name': row[1],
            'admin1_code': row[2],
            'admin1_name': row[3],
            'latitude': row[4],
            'longitude': row[5],
            'geonames_id': row[6],
            'population': row[7],
            'feature_code': row[8],
        }

    # Try fuzzy match
    cursor.execute('''
        SELECT name, ascii_name, admin1_code, admin1_name, latitude, longitude, geonames_id, population, feature_code
        FROM cities
        WHERE country_code = 'AT'
          AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
          AND (LOWER(name) LIKE LOWER(?) OR LOWER(ascii_name) LIKE LOWER(?))
        ORDER BY population DESC
        LIMIT 1
    ''', (f'{city_name}%', f'{city_name}%'))

    row = cursor.fetchone()
    if row:
        return {
            'name': row[0],
            'ascii_name': row[1],
            'admin1_code': row[2],
            'admin1_name': row[3],
            'latitude': row[4],
            'longitude': row[5],
            'geonames_id': row[6],
            'population': row[7],
            'feature_code': row[8],
        }

    return None


def update_custodian_file(file_path: Path, city_name: str, geo_data: dict, method: str, dry_run: bool = False) -> bool:
    """Update a custodian file with city data."""
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()

    ghcid_match = re.search(r'ghcid_current:\s*(\S+)', content)
    if not ghcid_match:
        return False

    old_ghcid = ghcid_match.group(1)

    region_code = AUSTRIAN_ADMIN1_MAP.get(geo_data['admin1_code'], geo_data['admin1_code'])
    city_code = generate_city_code(geo_data['ascii_name'] or geo_data['name'])

    parts = old_ghcid.split('-')
    if len(parts) >= 5:
        type_code = parts[3]
        abbrev_and_suffix = '-'.join(parts[4:])
        new_ghcid = f"AT-{region_code}-{city_code}-{type_code}-{abbrev_and_suffix}"
    else:
        return False

    if old_ghcid == new_ghcid:
        return False

    old_filename = file_path.name
    new_filename = old_filename.replace(old_ghcid, new_ghcid)
    new_file_path = file_path.parent / new_filename

    new_content = content.replace(old_ghcid, new_ghcid)

    old_resolution = re.search(r'location_resolution:\s*\n((?:\s+\S.*\n)*)', new_content)

    if old_resolution:
        new_resolution = f"""location_resolution:
    country_code: AT
    region_code: {region_code}
    region_name: {geo_data['admin1_name']}
    city_code: {city_code}
    city_name: {geo_data['name']}
    geonames_id: {geo_data['geonames_id']}
    feature_code: {geo_data['feature_code']}
    latitude: {geo_data['latitude']}
    longitude: {geo_data['longitude']}
    method: {method}
    resolution_date: '{datetime.now(timezone.utc).isoformat()}'
"""
        new_content = new_content[:old_resolution.start()] + new_resolution + new_content[old_resolution.end():]

    timestamp = datetime.now(timezone.utc).isoformat()
    history_entry = f"""  - ghcid: {new_ghcid}
    valid_from: '{timestamp}'
    reason: City enrichment from {method} - {city_name} resolved to {geo_data['name']} ({region_code})
"""

    history_match = re.search(r'ghcid_history:\s*\n', new_content)
    if history_match:
        insert_pos = history_match.end()
        new_content = new_content[:insert_pos] + history_entry + new_content[insert_pos:]

    if dry_run:
        print(f"  DRY RUN: {old_filename} -> {new_filename}")
        return True

    with open(file_path, 'w', encoding='utf-8') as f:
        f.write(new_content)

    if new_file_path != file_path:
        file_path.rename(new_file_path)

    return True


def main():
    dry_run = '--dry-run' in sys.argv

    base_dir = Path(__file__).parent.parent
    custodian_dir = base_dir / 'data' / 'custodian'
    source_file = base_dir / 'data' / 'instances' / 'austria_complete_ch_annotator.yaml'
    geonames_db = base_dir / 'data' / 'reference' / 'geonames.db'

    print("Austrian City Enrichment Script")
    print("=" * 50)

    if dry_run:
        print("DRY RUN MODE")

    # Load source data
    print(f"\nLoading source data from {source_file.name}...")
    source_lookup = load_source_data(str(source_file))
    print(f"  Found {len(source_lookup)} ISIL entries")

    coords_count = sum(1 for v in source_lookup.values() if v['coords'])
    print(f"  {coords_count} entries have coordinates")

    conn = sqlite3.connect(str(geonames_db))

    print(f"\nFinding Austrian XXX files...")
    xxx_files = list(custodian_dir.glob('AT-*-XXX-*.yaml'))
    print(f"  Found {len(xxx_files)} files")

    updated = 0
    by_coords = 0
    by_name = 0
    no_city = 0
    no_geonames = 0
    errors = 0

    for file_path in xxx_files:
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()

            # Find ISIL code
            isil_match = re.search(r'identifier_value:\s*(AT-\w+)', content)
            isil_code = isil_match.group(1) if isil_match else None

            # Get institution name
            name_match = re.search(r'claim_value:\s*(.+)', content)
            inst_name = name_match.group(1).strip() if name_match else ''

            geo_data = None
            method = None
            city_name = None

            # Strategy 1: Use coordinates for reverse geocoding
            if isil_code and isil_code in source_lookup:
                source_data = source_lookup[isil_code]
                if source_data['coords']:
                    lat, lon = source_data['coords']
                    geo_data = reverse_geocode(lat, lon, conn)
                    if geo_data:
                        method = 'REVERSE_GEOCODE'
                        city_name = geo_data['name']
                        by_coords += 1

            # Strategy 2: Extract city from institution name
            if not geo_data:
                city_name = extract_city_from_name(inst_name)
                if city_name:
                    geo_data = lookup_city_in_geonames(city_name, conn)
                    if geo_data:
                        method = 'NAME_EXTRACTION'
                        by_name += 1

            if not geo_data:
                no_city += 1
                continue

            if update_custodian_file(file_path, city_name, geo_data, method, dry_run):
                updated += 1
                if not dry_run:
                    print(f"  Updated: {file_path.name} -> {city_name} ({method})")

        except Exception as e:
            errors += 1
            print(f"  ERROR: {file_path.name}: {e}")

    conn.close()

    print("\n" + "=" * 50)
    print("SUMMARY")
    print("=" * 50)
    print(f"Total XXX files:       {len(xxx_files)}")
    print(f"Updated:               {updated}")
    print(f"  By coordinates:      {by_coords}")
    print(f"  By name extraction:  {by_name}")
    print(f"No city found:         {no_city}")
    print(f"Errors:                {errors}")
    print(f"Remaining XXX:         {len(xxx_files) - updated}")

    # Generate report
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    report_path = base_dir / 'reports' / f'AUSTRIAN_CITY_ENRICHMENT_{timestamp}.md'

    with open(report_path, 'w') as f:
        f.write(f"# Austrian City Enrichment Report\n\n")
        f.write(f"**Date**: {datetime.now().isoformat()}\n")
        f.write(f"**Dry Run**: {dry_run}\n\n")
        f.write(f"## Summary\n\n")
        f.write(f"| Metric | Count |\n")
        f.write(f"|--------|-------|\n")
        f.write(f"| Total XXX files | {len(xxx_files)} |\n")
        f.write(f"| Updated | {updated} |\n")
        f.write(f"| By coordinates | {by_coords} |\n")
        f.write(f"| By name extraction | {by_name} |\n")
        f.write(f"| No city found | {no_city} |\n")
        f.write(f"| Errors | {errors} |\n")
        f.write(f"| Remaining XXX | {len(xxx_files) - updated} |\n")

    print(f"\nReport: {report_path}")


if __name__ == '__main__':
    main()