glam/scripts/enrich_cities_google.py

#!/usr/bin/env python3
"""
Enrich custodian files with city/region data using Google Places API.

This is a generic script that works for any country's XXX files.

Usage:
    python scripts/enrich_cities_google.py --country KR [--dry-run] [--limit N]
    python scripts/enrich_cities_google.py --country AR [--dry-run] [--limit N]
    python scripts/enrich_cities_google.py --all [--dry-run] [--limit N]

Environment Variables:
    GOOGLE_PLACES_TOKEN - Required: Google Cloud API key with Places API enabled
"""

import os
import sys
import time
import sqlite3
import re
import argparse
from pathlib import Path
from datetime import datetime, timezone
from typing import Optional

import yaml
import httpx
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Configuration
GOOGLE_PLACES_TOKEN = os.getenv("GOOGLE_PLACES_TOKEN", "")
GEONAMES_DB = Path("/Users/kempersc/apps/glam/data/reference/geonames.db")
CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian")

# Google Places API
TEXT_SEARCH_URL = "https://places.googleapis.com/v1/places:searchText"
REQUEST_DELAY = 0.3

# Country name mapping for search queries
COUNTRY_NAMES = {
    'KR': 'South Korea',
    'AR': 'Argentina',
    'US': 'United States',
    'IN': 'India',
    'JM': 'Jamaica',
    'UZ': 'Uzbekistan',
    'UA': 'Ukraine',
    'TJ': 'Tajikistan',
    'OM': 'Oman',
    'NL': 'Netherlands',
    'NA': 'Namibia',
    'ML': 'Mali',
    'LK': 'Sri Lanka',
    'LB': 'Lebanon',
    'IT': 'Italy',
    'IR': 'Iran',
    'EC': 'Ecuador',
    'DK': 'Denmark',
    'CU': 'Cuba',
    'CO': 'Colombia',
    'BR': 'Brazil',
    'MX': 'Mexico',
    'JP': 'Japan',
    'CZ': 'Czech Republic',
    'DE': 'Germany',
    'FR': 'France',
    'GB': 'United Kingdom',
    'EE': 'Estonia',
    'PH': 'Philippines',
    'CL': 'Chile',
    'CH': 'Switzerland',
}


def get_city_code(city_name: str) -> str:
    """Generate 3-letter city code from city name."""
    name = city_name.strip()
    # Remove common suffixes
    for suffix in [' City', ' Town', '-shi', '-ku', '-gun', '-cho', ' District']:
        if name.endswith(suffix):
            name = name[:-len(suffix)]

    words = name.split()

    if len(words) == 1:
        return name[:3].upper()
    elif len(words) == 2:
        return (words[0][0] + words[1][:2]).upper()
    else:
        return ''.join(w[0] for w in words[:3]).upper()


def search_google_places(query: str, api_key: str) -> Optional[dict]:
    """Search Google Places API for a location."""
    headers = {
        "Content-Type": "application/json",
        "X-Goog-Api-Key": api_key,
        "X-Goog-FieldMask": "places.displayName,places.formattedAddress,places.location,places.addressComponents,places.types,places.id,places.websiteUri"
    }

    payload = {
        "textQuery": query,
        "languageCode": "en"
    }

    try:
        response = httpx.post(TEXT_SEARCH_URL, json=payload, headers=headers, timeout=30)
        response.raise_for_status()
        data = response.json()

        if "places" in data and len(data["places"]) > 0:
            return data["places"][0]
        return None
    except Exception as e:
        print(f"  Error searching Google Places: {e}")
        return None


def extract_location_from_google(place: dict) -> dict:
    """Extract location information from Google Places result."""
    result = {
        'city': None,
        'region': None,
        'latitude': None,
        'longitude': None,
        'formatted_address': None,
        'place_id': None,
        'website': None,
        'country_code': None,  # Added for country validation
    }

    if not place:
        return result

    result['place_id'] = place.get('id')
    result['formatted_address'] = place.get('formattedAddress')
    result['website'] = place.get('websiteUri')

    location = place.get('location', {})
    result['latitude'] = location.get('latitude')
    result['longitude'] = location.get('longitude')

    components = place.get('addressComponents', [])
    for comp in components:
        types = comp.get('types', [])
        long_name = comp.get('longText', '')
        short_name = comp.get('shortText', '')

        if 'locality' in types:
            result['city'] = long_name
        elif 'administrative_area_level_1' in types:
            result['region'] = long_name
        elif 'sublocality_level_1' in types and not result['city']:
            result['city'] = long_name
        elif 'country' in types:
            # Extract country code for validation
            result['country_code'] = short_name

    return result


def lookup_city_geonames(conn: sqlite3.Connection, lat: float, lon: float, country_code: str) -> Optional[dict]:
    """Reverse geocode coordinates to find nearest city in GeoNames."""
    cursor = conn.cursor()

    cursor.execute("""
        SELECT name, ascii_name, admin1_code, admin1_name, geonames_id,
               latitude, longitude, population, feature_code,
               ((latitude - ?) * (latitude - ?) + (longitude - ?) * (longitude - ?)) as dist_sq
        FROM cities
        WHERE country_code = ?
          AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
        ORDER BY dist_sq
        LIMIT 1
    """, (lat, lat, lon, lon, country_code))

    row = cursor.fetchone()
    if row:
        return {
            'name': row[0],
            'ascii_name': row[1],
            'admin1_code': row[2],
            'admin1_name': row[3],
            'geonames_id': row[4],
            'latitude': row[5],
            'longitude': row[6],
            'population': row[7],
            'feature_code': row[8],
        }
    return None


# Brazil: GeoNames admin1_code → ISO 3166-2:BR state code
BRAZIL_STATE_CODES = {
    '01': 'AC',  # Acre
    '02': 'AL',  # Alagoas
    '03': 'AP',  # Amapá
    '04': 'AM',  # Amazonas
    '05': 'BA',  # Bahia
    '06': 'CE',  # Ceará
    '07': 'DF',  # Federal District (Distrito Federal)
    '08': 'ES',  # Espírito Santo
    '11': 'MS',  # Mato Grosso do Sul
    '13': 'MA',  # Maranhão
    '14': 'MT',  # Mato Grosso
    '15': 'MG',  # Minas Gerais
    '16': 'PA',  # Pará
    '17': 'PB',  # Paraíba
    '18': 'PR',  # Paraná
    '20': 'PI',  # Piauí
    '21': 'RJ',  # Rio de Janeiro
    '22': 'RN',  # Rio Grande do Norte
    '23': 'RS',  # Rio Grande do Sul
    '24': 'RO',  # Rondônia
    '25': 'RR',  # Roraima
    '26': 'SC',  # Santa Catarina
    '27': 'SP',  # São Paulo
    '28': 'SE',  # Sergipe
    '29': 'GO',  # Goiás
    '30': 'PE',  # Pernambuco
    '31': 'TO',  # Tocantins
}

# Switzerland: GeoNames admin1_code → ISO 3166-2:CH canton code
SWITZERLAND_CANTON_CODES = {
    'AG': 'AG',  # Aargau
    'AI': 'AI',  # Appenzell Innerrhoden
    'AR': 'AR',  # Appenzell Ausserrhoden
    'BE': 'BE',  # Bern
    'BL': 'BL',  # Basel-Landschaft
    'BS': 'BS',  # Basel-Stadt
    'FR': 'FR',  # Fribourg
    'GE': 'GE',  # Geneva
    'GL': 'GL',  # Glarus
    'GR': 'GR',  # Graubünden
    'JU': 'JU',  # Jura
    'LU': 'LU',  # Lucerne
    'NE': 'NE',  # Neuchâtel
    'NW': 'NW',  # Nidwalden
    'OW': 'OW',  # Obwalden
    'SG': 'SG',  # St. Gallen
    'SH': 'SH',  # Schaffhausen
    'SO': 'SO',  # Solothurn
    'SZ': 'SZ',  # Schwyz
    'TG': 'TG',  # Thurgau
    'TI': 'TI',  # Ticino
    'UR': 'UR',  # Uri
    'VD': 'VD',  # Vaud
    'VS': 'VS',  # Valais
    'ZG': 'ZG',  # Zug
    'ZH': 'ZH',  # Zürich
}

# Mexico: GeoNames admin1_code → ISO 3166-2:MX state code
MEXICO_STATE_CODES = {
    '01': 'AGU',  # Aguascalientes
    '02': 'BCN',  # Baja California
    '03': 'BCS',  # Baja California Sur
    '04': 'CAM',  # Campeche
    '05': 'COA',  # Coahuila
    '06': 'COL',  # Colima
    '07': 'CHP',  # Chiapas
    '08': 'CHH',  # Chihuahua
    '09': 'CMX',  # Ciudad de México (CDMX)
    '10': 'DUR',  # Durango
    '11': 'GUA',  # Guanajuato
    '12': 'GRO',  # Guerrero
    '13': 'HID',  # Hidalgo
    '14': 'JAL',  # Jalisco
    '15': 'MEX',  # México (State of Mexico)
    '16': 'MIC',  # Michoacán
    '17': 'MOR',  # Morelos
    '18': 'NAY',  # Nayarit
    '19': 'NLE',  # Nuevo León
    '20': 'OAX',  # Oaxaca
    '21': 'PUE',  # Puebla
    '22': 'QUE',  # Querétaro
    '23': 'ROO',  # Quintana Roo
    '24': 'SLP',  # San Luis Potosí
    '25': 'SIN',  # Sinaloa
    '26': 'SON',  # Sonora
    '27': 'TAB',  # Tabasco
    '28': 'TAM',  # Tamaulipas
    '29': 'TLA',  # Tlaxcala
    '30': 'VER',  # Veracruz
    '31': 'YUC',  # Yucatán
    '32': 'ZAC',  # Zacatecas
}

# Chile: GeoNames admin1_code → ISO 3166-2:CL region code
CHILE_REGION_CODES = {
    '01': 'TA',  # Tarapacá
    '02': 'AN',  # Antofagasta
    '03': 'AT',  # Atacama
    '04': 'CO',  # Coquimbo
    '05': 'VS',  # Valparaíso
    '06': 'LI',  # Libertador General Bernardo O'Higgins
    '07': 'ML',  # Maule
    '08': 'BI',  # Biobío
    '09': 'AR',  # La Araucanía
    '10': 'LL',  # Los Lagos
    '11': 'AI',  # Aisén del General Carlos Ibáñez del Campo
    '12': 'MA',  # Magallanes y de la Antártica Chilena
    '13': 'RM',  # Región Metropolitana de Santiago
    '14': 'LR',  # Los Ríos
    '15': 'AP',  # Arica y Parinacota
    '16': 'NB',  # Ñuble
}


def get_region_code(admin1_code: str, country_code: str, admin1_name: str) -> str:
    """Get ISO-style region code from GeoNames admin1_code."""
    if not admin1_code:
        return 'XX'

    # Country-specific mappings
    if country_code == 'BR' and admin1_code in BRAZIL_STATE_CODES:
        return BRAZIL_STATE_CODES[admin1_code]

    if country_code == 'CH' and admin1_code in SWITZERLAND_CANTON_CODES:
        return SWITZERLAND_CANTON_CODES[admin1_code]

    if country_code == 'MX' and admin1_code in MEXICO_STATE_CODES:
        return MEXICO_STATE_CODES[admin1_code]

    if country_code == 'CL' and admin1_code in CHILE_REGION_CODES:
        return CHILE_REGION_CODES[admin1_code]

    # For most countries, use first 2-3 characters of admin1_code or name
    if len(admin1_code) <= 3:
        return admin1_code.upper()

    # Use abbreviation from name
    if admin1_name:
        words = admin1_name.split()
        if len(words) == 1:
            return admin1_name[:2].upper()
        else:
            return ''.join(w[0] for w in words[:2]).upper()

    return admin1_code[:2].upper()


def process_file(filepath: Path, conn: sqlite3.Connection, api_key: str,
                 country_code: str, country_name: str, dry_run: bool = False) -> dict:
    """Process a single custodian file."""
    result = {
        'file': str(filepath),
        'status': 'skipped',
        'old_ghcid': None,
        'new_ghcid': None,
        'city': None,
        'region': None,
        'error': None,
    }

    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            data = yaml.safe_load(f)
    except Exception as e:
        result['status'] = 'error'
        result['error'] = f'Failed to load YAML: {e}'
        return result

    if not data:
        result['status'] = 'error'
        result['error'] = 'Empty YAML file'
        return result

    ghcid_data = data.get('ghcid', {})
    old_ghcid = ghcid_data.get('ghcid_current', '')
    result['old_ghcid'] = old_ghcid

    # Match patterns with XXX city code:
    # - {country}-XX-XXX-... (2-letter region like XX, BE, GE)
    # - {country}-10-XXX-... (2-digit region like 10, 52, 37)
    # - {country}-UKM-XXX-... (3-letter region like UKM, IDF, CMX)
    xxx_pattern = re.compile(rf'^{country_code}-[A-Z0-9]{{2,3}}-XXX-')
    if not xxx_pattern.match(old_ghcid):
        result['status'] = 'skipped'
        result['error'] = f'Not a {country_code}-*-XXX file'
        return result

    # Get institution name
    name = data.get('custodian_name', {}).get('claim_value', '')
    if not name:
        name = data.get('original_entry', {}).get('name', '')

    if not name:
        result['status'] = 'error'
        result['error'] = 'No institution name found'
        return result

    # Search Google Places
    search_query = f"{name} {country_name}"
    print(f"  Searching: {name[:50]}...")
    place = search_google_places(search_query, api_key)
    time.sleep(REQUEST_DELAY)

    if not place:
        result['status'] = 'error'
        result['error'] = 'Not found in Google Places'
        return result

    location_info = extract_location_from_google(place)

    if not location_info['latitude'] or not location_info['longitude']:
        result['status'] = 'error'
        result['error'] = 'No coordinates from Google'
        return result

    # CRITICAL: Validate that Google returned a result in the expected country
    # This prevents data contamination from similarly-named institutions in other countries
    google_country = location_info.get('country_code')
    if google_country and google_country != country_code:
        result['status'] = 'error'
        result['error'] = f'COUNTRY MISMATCH: Google returned {google_country}, expected {country_code}. Address: {location_info.get("formatted_address", "unknown")}'
        return result

    # Lookup in GeoNames
    city_info = lookup_city_geonames(conn, location_info['latitude'],
                                      location_info['longitude'], country_code)

    if not city_info:
        result['status'] = 'error'
        result['error'] = 'City not found in GeoNames'
        return result

    region_code = get_region_code(city_info['admin1_code'], country_code, city_info['admin1_name'])
    city_code = get_city_code(city_info['ascii_name'])

    result['city'] = city_info['ascii_name']
    result['region'] = city_info['admin1_name']

    # Build new GHCID
    parts = old_ghcid.split('-')
    if len(parts) >= 5:
        inst_type = parts[3]
        abbreviation = '-'.join(parts[4:])
    else:
        result['status'] = 'error'
        result['error'] = f'Invalid GHCID format: {old_ghcid}'
        return result

    new_ghcid = f'{country_code}-{region_code}-{city_code}-{inst_type}-{abbreviation}'
    result['new_ghcid'] = new_ghcid

    if dry_run:
        result['status'] = 'would_update'
        return result

    # Update the data
    timestamp = datetime.now(timezone.utc).isoformat()

    data['ghcid']['ghcid_current'] = new_ghcid
    data['ghcid']['location_resolution'] = {
        'method': 'GOOGLE_PLACES_GEONAMES',
        'country_code': country_code,
        'region_code': region_code,
        'region_name': city_info['admin1_name'],
        'city_code': city_code,
        'city_name': city_info['ascii_name'],
        'geonames_id': city_info['geonames_id'],
        'feature_code': city_info['feature_code'],
        'google_place_id': location_info.get('place_id'),
        'latitude': location_info['latitude'],
        'longitude': location_info['longitude'],
        'resolution_date': timestamp,
    }

    data['google_maps_enrichment'] = {
        'place_id': location_info.get('place_id'),
        'formatted_address': location_info.get('formatted_address'),
        'website': location_info.get('website'),
        'latitude': location_info['latitude'],
        'longitude': location_info['longitude'],
        'enriched_at': timestamp,
        'source': 'Google Places API (New)',
    }

    # Update GHCID history
    if 'ghcid_history' not in data['ghcid']:
        data['ghcid']['ghcid_history'] = []

    for entry in data['ghcid']['ghcid_history']:
        if entry.get('ghcid') == old_ghcid and not entry.get('valid_to'):
            entry['valid_to'] = timestamp

    data['ghcid']['ghcid_history'].append({
        'ghcid': new_ghcid,
        'ghcid_numeric': data['ghcid'].get('ghcid_numeric'),
        'valid_from': timestamp,
        'reason': f'Location resolved via Google Places + GeoNames: {city_info["ascii_name"]} ({region_code})',
    })

    if 'identifiers' in data:
        for identifier in data['identifiers']:
            if identifier.get('identifier_scheme') == 'GHCID':
                identifier['identifier_value'] = new_ghcid

    # Write and rename
    with open(filepath, 'w', encoding='utf-8') as f:
        yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)

    new_filename = f'{new_ghcid}.yaml'
    new_filepath = filepath.parent / new_filename

    if filepath != new_filepath and not new_filepath.exists():
        filepath.rename(new_filepath)
        result['new_file'] = str(new_filepath)
    elif new_filepath.exists() and filepath != new_filepath:
        result['status'] = 'collision'
        result['error'] = f'Target file exists: {new_filepath.name}'
        return result

    result['status'] = 'updated'
    return result


def main():
    parser = argparse.ArgumentParser(description='Enrich custodian files with Google Places data')
    parser.add_argument('--country', type=str, help='Country code (e.g., KR, AR, US)')
    parser.add_argument('--all', action='store_true', help='Process all countries with XXX files')
    parser.add_argument('--dry-run', action='store_true', help='Show what would be done')
    parser.add_argument('--limit', type=int, help='Limit number of files per country')
    args = parser.parse_args()

    if not GOOGLE_PLACES_TOKEN:
        print("ERROR: GOOGLE_PLACES_TOKEN environment variable is required")
        sys.exit(1)

    if not GEONAMES_DB.exists():
        print(f"ERROR: GeoNames database not found: {GEONAMES_DB}")
        sys.exit(1)

    # Determine which countries to process
    if args.all:
        # Find all countries with XXX files (either XX-XXX or {region}-XXX)
        countries = set()
        for f in CUSTODIAN_DIR.glob('*-*-XXX-*.yaml'):
            cc = f.name[:2]
            if cc in COUNTRY_NAMES:
                countries.add(cc)
        countries = sorted(countries)
    elif args.country:
        countries = [args.country.upper()]
    else:
        print("ERROR: Specify --country CODE or --all")
        sys.exit(1)

    conn = sqlite3.connect(str(GEONAMES_DB))

    total_stats = {'updated': 0, 'error': 0, 'skipped': 0, 'would_update': 0, 'collision': 0}

    for country_code in countries:
        country_name = COUNTRY_NAMES.get(country_code) or country_code

        files = sorted(CUSTODIAN_DIR.glob(f'{country_code}-*-XXX-*.yaml'))

        if args.limit:
            files = files[:args.limit]

        if not files:
            continue

        print(f"\n{'='*60}")
        print(f"Processing {country_code} ({country_name}): {len(files)} files")
        print('='*60)

        stats = {'updated': 0, 'error': 0, 'skipped': 0, 'would_update': 0, 'collision': 0}

        for filepath in files:
            print(f"Processing: {filepath.name}")
            result = process_file(filepath, conn, GOOGLE_PLACES_TOKEN,
                                  country_code, country_name, dry_run=args.dry_run)
            stats[result['status']] = stats.get(result['status'], 0) + 1

            if result['status'] in ('updated', 'would_update'):
                print(f"  ✓ {result['city']} ({result['region']}): {result['old_ghcid']} → {result['new_ghcid']}")
            elif result['status'] == 'error':
                print(f"  ✗ {result['error']}")
            elif result['status'] == 'collision':
                print(f"  ⚠ {result['error']}")

        print(f"\n{country_code} Summary: Updated={stats.get('updated', 0)}, "
              f"Would update={stats.get('would_update', 0)}, "
              f"Errors={stats.get('error', 0)}")

        for k, v in stats.items():
            total_stats[k] = total_stats.get(k, 0) + v

    conn.close()

    print()
    print('='*60)
    print('TOTAL Summary:')
    print(f"  Updated: {total_stats.get('updated', 0)}")
    print(f"  Would update: {total_stats.get('would_update', 0)}")
    print(f"  Errors: {total_stats.get('error', 0)}")
    print(f"  Collisions: {total_stats.get('collision', 0)}")
    print(f"  Skipped: {total_stats.get('skipped', 0)}")


if __name__ == '__main__':
    main()