glam/scripts/enrich_japanese_cities.py

#!/usr/bin/env python3
"""
Enrich Japanese custodian files with city/region data using Google Places API.

This script:
1. Finds Japanese XXX files (no city/region resolved)
2. Uses Google Places API to search for each institution
3. Extracts location data (city, prefecture, coordinates)
4. Updates GHCID with proper region/city codes
5. Adds Google Maps enrichment data

Usage:
    python scripts/enrich_japanese_cities.py [--dry-run] [--limit N]

Environment Variables:
    GOOGLE_PLACES_TOKEN - Required: Google Cloud API key with Places API enabled
"""

import os
import sys
import time
import sqlite3
import re
import argparse
from pathlib import Path
from datetime import datetime, timezone
from typing import Optional

import yaml
import httpx
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Configuration
GOOGLE_PLACES_TOKEN = os.getenv("GOOGLE_PLACES_TOKEN", "")
GEONAMES_DB = Path("/Users/kempersc/apps/glam/data/reference/geonames.db")
CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian")

# Google Places API
TEXT_SEARCH_URL = "https://places.googleapis.com/v1/places:searchText"
REQUEST_DELAY = 0.3  # Rate limiting

# Japanese prefecture GeoNames admin1_code to ISO 3166-2:JP mapping
ADMIN1_TO_ISO = {
    '01': 'AI',   # Aichi
    '02': 'AK',   # Akita
    '03': 'AO',   # Aomori
    '04': 'CH',   # Chiba
    '05': 'EH',   # Ehime
    '06': 'FI',   # Fukui
    '07': 'FO',   # Fukuoka
    '08': 'FS',   # Fukushima
    '09': 'GI',   # Gifu
    '10': 'GU',   # Gunma
    '11': 'HS',   # Hiroshima
    '12': 'HO',   # Hokkaido
    '13': 'HG',   # Hyogo
    '14': 'IB',   # Ibaraki
    '15': 'IS',   # Ishikawa
    '16': 'IW',   # Iwate
    '17': 'KA',   # Kagawa
    '18': 'KS',   # Kagoshima
    '19': 'KN',   # Kanagawa
    '20': 'KC',   # Kochi
    '21': 'KM',   # Kumamoto
    '22': 'KY',   # Kyoto
    '23': 'ME',   # Mie
    '24': 'MG',   # Miyagi
    '25': 'MZ',   # Miyazaki
    '26': 'NN',   # Nagano
    '27': 'NS',   # Nagasaki
    '28': 'NR',   # Nara
    '29': 'NI',   # Niigata
    '30': 'OT',   # Oita
    '31': 'OK',   # Okayama
    '32': 'OS',   # Osaka
    '33': 'SG',   # Saga
    '34': 'ST',   # Saitama
    '35': 'SI',   # Shiga
    '36': 'SM',   # Shimane
    '37': 'SZ',   # Shizuoka
    '38': 'TC',   # Tochigi
    '39': 'TS',   # Tokushima
    '40': 'TK',   # Tokyo
    '41': 'TT',   # Tottori
    '42': 'TY',   # Toyama
    '43': 'WK',   # Wakayama
    '44': 'YG',   # Yamagata
    '45': 'YM',   # Yamaguchi
    '46': 'YN',   # Yamanashi
    '47': 'ON',   # Okinawa
}

# Reverse mapping for lookup by prefecture name
PREFECTURE_TO_ISO = {
    'Aichi': 'AI', 'Akita': 'AK', 'Aomori': 'AO', 'Chiba': 'CH', 'Ehime': 'EH',
    'Fukui': 'FI', 'Fukuoka': 'FO', 'Fukushima': 'FS', 'Gifu': 'GI', 'Gunma': 'GU',
    'Hiroshima': 'HS', 'Hokkaido': 'HO', 'Hyogo': 'HG', 'Hyōgo': 'HG',
    'Ibaraki': 'IB', 'Ishikawa': 'IS', 'Iwate': 'IW', 'Kagawa': 'KA',
    'Kagoshima': 'KS', 'Kanagawa': 'KN', 'Kochi': 'KC', 'Kumamoto': 'KM',
    'Kyoto': 'KY', 'Mie': 'ME', 'Miyagi': 'MG', 'Miyazaki': 'MZ',
    'Nagano': 'NN', 'Nagasaki': 'NS', 'Nara': 'NR', 'Niigata': 'NI',
    'Oita': 'OT', 'Okayama': 'OK', 'Osaka': 'OS', 'Saga': 'SG',
    'Saitama': 'ST', 'Shiga': 'SI', 'Shimane': 'SM', 'Shizuoka': 'SZ',
    'Tochigi': 'TC', 'Tokushima': 'TS', 'Tokyo': 'TK', 'Tottori': 'TT',
    'Toyama': 'TY', 'Wakayama': 'WK', 'Yamagata': 'YG', 'Yamaguchi': 'YM',
    'Yamanashi': 'YN', 'Okinawa': 'ON',
    # Alternative spellings from address strings
    'Tokyo To': 'TK', 'Osaka Fu': 'OS', 'Kyoto Fu': 'KY', 'Hokkaido': 'HO',
    'Aichi Ken': 'AI', 'Hyogo Ken': 'HG', 'Kanagawa Ken': 'KN',
}


def get_city_code(city_name: str) -> str:
    """Generate 3-letter city code from city name."""
    # Clean suffixes common in Japanese city names
    name = city_name.strip()
    for suffix in [' Shi', ' Ku', ' Cho', ' Machi', ' Mura', ' Gun', ' City', '-shi', '-ku']:
        if name.endswith(suffix):
            name = name[:-len(suffix)]

    words = name.split()

    if len(words) == 1:
        return name[:3].upper()
    elif len(words) == 2:
        return (words[0][0] + words[1][:2]).upper()
    else:
        return ''.join(w[0] for w in words[:3]).upper()


def search_google_places(query: str, api_key: str, country_bias: str = "JP") -> Optional[dict]:
    """Search Google Places API for a location."""
    headers = {
        "Content-Type": "application/json",
        "X-Goog-Api-Key": api_key,
        "X-Goog-FieldMask": "places.displayName,places.formattedAddress,places.location,places.addressComponents,places.types,places.id,places.websiteUri"
    }

    payload = {
        "textQuery": query,
        "languageCode": "en"
    }

    try:
        response = httpx.post(TEXT_SEARCH_URL, json=payload, headers=headers, timeout=30)
        response.raise_for_status()
        data = response.json()

        if "places" in data and len(data["places"]) > 0:
            return data["places"][0]
        return None
    except Exception as e:
        print(f"  Error searching Google Places: {e}")
        return None


def extract_location_from_google(place: dict) -> dict:
    """Extract location information from Google Places result."""
    result = {
        'city': None,
        'prefecture': None,
        'prefecture_code': None,
        'latitude': None,
        'longitude': None,
        'formatted_address': None,
        'place_id': None,
        'website': None,
    }

    if not place:
        return result

    result['place_id'] = place.get('id')
    result['formatted_address'] = place.get('formattedAddress')
    result['website'] = place.get('websiteUri')

    # Get coordinates
    location = place.get('location', {})
    result['latitude'] = location.get('latitude')
    result['longitude'] = location.get('longitude')

    # Parse address components
    components = place.get('addressComponents', [])
    for comp in components:
        types = comp.get('types', [])
        long_name = comp.get('longText', '')

        if 'locality' in types:
            result['city'] = long_name
        elif 'administrative_area_level_1' in types:
            result['prefecture'] = long_name
            # Try to get ISO code
            result['prefecture_code'] = PREFECTURE_TO_ISO.get(long_name)
        elif 'sublocality_level_1' in types and not result['city']:
            # Use ward/sublocality as city if no locality
            result['city'] = long_name

    return result


def lookup_city_geonames(conn: sqlite3.Connection, lat: float, lon: float) -> Optional[dict]:
    """Reverse geocode coordinates to find nearest city in GeoNames."""
    cursor = conn.cursor()

    cursor.execute("""
        SELECT name, ascii_name, admin1_code, admin1_name, geonames_id,
               latitude, longitude, population, feature_code,
               ((latitude - ?) * (latitude - ?) + (longitude - ?) * (longitude - ?)) as dist_sq
        FROM cities
        WHERE country_code = 'JP'
          AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
        ORDER BY dist_sq
        LIMIT 1
    """, (lat, lat, lon, lon))

    row = cursor.fetchone()
    if row:
        return {
            'name': row[0],
            'ascii_name': row[1],
            'admin1_code': row[2],
            'admin1_name': row[3],
            'geonames_id': row[4],
            'latitude': row[5],
            'longitude': row[6],
            'population': row[7],
            'feature_code': row[8],
        }
    return None


def process_file(filepath: Path, conn: sqlite3.Connection, api_key: str, dry_run: bool = False) -> dict:
    """Process a single Japanese custodian file."""
    result = {
        'file': str(filepath),
        'status': 'skipped',
        'old_ghcid': None,
        'new_ghcid': None,
        'city': None,
        'prefecture': None,
        'error': None,
    }

    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            data = yaml.safe_load(f)
    except Exception as e:
        result['status'] = 'error'
        result['error'] = f'Failed to load YAML: {e}'
        return result

    if not data:
        result['status'] = 'error'
        result['error'] = 'Empty YAML file'
        return result

    # Get current GHCID
    ghcid_data = data.get('ghcid', {})
    old_ghcid = ghcid_data.get('ghcid_current', '')
    result['old_ghcid'] = old_ghcid

    if not old_ghcid.startswith('JP-XX-XXX-'):
        result['status'] = 'skipped'
        result['error'] = 'Not a JP-XX-XXX file'
        return result

    # Get institution name for search
    name = data.get('custodian_name', {}).get('claim_value', '')
    if not name:
        name = data.get('original_entry', {}).get('name', '')

    if not name:
        result['status'] = 'error'
        result['error'] = 'No institution name found'
        return result

    # Search Google Places
    print(f"  Searching: {name[:50]}...")
    place = search_google_places(f"{name} Japan", api_key)
    time.sleep(REQUEST_DELAY)

    if not place:
        result['status'] = 'error'
        result['error'] = 'Not found in Google Places'
        return result

    # Extract location
    location_info = extract_location_from_google(place)

    if not location_info['latitude'] or not location_info['longitude']:
        result['status'] = 'error'
        result['error'] = 'No coordinates from Google'
        return result

    # Lookup in GeoNames for city code
    city_info = lookup_city_geonames(conn, location_info['latitude'], location_info['longitude'])

    if not city_info:
        result['status'] = 'error'
        result['error'] = 'City not found in GeoNames'
        return result

    # Determine region code
    admin1_code = city_info['admin1_code']
    region_code = ADMIN1_TO_ISO.get(admin1_code, 'XX')

    if region_code == 'XX':
        # Try from Google address
        region_code = location_info.get('prefecture_code', 'XX')

    # Generate city code
    city_code = get_city_code(city_info['ascii_name'])

    result['city'] = city_info['ascii_name']
    result['prefecture'] = city_info['admin1_name']

    # Build new GHCID
    parts = old_ghcid.split('-')
    if len(parts) >= 5:
        inst_type = parts[3]
        abbreviation = '-'.join(parts[4:])
    else:
        result['status'] = 'error'
        result['error'] = f'Invalid GHCID format: {old_ghcid}'
        return result

    new_ghcid = f'JP-{region_code}-{city_code}-{inst_type}-{abbreviation}'
    result['new_ghcid'] = new_ghcid

    if dry_run:
        result['status'] = 'would_update'
        return result

    # Update the data
    timestamp = datetime.now(timezone.utc).isoformat()

    # Update ghcid section
    data['ghcid']['ghcid_current'] = new_ghcid
    data['ghcid']['location_resolution'] = {
        'method': 'GOOGLE_PLACES_GEONAMES',
        'country_code': 'JP',
        'region_code': region_code,
        'region_name': city_info['admin1_name'],
        'city_code': city_code,
        'city_name': city_info['ascii_name'],
        'geonames_id': city_info['geonames_id'],
        'feature_code': city_info['feature_code'],
        'google_place_id': location_info.get('place_id'),
        'latitude': location_info['latitude'],
        'longitude': location_info['longitude'],
        'resolution_date': timestamp,
    }

    # Add Google Maps enrichment
    data['google_maps_enrichment'] = {
        'place_id': location_info.get('place_id'),
        'formatted_address': location_info.get('formatted_address'),
        'website': location_info.get('website'),
        'latitude': location_info['latitude'],
        'longitude': location_info['longitude'],
        'enriched_at': timestamp,
        'source': 'Google Places API (New)',
    }

    # Update location in original_entry
    if 'original_entry' in data and 'locations' in data['original_entry']:
        if data['original_entry']['locations']:
            data['original_entry']['locations'][0]['city'] = city_info['ascii_name']
            data['original_entry']['locations'][0]['region'] = city_info['admin1_name']
            if location_info['latitude']:
                data['original_entry']['locations'][0]['latitude'] = location_info['latitude']
                data['original_entry']['locations'][0]['longitude'] = location_info['longitude']

    # Add to GHCID history
    if 'ghcid_history' not in data['ghcid']:
        data['ghcid']['ghcid_history'] = []

    for entry in data['ghcid']['ghcid_history']:
        if entry.get('ghcid') == old_ghcid and not entry.get('valid_to'):
            entry['valid_to'] = timestamp

    data['ghcid']['ghcid_history'].append({
        'ghcid': new_ghcid,
        'ghcid_numeric': data['ghcid'].get('ghcid_numeric'),
        'valid_from': timestamp,
        'reason': f'Location resolved via Google Places + GeoNames: {city_info["ascii_name"]} ({region_code})',
    })

    # Update identifiers
    if 'identifiers' in data:
        for identifier in data['identifiers']:
            if identifier.get('identifier_scheme') == 'GHCID':
                identifier['identifier_value'] = new_ghcid

    # Write updated data
    with open(filepath, 'w', encoding='utf-8') as f:
        yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)

    # Rename file
    new_filename = f'{new_ghcid}.yaml'
    new_filepath = filepath.parent / new_filename

    if filepath != new_filepath and not new_filepath.exists():
        filepath.rename(new_filepath)
        result['new_file'] = str(new_filepath)
    elif new_filepath.exists() and filepath != new_filepath:
        result['status'] = 'collision'
        result['error'] = f'Target file exists: {new_filepath.name}'
        return result

    result['status'] = 'updated'
    return result


def main():
    parser = argparse.ArgumentParser(description='Enrich Japanese custodian files with Google Places data')
    parser.add_argument('--dry-run', action='store_true', help='Show what would be done')
    parser.add_argument('--limit', type=int, help='Limit number of files to process')
    args = parser.parse_args()

    if not GOOGLE_PLACES_TOKEN:
        print("ERROR: GOOGLE_PLACES_TOKEN environment variable is required")
        print("Set it in .env file or export GOOGLE_PLACES_TOKEN=...")
        sys.exit(1)

    if not GEONAMES_DB.exists():
        print(f"ERROR: GeoNames database not found: {GEONAMES_DB}")
        sys.exit(1)

    # Find Japanese XXX files
    files = sorted(CUSTODIAN_DIR.glob('JP-XX-XXX-*.yaml'))

    if args.limit:
        files = files[:args.limit]

    print(f"Found {len(files)} Japanese XXX files")
    print(f"Dry run: {args.dry_run}")
    print()

    conn = sqlite3.connect(str(GEONAMES_DB))

    stats = {'updated': 0, 'error': 0, 'skipped': 0, 'would_update': 0, 'collision': 0}
    errors = []

    for filepath in files:
        print(f"Processing: {filepath.name}")
        result = process_file(filepath, conn, GOOGLE_PLACES_TOKEN, dry_run=args.dry_run)
        stats[result['status']] = stats.get(result['status'], 0) + 1

        if result['status'] in ('updated', 'would_update'):
            print(f"  ✓ {result['city']} ({result['prefecture']}): {result['old_ghcid']} → {result['new_ghcid']}")
        elif result['status'] == 'error':
            print(f"  ✗ {result['error']}")
            errors.append(result)
        elif result['status'] == 'collision':
            print(f"  ⚠ {result['error']}")

    conn.close()

    print()
    print('=' * 60)
    print('Summary:')
    print(f"  Updated: {stats.get('updated', 0)}")
    print(f"  Would update: {stats.get('would_update', 0)}")
    print(f"  Errors: {stats.get('error', 0)}")
    print(f"  Collisions: {stats.get('collision', 0)}")
    print(f"  Skipped: {stats.get('skipped', 0)}")

    if errors:
        print()
        print('Files with errors (may need manual research):')
        for err in errors[:10]:
            print(f"  - {Path(err['file']).name}: {err['error']}")


if __name__ == '__main__':
    main()