glam/scripts/geocode_from_city_name.py

#!/usr/bin/env python3
"""
Geocode custodian files by looking up city names in GeoNames database.
Handles various city name formats and country-specific patterns.

Usage:
    python scripts/geocode_from_city_name.py --country CH
    python scripts/geocode_from_city_name.py --country CZ
    python scripts/geocode_from_city_name.py  # All countries
"""

import sqlite3
import re
from pathlib import Path
from datetime import datetime, timezone
from ruamel.yaml import YAML

GEONAMES_DB = Path("data/reference/geonames.db")
CUSTODIAN_DIR = Path("data/custodian")

yaml = YAML()
yaml.preserve_quotes = True
yaml.width = 4096


def normalize_city_name(city: str, country: str) -> list[str]:
    """Generate candidate city names for lookup, ordered by priority."""
    candidates = [city]

    # Common patterns for all countries
    # Remove " - Suisse", " - France", etc.
    no_country = re.sub(r'\s*-\s*(Suisse|Schweiz|Switzerland|France|Deutschland|Germany)$', '', city, flags=re.I)
    if no_country != city:
        candidates.append(no_country)

    # Remove house numbers (common in CZ addresses)
    cleaned = re.sub(r'\s+\d+$', '', city)
    if cleaned != city:
        candidates.append(cleaned)

    # Handle Swiss city names
    if country == 'CH':
        # "St. Gallen" and "St.Gallen" -> also try "Sankt Gallen"
        if 'St.' in city or 'St-' in city:
            candidates.append(city.replace('St.', 'Sankt ').replace('St-', 'Sankt-'))
            candidates.append(city.replace('St.', 'Saint ').replace('St-', 'Saint-'))
            # Remove space if it created double space
            for i, c in enumerate(candidates):
                candidates[i] = re.sub(r'\s+', ' ', c).strip()

        # Remove parenthetical suffixes (Sauverny)
        no_parens = re.sub(r'\s*\([^)]*\)', '', city)
        if no_parens != city:
            candidates.append(no_parens.strip())

        # "Bernex - Genève" -> try "Bernex" and "Genève"
        if ' - ' in city:
            parts = [p.strip() for p in city.split(' - ')]
            candidates.extend(parts)

        # "Lausanne-Dorigny" -> try just "Lausanne"
        if '-' in city:
            parts = city.split('-')
            candidates.extend([p.strip() for p in parts])

        # "Buchs SG", "Brugg AG" -> try just "Buchs", "Brugg"
        canton_suffix = re.match(r'^(.+)\s+(AG|AI|AR|BE|BL|BS|FR|GE|GL|GR|JU|LU|NE|NW|OW|SG|SH|SO|SZ|TG|TI|UR|VD|VS|ZG|ZH)$', city)
        if canton_suffix:
            candidates.append(canton_suffix.group(1).strip())

        # "Dättwil AG" -> "Dättwil"
        # "Villigen PSI" -> "Villigen"
        no_suffix = re.sub(r'\s+[A-Z]{2,3}$', '', city)
        if no_suffix != city:
            candidates.append(no_suffix)

        # "Hausen b. Brugg" -> "Hausen"
        no_bei = re.sub(r'\s+b\.\s+.*$', '', city)
        if no_bei != city:
            candidates.append(no_bei)

        # "Ecublens/VD" -> "Ecublens"
        if '/' in city:
            parts = city.split('/')
            candidates.append(parts[0].strip())

        # "Sils / Segl Maria" -> "Sils", "Segl Maria"
        if ' / ' in city:
            parts = [p.strip() for p in city.split(' / ')]
            candidates.extend(parts)

        # "Glion sur Montreux" -> "Glion"
        no_sur = re.sub(r'\s+sur\s+.*$', '', city, flags=re.I)
        if no_sur != city:
            candidates.append(no_sur)

    # Handle Czech city names with district suffixes
    if country == 'CZ':
        # "Břasy-Stupno" -> try just "Břasy" and "Stupno"
        if '-' in city:
            parts = city.split('-')
            candidates.extend([p.strip() for p in parts])
        # Remove district numbers like "Praha 1"
        no_district = re.sub(r'\s+\d+$', '', city)
        if no_district != city:
            candidates.append(no_district)

    # Handle Japanese compound names (fallback to just the town name)
    if country == 'JP':
        # "Waga Gun Nishiwaga Machi" -> try "Nishiwaga"
        parts = city.split()
        for i, part in enumerate(parts):
            if part in ('Machi', 'Cho', 'Mura', 'Shi', 'Ku'):
                if i > 0:
                    candidates.append(parts[i-1])  # Just the town name
                    # Also try "town + suffix"
                    candidates.append(f"{parts[i-1]} {part}")
        # Try removing "Gun" district entirely
        no_gun = re.sub(r'\w+\s+Gun\s+', '', city)
        if no_gun != city:
            candidates.append(no_gun)

    # Deduplicate while preserving order
    seen = set()
    result = []
    for c in candidates:
        if c and c not in seen:
            seen.add(c)
            result.append(c)

    return result


def get_coords_for_city(conn: sqlite3.Connection, city_name: str, country_code: str) -> tuple[float, float, int, str] | None:
    """Get lat/lon, geonames_id, and matched name for a city."""
    # First try exact match on name/ascii_name (fast)
    cursor = conn.execute(
        """SELECT latitude, longitude, geonames_id, name
           FROM cities
           WHERE country_code = ?
             AND (name = ? OR ascii_name = ?)
             AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
           ORDER BY population DESC
           LIMIT 1""",
        (country_code, city_name, city_name)
    )
    row = cursor.fetchone()
    if row:
        return row[0], row[1], row[2], row[3]
    return None


def process_file(filepath: Path, conn: sqlite3.Connection) -> bool:
    """Process a single custodian file. Returns True if updated."""
    with open(filepath, 'r', encoding='utf-8') as f:
        data = yaml.load(f)

    if not data:
        return False

    # Check if already has coordinates
    location = data.get('location', {})
    if location.get('latitude') and location.get('longitude'):
        return False

    # Get city and country from location block
    city = location.get('city')
    country = location.get('country')

    if not city or not country:
        return False

    # Handle country as dict (from Wikidata enrichment)
    if isinstance(country, dict):
        # Extract country code from Wikidata country structure
        country_label = country.get('label', '')
        country_map = {
            'Switzerland': 'CH', 'Czech Republic': 'CZ', 'Czechia': 'CZ',
            'Japan': 'JP', 'Germany': 'DE', 'Austria': 'AT', 'Belgium': 'BE',
            'France': 'FR', 'Italy': 'IT', 'Netherlands': 'NL', 'Poland': 'PL'
        }
        country = country_map.get(country_label, None)
        if not country:
            return False

    # Generate candidate city names
    candidates = normalize_city_name(city, country)

    # Try each candidate
    for candidate in candidates:
        result = get_coords_for_city(conn, candidate, country)
        if result:
            lat, lon, geonames_id, matched_name = result

            # Update location block
            data['location']['latitude'] = lat
            data['location']['longitude'] = lon
            data['location']['geonames_id'] = geonames_id
            data['location']['geocoding_timestamp'] = datetime.now(timezone.utc).isoformat()
            data['location']['geocoding_method'] = 'CITY_NAME_LOOKUP'
            if matched_name != city:
                data['location']['geonames_matched_name'] = matched_name

            # Write back
            with open(filepath, 'w', encoding='utf-8') as f:
                yaml.dump(data, f)

            return True

    return False


def main():
    import argparse
    parser = argparse.ArgumentParser(description='Geocode by city name lookup')
    parser.add_argument('--country', type=str, help='Country code to process')
    parser.add_argument('--dry-run', action='store_true', help='Show what would be done')
    args = parser.parse_args()

    conn = sqlite3.connect(GEONAMES_DB)

    # Find files to process
    if args.country:
        pattern = f"{args.country}-*.yaml"
    else:
        pattern = "*.yaml"

    files = list(CUSTODIAN_DIR.glob(pattern))
    print(f"Found {len(files)} files matching {pattern}")

    updated = 0
    skipped = 0
    no_match = 0

    for filepath in files:
        if not filepath.is_file():
            continue

        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                data = yaml.load(f)

            if not data:
                continue

            location = data.get('location', {})
            if location.get('latitude'):
                skipped += 1
                continue

            city = location.get('city')
            country = location.get('country')

            if not city or not country:
                no_match += 1
                continue

            # Handle country dict
            if isinstance(country, dict):
                country_label = country.get('label', '')
                country_map = {
                    'Switzerland': 'CH', 'Czech Republic': 'CZ', 'Czechia': 'CZ',
                    'Japan': 'JP', 'Germany': 'DE', 'Austria': 'AT', 'Belgium': 'BE'
                }
                country = country_map.get(country_label, None)
                if not country:
                    no_match += 1
                    continue

            candidates = normalize_city_name(city, country)

            if args.dry_run:
                found = False
                for candidate in candidates:
                    result = get_coords_for_city(conn, candidate, country)
                    if result:
                        print(f"Would update: {filepath.name} ({city}) -> {result[3]} ({result[0]:.4f}, {result[1]:.4f})")
                        updated += 1
                        found = True
                        break
                if not found:
                    print(f"  No match: {filepath.name} ({city}, {country}) tried: {candidates[:3]}")
                    no_match += 1
            else:
                if process_file(filepath, conn):
                    print(f"Updated: {filepath.name}")
                    updated += 1
                else:
                    no_match += 1

        except Exception as e:
            print(f"Error: {filepath.name}: {e}")

    conn.close()

    print(f"\nSummary:")
    print(f"  Updated: {updated}")
    print(f"  Skipped (already has coords): {skipped}")
    print(f"  No match found: {no_match}")


if __name__ == "__main__":
    main()