glam/scripts/geocode_jp_compound_cities.py

#!/usr/bin/env python3
"""
Geocode Japanese Compound City Names from GeoNames Database

This script handles Japanese compound city names that weren't matched by the
standard geocoding approach. Japanese locations often use compound names like:

  - "Aichi Gun Togo Cho" = Aichi District, Togo Town → search for "Togo"
  - "Nagoya Shi Chikusa Ku" = Nagoya City, Chikusa Ward → search for "Nagoya"
  - "Kamikita Gun Rokkasho Mura" = Kamikita District, Rokkasho Village → search for "Rokkasho"

Japanese Administrative Divisions:
  - 県 (Ken) = Prefecture
  - 市 (Shi) = City
  - 区 (Ku) = Ward (within cities)
  - 郡 (Gun) = County/District
  - 町 (Cho/Machi) = Town
  - 村 (Mura/Son) = Village

Strategy:
  1. Parse compound city name to extract settlement name
  2. For "X Gun Y Cho/Mura" → search for Y
  3. For "X Shi Y Ku" → search for X (the main city)
  4. Use GeoNames local database for fast lookups

Usage:
    python scripts/geocode_jp_compound_cities.py --dry-run
    python scripts/geocode_jp_compound_cities.py --limit 100
    python scripts/geocode_jp_compound_cities.py --all
"""

import argparse
import re
import sqlite3
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional

from ruamel.yaml import YAML

# Setup ruamel.yaml for round-trip preservation
yaml = YAML()
yaml.preserve_quotes = True
yaml.width = 120

# Configuration
CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian")
GEONAMES_DB = Path("/Users/kempersc/apps/glam/data/reference/geonames.db")


def parse_japanese_compound_city(city_name: str) -> list[str]:
    """
    Parse Japanese compound city name and return list of candidate search terms.

    Examples:
        "Aichi Gun Togo Cho" → ["Togo", "Togo-cho", "Togocho"]
        "Nagoya Shi Chikusa Ku" → ["Nagoya", "Chikusa"]
        "Kamikita Gun Rokkasho Mura" → ["Rokkasho", "Rokkasho-mura"]
        "Kitanagoyashi" → ["Kitanagoya"]

    Returns list of candidate names in order of preference.
    """
    if not city_name:
        return []

    candidates = []

    # Normalize: remove extra spaces, handle case
    city = city_name.strip()
    parts = city.split()

    # Pattern 1: "X Gun Y Cho" (District + Town)
    # Extract Y as the main settlement
    if 'Gun' in parts and 'Cho' in parts:
        gun_idx = parts.index('Gun')
        cho_idx = parts.index('Cho')
        if gun_idx < cho_idx:
            town_name = ' '.join(parts[gun_idx + 1:cho_idx])
            if town_name:
                candidates.extend([
                    town_name,
                    f"{town_name}-cho",
                    f"{town_name}cho",
                    town_name.lower(),
                ])

    # Pattern 2: "X Gun Y Mura" (District + Village)
    if 'Gun' in parts and 'Mura' in parts:
        gun_idx = parts.index('Gun')
        mura_idx = parts.index('Mura')
        if gun_idx < mura_idx:
            village_name = ' '.join(parts[gun_idx + 1:mura_idx])
            if village_name:
                candidates.extend([
                    village_name,
                    f"{village_name}-mura",
                    f"{village_name}mura",
                    village_name.lower(),
                ])

    # Pattern 2b: "X Gun Y Machi" (District + Town - alternate romanization)
    if 'Gun' in parts and 'Machi' in parts:
        gun_idx = parts.index('Gun')
        machi_idx = parts.index('Machi')
        if gun_idx < machi_idx:
            town_name = ' '.join(parts[gun_idx + 1:machi_idx])
            if town_name:
                candidates.extend([
                    town_name,
                    f"{town_name}-machi",
                    f"{town_name}machi",
                    f"{town_name}-cho",
                    f"{town_name}cho",
                    town_name.lower(),
                ])

    # Pattern 3: "X Shi Y Ku" (City + Ward) → Use the city (X)
    if 'Shi' in parts and 'Ku' in parts:
        shi_idx = parts.index('Shi')
        city_name_part = ' '.join(parts[:shi_idx])
        if city_name_part:
            candidates.extend([
                city_name_part,
                city_name_part.lower(),
            ])
        # Also add the ward as fallback
        ku_idx = parts.index('Ku')
        ward_name = ' '.join(parts[shi_idx + 1:ku_idx])
        if ward_name:
            candidates.extend([
                ward_name,
                f"{ward_name}-ku",
            ])

    # Pattern 4: "Xshi" (concatenated city name, e.g., "Kitanagoyashi")
    if city.lower().endswith('shi') and ' ' not in city:
        base = city[:-3]  # Remove "shi"
        if base:
            candidates.extend([
                base,
                base.lower(),
                city,  # Also try full name
            ])

    # Pattern 5: Just "X Shi" without ward
    if 'Shi' in parts and 'Ku' not in parts:
        shi_idx = parts.index('Shi')
        city_name_part = ' '.join(parts[:shi_idx])
        if city_name_part:
            candidates.extend([
                city_name_part,
                city_name_part.lower(),
            ])

    # Pattern 6: Try full name as-is
    if not candidates:
        candidates.append(city)

    # Deduplicate while preserving order
    seen = set()
    unique = []
    for c in candidates:
        c_lower = c.lower()
        if c_lower not in seen and c:
            seen.add(c_lower)
            unique.append(c)

    return unique


class GeoNamesLookup:
    """Fast city coordinate lookup from GeoNames database."""

    def __init__(self, db_path: Path):
        self.conn = sqlite3.connect(db_path)
        self.conn.row_factory = sqlite3.Row

    def lookup_city(self, candidates: list[str], country_code: str = "JP") -> Optional[dict]:
        """
        Look up city coordinates trying multiple candidate names.

        Args:
            candidates: List of potential city names to try
            country_code: Country code (default JP)

        Returns:
            Dict with coordinates or None if not found
        """
        # Preferred feature codes (proper settlements)
        preferred_features = "('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')"
        # Fallback includes PPLX (neighborhoods/sections) for Japan
        fallback_features = "('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG', 'PPLX')"

        for candidate in candidates:
            if not candidate:
                continue

            # Try exact match with preferred features
            cursor = self.conn.execute(f"""
                SELECT geonames_id, name, ascii_name, latitude, longitude,
                       admin1_code, admin1_name, feature_code, population
                FROM cities
                WHERE country_code = ?
                  AND (LOWER(name) = LOWER(?) OR LOWER(ascii_name) = LOWER(?))
                  AND feature_code IN {preferred_features}
                ORDER BY population DESC
                LIMIT 1
            """, (country_code, candidate, candidate))

            row = cursor.fetchone()
            if row:
                return self._row_to_dict(row, candidate)

            # Try partial match (starts with) with preferred features
            cursor = self.conn.execute(f"""
                SELECT geonames_id, name, ascii_name, latitude, longitude,
                       admin1_code, admin1_name, feature_code, population
                FROM cities
                WHERE country_code = ?
                  AND (LOWER(name) LIKE LOWER(?) OR LOWER(ascii_name) LIKE LOWER(?))
                  AND feature_code IN {preferred_features}
                ORDER BY population DESC
                LIMIT 1
            """, (country_code, f"{candidate}%", f"{candidate}%"))

            row = cursor.fetchone()
            if row:
                return self._row_to_dict(row, candidate)

        # Fallback: try again with PPLX included (for Japanese administrative units)
        for candidate in candidates:
            if not candidate:
                continue

            cursor = self.conn.execute(f"""
                SELECT geonames_id, name, ascii_name, latitude, longitude,
                       admin1_code, admin1_name, feature_code, population
                FROM cities
                WHERE country_code = ?
                  AND (LOWER(name) = LOWER(?) OR LOWER(ascii_name) = LOWER(?))
                  AND feature_code IN {fallback_features}
                ORDER BY population DESC
                LIMIT 1
            """, (country_code, candidate, candidate))

            row = cursor.fetchone()
            if row:
                return self._row_to_dict(row, candidate)

        return None

    def _row_to_dict(self, row, matched_candidate: str) -> dict:
        """Convert database row to dictionary."""
        return {
            'geonames_id': row['geonames_id'],
            'geonames_name': row['name'],
            'ascii_name': row['ascii_name'],
            'latitude': row['latitude'],
            'longitude': row['longitude'],
            'admin1_code': row['admin1_code'],
            'admin1_name': row['admin1_name'],
            'feature_code': row['feature_code'],
            'population': row['population'],
            'matched_candidate': matched_candidate
        }

    def close(self):
        self.conn.close()


def extract_city(data: dict) -> Optional[str]:
    """Extract city name from custodian data."""
    # Try location block
    city = data.get('location', {}).get('city')
    if city:
        return city

    # Try original_entry.locations
    orig_locs = data.get('original_entry', {}).get('locations', [])
    if orig_locs and len(orig_locs) > 0:
        return orig_locs[0].get('city')

    return None


def has_coordinates(data: dict) -> bool:
    """Check if file already has coordinates."""
    loc = data.get('location', {})
    return loc.get('latitude') is not None and loc.get('longitude') is not None


def geocode_file(filepath: Path, geonames: GeoNamesLookup, dry_run: bool = False, verbose: bool = False) -> dict:
    """
    Geocode a single Japanese custodian file.

    Returns dict with results.
    """
    result = {
        'success': False,
        'geocoded': False,
        'already_has_coords': False,
        'city': None,
        'candidates': [],
        'matched_candidate': None,
        'error': None
    }

    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            data = yaml.load(f)

        if not isinstance(data, dict):
            result['error'] = "Invalid YAML structure"
            return result

        # Check if already has coordinates
        if has_coordinates(data):
            result['success'] = True
            result['already_has_coords'] = True
            return result

        # Extract city name
        city = extract_city(data)
        result['city'] = city

        if not city:
            result['error'] = "No city found"
            result['success'] = True
            return result

        # Parse compound city name
        candidates = parse_japanese_compound_city(city)
        result['candidates'] = candidates

        if verbose:
            print(f"  City: {city}")
            print(f"  Candidates: {candidates}")

        if not candidates:
            result['error'] = f"Could not parse city name: {city}"
            result['success'] = True
            return result

        # Look up in GeoNames
        geo_result = geonames.lookup_city(candidates, "JP")

        if not geo_result:
            result['error'] = f"No match for: {candidates}"
            result['success'] = True
            return result

        result['matched_candidate'] = geo_result['matched_candidate']

        # Update location block with coordinates
        if 'location' not in data:
            data['location'] = {}

        data['location']['latitude'] = geo_result['latitude']
        data['location']['longitude'] = geo_result['longitude']
        data['location']['coordinate_provenance'] = {
            'source_type': 'GEONAMES_JP_COMPOUND',
            'source_path': 'data/reference/geonames.db',
            'entity_id': geo_result['geonames_id'],
            'original_query': city,
            'matched_candidate': geo_result['matched_candidate'],
            'original_timestamp': datetime.now(timezone.utc).isoformat()
        }

        # Add geonames reference
        if not data['location'].get('geonames_id'):
            data['location']['geonames_id'] = geo_result['geonames_id']
        if not data['location'].get('geonames_name'):
            data['location']['geonames_name'] = geo_result['geonames_name']
        if not data['location'].get('feature_code'):
            data['location']['feature_code'] = geo_result['feature_code']

        data['location']['normalization_timestamp'] = datetime.now(timezone.utc).isoformat()

        if not dry_run:
            with open(filepath, 'w', encoding='utf-8') as f:
                yaml.dump(data, f)

        result['success'] = True
        result['geocoded'] = True
        return result

    except Exception as e:
        result['error'] = str(e)
        return result


def main():
    parser = argparse.ArgumentParser(
        description="Geocode Japanese compound city names using GeoNames database"
    )
    parser.add_argument('--dry-run', action='store_true', help="Preview without writing")
    parser.add_argument('--limit', type=int, default=0, help="Limit number of files to process")
    parser.add_argument('--all', action='store_true', help="Process all files (no limit)")
    parser.add_argument('--verbose', action='store_true', help="Show detailed output")
    parser.add_argument('--file-list', type=str, help="File containing list of files to process (one per line)")

    args = parser.parse_args()

    if args.dry_run:
        print("DRY RUN - No files will be modified\n")

    # Initialize GeoNames lookup
    if not GEONAMES_DB.exists():
        print(f"Error: GeoNames database not found at {GEONAMES_DB}")
        return 1

    geonames = GeoNamesLookup(GEONAMES_DB)

    # Get list of files to process
    if args.file_list:
        # Read from provided file list
        with open(args.file_list, 'r') as f:
            files_to_process = [CUSTODIAN_DIR / line.strip() for line in f if line.strip()]
        print(f"Loaded {len(files_to_process)} files from {args.file_list}")
    else:
        # Scan directory (slow)
        print("Scanning for JP files missing coordinates...")
        all_jp_files = sorted(CUSTODIAN_DIR.glob("JP-*.yaml"))
        print(f"Total JP files: {len(all_jp_files)}")

        # Filter to only files missing coordinates
        files_to_process = []
        for fp in all_jp_files:
            try:
                with open(fp, 'r', encoding='utf-8') as f:
                    data = yaml.load(f)
                if isinstance(data, dict) and not has_coordinates(data):
                    files_to_process.append(fp)
            except Exception:
                pass  # Skip files that can't be read

    print(f"Files to process: {len(files_to_process)}")

    if args.limit and not args.all:
        files_to_process = files_to_process[:args.limit]
        print(f"Limited to first {args.limit} files")

    # Statistics
    stats = {
        'total': len(files_to_process),
        'geocoded': 0,
        'not_found': 0,
        'no_city': 0,
        'errors': 0,
        'by_pattern': {}
    }

    not_found_samples = []

    for i, filepath in enumerate(files_to_process):
        result = geocode_file(filepath, geonames, dry_run=args.dry_run, verbose=args.verbose)

        if result['geocoded']:
            stats['geocoded'] += 1
            # Track which candidate matched
            matched = result.get('matched_candidate', 'unknown')
            stats['by_pattern'][matched] = stats['by_pattern'].get(matched, 0) + 1
            if args.verbose:
                print(f"✅ {filepath.name}: {result['city']} → {matched}")
        elif result.get('error') and 'No match' in result['error']:
            stats['not_found'] += 1
            if len(not_found_samples) < 50:
                not_found_samples.append((filepath.name, result['city'], result['candidates']))
        elif result.get('error') and 'No city' in result['error']:
            stats['no_city'] += 1
        elif result.get('error'):
            stats['errors'] += 1
            if args.verbose:
                print(f"❌ {filepath.name}: {result['error']}")

        if not args.verbose and (i + 1) % 500 == 0:
            print(f"Processed {i+1}/{len(files_to_process)} files... (geocoded: {stats['geocoded']})")

    # Print summary
    print("\n" + "=" * 70)
    print("JAPANESE COMPOUND CITY GEOCODING SUMMARY")
    print("=" * 70)
    print(f"Total files processed:    {stats['total']}")
    print(f"Successfully geocoded:    {stats['geocoded']}")
    print(f"City not found:          {stats['not_found']}")
    print(f"No city in file:         {stats['no_city']}")
    print(f"Errors:                  {stats['errors']}")

    if stats['geocoded'] > 0:
        print(f"\nSuccess rate: {stats['geocoded']/stats['total']*100:.1f}%")

    if not_found_samples:
        print(f"\nSample cities not found ({len(not_found_samples)} shown):")
        for filename, city, candidates in not_found_samples[:20]:
            print(f"  {filename}: {city} → tried {candidates[:3]}")

    if args.dry_run:
        print("\n(DRY RUN - No files were modified)")

    geonames.close()
    return 0


if __name__ == "__main__":
    exit(main())