glam/scripts/retry_japanese_geocoding.py

#!/usr/bin/env python3
"""
Retry Failed Japanese Geocoding with Enhanced Strategies

This script specifically targets the 1,481 failed Japanese institution geocoding
attempts with improved query strategies:

1. Hierarchical fallback: Try progressively broader queries
   - Full address → City + Prefecture → Prefecture only
2. Alternative formats: Try different romanization/formatting
3. Prefecture-level geocoding: For rural/small towns not in database
4. Postal code lookup: Use postal codes as additional signal

Japanese Administrative Divisions:
- 都 (To) = Metropolis (Tokyo)
- 道 (Do) = Circuit (Hokkaido)
- 府 (Fu) = Urban prefecture (Osaka, Kyoto)
- 県 (Ken) = Prefecture
- 市 (Shi) = City
- 区 (Ku) = Ward (within cities)
- 郡 (Gun) = County/District
- 町 (Cho/Machi) = Town
- 村 (Mura/Son) = Village

Address Format Issues:
- ISIL registry uses all-caps romanization
- Nominatim works better with proper case
- "GUN" + "CHO" indicates county-level town (often not in Nominatim)
- Prefecture-level fallback is more reliable for rural areas

Usage:
    python scripts/retry_japanese_geocoding.py [--dry-run] [--limit N]
"""

import argparse
import sqlite3
import time
import yaml
import requests
from pathlib import Path
from datetime import datetime, timezone
from typing import Dict, List, Optional, Tuple
import re


class JapaneseGeocodingRetry:
    """Enhanced geocoding for failed Japanese institutions."""

    def __init__(self, cache_file: Path, data_file: Path, dry_run: bool = False):
        self.cache_file = cache_file
        self.data_file = data_file
        self.dry_run = dry_run
        self.cache_conn = sqlite3.connect(cache_file)
        self.session = requests.Session()
        self.session.headers.update({'User-Agent': 'GLAM-Data-Extractor/1.0'})

        # Statistics
        self.stats = {
            'total_failed': 0,
            'retry_attempted': 0,
            'newly_geocoded': 0,
            'still_failed': 0,
            'cache_hits': 0,
            'api_calls': 0,
            'by_strategy': {
                'full_address': 0,
                'city_prefecture': 0,
                'prefecture_only': 0,
                'postal_code': 0,
                'proper_case': 0
            }
        }

    def normalize_japanese_city(self, city: str) -> str:
        """
        Normalize Japanese city names for better Nominatim matching.

        Examples:
        - "SAPPORO SHI KITA KU" → "Sapporo, Hokkaido"
        - "SHIRAOI GUN SHIRAOI CHO" → "Shiraoi, Hokkaido"
        - "KAMIKITA GUN ROKKASHO MURA" → "Rokkasho, Aomori"
        """
        # Extract main city/town name (before SHI, GUN, KU)
        parts = city.split()

        # Pattern 1: "CITY SHI WARD KU" → "City"
        if 'SHI' in parts and 'KU' in parts:
            shi_idx = parts.index('SHI')
            return ' '.join(parts[:shi_idx]).title()

        # Pattern 2: "COUNTY GUN TOWN CHO" → "Town"
        if 'GUN' in parts and 'CHO' in parts:
            gun_idx = parts.index('GUN')
            cho_idx = parts.index('CHO')
            # Town name is between GUN and CHO
            return ' '.join(parts[gun_idx+1:cho_idx]).title()

        # Pattern 3: "COUNTY GUN VILLAGE MURA" → "Village"
        if 'GUN' in parts and 'MURA' in parts:
            gun_idx = parts.index('GUN')
            mura_idx = parts.index('MURA')
            return ' '.join(parts[gun_idx+1:mura_idx]).title()

        # Pattern 4: Just city name
        if 'SHI' in parts:
            shi_idx = parts.index('SHI')
            return ' '.join(parts[:shi_idx]).title()

        # Default: return as-is in title case
        return city.title()

    def normalize_japanese_prefecture(self, region: str) -> str:
        """
        Normalize Japanese prefecture names.

        Examples:
        - "HOKKAIDO" → "Hokkaido"
        - "TOKYO TO" → "Tokyo"
        - "AOMORI KEN" → "Aomori"
        """
        # Remove administrative suffixes
        region = region.replace(' KEN', '').replace(' TO', '').replace(' FU', '').replace(' DO', '')
        return region.title()

    def build_query_strategies(self, location: Dict) -> List[Tuple[str, str]]:
        """
        Build multiple query strategies for a failed location.

        Returns list of (query_string, strategy_name) tuples in order of preference.
        """
        strategies = []

        city = location.get('city', '')
        region = location.get('region', '')
        street = location.get('street_address', '')
        postal = location.get('postal_code', '')

        # Normalize names
        city_normalized = self.normalize_japanese_city(city)
        prefecture_normalized = self.normalize_japanese_prefecture(region)

        # Strategy 1: City + Prefecture (proper case)
        if city_normalized and prefecture_normalized:
            query = f"{city_normalized}, {prefecture_normalized}, Japan"
            strategies.append((query, 'proper_case'))

        # Strategy 2: Prefecture only (most reliable for rural areas)
        if prefecture_normalized:
            query = f"{prefecture_normalized}, Japan"
            strategies.append((query, 'prefecture_only'))

        # Strategy 3: Postal code + Prefecture (if available)
        if postal and prefecture_normalized:
            query = f"{postal}, {prefecture_normalized}, Japan"
            strategies.append((query, 'postal_code'))

        # Strategy 4: Original city + prefecture (all caps, last resort)
        if city and region:
            query = f"{city}, {region}, Japan"
            strategies.append((query, 'city_prefecture'))

        return strategies

    def geocode_with_nominatim(self, query: str) -> Optional[Dict]:
        """Query Nominatim API with rate limiting."""
        # Check cache first
        cached = self.get_from_cache(query)
        if cached is not None:
            self.stats['cache_hits'] += 1
            return cached

        if self.dry_run:
            print(f"  [DRY RUN] Would query: {query}")
            return None

        # Rate limiting: 1 request per second
        time.sleep(1.0)

        try:
            response = self.session.get(
                'https://nominatim.openstreetmap.org/search',
                params={
                    'q': query,
                    'format': 'json',
                    'limit': 1,
                    'addressdetails': 1,
                    'extratags': 1
                },
                timeout=10
            )
            response.raise_for_status()

            self.stats['api_calls'] += 1
            results = response.json()

            if results:
                result = results[0]
                geo_data = {
                    'latitude': float(result['lat']),
                    'longitude': float(result['lon']),
                    'display_name': result.get('display_name'),
                    'geonames_id': None
                }

                # Try to extract GeoNames ID from extratags
                if 'extratags' in result and isinstance(result['extratags'], dict):
                    geonames_id = result['extratags'].get('geonames_id')
                    if geonames_id:
                        geo_data['geonames_id'] = int(geonames_id)

                # Cache success
                self.cache_result(query, geo_data)
                return geo_data
            else:
                # Cache failure
                self.cache_result(query, None)
                return None

        except Exception as e:
            print(f"  ❌ API error: {e}")
            # Cache failure to avoid retrying
            self.cache_result(query, None)
            return None

    def get_from_cache(self, query: str) -> Optional[Dict]:
        """Retrieve from cache."""
        cursor = self.cache_conn.execute(
            "SELECT latitude, longitude, geonames_id, display_name, success FROM geocoding_cache WHERE query = ?",
            (query,)
        )
        row = cursor.fetchone()
        if row:
            if row[4]:  # success = 1
                return {
                    'latitude': row[0],
                    'longitude': row[1],
                    'geonames_id': row[2],
                    'display_name': row[3]
                }
            else:
                # Cached failure (return empty dict to signal "tried and failed")
                return {}
        return None  # Not in cache at all

    def cache_result(self, query: str, result: Optional[Dict]):
        """Store result in cache."""
        if result:
            self.cache_conn.execute("""
                INSERT OR REPLACE INTO geocoding_cache
                (query, latitude, longitude, geonames_id, display_name, timestamp, success)
                VALUES (?, ?, ?, ?, ?, ?, 1)
            """, (
                query,
                result.get('latitude'),
                result.get('longitude'),
                result.get('geonames_id'),
                result.get('display_name'),
                datetime.now(timezone.utc).isoformat()
            ))
        else:
            self.cache_conn.execute("""
                INSERT OR REPLACE INTO geocoding_cache
                (query, latitude, longitude, geonames_id, display_name, timestamp, success)
                VALUES (?, NULL, NULL, NULL, NULL, ?, 0)
            """, (query, datetime.now(timezone.utc).isoformat()))
        self.cache_conn.commit()

    def retry_institution(self, institution: Dict) -> bool:
        """
        Retry geocoding for a single institution.
        Returns True if newly geocoded, False otherwise.
        """
        if not institution.get('locations'):
            return False

        location = institution['locations'][0]

        # Skip if already geocoded
        if location.get('latitude') is not None:
            return False

        # Skip non-Japanese
        if location.get('country') != 'JP':
            return False

        self.stats['retry_attempted'] += 1
        name = institution.get('name', 'Unknown')

        print(f"\n[{self.stats['retry_attempted']}/{self.stats['total_failed']}] {name}")
        print(f"  Original: {location.get('city')}, {location.get('region')}")

        # Try multiple strategies
        strategies = self.build_query_strategies(location)

        for query, strategy_name in strategies:
            print(f"  Trying ({strategy_name}): {query}")
            result = self.geocode_with_nominatim(query)

            if result and result.get('latitude'):
                # Success!
                location['latitude'] = result['latitude']
                location['longitude'] = result['longitude']
                if result.get('geonames_id'):
                    location['geonames_id'] = result['geonames_id']

                self.stats['newly_geocoded'] += 1
                self.stats['by_strategy'][strategy_name] += 1

                print(f"  ✅ Geocoded via {strategy_name}: {result['latitude']:.4f}, {result['longitude']:.4f}")
                return True

        # All strategies failed
        print(f"  ❌ All strategies failed")
        self.stats['still_failed'] += 1
        return False

    def run(self, limit: Optional[int] = None):
        """Run retry process on all failed Japanese institutions."""
        print("=" * 80)
        print("JAPANESE GEOCODING RETRY")
        print("=" * 80)
        print()

        # Load dataset
        print(f"Loading dataset from {self.data_file}...")
        with open(self.data_file, 'r') as f:
            institutions = yaml.safe_load(f)

        print(f"Loaded {len(institutions)} institutions")
        print()

        # Find failed Japanese geocoding
        failed_japanese = []
        for inst in institutions:
            if inst.get('locations'):
                loc = inst['locations'][0]
                if loc.get('country') == 'JP' and loc.get('latitude') is None:
                    failed_japanese.append(inst)

        self.stats['total_failed'] = len(failed_japanese)
        print(f"Found {self.stats['total_failed']} failed Japanese geocoding attempts")
        print()

        if self.dry_run:
            print("🧪 DRY RUN MODE - No changes will be made")
            print()

        # Apply limit if specified
        if limit:
            failed_japanese = failed_japanese[:limit]
            print(f"Limiting to first {limit} institutions for testing")
            print()

        # Retry each failed institution
        start_time = time.time()

        for inst in failed_japanese:
            self.retry_institution(inst)

            # Progress indicator every 50 institutions
            if self.stats['retry_attempted'] % 50 == 0:
                success_rate = (self.stats['newly_geocoded'] / self.stats['retry_attempted'] * 100)
                print(f"\n📊 Progress: {self.stats['retry_attempted']}/{self.stats['total_failed']} | "
                      f"Newly geocoded: {self.stats['newly_geocoded']} ({success_rate:.1f}%)")

        # Save updated dataset
        if not self.dry_run and self.stats['newly_geocoded'] > 0:
            print(f"\n💾 Saving {self.stats['newly_geocoded']} newly geocoded institutions...")
            with open(self.data_file, 'w') as f:
                yaml.dump(institutions, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
            print(f"✅ Saved to {self.data_file}")

        # Print final statistics
        elapsed = time.time() - start_time

        print()
        print("=" * 80)
        print("RETRY STATISTICS")
        print("=" * 80)
        print(f"Total failed institutions:  {self.stats['total_failed']}")
        print(f"Retry attempted:            {self.stats['retry_attempted']}")
        print(f"Newly geocoded:             {self.stats['newly_geocoded']}")
        print(f"Still failed:               {self.stats['still_failed']}")
        print()
        print(f"Cache hits:                 {self.stats['cache_hits']}")
        print(f"API calls:                  {self.stats['api_calls']}")
        print()
        print("Success by strategy:")
        for strategy, count in self.stats['by_strategy'].items():
            if count > 0:
                pct = (count / self.stats['newly_geocoded'] * 100) if self.stats['newly_geocoded'] > 0 else 0
                print(f"  {strategy:20s} {count:4d} ({pct:.1f}%)")
        print()
        print(f"Total execution time: {elapsed / 60:.1f} minutes")

        if self.stats['newly_geocoded'] > 0:
            avg_rate = self.stats['api_calls'] / elapsed if elapsed > 0 else 0
            print(f"Average API call rate: {avg_rate:.2f} requests/second")

        print("=" * 80)

        # Calculate new overall coverage
        if not self.dry_run and self.stats['newly_geocoded'] > 0:
            total_jp = sum(1 for inst in institutions if inst.get('locations') and inst['locations'][0].get('country') == 'JP')
            geocoded_jp = sum(1 for inst in institutions
                            if inst.get('locations')
                            and inst['locations'][0].get('country') == 'JP'
                            and inst['locations'][0].get('latitude') is not None)

            print()
            print("UPDATED JAPANESE COVERAGE:")
            print(f"  Total Japanese institutions: {total_jp}")
            print(f"  Successfully geocoded:       {geocoded_jp} ({geocoded_jp/total_jp*100:.1f}%)")
            print(f"  Still failed:                {total_jp - geocoded_jp} ({(total_jp - geocoded_jp)/total_jp*100:.1f}%)")
            print("=" * 80)


def main():
    parser = argparse.ArgumentParser(
        description='Retry failed Japanese geocoding with enhanced strategies'
    )
    parser.add_argument(
        '--dry-run',
        action='store_true',
        help='Show what would be done without making changes'
    )
    parser.add_argument(
        '--limit',
        type=int,
        help='Limit retry to first N failed institutions (for testing)'
    )

    args = parser.parse_args()

    # Paths
    base_dir = Path(__file__).parent.parent
    data_file = base_dir / 'data' / 'instances' / 'global' / 'global_heritage_institutions.yaml'
    cache_file = base_dir / 'data' / 'cache' / 'geocoding_cache.db'

    # Run retry
    retry = JapaneseGeocodingRetry(cache_file, data_file, dry_run=args.dry_run)
    retry.run(limit=args.limit)


if __name__ == '__main__':
    main()