#!/usr/bin/env python3 """ Geocode Missing Coordinates from GeoNames Database This script geocodes custodian files that are missing coordinates using the local GeoNames database. It's much faster than API-based geocoding (no rate limits). Features: - Uses local GeoNames SQLite database for instant lookups - Fuzzy matching for city names - Updates files in-place preserving YAML structure - Batch processing with progress tracking - Safe updates (additive only, preserves existing data) Usage: python scripts/geocode_missing_from_geonames.py --dry-run python scripts/geocode_missing_from_geonames.py --country JP --limit 100 python scripts/geocode_missing_from_geonames.py --all """ import argparse import sqlite3 from datetime import datetime, timezone from pathlib import Path from typing import Optional import unicodedata from ruamel.yaml import YAML # Setup ruamel.yaml for round-trip preservation yaml = YAML() yaml.preserve_quotes = True yaml.width = 120 # Configuration CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian") GEONAMES_DB = Path("/Users/kempersc/apps/glam/data/reference/geonames.db") def normalize_city_name(name: Optional[str]) -> str: """Normalize city name for matching.""" if not name: return "" # NFD decomposition and remove accents normalized = unicodedata.normalize('NFD', name) ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn') result = ascii_name.lower().strip() # Remove common Japanese administrative suffixes # These are romanized forms of 市 (shi/city), 区 (ku/ward), 町 (machi/town), etc. jp_suffixes = [' shi', '-shi', ' ku', '-ku', ' machi', '-machi', ' cho', '-cho', ' ken', '-ken', ' gun', '-gun', ' son', '-son', ' mura', '-mura'] for suffix in jp_suffixes: if result.endswith(suffix): result = result[:-len(suffix)] break return result class GeoNamesLookup: """Fast city coordinate lookup from GeoNames database.""" def __init__(self, db_path: Path): self.conn = sqlite3.connect(db_path) self.conn.row_factory = sqlite3.Row def lookup_city(self, city: str, country_code: str, region: str = None) -> Optional[dict]: """ Look up city coordinates in GeoNames database. Returns dict with latitude, longitude, geonames_id, etc. or None if not found. """ if not city or not country_code: return None # Normalize inputs city_norm = normalize_city_name(city) country_code = country_code.upper() # Try exact match first (case-insensitive) cursor = self.conn.execute(""" SELECT geonames_id, name, ascii_name, latitude, longitude, admin1_code, admin1_name, feature_code, population FROM cities WHERE country_code = ? AND (LOWER(name) = ? OR LOWER(ascii_name) = ?) ORDER BY population DESC LIMIT 1 """, (country_code, city_norm or "", city_norm or "")) row = cursor.fetchone() if row: return self._row_to_dict(row) # Try with original city name (for non-ASCII) cursor = self.conn.execute(""" SELECT geonames_id, name, ascii_name, latitude, longitude, admin1_code, admin1_name, feature_code, population FROM cities WHERE country_code = ? AND (name = ? OR ascii_name = ?) ORDER BY population DESC LIMIT 1 """, (country_code, city, city)) row = cursor.fetchone() if row: return self._row_to_dict(row) # Try partial match (city name contains or is contained in) cursor = self.conn.execute(""" SELECT geonames_id, name, ascii_name, latitude, longitude, admin1_code, admin1_name, feature_code, population FROM cities WHERE country_code = ? AND (LOWER(name) LIKE ? OR LOWER(ascii_name) LIKE ?) ORDER BY population DESC LIMIT 1 """, (country_code, f"%{city_norm}%", f"%{city_norm}%")) row = cursor.fetchone() if row: return self._row_to_dict(row) return None def _row_to_dict(self, row) -> dict: """Convert database row to dictionary.""" return { 'geonames_id': row['geonames_id'], 'geonames_name': row['name'], 'latitude': row['latitude'], 'longitude': row['longitude'], 'admin1_code': row['admin1_code'], 'admin1_name': row['admin1_name'], 'feature_code': row['feature_code'], 'population': row['population'] } def close(self): self.conn.close() def extract_city_country(data: dict) -> tuple[Optional[str], Optional[str]]: """Extract city and country from custodian data.""" city = None country = None # Try location block first loc = data.get('location', {}) if loc: city = loc.get('city') country = loc.get('country') # Try ghcid.location_resolution if not city: ghcid_loc = data.get('ghcid', {}).get('location_resolution', {}) if ghcid_loc: city = (ghcid_loc.get('city_name') or ghcid_loc.get('city_label') or ghcid_loc.get('geonames_name') or ghcid_loc.get('google_maps_locality')) if not country: country = ghcid_loc.get('country_code') # Try original_entry.locations if not city: orig_locs = data.get('original_entry', {}).get('locations', []) if orig_locs and len(orig_locs) > 0: city = orig_locs[0].get('city') country = orig_locs[0].get('country') # Try to infer country from GHCID if not country: ghcid = data.get('ghcid', {}).get('ghcid_current', '') if ghcid and len(ghcid) >= 2: country = ghcid[:2] return city, country def geocode_file(filepath: Path, geonames: GeoNamesLookup, dry_run: bool = False) -> dict: """ Geocode a single custodian file using GeoNames. Returns: Dictionary with results: - success: bool - geocoded: bool (True if coordinates were added) - already_has_coords: bool - error: str or None """ result = { 'success': False, 'geocoded': False, 'already_has_coords': False, 'city': None, 'country': None, 'error': None } try: with open(filepath, 'r', encoding='utf-8') as f: data = yaml.load(f) if not isinstance(data, dict): result['error'] = "Invalid YAML structure" return result # Check if already has coordinates loc = data.get('location', {}) if loc.get('latitude') is not None and loc.get('longitude') is not None: result['success'] = True result['already_has_coords'] = True return result # Extract city and country city, country = extract_city_country(data) result['city'] = city result['country'] = country if not city or not country: result['error'] = f"Missing city ({city}) or country ({country})" result['success'] = True # Not an error, just no data to geocode return result # Look up in GeoNames geo_result = geonames.lookup_city(city, country) if not geo_result: result['error'] = f"City not found in GeoNames: {city}, {country}" result['success'] = True # Not a fatal error return result # Update location block with coordinates if 'location' not in data: data['location'] = {} data['location']['latitude'] = geo_result['latitude'] data['location']['longitude'] = geo_result['longitude'] data['location']['coordinate_provenance'] = { 'source_type': 'GEONAMES_LOCAL', 'source_path': 'data/reference/geonames.db', 'entity_id': geo_result['geonames_id'], 'original_timestamp': datetime.now(timezone.utc).isoformat() } # Add geonames reference if not present if not data['location'].get('geonames_id'): data['location']['geonames_id'] = geo_result['geonames_id'] if not data['location'].get('geonames_name'): data['location']['geonames_name'] = geo_result['geonames_name'] if not data['location'].get('feature_code'): data['location']['feature_code'] = geo_result['feature_code'] # Update normalization timestamp data['location']['normalization_timestamp'] = datetime.now(timezone.utc).isoformat() if not dry_run: with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f) result['success'] = True result['geocoded'] = True return result except Exception as e: result['error'] = str(e) return result def main(): parser = argparse.ArgumentParser( description="Geocode missing coordinates using GeoNames database" ) parser.add_argument('--dry-run', action='store_true', help="Preview without writing") parser.add_argument('--country', type=str, help="Only process specific country code (e.g., JP)") parser.add_argument('--limit', type=int, default=0, help="Limit number of files to process") parser.add_argument('--all', action='store_true', help="Process all files (no limit)") parser.add_argument('--verbose', action='store_true', help="Show detailed output") args = parser.parse_args() if args.dry_run: print("DRY RUN - No files will be modified\n") # Initialize GeoNames lookup if not GEONAMES_DB.exists(): print(f"Error: GeoNames database not found at {GEONAMES_DB}") return 1 geonames = GeoNamesLookup(GEONAMES_DB) # Get list of files to process if args.country: pattern = f"{args.country.upper()}-*.yaml" files = sorted(CUSTODIAN_DIR.glob(pattern)) print(f"Processing {args.country.upper()} files: {len(files)} found") else: files = sorted(CUSTODIAN_DIR.glob("*.yaml")) print(f"Processing all files: {len(files)} found") if args.limit and not args.all: files = files[:args.limit] print(f"Limited to first {args.limit} files") # Statistics stats = { 'total': len(files), 'geocoded': 0, 'already_has_coords': 0, 'no_city_data': 0, 'not_found': 0, 'errors': 0, 'by_country': {} } errors = [] not_found = [] for i, filepath in enumerate(files): result = geocode_file(filepath, geonames, dry_run=args.dry_run) # Extract country from filename country = filepath.name[:2] if country not in stats['by_country']: stats['by_country'][country] = {'geocoded': 0, 'not_found': 0} if result['geocoded']: stats['geocoded'] += 1 stats['by_country'][country]['geocoded'] += 1 elif result['already_has_coords']: stats['already_has_coords'] += 1 elif result['error'] and 'Missing city' in result['error']: stats['no_city_data'] += 1 elif result['error'] and 'not found in GeoNames' in result['error']: stats['not_found'] += 1 stats['by_country'][country]['not_found'] += 1 if len(not_found) < 100: not_found.append((filepath.name, result['city'], result['country'])) elif result['error']: stats['errors'] += 1 if len(errors) < 20: errors.append((filepath.name, result['error'])) if args.verbose: status = "GEOCODED" if result['geocoded'] else "SKIP" if result['already_has_coords'] else "FAIL" print(f"[{i+1}/{len(files)}] {filepath.name}: {status}") elif (i + 1) % 1000 == 0: print(f"Processed {i+1}/{len(files)} files... (geocoded: {stats['geocoded']})") # Print summary print("\n" + "=" * 60) print("GEOCODING SUMMARY") print("=" * 60) print(f"Total files processed: {stats['total']}") print(f"Already had coordinates: {stats['already_has_coords']}") print(f"Successfully geocoded: {stats['geocoded']}") print(f"No city data available: {stats['no_city_data']}") print(f"City not found in GeoNames: {stats['not_found']}") print(f"Errors: {stats['errors']}") if stats['by_country']: print("\nResults by country:") for country, data in sorted(stats['by_country'].items(), key=lambda x: -x[1]['geocoded']): if data['geocoded'] > 0 or data['not_found'] > 0: print(f" {country}: geocoded={data['geocoded']}, not_found={data['not_found']}") if not_found: print(f"\nFirst {len(not_found)} cities not found:") for filename, city, country in not_found[:20]: print(f" {filename}: {city}, {country}") if errors: print(f"\nFirst {len(errors)} errors:") for filename, error in errors: print(f" {filename}: {error}") if args.dry_run: print("\n(DRY RUN - No files were modified)") geonames.close() return 0 if __name__ == "__main__": exit(main())