#!/usr/bin/env python3 """ Add lat/lon coordinates to custodian files that have geonames_id but missing coordinates. Usage: python scripts/geocode_from_geonames_id.py --country BG python scripts/geocode_from_geonames_id.py --country EG python scripts/geocode_from_geonames_id.py # Process all countries """ import sqlite3 import sys from pathlib import Path from datetime import datetime, timezone from ruamel.yaml import YAML GEONAMES_DB = Path("data/reference/geonames.db") CUSTODIAN_DIR = Path("data/custodian") yaml = YAML() yaml.preserve_quotes = True yaml.width = 4096 def get_coords_from_geonames(conn: sqlite3.Connection, geonames_id: int) -> tuple[float, float] | None: """Get lat/lon for a GeoNames ID.""" cursor = conn.execute( "SELECT latitude, longitude FROM cities WHERE geonames_id = ?", (geonames_id,) ) row = cursor.fetchone() if row: return row[0], row[1] return None def process_file(filepath: Path, conn: sqlite3.Connection) -> bool: """Process a single custodian file. Returns True if updated.""" with open(filepath, 'r', encoding='utf-8') as f: data = yaml.load(f) if not data: return False # Check if already has coordinates location = data.get('location', {}) if location.get('latitude') and location.get('longitude'): return False # Already has coordinates # Try to find geonames_id in location or location_resolution geonames_id = None # Check location block if location.get('geonames_id'): geonames_id = location['geonames_id'] # Check ghcid.location_resolution if not geonames_id: ghcid = data.get('ghcid', {}) loc_res = ghcid.get('location_resolution', {}) if loc_res.get('geonames_id'): geonames_id = loc_res['geonames_id'] if not geonames_id: return False # Look up coordinates coords = get_coords_from_geonames(conn, geonames_id) if not coords: print(f" Warning: GeoNames ID {geonames_id} not found in DB for {filepath.name}") return False lat, lon = coords # Update location block if 'location' not in data: data['location'] = {} data['location']['latitude'] = lat data['location']['longitude'] = lon data['location']['geocoding_timestamp'] = datetime.now(timezone.utc).isoformat() data['location']['geocoding_method'] = 'GEONAMES_ID_LOOKUP' # Write back with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f) return True def main(): import argparse parser = argparse.ArgumentParser(description='Add coordinates from GeoNames IDs') parser.add_argument('--country', type=str, help='Country code to process (e.g., BG, EG)') parser.add_argument('--dry-run', action='store_true', help='Show what would be done without making changes') args = parser.parse_args() if not GEONAMES_DB.exists(): print(f"Error: GeoNames DB not found at {GEONAMES_DB}") sys.exit(1) conn = sqlite3.connect(GEONAMES_DB) # Find files to process if args.country: pattern = f"{args.country}-*.yaml" else: pattern = "*.yaml" files = list(CUSTODIAN_DIR.glob(pattern)) print(f"Found {len(files)} files matching {pattern}") updated = 0 skipped = 0 no_geonames = 0 for filepath in files: # Skip subdirectories if not filepath.is_file(): continue try: if args.dry_run: # Just check if it would be updated with open(filepath, 'r', encoding='utf-8') as f: data = yaml.load(f) if data: location = data.get('location', {}) if not location.get('latitude'): geonames_id = location.get('geonames_id') or data.get('ghcid', {}).get('location_resolution', {}).get('geonames_id') if geonames_id: coords = get_coords_from_geonames(conn, geonames_id) if coords: print(f"Would update: {filepath.name} -> ({coords[0]}, {coords[1]})") updated += 1 else: no_geonames += 1 else: no_geonames += 1 else: skipped += 1 else: if process_file(filepath, conn): print(f"Updated: {filepath.name}") updated += 1 else: skipped += 1 except Exception as e: print(f"Error processing {filepath.name}: {e}") conn.close() print(f"\nSummary:") print(f" Updated: {updated}") print(f" Skipped (already has coords): {skipped}") if no_geonames: print(f" No GeoNames ID: {no_geonames}") if __name__ == "__main__": main()