- BG: Add lat/lon from existing GeoNames IDs (28 files) - EG: Map city codes to GeoNames (CAI→Cairo, ALX→Alexandria, etc.) (28 files) - Fix malformed EG-IS-\`A\`-O-SCA.yaml → EG-IS-ISM-O-SCA.yaml - Overall coverage: 96.4% → 96.6%
160 lines
5 KiB
Python
Executable file
160 lines
5 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Add lat/lon coordinates to custodian files that have geonames_id but missing coordinates.
|
|
|
|
Usage:
|
|
python scripts/geocode_from_geonames_id.py --country BG
|
|
python scripts/geocode_from_geonames_id.py --country EG
|
|
python scripts/geocode_from_geonames_id.py # Process all countries
|
|
"""
|
|
|
|
import sqlite3
|
|
import sys
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from ruamel.yaml import YAML
|
|
|
|
GEONAMES_DB = Path("data/reference/geonames.db")
|
|
CUSTODIAN_DIR = Path("data/custodian")
|
|
|
|
yaml = YAML()
|
|
yaml.preserve_quotes = True
|
|
yaml.width = 4096
|
|
|
|
def get_coords_from_geonames(conn: sqlite3.Connection, geonames_id: int) -> tuple[float, float] | None:
|
|
"""Get lat/lon for a GeoNames ID."""
|
|
cursor = conn.execute(
|
|
"SELECT latitude, longitude FROM cities WHERE geonames_id = ?",
|
|
(geonames_id,)
|
|
)
|
|
row = cursor.fetchone()
|
|
if row:
|
|
return row[0], row[1]
|
|
return None
|
|
|
|
|
|
def process_file(filepath: Path, conn: sqlite3.Connection) -> bool:
|
|
"""Process a single custodian file. Returns True if updated."""
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.load(f)
|
|
|
|
if not data:
|
|
return False
|
|
|
|
# Check if already has coordinates
|
|
location = data.get('location', {})
|
|
if location.get('latitude') and location.get('longitude'):
|
|
return False # Already has coordinates
|
|
|
|
# Try to find geonames_id in location or location_resolution
|
|
geonames_id = None
|
|
|
|
# Check location block
|
|
if location.get('geonames_id'):
|
|
geonames_id = location['geonames_id']
|
|
|
|
# Check ghcid.location_resolution
|
|
if not geonames_id:
|
|
ghcid = data.get('ghcid', {})
|
|
loc_res = ghcid.get('location_resolution', {})
|
|
if loc_res.get('geonames_id'):
|
|
geonames_id = loc_res['geonames_id']
|
|
|
|
if not geonames_id:
|
|
return False
|
|
|
|
# Look up coordinates
|
|
coords = get_coords_from_geonames(conn, geonames_id)
|
|
if not coords:
|
|
print(f" Warning: GeoNames ID {geonames_id} not found in DB for {filepath.name}")
|
|
return False
|
|
|
|
lat, lon = coords
|
|
|
|
# Update location block
|
|
if 'location' not in data:
|
|
data['location'] = {}
|
|
|
|
data['location']['latitude'] = lat
|
|
data['location']['longitude'] = lon
|
|
data['location']['geocoding_timestamp'] = datetime.now(timezone.utc).isoformat()
|
|
data['location']['geocoding_method'] = 'GEONAMES_ID_LOOKUP'
|
|
|
|
# Write back
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f)
|
|
|
|
return True
|
|
|
|
|
|
def main():
|
|
import argparse
|
|
parser = argparse.ArgumentParser(description='Add coordinates from GeoNames IDs')
|
|
parser.add_argument('--country', type=str, help='Country code to process (e.g., BG, EG)')
|
|
parser.add_argument('--dry-run', action='store_true', help='Show what would be done without making changes')
|
|
args = parser.parse_args()
|
|
|
|
if not GEONAMES_DB.exists():
|
|
print(f"Error: GeoNames DB not found at {GEONAMES_DB}")
|
|
sys.exit(1)
|
|
|
|
conn = sqlite3.connect(GEONAMES_DB)
|
|
|
|
# Find files to process
|
|
if args.country:
|
|
pattern = f"{args.country}-*.yaml"
|
|
else:
|
|
pattern = "*.yaml"
|
|
|
|
files = list(CUSTODIAN_DIR.glob(pattern))
|
|
print(f"Found {len(files)} files matching {pattern}")
|
|
|
|
updated = 0
|
|
skipped = 0
|
|
no_geonames = 0
|
|
|
|
for filepath in files:
|
|
# Skip subdirectories
|
|
if not filepath.is_file():
|
|
continue
|
|
|
|
try:
|
|
if args.dry_run:
|
|
# Just check if it would be updated
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.load(f)
|
|
if data:
|
|
location = data.get('location', {})
|
|
if not location.get('latitude'):
|
|
geonames_id = location.get('geonames_id') or data.get('ghcid', {}).get('location_resolution', {}).get('geonames_id')
|
|
if geonames_id:
|
|
coords = get_coords_from_geonames(conn, geonames_id)
|
|
if coords:
|
|
print(f"Would update: {filepath.name} -> ({coords[0]}, {coords[1]})")
|
|
updated += 1
|
|
else:
|
|
no_geonames += 1
|
|
else:
|
|
no_geonames += 1
|
|
else:
|
|
skipped += 1
|
|
else:
|
|
if process_file(filepath, conn):
|
|
print(f"Updated: {filepath.name}")
|
|
updated += 1
|
|
else:
|
|
skipped += 1
|
|
except Exception as e:
|
|
print(f"Error processing {filepath.name}: {e}")
|
|
|
|
conn.close()
|
|
|
|
print(f"\nSummary:")
|
|
print(f" Updated: {updated}")
|
|
print(f" Skipped (already has coords): {skipped}")
|
|
if no_geonames:
|
|
print(f" No GeoNames ID: {no_geonames}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|