glam/scripts/geocode_missing_coordinates.py

191 lines
6 KiB
Python

#!/usr/bin/env python3
"""
Geocode custodian files that have city/address but no coordinates.
Uses Nominatim (free, no API key) to geocode based on:
1. street_address + city + country
2. city + country (fallback)
Rate limit: 1 request per second (Nominatim terms of service)
"""
import argparse
import logging
import time
from pathlib import Path
from datetime import datetime, timezone
import httpx
import yaml
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
datefmt="%Y-%m-%d %H:%M:%S"
)
logger = logging.getLogger(__name__)
NOMINATIM_URL = "https://nominatim.openstreetmap.org/search"
USER_AGENT = "GLAM-Heritage-Geocoder/1.0 (https://bronhouder.nl)"
def geocode_address(address: str, city: str, country: str = "NL") -> dict | None:
"""Geocode an address using Nominatim."""
# Build search query
query_parts = []
if address:
query_parts.append(address)
if city:
query_parts.append(city)
query = ", ".join(query_parts)
params = {
"q": query,
"countrycodes": country.lower(),
"format": "json",
"limit": 1,
"addressdetails": 1,
}
headers = {"User-Agent": USER_AGENT}
try:
with httpx.Client(timeout=10.0) as client:
resp = client.get(NOMINATIM_URL, params=params, headers=headers)
resp.raise_for_status()
results = resp.json()
if results:
result = results[0]
return {
"latitude": float(result["lat"]),
"longitude": float(result["lon"]),
"display_name": result.get("display_name"),
"osm_id": result.get("osm_id"),
"osm_type": result.get("osm_type"),
}
except Exception as e:
logger.warning(f"Geocoding failed for '{query}': {e}")
return None
def process_file(filepath: Path, dry_run: bool = False) -> bool:
"""Process a single custodian file, adding coordinates if missing."""
with open(filepath, "r", encoding="utf-8") as f:
data = yaml.safe_load(f)
if not data:
return False
location = data.get("location") or {}
# Skip if already has coordinates
if location.get("latitude") and location.get("longitude"):
return False
# Need city to geocode
city = location.get("city")
if not city:
return False
# Get address and country
street = location.get("street_address") or location.get("street") or ""
country = location.get("country", "NL")
# Geocode
result = geocode_address(street, city, country)
if result:
logger.info(f"Geocoded {filepath.name}: {city} -> ({result['latitude']}, {result['longitude']})")
if not dry_run:
# Update the data
if "location" not in data:
data["location"] = {}
data["location"]["latitude"] = result["latitude"]
data["location"]["longitude"] = result["longitude"]
data["location"]["coordinate_provenance"] = {
"source_type": "NOMINATIM_GEOCODE",
"geocode_query": f"{street}, {city}" if street else city,
"geocode_timestamp": datetime.now(timezone.utc).isoformat(),
"osm_id": result.get("osm_id"),
"osm_type": result.get("osm_type"),
"display_name": result.get("display_name"),
}
# Write back
with open(filepath, "w", encoding="utf-8") as f:
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
return True
return False
def main():
parser = argparse.ArgumentParser(description="Geocode custodian files missing coordinates")
parser.add_argument("--data-dir", type=Path, default=Path("data/custodian"),
help="Directory containing custodian YAML files")
parser.add_argument("--pattern", type=str, default="NL-*.yaml",
help="Glob pattern for files to process")
parser.add_argument("--limit", type=int, default=None,
help="Limit number of files to process")
parser.add_argument("--dry-run", action="store_true",
help="Don't write changes, just show what would be done")
parser.add_argument("--rate-limit", type=float, default=1.0,
help="Seconds between Nominatim requests (default: 1.0)")
args = parser.parse_args()
# Find files to process
files = list(args.data_dir.glob(args.pattern))
logger.info(f"Found {len(files)} files matching {args.pattern}")
# Filter to files needing geocoding
to_geocode = []
for f in files:
try:
with open(f) as fp:
data = yaml.safe_load(fp)
location = data.get("location") or {}
if location.get("city") and not (location.get("latitude") and location.get("longitude")):
to_geocode.append(f)
except:
pass
logger.info(f"Files needing geocoding: {len(to_geocode)}")
if args.limit:
to_geocode = to_geocode[:args.limit]
logger.info(f"Limited to {len(to_geocode)} files")
# Process files
geocoded = 0
failed = 0
for i, filepath in enumerate(to_geocode):
if process_file(filepath, args.dry_run):
geocoded += 1
else:
failed += 1
# Rate limiting
if i < len(to_geocode) - 1:
time.sleep(args.rate_limit)
# Progress every 50 files
if (i + 1) % 50 == 0:
logger.info(f"Progress: {i + 1}/{len(to_geocode)} (geocoded: {geocoded})")
logger.info("=" * 60)
logger.info(f"Geocoding complete:")
logger.info(f" Files processed: {len(to_geocode)}")
logger.info(f" Successfully geocoded: {geocoded}")
logger.info(f" Failed/skipped: {failed}")
if __name__ == "__main__":
main()