#!/usr/bin/env python3 """ Geocode custodian files that have city/address but no coordinates. Uses Nominatim (free, no API key) to geocode based on: 1. street_address + city + country 2. city + country (fallback) Rate limit: 1 request per second (Nominatim terms of service) """ import argparse import logging import time from pathlib import Path from datetime import datetime, timezone import httpx import yaml logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", datefmt="%Y-%m-%d %H:%M:%S" ) logger = logging.getLogger(__name__) NOMINATIM_URL = "https://nominatim.openstreetmap.org/search" USER_AGENT = "GLAM-Heritage-Geocoder/1.0 (https://bronhouder.nl)" def geocode_address(address: str, city: str, country: str = "NL") -> dict | None: """Geocode an address using Nominatim. Entity Resolution Safety: - We geocode the LOCATION (city/address), NOT the institution name - This is safe because we're finding the city's coordinates, not trying to identify which business is at that location - Institutions in the same city will get the same (or nearby) coordinates - Street address provides more precision when available """ # Build search query query_parts = [] if address: query_parts.append(address) if city: query_parts.append(city) query = ", ".join(query_parts) params = { "q": query, "countrycodes": country.lower(), "format": "json", "limit": 1, "addressdetails": 1, } headers = {"User-Agent": USER_AGENT} try: with httpx.Client(timeout=10.0) as client: resp = client.get(NOMINATIM_URL, params=params, headers=headers) resp.raise_for_status() results = resp.json() if results: result = results[0] address_details = result.get("address", {}) # Validate: check if result is in the expected city # Nominatim may return a different city if the query is ambiguous result_city = ( address_details.get("city") or address_details.get("town") or address_details.get("village") or address_details.get("municipality") or "" ) # Calculate confidence based on address specificity and city match confidence = 0.5 # Base confidence for city-level geocoding if address: # Has street address confidence += 0.3 # Check if returned city matches expected city (case-insensitive) city_match = city.lower() in result_city.lower() or result_city.lower() in city.lower() if city_match: confidence += 0.2 else: # Log potential mismatch for review logger.warning( f"City mismatch: expected '{city}', got '{result_city}' " f"for query '{query}' - verify manually" ) confidence -= 0.2 return { "latitude": float(result["lat"]), "longitude": float(result["lon"]), "display_name": result.get("display_name"), "osm_id": result.get("osm_id"), "osm_type": result.get("osm_type"), "confidence": round(confidence, 2), "result_city": result_city, "city_match": city_match, } except Exception as e: logger.warning(f"Geocoding failed for '{query}': {e}") return None def process_file(filepath: Path, dry_run: bool = False) -> bool: """Process a single custodian file, adding coordinates if missing.""" with open(filepath, "r", encoding="utf-8") as f: data = yaml.safe_load(f) if not data: return False location = data.get("location") or {} # Skip if already has coordinates if location.get("latitude") and location.get("longitude"): return False # Need city to geocode city = location.get("city") if not city: return False # Get address and country street = location.get("street_address") or location.get("street") or "" country = location.get("country", "NL") # Geocode result = geocode_address(street, city, country) if result: confidence_str = f" (confidence: {result['confidence']:.2f})" city_match_str = "" if result['city_match'] else " [CITY MISMATCH]" logger.info(f"Geocoded {filepath.name}: {city} -> ({result['latitude']}, {result['longitude']}){confidence_str}{city_match_str}") if not dry_run: # Update the data if "location" not in data: data["location"] = {} data["location"]["latitude"] = result["latitude"] data["location"]["longitude"] = result["longitude"] data["location"]["coordinate_provenance"] = { "source_type": "NOMINATIM_GEOCODE", "geocode_query": f"{street}, {city}" if street else city, "geocode_timestamp": datetime.now(timezone.utc).isoformat(), "osm_id": result.get("osm_id"), "osm_type": result.get("osm_type"), "display_name": result.get("display_name"), "geocode_confidence": result.get("confidence"), "city_match": result.get("city_match"), "result_city": result.get("result_city"), } # Write back with open(filepath, "w", encoding="utf-8") as f: yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) return True return False def main(): parser = argparse.ArgumentParser(description="Geocode custodian files missing coordinates") parser.add_argument("--data-dir", type=Path, default=Path("data/custodian"), help="Directory containing custodian YAML files") parser.add_argument("--pattern", type=str, default="NL-*.yaml", help="Glob pattern for files to process") parser.add_argument("--limit", type=int, default=None, help="Limit number of files to process") parser.add_argument("--dry-run", action="store_true", help="Don't write changes, just show what would be done") parser.add_argument("--rate-limit", type=float, default=1.0, help="Seconds between Nominatim requests (default: 1.0)") args = parser.parse_args() # Find files to process files = list(args.data_dir.glob(args.pattern)) logger.info(f"Found {len(files)} files matching {args.pattern}") # Filter to files needing geocoding to_geocode = [] for f in files: try: with open(f) as fp: data = yaml.safe_load(fp) location = data.get("location") or {} if location.get("city") and not (location.get("latitude") and location.get("longitude")): to_geocode.append(f) except: pass logger.info(f"Files needing geocoding: {len(to_geocode)}") if args.limit: to_geocode = to_geocode[:args.limit] logger.info(f"Limited to {len(to_geocode)} files") # Process files geocoded = 0 failed = 0 for i, filepath in enumerate(to_geocode): if process_file(filepath, args.dry_run): geocoded += 1 else: failed += 1 # Rate limiting if i < len(to_geocode) - 1: time.sleep(args.rate_limit) # Progress every 50 files if (i + 1) % 50 == 0: logger.info(f"Progress: {i + 1}/{len(to_geocode)} (geocoded: {geocoded})") logger.info("=" * 60) logger.info(f"Geocoding complete:") logger.info(f" Files processed: {len(to_geocode)}") logger.info(f" Successfully geocoded: {geocoded}") logger.info(f" Failed/skipped: {failed}") if __name__ == "__main__": main()