260 lines
9.4 KiB
Python
260 lines
9.4 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Geocode custodian files that have city/address but no coordinates.
|
|
|
|
Uses Nominatim (free, no API key) to geocode based on:
|
|
1. street_address + city + country
|
|
2. city + country (fallback)
|
|
|
|
Rate limit: 1 request per second (Nominatim terms of service)
|
|
"""
|
|
|
|
import argparse
|
|
import logging
|
|
import time
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
|
|
import httpx
|
|
import yaml
|
|
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s [%(levelname)s] %(message)s",
|
|
datefmt="%Y-%m-%d %H:%M:%S"
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
NOMINATIM_URL = "https://nominatim.openstreetmap.org/search"
|
|
USER_AGENT = "GLAM-Heritage-Geocoder/1.0 (https://bronhouder.nl)"
|
|
|
|
|
|
def geocode_address(address: str, city: str, country: str = "NL") -> dict | None:
|
|
"""Geocode an address using Nominatim.
|
|
|
|
Entity Resolution Safety:
|
|
- We geocode the LOCATION (city/address), NOT the institution name
|
|
- This is safe because we're finding the city's coordinates, not trying to
|
|
identify which business is at that location
|
|
- Institutions in the same city will get the same (or nearby) coordinates
|
|
- Street address provides more precision when available
|
|
"""
|
|
# Build search query
|
|
query_parts = []
|
|
if address:
|
|
query_parts.append(address)
|
|
if city:
|
|
query_parts.append(city)
|
|
|
|
query = ", ".join(query_parts)
|
|
|
|
params = {
|
|
"q": query,
|
|
"countrycodes": country.lower(),
|
|
"format": "json",
|
|
"limit": 1,
|
|
"addressdetails": 1,
|
|
}
|
|
|
|
headers = {"User-Agent": USER_AGENT}
|
|
|
|
try:
|
|
with httpx.Client(timeout=10.0) as client:
|
|
resp = client.get(NOMINATIM_URL, params=params, headers=headers)
|
|
resp.raise_for_status()
|
|
results = resp.json()
|
|
|
|
if results:
|
|
result = results[0]
|
|
address_details = result.get("address", {})
|
|
|
|
# Validate: check if result is in the expected city
|
|
# Nominatim may return a different city if the query is ambiguous
|
|
result_city = (
|
|
address_details.get("city") or
|
|
address_details.get("town") or
|
|
address_details.get("village") or
|
|
address_details.get("municipality") or
|
|
""
|
|
)
|
|
|
|
# Calculate confidence based on address specificity and city match
|
|
confidence = 0.5 # Base confidence for city-level geocoding
|
|
|
|
if address: # Has street address
|
|
confidence += 0.3
|
|
|
|
# Check if returned city matches expected city (case-insensitive)
|
|
city_match = city.lower() in result_city.lower() or result_city.lower() in city.lower()
|
|
if city_match:
|
|
confidence += 0.2
|
|
else:
|
|
# Log potential mismatch for review
|
|
logger.warning(
|
|
f"City mismatch: expected '{city}', got '{result_city}' "
|
|
f"for query '{query}' - verify manually"
|
|
)
|
|
confidence -= 0.2
|
|
|
|
return {
|
|
"latitude": float(result["lat"]),
|
|
"longitude": float(result["lon"]),
|
|
"display_name": result.get("display_name"),
|
|
"osm_id": result.get("osm_id"),
|
|
"osm_type": result.get("osm_type"),
|
|
"confidence": round(confidence, 2),
|
|
"result_city": result_city,
|
|
"city_match": city_match,
|
|
"geocode_query": query, # Store query for provenance
|
|
}
|
|
except Exception as e:
|
|
logger.warning(f"Geocoding failed for '{query}': {e}")
|
|
|
|
return None
|
|
|
|
|
|
def process_file(filepath: Path, dry_run: bool = False) -> bool:
|
|
"""Process a single custodian file, adding coordinates if missing."""
|
|
with open(filepath, "r", encoding="utf-8") as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
if not data:
|
|
return False
|
|
|
|
location = data.get("location") or {}
|
|
|
|
# Skip if already has coordinates
|
|
if location.get("latitude") and location.get("longitude"):
|
|
return False
|
|
|
|
# Need city to geocode
|
|
city = location.get("city")
|
|
if not city:
|
|
return False
|
|
|
|
# Get address and country
|
|
street = location.get("street_address") or location.get("street") or ""
|
|
country = location.get("country", "NL")
|
|
|
|
# Geocode
|
|
result = geocode_address(street, city, country)
|
|
|
|
if result:
|
|
confidence_str = f" (confidence: {result['confidence']:.2f})"
|
|
city_match_str = "" if result['city_match'] else " [CITY MISMATCH]"
|
|
logger.info(f"Geocoded {filepath.name}: {city} -> ({result['latitude']}, {result['longitude']}){confidence_str}{city_match_str}")
|
|
|
|
if not dry_run:
|
|
# Update the data
|
|
if "location" not in data:
|
|
data["location"] = {}
|
|
|
|
data["location"]["latitude"] = result["latitude"]
|
|
data["location"]["longitude"] = result["longitude"]
|
|
|
|
# Build provenance per Rule 35 (Dual Timestamps) and Rule 6 (XPath Provenance)
|
|
# For geocoding, the "source" is the Nominatim API, not a webpage
|
|
now = datetime.now(timezone.utc).isoformat()
|
|
geocode_query = result.get('geocode_query', f"{street}, {city}" if street else city)
|
|
encoded_query = geocode_query.replace(" ", "+")
|
|
|
|
data["location"]["coordinate_provenance"] = {
|
|
# Core identification
|
|
"source_type": "NOMINATIM_GEOCODE",
|
|
"source_url": f"https://nominatim.openstreetmap.org/search?q={encoded_query}&format=json",
|
|
|
|
# Dual timestamps (Rule 35)
|
|
"statement_created_at": now, # When we extracted/created this claim
|
|
"source_archived_at": now, # API response is ephemeral, same as creation
|
|
|
|
# Retrieval agent (Rule 35)
|
|
"retrieval_agent": "geocode_missing_coordinates.py",
|
|
|
|
# Geocoding specifics
|
|
"geocode_query": geocode_query,
|
|
"osm_id": result.get("osm_id"),
|
|
"osm_type": result.get("osm_type"),
|
|
"display_name": result.get("display_name"),
|
|
|
|
# Confidence and validation
|
|
"geocode_confidence": result.get("confidence"),
|
|
"city_match": result.get("city_match"),
|
|
"result_city": result.get("result_city"),
|
|
|
|
# Note: No xpath for API responses - this is not web scraping
|
|
# XPath provenance (Rule 6) applies to WebClaim/WebObservation only
|
|
}
|
|
|
|
# Write back
|
|
with open(filepath, "w", encoding="utf-8") as f:
|
|
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Geocode custodian files missing coordinates")
|
|
parser.add_argument("--data-dir", type=Path, default=Path("data/custodian"),
|
|
help="Directory containing custodian YAML files")
|
|
parser.add_argument("--pattern", type=str, default="NL-*.yaml",
|
|
help="Glob pattern for files to process")
|
|
parser.add_argument("--limit", type=int, default=None,
|
|
help="Limit number of files to process")
|
|
parser.add_argument("--dry-run", action="store_true",
|
|
help="Don't write changes, just show what would be done")
|
|
parser.add_argument("--rate-limit", type=float, default=1.0,
|
|
help="Seconds between Nominatim requests (default: 1.0)")
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Find files to process
|
|
files = list(args.data_dir.glob(args.pattern))
|
|
logger.info(f"Found {len(files)} files matching {args.pattern}")
|
|
|
|
# Filter to files needing geocoding
|
|
to_geocode = []
|
|
for f in files:
|
|
try:
|
|
with open(f) as fp:
|
|
data = yaml.safe_load(fp)
|
|
location = data.get("location") or {}
|
|
if location.get("city") and not (location.get("latitude") and location.get("longitude")):
|
|
to_geocode.append(f)
|
|
except:
|
|
pass
|
|
|
|
logger.info(f"Files needing geocoding: {len(to_geocode)}")
|
|
|
|
if args.limit:
|
|
to_geocode = to_geocode[:args.limit]
|
|
logger.info(f"Limited to {len(to_geocode)} files")
|
|
|
|
# Process files
|
|
geocoded = 0
|
|
failed = 0
|
|
|
|
for i, filepath in enumerate(to_geocode):
|
|
if process_file(filepath, args.dry_run):
|
|
geocoded += 1
|
|
else:
|
|
failed += 1
|
|
|
|
# Rate limiting
|
|
if i < len(to_geocode) - 1:
|
|
time.sleep(args.rate_limit)
|
|
|
|
# Progress every 50 files
|
|
if (i + 1) % 50 == 0:
|
|
logger.info(f"Progress: {i + 1}/{len(to_geocode)} (geocoded: {geocoded})")
|
|
|
|
logger.info("=" * 60)
|
|
logger.info(f"Geocoding complete:")
|
|
logger.info(f" Files processed: {len(to_geocode)}")
|
|
logger.info(f" Successfully geocoded: {geocoded}")
|
|
logger.info(f" Failed/skipped: {failed}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|