glam/enrich_geocoding.py
2025-11-19 23:25:22 +01:00

351 lines
12 KiB
Python

#!/usr/bin/env python3
"""
Brazilian GLAM Geocoding Enrichment
====================================
Uses Nominatim API to geocode institutions by name + state to add city-level
location data and geographic coordinates.
Features:
- Rate limiting (1 req/sec for Nominatim)
- Caching to avoid duplicate lookups
- Fallback strategies for failed lookups
- Progress tracking and detailed reporting
"""
import yaml
import time
import requests
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Optional, Tuple
from urllib.parse import quote
# File paths
INPUT_FILE = Path("/Users/kempersc/apps/glam/data/instances/brazilian_institutions_curated_v2.yaml")
OUTPUT_FILE = Path("/Users/kempersc/apps/glam/data/instances/brazilian_institutions_geocoded_v3.yaml")
CACHE_FILE = Path("/Users/kempersc/apps/glam/data/cache/geocoding_cache.yaml")
# Nominatim API settings
NOMINATIM_URL = "https://nominatim.openstreetmap.org/search"
USER_AGENT = "GLAM-Data-Extraction/0.2.0 (heritage research project)"
REQUEST_DELAY = 1.1 # Seconds between requests (Nominatim requires 1/sec max)
# Brazilian state code mapping
STATE_CODES = {
'ACRE': 'AC', 'ALAGOAS': 'AL', 'AMAPÁ': 'AP', 'AMAZONAS': 'AM',
'BAHIA': 'BA', 'CEARÁ': 'CE', 'DISTRITO FEDERAL': 'DF', 'ESPÍRITO SANTO': 'ES',
'GOIÁS': 'GO', 'MARANHÃO': 'MA', 'MATO GROSSO': 'MT', 'MATO GROSSO DO SUL': 'MS',
'MINAS GERAIS': 'MG', 'PARÁ': 'PA', 'PARAÍBA': 'PB', 'PARANÁ': 'PR',
'PERNAMBUCO': 'PE', 'PIAUÍ': 'PI', 'RIO DE JANEIRO': 'RJ', 'RIO GRANDE DO NORTE': 'RN',
'RIO GRANDE DO SUL': 'RS', 'RONDÔNIA': 'RO', 'RORAIMA': 'RR',
'SANTA CATARINA': 'SC', 'SÃO PAULO': 'SP', 'SERGIPE': 'SE', 'TOCANTINS': 'TO'
}
class GeocodingCache:
"""Simple YAML-based cache for geocoding results."""
def __init__(self, cache_file: Path):
self.cache_file = cache_file
self.cache = self._load_cache()
def _load_cache(self) -> Dict:
"""Load cache from file."""
if self.cache_file.exists():
with open(self.cache_file, 'r', encoding='utf-8') as f:
return yaml.safe_load(f) or {}
return {}
def _save_cache(self):
"""Save cache to file."""
self.cache_file.parent.mkdir(parents=True, exist_ok=True)
with open(self.cache_file, 'w', encoding='utf-8') as f:
yaml.dump(self.cache, f, default_flow_style=False, allow_unicode=True)
def get(self, key: str) -> Optional[Dict]:
"""Get cached geocoding result."""
return self.cache.get(key)
def set(self, key: str, value: Optional[Dict]):
"""Cache a geocoding result (including None for failed lookups)."""
self.cache[key] = value
self._save_cache()
def geocode_location(query: str, state: str, cache: GeocodingCache) -> Optional[Dict]:
"""
Geocode a location using Nominatim API.
Args:
query: Institution name or location description
state: Brazilian state name or code
cache: Geocoding cache
Returns:
Dict with 'city', 'latitude', 'longitude', 'display_name' or None
"""
# Generate cache key
cache_key = f"{query}|{state}|BR"
# Check cache first
cached = cache.get(cache_key)
if cached:
print(f" [CACHE] {query}, {state}")
return cached
# Build search query
state_code = STATE_CODES.get(state.upper(), state)
search_query = f"{query}, {state_code}, Brazil"
params = {
'q': search_query,
'format': 'json',
'limit': 1,
'countrycodes': 'br',
'addressdetails': 1
}
headers = {
'User-Agent': USER_AGENT
}
try:
print(f" [API] Geocoding: {search_query}")
time.sleep(REQUEST_DELAY) # Rate limiting
response = requests.get(NOMINATIM_URL, params=params, headers=headers, timeout=10)
response.raise_for_status()
results = response.json()
if not results:
print(f" [FAIL] No results for: {search_query}")
# Cache null result to avoid re-querying
cache.set(cache_key, None)
return None
result = results[0]
address = result.get('address', {})
# Extract city from address components
city = (
address.get('city') or
address.get('town') or
address.get('municipality') or
address.get('village') or
address.get('suburb')
)
if not city:
print(f" [WARN] No city found in result for: {search_query}")
cache.set(cache_key, None)
return None
geo_data = {
'city': city,
'latitude': float(result['lat']),
'longitude': float(result['lon']),
'display_name': result.get('display_name', ''),
'osm_type': result.get('osm_type'),
'osm_id': result.get('osm_id')
}
print(f" [SUCCESS] Found: {city} ({geo_data['latitude']:.4f}, {geo_data['longitude']:.4f})")
# Cache successful result
cache.set(cache_key, geo_data)
return geo_data
except requests.RequestException as e:
print(f" [ERROR] API request failed: {e}")
return None
except (KeyError, ValueError) as e:
print(f" [ERROR] Failed to parse result: {e}")
return None
def enrich_record_with_geocoding(record: Dict, cache: GeocodingCache) -> Dict:
"""
Enrich a record with geocoding data.
Strategy:
1. Skip if already has city
2. Try geocoding institution name + state
3. Update location with city + coordinates
"""
enriched = record.copy()
# Check if already has city
if enriched.get('locations'):
for loc in enriched['locations']:
if loc.get('city'):
return enriched # Already has city, skip
# Get state from location
state = None
if enriched.get('locations') and enriched['locations']:
state = enriched['locations'][0].get('region')
if not state:
return enriched # No state info, can't geocode
# Try geocoding with institution name
inst_name = enriched.get('name', '')
geo_data = geocode_location(inst_name, state, cache)
if geo_data:
# Update location with geocoding data
if 'locations' not in enriched or not enriched['locations']:
enriched['locations'] = [{'country': 'BR', 'region': state}]
# Update first location
enriched['locations'][0].update({
'city': geo_data['city'],
'latitude': geo_data['latitude'],
'longitude': geo_data['longitude']
})
# Add OSM identifiers if not already present
osm_id = f"{geo_data['osm_type']}/{geo_data['osm_id']}"
osm_url = f"https://www.openstreetmap.org/{geo_data['osm_type']}/{geo_data['osm_id']}"
if 'identifiers' not in enriched:
enriched['identifiers'] = []
# Check if OSM ID already exists
existing_osm = any(
id.get('identifier_scheme') == 'OpenStreetMap'
for id in enriched.get('identifiers', [])
)
if not existing_osm:
enriched['identifiers'].append({
'identifier_scheme': 'OpenStreetMap',
'identifier_value': osm_id,
'identifier_url': osm_url
})
# Update provenance
if 'provenance' in enriched:
enriched['provenance']['extraction_date'] = datetime.now(timezone.utc).isoformat()
enriched['provenance']['extraction_method'] = (
enriched['provenance'].get('extraction_method', '') +
' + Nominatim geocoding'
)
# Increase confidence slightly for geocoded records
current_confidence = enriched['provenance'].get('confidence_score', 0.7)
enriched['provenance']['confidence_score'] = min(0.85, current_confidence + 0.05)
return enriched
def main():
"""Main geocoding enrichment workflow."""
print("=" * 70)
print("Brazilian GLAM Geocoding Enrichment - v3.0")
print("=" * 70)
print()
# Load input data
print(f"Loading records from: {INPUT_FILE}")
with open(INPUT_FILE, 'r', encoding='utf-8') as f:
records = yaml.safe_load(f)
print(f"Loaded {len(records)} records")
# Initialize cache
cache = GeocodingCache(CACHE_FILE)
print(f"Geocoding cache: {len(cache.cache)} entries")
print()
# Count records needing geocoding
needs_geocoding = sum(
1 for r in records
if not (r.get('locations') and any(loc.get('city') for loc in r['locations']))
)
print(f"Records needing geocoding: {needs_geocoding}/{len(records)}")
print(f"Estimated time: ~{needs_geocoding * REQUEST_DELAY / 60:.1f} minutes")
print()
# Enrich records
print("=" * 70)
print("Geocoding Records")
print("=" * 70)
enriched_records = []
success_count = 0
failed_count = 0
skipped_count = 0
for i, record in enumerate(records, 1):
print(f"\n[{i}/{len(records)}] {record.get('name', 'Unknown')}")
# Check if already has city
has_city = record.get('locations') and any(loc.get('city') for loc in record['locations'])
if has_city:
print(" [SKIP] Already has city")
enriched_records.append(record)
skipped_count += 1
continue
# Enrich with geocoding
enriched = enrich_record_with_geocoding(record, cache)
# Check if geocoding succeeded
geocoded = enriched.get('locations') and any(loc.get('city') for loc in enriched['locations'])
if geocoded and not has_city:
success_count += 1
elif not geocoded:
failed_count += 1
enriched_records.append(enriched)
# Save enriched output
print("\n" + "=" * 70)
print("Saving Enriched Output")
print("=" * 70)
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
yaml.dump(enriched_records, f,
default_flow_style=False,
allow_unicode=True,
sort_keys=False,
indent=2)
print(f"✓ Saved {len(enriched_records)} enriched records to:")
print(f" {OUTPUT_FILE}")
# Final statistics
print("\n" + "=" * 70)
print("Geocoding Complete")
print("=" * 70)
total_with_cities = sum(
1 for r in enriched_records
if r.get('locations') and any(loc.get('city') for loc in r['locations'])
)
total_with_coords = sum(
1 for r in enriched_records
if r.get('locations') and any(loc.get('latitude') for loc in r['locations'])
)
print(f"Records processed: {len(enriched_records)}")
print(f"Already had cities: {skipped_count}")
print(f"Successfully geocoded: {success_count}")
print(f"Failed geocoding: {failed_count}")
print()
print(f"Total with cities: {total_with_cities} ({total_with_cities/len(enriched_records)*100:.1f}%)")
print(f"Total with coordinates: {total_with_coords} ({total_with_coords/len(enriched_records)*100:.1f}%)")
print()
print(f"Target achieved: {'' if total_with_cities >= 58 else ''} (60% = {len(enriched_records)*0.6:.0f} records)")
print()
if __name__ == "__main__":
main()