#!/usr/bin/env python3 """ Brazilian GLAM Geocoding Enrichment ==================================== Uses Nominatim API to geocode institutions by name + state to add city-level location data and geographic coordinates. Features: - Rate limiting (1 req/sec for Nominatim) - Caching to avoid duplicate lookups - Fallback strategies for failed lookups - Progress tracking and detailed reporting """ import yaml import time import requests from datetime import datetime, timezone from pathlib import Path from typing import Dict, List, Optional, Tuple from urllib.parse import quote # File paths INPUT_FILE = Path("/Users/kempersc/apps/glam/data/instances/brazilian_institutions_curated_v2.yaml") OUTPUT_FILE = Path("/Users/kempersc/apps/glam/data/instances/brazilian_institutions_geocoded_v3.yaml") CACHE_FILE = Path("/Users/kempersc/apps/glam/data/cache/geocoding_cache.yaml") # Nominatim API settings NOMINATIM_URL = "https://nominatim.openstreetmap.org/search" USER_AGENT = "GLAM-Data-Extraction/0.2.0 (heritage research project)" REQUEST_DELAY = 1.1 # Seconds between requests (Nominatim requires 1/sec max) # Brazilian state code mapping STATE_CODES = { 'ACRE': 'AC', 'ALAGOAS': 'AL', 'AMAPÁ': 'AP', 'AMAZONAS': 'AM', 'BAHIA': 'BA', 'CEARÁ': 'CE', 'DISTRITO FEDERAL': 'DF', 'ESPÍRITO SANTO': 'ES', 'GOIÁS': 'GO', 'MARANHÃO': 'MA', 'MATO GROSSO': 'MT', 'MATO GROSSO DO SUL': 'MS', 'MINAS GERAIS': 'MG', 'PARÁ': 'PA', 'PARAÍBA': 'PB', 'PARANÁ': 'PR', 'PERNAMBUCO': 'PE', 'PIAUÍ': 'PI', 'RIO DE JANEIRO': 'RJ', 'RIO GRANDE DO NORTE': 'RN', 'RIO GRANDE DO SUL': 'RS', 'RONDÔNIA': 'RO', 'RORAIMA': 'RR', 'SANTA CATARINA': 'SC', 'SÃO PAULO': 'SP', 'SERGIPE': 'SE', 'TOCANTINS': 'TO' } class GeocodingCache: """Simple YAML-based cache for geocoding results.""" def __init__(self, cache_file: Path): self.cache_file = cache_file self.cache = self._load_cache() def _load_cache(self) -> Dict: """Load cache from file.""" if self.cache_file.exists(): with open(self.cache_file, 'r', encoding='utf-8') as f: return yaml.safe_load(f) or {} return {} def _save_cache(self): """Save cache to file.""" self.cache_file.parent.mkdir(parents=True, exist_ok=True) with open(self.cache_file, 'w', encoding='utf-8') as f: yaml.dump(self.cache, f, default_flow_style=False, allow_unicode=True) def get(self, key: str) -> Optional[Dict]: """Get cached geocoding result.""" return self.cache.get(key) def set(self, key: str, value: Optional[Dict]): """Cache a geocoding result (including None for failed lookups).""" self.cache[key] = value self._save_cache() def geocode_location(query: str, state: str, cache: GeocodingCache) -> Optional[Dict]: """ Geocode a location using Nominatim API. Args: query: Institution name or location description state: Brazilian state name or code cache: Geocoding cache Returns: Dict with 'city', 'latitude', 'longitude', 'display_name' or None """ # Generate cache key cache_key = f"{query}|{state}|BR" # Check cache first cached = cache.get(cache_key) if cached: print(f" [CACHE] {query}, {state}") return cached # Build search query state_code = STATE_CODES.get(state.upper(), state) search_query = f"{query}, {state_code}, Brazil" params = { 'q': search_query, 'format': 'json', 'limit': 1, 'countrycodes': 'br', 'addressdetails': 1 } headers = { 'User-Agent': USER_AGENT } try: print(f" [API] Geocoding: {search_query}") time.sleep(REQUEST_DELAY) # Rate limiting response = requests.get(NOMINATIM_URL, params=params, headers=headers, timeout=10) response.raise_for_status() results = response.json() if not results: print(f" [FAIL] No results for: {search_query}") # Cache null result to avoid re-querying cache.set(cache_key, None) return None result = results[0] address = result.get('address', {}) # Extract city from address components city = ( address.get('city') or address.get('town') or address.get('municipality') or address.get('village') or address.get('suburb') ) if not city: print(f" [WARN] No city found in result for: {search_query}") cache.set(cache_key, None) return None geo_data = { 'city': city, 'latitude': float(result['lat']), 'longitude': float(result['lon']), 'display_name': result.get('display_name', ''), 'osm_type': result.get('osm_type'), 'osm_id': result.get('osm_id') } print(f" [SUCCESS] Found: {city} ({geo_data['latitude']:.4f}, {geo_data['longitude']:.4f})") # Cache successful result cache.set(cache_key, geo_data) return geo_data except requests.RequestException as e: print(f" [ERROR] API request failed: {e}") return None except (KeyError, ValueError) as e: print(f" [ERROR] Failed to parse result: {e}") return None def enrich_record_with_geocoding(record: Dict, cache: GeocodingCache) -> Dict: """ Enrich a record with geocoding data. Strategy: 1. Skip if already has city 2. Try geocoding institution name + state 3. Update location with city + coordinates """ enriched = record.copy() # Check if already has city if enriched.get('locations'): for loc in enriched['locations']: if loc.get('city'): return enriched # Already has city, skip # Get state from location state = None if enriched.get('locations') and enriched['locations']: state = enriched['locations'][0].get('region') if not state: return enriched # No state info, can't geocode # Try geocoding with institution name inst_name = enriched.get('name', '') geo_data = geocode_location(inst_name, state, cache) if geo_data: # Update location with geocoding data if 'locations' not in enriched or not enriched['locations']: enriched['locations'] = [{'country': 'BR', 'region': state}] # Update first location enriched['locations'][0].update({ 'city': geo_data['city'], 'latitude': geo_data['latitude'], 'longitude': geo_data['longitude'] }) # Add OSM identifiers if not already present osm_id = f"{geo_data['osm_type']}/{geo_data['osm_id']}" osm_url = f"https://www.openstreetmap.org/{geo_data['osm_type']}/{geo_data['osm_id']}" if 'identifiers' not in enriched: enriched['identifiers'] = [] # Check if OSM ID already exists existing_osm = any( id.get('identifier_scheme') == 'OpenStreetMap' for id in enriched.get('identifiers', []) ) if not existing_osm: enriched['identifiers'].append({ 'identifier_scheme': 'OpenStreetMap', 'identifier_value': osm_id, 'identifier_url': osm_url }) # Update provenance if 'provenance' in enriched: enriched['provenance']['extraction_date'] = datetime.now(timezone.utc).isoformat() enriched['provenance']['extraction_method'] = ( enriched['provenance'].get('extraction_method', '') + ' + Nominatim geocoding' ) # Increase confidence slightly for geocoded records current_confidence = enriched['provenance'].get('confidence_score', 0.7) enriched['provenance']['confidence_score'] = min(0.85, current_confidence + 0.05) return enriched def main(): """Main geocoding enrichment workflow.""" print("=" * 70) print("Brazilian GLAM Geocoding Enrichment - v3.0") print("=" * 70) print() # Load input data print(f"Loading records from: {INPUT_FILE}") with open(INPUT_FILE, 'r', encoding='utf-8') as f: records = yaml.safe_load(f) print(f"Loaded {len(records)} records") # Initialize cache cache = GeocodingCache(CACHE_FILE) print(f"Geocoding cache: {len(cache.cache)} entries") print() # Count records needing geocoding needs_geocoding = sum( 1 for r in records if not (r.get('locations') and any(loc.get('city') for loc in r['locations'])) ) print(f"Records needing geocoding: {needs_geocoding}/{len(records)}") print(f"Estimated time: ~{needs_geocoding * REQUEST_DELAY / 60:.1f} minutes") print() # Enrich records print("=" * 70) print("Geocoding Records") print("=" * 70) enriched_records = [] success_count = 0 failed_count = 0 skipped_count = 0 for i, record in enumerate(records, 1): print(f"\n[{i}/{len(records)}] {record.get('name', 'Unknown')}") # Check if already has city has_city = record.get('locations') and any(loc.get('city') for loc in record['locations']) if has_city: print(" [SKIP] Already has city") enriched_records.append(record) skipped_count += 1 continue # Enrich with geocoding enriched = enrich_record_with_geocoding(record, cache) # Check if geocoding succeeded geocoded = enriched.get('locations') and any(loc.get('city') for loc in enriched['locations']) if geocoded and not has_city: success_count += 1 elif not geocoded: failed_count += 1 enriched_records.append(enriched) # Save enriched output print("\n" + "=" * 70) print("Saving Enriched Output") print("=" * 70) OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True) with open(OUTPUT_FILE, 'w', encoding='utf-8') as f: yaml.dump(enriched_records, f, default_flow_style=False, allow_unicode=True, sort_keys=False, indent=2) print(f"✓ Saved {len(enriched_records)} enriched records to:") print(f" {OUTPUT_FILE}") # Final statistics print("\n" + "=" * 70) print("Geocoding Complete") print("=" * 70) total_with_cities = sum( 1 for r in enriched_records if r.get('locations') and any(loc.get('city') for loc in r['locations']) ) total_with_coords = sum( 1 for r in enriched_records if r.get('locations') and any(loc.get('latitude') for loc in r['locations']) ) print(f"Records processed: {len(enriched_records)}") print(f"Already had cities: {skipped_count}") print(f"Successfully geocoded: {success_count}") print(f"Failed geocoding: {failed_count}") print() print(f"Total with cities: {total_with_cities} ({total_with_cities/len(enriched_records)*100:.1f}%)") print(f"Total with coordinates: {total_with_coords} ({total_with_coords/len(enriched_records)*100:.1f}%)") print() print(f"Target achieved: {'✓' if total_with_cities >= 58 else '✗'} (60% = {len(enriched_records)*0.6:.0f} records)") print() if __name__ == "__main__": main()