351 lines
12 KiB
Python
351 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Brazilian GLAM Geocoding Enrichment
|
|
====================================
|
|
|
|
Uses Nominatim API to geocode institutions by name + state to add city-level
|
|
location data and geographic coordinates.
|
|
|
|
Features:
|
|
- Rate limiting (1 req/sec for Nominatim)
|
|
- Caching to avoid duplicate lookups
|
|
- Fallback strategies for failed lookups
|
|
- Progress tracking and detailed reporting
|
|
"""
|
|
|
|
import yaml
|
|
import time
|
|
import requests
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional, Tuple
|
|
from urllib.parse import quote
|
|
|
|
# File paths
|
|
INPUT_FILE = Path("/Users/kempersc/apps/glam/data/instances/brazilian_institutions_curated_v2.yaml")
|
|
OUTPUT_FILE = Path("/Users/kempersc/apps/glam/data/instances/brazilian_institutions_geocoded_v3.yaml")
|
|
CACHE_FILE = Path("/Users/kempersc/apps/glam/data/cache/geocoding_cache.yaml")
|
|
|
|
# Nominatim API settings
|
|
NOMINATIM_URL = "https://nominatim.openstreetmap.org/search"
|
|
USER_AGENT = "GLAM-Data-Extraction/0.2.0 (heritage research project)"
|
|
REQUEST_DELAY = 1.1 # Seconds between requests (Nominatim requires 1/sec max)
|
|
|
|
# Brazilian state code mapping
|
|
STATE_CODES = {
|
|
'ACRE': 'AC', 'ALAGOAS': 'AL', 'AMAPÁ': 'AP', 'AMAZONAS': 'AM',
|
|
'BAHIA': 'BA', 'CEARÁ': 'CE', 'DISTRITO FEDERAL': 'DF', 'ESPÍRITO SANTO': 'ES',
|
|
'GOIÁS': 'GO', 'MARANHÃO': 'MA', 'MATO GROSSO': 'MT', 'MATO GROSSO DO SUL': 'MS',
|
|
'MINAS GERAIS': 'MG', 'PARÁ': 'PA', 'PARAÍBA': 'PB', 'PARANÁ': 'PR',
|
|
'PERNAMBUCO': 'PE', 'PIAUÍ': 'PI', 'RIO DE JANEIRO': 'RJ', 'RIO GRANDE DO NORTE': 'RN',
|
|
'RIO GRANDE DO SUL': 'RS', 'RONDÔNIA': 'RO', 'RORAIMA': 'RR',
|
|
'SANTA CATARINA': 'SC', 'SÃO PAULO': 'SP', 'SERGIPE': 'SE', 'TOCANTINS': 'TO'
|
|
}
|
|
|
|
|
|
class GeocodingCache:
|
|
"""Simple YAML-based cache for geocoding results."""
|
|
|
|
def __init__(self, cache_file: Path):
|
|
self.cache_file = cache_file
|
|
self.cache = self._load_cache()
|
|
|
|
def _load_cache(self) -> Dict:
|
|
"""Load cache from file."""
|
|
if self.cache_file.exists():
|
|
with open(self.cache_file, 'r', encoding='utf-8') as f:
|
|
return yaml.safe_load(f) or {}
|
|
return {}
|
|
|
|
def _save_cache(self):
|
|
"""Save cache to file."""
|
|
self.cache_file.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(self.cache_file, 'w', encoding='utf-8') as f:
|
|
yaml.dump(self.cache, f, default_flow_style=False, allow_unicode=True)
|
|
|
|
def get(self, key: str) -> Optional[Dict]:
|
|
"""Get cached geocoding result."""
|
|
return self.cache.get(key)
|
|
|
|
def set(self, key: str, value: Optional[Dict]):
|
|
"""Cache a geocoding result (including None for failed lookups)."""
|
|
self.cache[key] = value
|
|
self._save_cache()
|
|
|
|
|
|
def geocode_location(query: str, state: str, cache: GeocodingCache) -> Optional[Dict]:
|
|
"""
|
|
Geocode a location using Nominatim API.
|
|
|
|
Args:
|
|
query: Institution name or location description
|
|
state: Brazilian state name or code
|
|
cache: Geocoding cache
|
|
|
|
Returns:
|
|
Dict with 'city', 'latitude', 'longitude', 'display_name' or None
|
|
"""
|
|
# Generate cache key
|
|
cache_key = f"{query}|{state}|BR"
|
|
|
|
# Check cache first
|
|
cached = cache.get(cache_key)
|
|
if cached:
|
|
print(f" [CACHE] {query}, {state}")
|
|
return cached
|
|
|
|
# Build search query
|
|
state_code = STATE_CODES.get(state.upper(), state)
|
|
search_query = f"{query}, {state_code}, Brazil"
|
|
|
|
params = {
|
|
'q': search_query,
|
|
'format': 'json',
|
|
'limit': 1,
|
|
'countrycodes': 'br',
|
|
'addressdetails': 1
|
|
}
|
|
|
|
headers = {
|
|
'User-Agent': USER_AGENT
|
|
}
|
|
|
|
try:
|
|
print(f" [API] Geocoding: {search_query}")
|
|
time.sleep(REQUEST_DELAY) # Rate limiting
|
|
|
|
response = requests.get(NOMINATIM_URL, params=params, headers=headers, timeout=10)
|
|
response.raise_for_status()
|
|
|
|
results = response.json()
|
|
|
|
if not results:
|
|
print(f" [FAIL] No results for: {search_query}")
|
|
# Cache null result to avoid re-querying
|
|
cache.set(cache_key, None)
|
|
return None
|
|
|
|
result = results[0]
|
|
address = result.get('address', {})
|
|
|
|
# Extract city from address components
|
|
city = (
|
|
address.get('city') or
|
|
address.get('town') or
|
|
address.get('municipality') or
|
|
address.get('village') or
|
|
address.get('suburb')
|
|
)
|
|
|
|
if not city:
|
|
print(f" [WARN] No city found in result for: {search_query}")
|
|
cache.set(cache_key, None)
|
|
return None
|
|
|
|
geo_data = {
|
|
'city': city,
|
|
'latitude': float(result['lat']),
|
|
'longitude': float(result['lon']),
|
|
'display_name': result.get('display_name', ''),
|
|
'osm_type': result.get('osm_type'),
|
|
'osm_id': result.get('osm_id')
|
|
}
|
|
|
|
print(f" [SUCCESS] Found: {city} ({geo_data['latitude']:.4f}, {geo_data['longitude']:.4f})")
|
|
|
|
# Cache successful result
|
|
cache.set(cache_key, geo_data)
|
|
return geo_data
|
|
|
|
except requests.RequestException as e:
|
|
print(f" [ERROR] API request failed: {e}")
|
|
return None
|
|
except (KeyError, ValueError) as e:
|
|
print(f" [ERROR] Failed to parse result: {e}")
|
|
return None
|
|
|
|
|
|
def enrich_record_with_geocoding(record: Dict, cache: GeocodingCache) -> Dict:
|
|
"""
|
|
Enrich a record with geocoding data.
|
|
|
|
Strategy:
|
|
1. Skip if already has city
|
|
2. Try geocoding institution name + state
|
|
3. Update location with city + coordinates
|
|
"""
|
|
enriched = record.copy()
|
|
|
|
# Check if already has city
|
|
if enriched.get('locations'):
|
|
for loc in enriched['locations']:
|
|
if loc.get('city'):
|
|
return enriched # Already has city, skip
|
|
|
|
# Get state from location
|
|
state = None
|
|
if enriched.get('locations') and enriched['locations']:
|
|
state = enriched['locations'][0].get('region')
|
|
|
|
if not state:
|
|
return enriched # No state info, can't geocode
|
|
|
|
# Try geocoding with institution name
|
|
inst_name = enriched.get('name', '')
|
|
geo_data = geocode_location(inst_name, state, cache)
|
|
|
|
if geo_data:
|
|
# Update location with geocoding data
|
|
if 'locations' not in enriched or not enriched['locations']:
|
|
enriched['locations'] = [{'country': 'BR', 'region': state}]
|
|
|
|
# Update first location
|
|
enriched['locations'][0].update({
|
|
'city': geo_data['city'],
|
|
'latitude': geo_data['latitude'],
|
|
'longitude': geo_data['longitude']
|
|
})
|
|
|
|
# Add OSM identifiers if not already present
|
|
osm_id = f"{geo_data['osm_type']}/{geo_data['osm_id']}"
|
|
osm_url = f"https://www.openstreetmap.org/{geo_data['osm_type']}/{geo_data['osm_id']}"
|
|
|
|
if 'identifiers' not in enriched:
|
|
enriched['identifiers'] = []
|
|
|
|
# Check if OSM ID already exists
|
|
existing_osm = any(
|
|
id.get('identifier_scheme') == 'OpenStreetMap'
|
|
for id in enriched.get('identifiers', [])
|
|
)
|
|
|
|
if not existing_osm:
|
|
enriched['identifiers'].append({
|
|
'identifier_scheme': 'OpenStreetMap',
|
|
'identifier_value': osm_id,
|
|
'identifier_url': osm_url
|
|
})
|
|
|
|
# Update provenance
|
|
if 'provenance' in enriched:
|
|
enriched['provenance']['extraction_date'] = datetime.now(timezone.utc).isoformat()
|
|
enriched['provenance']['extraction_method'] = (
|
|
enriched['provenance'].get('extraction_method', '') +
|
|
' + Nominatim geocoding'
|
|
)
|
|
# Increase confidence slightly for geocoded records
|
|
current_confidence = enriched['provenance'].get('confidence_score', 0.7)
|
|
enriched['provenance']['confidence_score'] = min(0.85, current_confidence + 0.05)
|
|
|
|
return enriched
|
|
|
|
|
|
def main():
|
|
"""Main geocoding enrichment workflow."""
|
|
print("=" * 70)
|
|
print("Brazilian GLAM Geocoding Enrichment - v3.0")
|
|
print("=" * 70)
|
|
print()
|
|
|
|
# Load input data
|
|
print(f"Loading records from: {INPUT_FILE}")
|
|
with open(INPUT_FILE, 'r', encoding='utf-8') as f:
|
|
records = yaml.safe_load(f)
|
|
|
|
print(f"Loaded {len(records)} records")
|
|
|
|
# Initialize cache
|
|
cache = GeocodingCache(CACHE_FILE)
|
|
print(f"Geocoding cache: {len(cache.cache)} entries")
|
|
print()
|
|
|
|
# Count records needing geocoding
|
|
needs_geocoding = sum(
|
|
1 for r in records
|
|
if not (r.get('locations') and any(loc.get('city') for loc in r['locations']))
|
|
)
|
|
|
|
print(f"Records needing geocoding: {needs_geocoding}/{len(records)}")
|
|
print(f"Estimated time: ~{needs_geocoding * REQUEST_DELAY / 60:.1f} minutes")
|
|
print()
|
|
|
|
# Enrich records
|
|
print("=" * 70)
|
|
print("Geocoding Records")
|
|
print("=" * 70)
|
|
|
|
enriched_records = []
|
|
success_count = 0
|
|
failed_count = 0
|
|
skipped_count = 0
|
|
|
|
for i, record in enumerate(records, 1):
|
|
print(f"\n[{i}/{len(records)}] {record.get('name', 'Unknown')}")
|
|
|
|
# Check if already has city
|
|
has_city = record.get('locations') and any(loc.get('city') for loc in record['locations'])
|
|
|
|
if has_city:
|
|
print(" [SKIP] Already has city")
|
|
enriched_records.append(record)
|
|
skipped_count += 1
|
|
continue
|
|
|
|
# Enrich with geocoding
|
|
enriched = enrich_record_with_geocoding(record, cache)
|
|
|
|
# Check if geocoding succeeded
|
|
geocoded = enriched.get('locations') and any(loc.get('city') for loc in enriched['locations'])
|
|
|
|
if geocoded and not has_city:
|
|
success_count += 1
|
|
elif not geocoded:
|
|
failed_count += 1
|
|
|
|
enriched_records.append(enriched)
|
|
|
|
# Save enriched output
|
|
print("\n" + "=" * 70)
|
|
print("Saving Enriched Output")
|
|
print("=" * 70)
|
|
|
|
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
|
|
yaml.dump(enriched_records, f,
|
|
default_flow_style=False,
|
|
allow_unicode=True,
|
|
sort_keys=False,
|
|
indent=2)
|
|
|
|
print(f"✓ Saved {len(enriched_records)} enriched records to:")
|
|
print(f" {OUTPUT_FILE}")
|
|
|
|
# Final statistics
|
|
print("\n" + "=" * 70)
|
|
print("Geocoding Complete")
|
|
print("=" * 70)
|
|
|
|
total_with_cities = sum(
|
|
1 for r in enriched_records
|
|
if r.get('locations') and any(loc.get('city') for loc in r['locations'])
|
|
)
|
|
|
|
total_with_coords = sum(
|
|
1 for r in enriched_records
|
|
if r.get('locations') and any(loc.get('latitude') for loc in r['locations'])
|
|
)
|
|
|
|
print(f"Records processed: {len(enriched_records)}")
|
|
print(f"Already had cities: {skipped_count}")
|
|
print(f"Successfully geocoded: {success_count}")
|
|
print(f"Failed geocoding: {failed_count}")
|
|
print()
|
|
print(f"Total with cities: {total_with_cities} ({total_with_cities/len(enriched_records)*100:.1f}%)")
|
|
print(f"Total with coordinates: {total_with_coords} ({total_with_coords/len(enriched_records)*100:.1f}%)")
|
|
print()
|
|
print(f"Target achieved: {'✓' if total_with_cities >= 58 else '✗'} (60% = {len(enriched_records)*0.6:.0f} records)")
|
|
print()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|