glam/scripts/reverse_geocode_mexican_cities.py
2025-11-19 23:25:22 +01:00

224 lines
7 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Reverse geocode Mexican institutions with missing city fields.
Uses Nominatim API to extract city names from latitude/longitude coordinates
for institutions that have coordinates but missing city field.
Respects Nominatim rate limit: 1 request per second
"""
import sys
import time
from pathlib import Path
from datetime import datetime, timezone
from typing import Optional, Dict, Any
import requests
import yaml
# Add project root to path
project_root = Path(__file__).parent.parent
sys.path.insert(0, str(project_root))
def reverse_geocode(lat: float, lon: float) -> Optional[Dict[str, Any]]:
"""
Reverse geocode coordinates using Nominatim API.
Returns:
Dictionary with address components or None if request fails
"""
url = "https://nominatim.openstreetmap.org/reverse"
params = {
'format': 'json',
'lat': lat,
'lon': lon,
'zoom': 18, # High zoom for city-level detail
'addressdetails': 1,
'accept-language': 'es,en' # Spanish first, then English
}
headers = {
'User-Agent': 'GLAM-Heritage-Custodian-Project/1.0 (heritage data enrichment)'
}
try:
response = requests.get(url, params=params, headers=headers, timeout=10)
response.raise_for_status()
data = response.json()
if 'error' in data:
print(f" ❌ Geocoding error: {data['error']}")
return None
return data
except requests.exceptions.RequestException as e:
print(f" ❌ Request failed: {e}")
return None
def extract_city_from_geocode(geocode_data: Dict[str, Any]) -> Optional[str]:
"""
Extract city name from Nominatim response.
Tries multiple address components in priority order:
1. city
2. town
3. municipality
4. village
5. hamlet
6. suburb (if no better option)
"""
address = geocode_data.get('address', {})
# Priority order for city-level components
city_keys = ['city', 'town', 'municipality', 'village', 'hamlet']
for key in city_keys:
if key in address:
return address[key]
# Fallback to suburb if nothing else found
if 'suburb' in address:
return address['suburb']
return None
def process_mexican_institutions(input_path: Path, dry_run: bool = False):
"""
Process Mexican institutions and reverse geocode missing cities.
"""
print("🗺️ Reverse geocoding Mexican institutions...\n")
# Load dataset
with open(input_path, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
institutions = data if isinstance(data, list) else [data]
# Find Mexican institutions with coordinates but no city
candidates = []
for inst in institutions:
locations = inst.get('locations', [])
if not locations:
continue
loc = locations[0] # Primary location
country = loc.get('country', '')
city = loc.get('city')
lat = loc.get('latitude')
lon = loc.get('longitude')
if country == 'MX' and not city and lat and lon:
candidates.append(inst)
print(f"Found {len(candidates)} Mexican institutions needing reverse geocoding\n")
if dry_run:
print("🔍 DRY RUN MODE - No files will be modified\n")
# Process each candidate
geocoded_count = 0
failed_count = 0
for inst in candidates:
name = inst.get('name', 'Unknown')
loc = inst['locations'][0]
region = loc.get('region', 'Unknown')
lat = loc['latitude']
lon = loc['longitude']
print(f"📍 {name}")
print(f" Region: {region}")
print(f" Coords: {lat}, {lon}")
# Reverse geocode
geocode_data = reverse_geocode(lat, lon)
if geocode_data:
city = extract_city_from_geocode(geocode_data)
if city:
print(f" ✅ Found city: {city}")
if not dry_run:
# Update location
loc['city'] = city
# Add enrichment history
if 'provenance' not in inst:
inst['provenance'] = {}
if 'enrichment_history' not in inst['provenance']:
inst['provenance']['enrichment_history'] = []
inst['provenance']['enrichment_history'].append({
'enrichment_date': datetime.now(timezone.utc).isoformat(),
'enrichment_method': 'Nominatim reverse geocoding',
'fields_updated': ['locations.city'],
'source': f'Nominatim API reverse geocode ({lat}, {lon})',
'notes': f'Extracted city name from coordinates using OpenStreetMap Nominatim'
})
geocoded_count += 1
else:
print(f" ⚠️ No city found in geocode response")
failed_count += 1
else:
print(f" ❌ Geocoding failed")
failed_count += 1
print()
# Rate limit: 1 request per second
time.sleep(1.1)
# Save results
if not dry_run and geocoded_count > 0:
# Create backup
backup_path = input_path.parent / f"{input_path.stem}_backup_before_reverse_geocoding.yaml"
with open(input_path, 'r', encoding='utf-8') as f:
backup_data = f.read()
with open(backup_path, 'w', encoding='utf-8') as f:
f.write(backup_data)
print(f"✅ Backup created: {backup_path}")
# Write updated data
with open(input_path, 'w', encoding='utf-8') as f:
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False)
print(f"✅ Updated data written to: {input_path}")
# Print summary
print("\n" + "="*80)
print("REVERSE GEOCODING SUMMARY")
print("="*80)
print(f"Institutions processed: {len(candidates)}")
print(f"Successfully geocoded: {geocoded_count}")
print(f"Failed/No city found: {failed_count}")
print("="*80)
if dry_run:
print("\n💡 Run without --dry-run flag to apply changes")
def main():
import argparse
parser = argparse.ArgumentParser(description='Reverse geocode Mexican institutions')
parser.add_argument('--dry-run', action='store_true',
help='Test mode - do not modify files')
parser.add_argument('--input', type=Path,
default=Path('data/instances/all/globalglam-20251113-mexico-deduplicated.yaml'),
help='Input YAML file path')
args = parser.parse_args()
if not args.input.exists():
print(f"❌ Error: Input file not found: {args.input}")
sys.exit(1)
process_mexican_institutions(args.input, dry_run=args.dry_run)
if __name__ == '__main__':
main()