#!/usr/bin/env python3 """ Reverse geocode Mexican institutions with missing city fields. Uses Nominatim API to extract city names from latitude/longitude coordinates for institutions that have coordinates but missing city field. Respects Nominatim rate limit: 1 request per second """ import sys import time from pathlib import Path from datetime import datetime, timezone from typing import Optional, Dict, Any import requests import yaml # Add project root to path project_root = Path(__file__).parent.parent sys.path.insert(0, str(project_root)) def reverse_geocode(lat: float, lon: float) -> Optional[Dict[str, Any]]: """ Reverse geocode coordinates using Nominatim API. Returns: Dictionary with address components or None if request fails """ url = "https://nominatim.openstreetmap.org/reverse" params = { 'format': 'json', 'lat': lat, 'lon': lon, 'zoom': 18, # High zoom for city-level detail 'addressdetails': 1, 'accept-language': 'es,en' # Spanish first, then English } headers = { 'User-Agent': 'GLAM-Heritage-Custodian-Project/1.0 (heritage data enrichment)' } try: response = requests.get(url, params=params, headers=headers, timeout=10) response.raise_for_status() data = response.json() if 'error' in data: print(f" ❌ Geocoding error: {data['error']}") return None return data except requests.exceptions.RequestException as e: print(f" ❌ Request failed: {e}") return None def extract_city_from_geocode(geocode_data: Dict[str, Any]) -> Optional[str]: """ Extract city name from Nominatim response. Tries multiple address components in priority order: 1. city 2. town 3. municipality 4. village 5. hamlet 6. suburb (if no better option) """ address = geocode_data.get('address', {}) # Priority order for city-level components city_keys = ['city', 'town', 'municipality', 'village', 'hamlet'] for key in city_keys: if key in address: return address[key] # Fallback to suburb if nothing else found if 'suburb' in address: return address['suburb'] return None def process_mexican_institutions(input_path: Path, dry_run: bool = False): """ Process Mexican institutions and reverse geocode missing cities. """ print("πŸ—ΊοΈ Reverse geocoding Mexican institutions...\n") # Load dataset with open(input_path, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) institutions = data if isinstance(data, list) else [data] # Find Mexican institutions with coordinates but no city candidates = [] for inst in institutions: locations = inst.get('locations', []) if not locations: continue loc = locations[0] # Primary location country = loc.get('country', '') city = loc.get('city') lat = loc.get('latitude') lon = loc.get('longitude') if country == 'MX' and not city and lat and lon: candidates.append(inst) print(f"Found {len(candidates)} Mexican institutions needing reverse geocoding\n") if dry_run: print("πŸ” DRY RUN MODE - No files will be modified\n") # Process each candidate geocoded_count = 0 failed_count = 0 for inst in candidates: name = inst.get('name', 'Unknown') loc = inst['locations'][0] region = loc.get('region', 'Unknown') lat = loc['latitude'] lon = loc['longitude'] print(f"πŸ“ {name}") print(f" Region: {region}") print(f" Coords: {lat}, {lon}") # Reverse geocode geocode_data = reverse_geocode(lat, lon) if geocode_data: city = extract_city_from_geocode(geocode_data) if city: print(f" βœ… Found city: {city}") if not dry_run: # Update location loc['city'] = city # Add enrichment history if 'provenance' not in inst: inst['provenance'] = {} if 'enrichment_history' not in inst['provenance']: inst['provenance']['enrichment_history'] = [] inst['provenance']['enrichment_history'].append({ 'enrichment_date': datetime.now(timezone.utc).isoformat(), 'enrichment_method': 'Nominatim reverse geocoding', 'fields_updated': ['locations.city'], 'source': f'Nominatim API reverse geocode ({lat}, {lon})', 'notes': f'Extracted city name from coordinates using OpenStreetMap Nominatim' }) geocoded_count += 1 else: print(f" ⚠️ No city found in geocode response") failed_count += 1 else: print(f" ❌ Geocoding failed") failed_count += 1 print() # Rate limit: 1 request per second time.sleep(1.1) # Save results if not dry_run and geocoded_count > 0: # Create backup backup_path = input_path.parent / f"{input_path.stem}_backup_before_reverse_geocoding.yaml" with open(input_path, 'r', encoding='utf-8') as f: backup_data = f.read() with open(backup_path, 'w', encoding='utf-8') as f: f.write(backup_data) print(f"βœ… Backup created: {backup_path}") # Write updated data with open(input_path, 'w', encoding='utf-8') as f: yaml.dump(institutions, f, allow_unicode=True, sort_keys=False) print(f"βœ… Updated data written to: {input_path}") # Print summary print("\n" + "="*80) print("REVERSE GEOCODING SUMMARY") print("="*80) print(f"Institutions processed: {len(candidates)}") print(f"Successfully geocoded: {geocoded_count}") print(f"Failed/No city found: {failed_count}") print("="*80) if dry_run: print("\nπŸ’‘ Run without --dry-run flag to apply changes") def main(): import argparse parser = argparse.ArgumentParser(description='Reverse geocode Mexican institutions') parser.add_argument('--dry-run', action='store_true', help='Test mode - do not modify files') parser.add_argument('--input', type=Path, default=Path('data/instances/all/globalglam-20251113-mexico-deduplicated.yaml'), help='Input YAML file path') args = parser.parse_args() if not args.input.exists(): print(f"❌ Error: Input file not found: {args.input}") sys.exit(1) process_mexican_institutions(args.input, dry_run=args.dry_run) if __name__ == '__main__': main()