224 lines
7 KiB
Python
Executable file
224 lines
7 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Reverse geocode Mexican institutions with missing city fields.
|
|
|
|
Uses Nominatim API to extract city names from latitude/longitude coordinates
|
|
for institutions that have coordinates but missing city field.
|
|
|
|
Respects Nominatim rate limit: 1 request per second
|
|
"""
|
|
|
|
import sys
|
|
import time
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from typing import Optional, Dict, Any
|
|
import requests
|
|
import yaml
|
|
|
|
# Add project root to path
|
|
project_root = Path(__file__).parent.parent
|
|
sys.path.insert(0, str(project_root))
|
|
|
|
|
|
def reverse_geocode(lat: float, lon: float) -> Optional[Dict[str, Any]]:
|
|
"""
|
|
Reverse geocode coordinates using Nominatim API.
|
|
|
|
Returns:
|
|
Dictionary with address components or None if request fails
|
|
"""
|
|
url = "https://nominatim.openstreetmap.org/reverse"
|
|
params = {
|
|
'format': 'json',
|
|
'lat': lat,
|
|
'lon': lon,
|
|
'zoom': 18, # High zoom for city-level detail
|
|
'addressdetails': 1,
|
|
'accept-language': 'es,en' # Spanish first, then English
|
|
}
|
|
headers = {
|
|
'User-Agent': 'GLAM-Heritage-Custodian-Project/1.0 (heritage data enrichment)'
|
|
}
|
|
|
|
try:
|
|
response = requests.get(url, params=params, headers=headers, timeout=10)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
if 'error' in data:
|
|
print(f" ❌ Geocoding error: {data['error']}")
|
|
return None
|
|
|
|
return data
|
|
except requests.exceptions.RequestException as e:
|
|
print(f" ❌ Request failed: {e}")
|
|
return None
|
|
|
|
|
|
def extract_city_from_geocode(geocode_data: Dict[str, Any]) -> Optional[str]:
|
|
"""
|
|
Extract city name from Nominatim response.
|
|
|
|
Tries multiple address components in priority order:
|
|
1. city
|
|
2. town
|
|
3. municipality
|
|
4. village
|
|
5. hamlet
|
|
6. suburb (if no better option)
|
|
"""
|
|
address = geocode_data.get('address', {})
|
|
|
|
# Priority order for city-level components
|
|
city_keys = ['city', 'town', 'municipality', 'village', 'hamlet']
|
|
|
|
for key in city_keys:
|
|
if key in address:
|
|
return address[key]
|
|
|
|
# Fallback to suburb if nothing else found
|
|
if 'suburb' in address:
|
|
return address['suburb']
|
|
|
|
return None
|
|
|
|
|
|
def process_mexican_institutions(input_path: Path, dry_run: bool = False):
|
|
"""
|
|
Process Mexican institutions and reverse geocode missing cities.
|
|
"""
|
|
print("🗺️ Reverse geocoding Mexican institutions...\n")
|
|
|
|
# Load dataset
|
|
with open(input_path, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
institutions = data if isinstance(data, list) else [data]
|
|
|
|
# Find Mexican institutions with coordinates but no city
|
|
candidates = []
|
|
for inst in institutions:
|
|
locations = inst.get('locations', [])
|
|
if not locations:
|
|
continue
|
|
|
|
loc = locations[0] # Primary location
|
|
country = loc.get('country', '')
|
|
city = loc.get('city')
|
|
lat = loc.get('latitude')
|
|
lon = loc.get('longitude')
|
|
|
|
if country == 'MX' and not city and lat and lon:
|
|
candidates.append(inst)
|
|
|
|
print(f"Found {len(candidates)} Mexican institutions needing reverse geocoding\n")
|
|
|
|
if dry_run:
|
|
print("🔍 DRY RUN MODE - No files will be modified\n")
|
|
|
|
# Process each candidate
|
|
geocoded_count = 0
|
|
failed_count = 0
|
|
|
|
for inst in candidates:
|
|
name = inst.get('name', 'Unknown')
|
|
loc = inst['locations'][0]
|
|
region = loc.get('region', 'Unknown')
|
|
lat = loc['latitude']
|
|
lon = loc['longitude']
|
|
|
|
print(f"📍 {name}")
|
|
print(f" Region: {region}")
|
|
print(f" Coords: {lat}, {lon}")
|
|
|
|
# Reverse geocode
|
|
geocode_data = reverse_geocode(lat, lon)
|
|
|
|
if geocode_data:
|
|
city = extract_city_from_geocode(geocode_data)
|
|
|
|
if city:
|
|
print(f" ✅ Found city: {city}")
|
|
|
|
if not dry_run:
|
|
# Update location
|
|
loc['city'] = city
|
|
|
|
# Add enrichment history
|
|
if 'provenance' not in inst:
|
|
inst['provenance'] = {}
|
|
|
|
if 'enrichment_history' not in inst['provenance']:
|
|
inst['provenance']['enrichment_history'] = []
|
|
|
|
inst['provenance']['enrichment_history'].append({
|
|
'enrichment_date': datetime.now(timezone.utc).isoformat(),
|
|
'enrichment_method': 'Nominatim reverse geocoding',
|
|
'fields_updated': ['locations.city'],
|
|
'source': f'Nominatim API reverse geocode ({lat}, {lon})',
|
|
'notes': f'Extracted city name from coordinates using OpenStreetMap Nominatim'
|
|
})
|
|
|
|
geocoded_count += 1
|
|
else:
|
|
print(f" ⚠️ No city found in geocode response")
|
|
failed_count += 1
|
|
else:
|
|
print(f" ❌ Geocoding failed")
|
|
failed_count += 1
|
|
|
|
print()
|
|
|
|
# Rate limit: 1 request per second
|
|
time.sleep(1.1)
|
|
|
|
# Save results
|
|
if not dry_run and geocoded_count > 0:
|
|
# Create backup
|
|
backup_path = input_path.parent / f"{input_path.stem}_backup_before_reverse_geocoding.yaml"
|
|
with open(input_path, 'r', encoding='utf-8') as f:
|
|
backup_data = f.read()
|
|
with open(backup_path, 'w', encoding='utf-8') as f:
|
|
f.write(backup_data)
|
|
print(f"✅ Backup created: {backup_path}")
|
|
|
|
# Write updated data
|
|
with open(input_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False)
|
|
print(f"✅ Updated data written to: {input_path}")
|
|
|
|
# Print summary
|
|
print("\n" + "="*80)
|
|
print("REVERSE GEOCODING SUMMARY")
|
|
print("="*80)
|
|
print(f"Institutions processed: {len(candidates)}")
|
|
print(f"Successfully geocoded: {geocoded_count}")
|
|
print(f"Failed/No city found: {failed_count}")
|
|
print("="*80)
|
|
|
|
if dry_run:
|
|
print("\n💡 Run without --dry-run flag to apply changes")
|
|
|
|
|
|
def main():
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description='Reverse geocode Mexican institutions')
|
|
parser.add_argument('--dry-run', action='store_true',
|
|
help='Test mode - do not modify files')
|
|
parser.add_argument('--input', type=Path,
|
|
default=Path('data/instances/all/globalglam-20251113-mexico-deduplicated.yaml'),
|
|
help='Input YAML file path')
|
|
|
|
args = parser.parse_args()
|
|
|
|
if not args.input.exists():
|
|
print(f"❌ Error: Input file not found: {args.input}")
|
|
sys.exit(1)
|
|
|
|
process_mexican_institutions(args.input, dry_run=args.dry_run)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|