#!/usr/bin/env python3 """ Add city names to Mexican institutions with missing city field. Combines manual corrections (for known institutions) with reverse geocoding (for uncertain cases). Flags questionable automated results for manual review. """ import sys import time from pathlib import Path from datetime import datetime, timezone from typing import Optional, Dict, Any, Tuple import requests import yaml # Add project root to path project_root = Path(__file__).parent.parent sys.path.insert(0, str(project_root)) # Known city corrections (based on institutional context and research) KNOWN_CITY_CORRECTIONS = { # State cultural institutes - typically in state capitals 'Instituto Sudcaliforniano de Cultura': { 'city': 'La Paz', 'source': 'State cultural institute, headquarters in capital city La Paz', 'confidence': 'high' }, 'Instituto Tamaulipeco para la Cultura y las Artes': { 'city': 'Ciudad Victoria', 'source': 'State cultural institute (ITCA), headquarters in capital Ciudad Victoria', 'confidence': 'high' }, # Specific sites with known locations 'Palenque Site Museum': { 'city': 'Palenque', 'source': 'Archaeological site museum located in Palenque, Chiapas', 'confidence': 'high' }, 'Calakmul Digital Project': { 'city': 'Calakmul', 'source': 'Archaeological site digital project, Calakmul Biosphere Reserve', 'confidence': 'high' }, 'Colonial Campeche Archive': { 'city': 'Campeche', 'source': 'Colonial archive for state of Campeche, located in capital city', 'confidence': 'high' }, 'Chetumal City Museum': { 'city': 'Chetumal', 'source': 'Museum explicitly named for Chetumal city', 'confidence': 'high' }, 'General Archive of Quintana Roo': { 'city': 'Chetumal', 'source': 'State archive, headquarters in capital city Chetumal', 'confidence': 'high' }, 'La Casa Redonda': { 'city': 'Chihuahua', 'source': 'Cultural center in Chihuahua city (state capital)', 'confidence': 'medium' }, 'UAS Repository': { 'city': 'Culiacán', 'source': 'Universidad Autónoma de Sinaloa repository, main campus in Culiacán', 'confidence': 'high' }, 'Instituto Regional del Patrimonio Mundial': { 'city': 'Zacatecas', 'source': 'Regional heritage institute, headquarters in Zacatecas city', 'confidence': 'medium' } } def reverse_geocode(lat: float, lon: float) -> Optional[Dict[str, Any]]: """ Reverse geocode coordinates using Nominatim API. Returns: Dictionary with address components or None if request fails """ url = "https://nominatim.openstreetmap.org/reverse" params = { 'format': 'json', 'lat': lat, 'lon': lon, 'zoom': 10, # City-level zoom (not too specific) 'addressdetails': 1, 'accept-language': 'es,en' # Spanish first, then English } headers = { 'User-Agent': 'GLAM-Heritage-Custodian-Project/1.0 (heritage data enrichment)' } try: response = requests.get(url, params=params, headers=headers, timeout=10) response.raise_for_status() data = response.json() if 'error' in data: return None return data except requests.exceptions.RequestException: return None def extract_city_from_geocode(geocode_data: Dict[str, Any]) -> Tuple[Optional[str], str]: """ Extract city name from Nominatim response. Returns: Tuple of (city_name, confidence_level) """ address = geocode_data.get('address', {}) # High confidence: city or town if 'city' in address: return address['city'], 'high' if 'town' in address: return address['town'], 'high' # Medium confidence: municipality if 'municipality' in address: return address['municipality'], 'medium' # Low confidence: village, hamlet, suburb if 'village' in address: return address['village'], 'low' if 'hamlet' in address: return address['hamlet'], 'low' if 'suburb' in address: return address['suburb'], 'low' return None, 'none' def apply_city_corrections(input_path: Path, dry_run: bool = False, use_geocoding: bool = True): """ Apply city corrections to Mexican institutions. """ print("🗺️ Adding city names to Mexican institutions...\n") # Load dataset with open(input_path, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) institutions = data if isinstance(data, list) else [data] # Find Mexican institutions with coordinates but no city candidates = [] for inst in institutions: locations = inst.get('locations', []) if not locations: continue loc = locations[0] # Primary location country = loc.get('country', '') city = loc.get('city') lat = loc.get('latitude') lon = loc.get('longitude') if country == 'MX' and not city and lat and lon: candidates.append(inst) print(f"Found {len(candidates)} Mexican institutions needing city names\n") if dry_run: print("🔍 DRY RUN MODE - No files will be modified\n") # Process each candidate manual_count = 0 geocoded_count = 0 failed_count = 0 low_confidence_count = 0 for inst in candidates: name = inst.get('name', 'Unknown') loc = inst['locations'][0] region = loc.get('region', 'Unknown') lat = loc['latitude'] lon = loc['longitude'] print(f"📍 {name}") print(f" Region: {region}") print(f" Coords: {lat}, {lon}") # Check if we have a manual correction if name in KNOWN_CITY_CORRECTIONS: correction = KNOWN_CITY_CORRECTIONS[name] city = correction['city'] source = correction['source'] confidence = correction['confidence'] print(f" ✅ Manual correction: {city} (confidence: {confidence})") print(f" 📝 Source: {source}") if not dry_run: loc['city'] = city # Add enrichment history if 'provenance' not in inst: inst['provenance'] = {} if 'enrichment_history' not in inst['provenance']: inst['provenance']['enrichment_history'] = [] inst['provenance']['enrichment_history'].append({ 'enrichment_date': datetime.now(timezone.utc).isoformat(), 'enrichment_method': 'Manual city correction', 'fields_updated': ['locations.city'], 'source': source, 'confidence': confidence, 'notes': f'City name verified through institutional research' }) manual_count += 1 elif use_geocoding: # Try reverse geocoding geocode_data = reverse_geocode(lat, lon) if geocode_data: city, confidence = extract_city_from_geocode(geocode_data) if city: if confidence == 'low': print(f" ⚠️ Reverse geocoded: {city} (LOW CONFIDENCE - needs verification)") low_confidence_count += 1 else: print(f" ✅ Reverse geocoded: {city} (confidence: {confidence})") if not dry_run: loc['city'] = city # Add enrichment history if 'provenance' not in inst: inst['provenance'] = {} if 'enrichment_history' not in inst['provenance']: inst['provenance']['enrichment_history'] = [] notes = 'Extracted city name from coordinates using OpenStreetMap Nominatim' if confidence == 'low': notes += ' - LOW CONFIDENCE, needs manual verification' inst['provenance']['enrichment_history'].append({ 'enrichment_date': datetime.now(timezone.utc).isoformat(), 'enrichment_method': 'Nominatim reverse geocoding', 'fields_updated': ['locations.city'], 'source': f'Nominatim API reverse geocode ({lat}, {lon})', 'confidence': confidence, 'notes': notes }) geocoded_count += 1 else: print(f" ⚠️ No city found in geocode response") failed_count += 1 else: print(f" ❌ Geocoding failed") failed_count += 1 # Rate limit: 1 request per second if use_geocoding: time.sleep(1.1) else: print(f" ⏭️ Skipped (geocoding disabled)") failed_count += 1 print() # Save results if not dry_run and (manual_count + geocoded_count) > 0: # Create backup backup_path = input_path.parent / f"{input_path.stem}_backup_before_city_enrichment.yaml" with open(input_path, 'r', encoding='utf-8') as f: backup_data = f.read() with open(backup_path, 'w', encoding='utf-8') as f: f.write(backup_data) print(f"✅ Backup created: {backup_path}") # Write updated data with open(input_path, 'w', encoding='utf-8') as f: yaml.dump(institutions, f, allow_unicode=True, sort_keys=False) print(f"✅ Updated data written to: {input_path}") # Print summary print("\n" + "="*80) print("CITY ENRICHMENT SUMMARY") print("="*80) print(f"Institutions processed: {len(candidates)}") print(f"Manual corrections: {manual_count}") print(f"Reverse geocoded: {geocoded_count}") print(f" └─ Low confidence (needs review): {low_confidence_count}") print(f"Failed/Skipped: {failed_count}") print("="*80) if dry_run: print("\n💡 Run without --dry-run flag to apply changes") elif low_confidence_count > 0: print(f"\n⚠️ {low_confidence_count} institutions have low-confidence city names") print(" Review these entries and verify cities are correct") def main(): import argparse parser = argparse.ArgumentParser(description='Add city names to Mexican institutions') parser.add_argument('--dry-run', action='store_true', help='Test mode - do not modify files') parser.add_argument('--no-geocoding', action='store_true', help='Skip reverse geocoding, only apply manual corrections') parser.add_argument('--input', type=Path, default=Path('data/instances/all/globalglam-20251113-mexico-deduplicated.yaml'), help='Input YAML file path') args = parser.parse_args() if not args.input.exists(): print(f"❌ Error: Input file not found: {args.input}") sys.exit(1) apply_city_corrections(args.input, dry_run=args.dry_run, use_geocoding=not args.no_geocoding) if __name__ == '__main__': main()