glam/scripts/enrich_mexican_cities.py
2025-11-19 23:25:22 +01:00

333 lines
12 KiB
Python

#!/usr/bin/env python3
"""
Add city names to Mexican institutions with missing city field.
Combines manual corrections (for known institutions) with reverse geocoding
(for uncertain cases). Flags questionable automated results for manual review.
"""
import sys
import time
from pathlib import Path
from datetime import datetime, timezone
from typing import Optional, Dict, Any, Tuple
import requests
import yaml
# Add project root to path
project_root = Path(__file__).parent.parent
sys.path.insert(0, str(project_root))
# Known city corrections (based on institutional context and research)
KNOWN_CITY_CORRECTIONS = {
# State cultural institutes - typically in state capitals
'Instituto Sudcaliforniano de Cultura': {
'city': 'La Paz',
'source': 'State cultural institute, headquarters in capital city La Paz',
'confidence': 'high'
},
'Instituto Tamaulipeco para la Cultura y las Artes': {
'city': 'Ciudad Victoria',
'source': 'State cultural institute (ITCA), headquarters in capital Ciudad Victoria',
'confidence': 'high'
},
# Specific sites with known locations
'Palenque Site Museum': {
'city': 'Palenque',
'source': 'Archaeological site museum located in Palenque, Chiapas',
'confidence': 'high'
},
'Calakmul Digital Project': {
'city': 'Calakmul',
'source': 'Archaeological site digital project, Calakmul Biosphere Reserve',
'confidence': 'high'
},
'Colonial Campeche Archive': {
'city': 'Campeche',
'source': 'Colonial archive for state of Campeche, located in capital city',
'confidence': 'high'
},
'Chetumal City Museum': {
'city': 'Chetumal',
'source': 'Museum explicitly named for Chetumal city',
'confidence': 'high'
},
'General Archive of Quintana Roo': {
'city': 'Chetumal',
'source': 'State archive, headquarters in capital city Chetumal',
'confidence': 'high'
},
'La Casa Redonda': {
'city': 'Chihuahua',
'source': 'Cultural center in Chihuahua city (state capital)',
'confidence': 'medium'
},
'UAS Repository': {
'city': 'Culiacán',
'source': 'Universidad Autónoma de Sinaloa repository, main campus in Culiacán',
'confidence': 'high'
},
'Instituto Regional del Patrimonio Mundial': {
'city': 'Zacatecas',
'source': 'Regional heritage institute, headquarters in Zacatecas city',
'confidence': 'medium'
}
}
def reverse_geocode(lat: float, lon: float) -> Optional[Dict[str, Any]]:
"""
Reverse geocode coordinates using Nominatim API.
Returns:
Dictionary with address components or None if request fails
"""
url = "https://nominatim.openstreetmap.org/reverse"
params = {
'format': 'json',
'lat': lat,
'lon': lon,
'zoom': 10, # City-level zoom (not too specific)
'addressdetails': 1,
'accept-language': 'es,en' # Spanish first, then English
}
headers = {
'User-Agent': 'GLAM-Heritage-Custodian-Project/1.0 (heritage data enrichment)'
}
try:
response = requests.get(url, params=params, headers=headers, timeout=10)
response.raise_for_status()
data = response.json()
if 'error' in data:
return None
return data
except requests.exceptions.RequestException:
return None
def extract_city_from_geocode(geocode_data: Dict[str, Any]) -> Tuple[Optional[str], str]:
"""
Extract city name from Nominatim response.
Returns:
Tuple of (city_name, confidence_level)
"""
address = geocode_data.get('address', {})
# High confidence: city or town
if 'city' in address:
return address['city'], 'high'
if 'town' in address:
return address['town'], 'high'
# Medium confidence: municipality
if 'municipality' in address:
return address['municipality'], 'medium'
# Low confidence: village, hamlet, suburb
if 'village' in address:
return address['village'], 'low'
if 'hamlet' in address:
return address['hamlet'], 'low'
if 'suburb' in address:
return address['suburb'], 'low'
return None, 'none'
def apply_city_corrections(input_path: Path, dry_run: bool = False, use_geocoding: bool = True):
"""
Apply city corrections to Mexican institutions.
"""
print("🗺️ Adding city names to Mexican institutions...\n")
# Load dataset
with open(input_path, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
institutions = data if isinstance(data, list) else [data]
# Find Mexican institutions with coordinates but no city
candidates = []
for inst in institutions:
locations = inst.get('locations', [])
if not locations:
continue
loc = locations[0] # Primary location
country = loc.get('country', '')
city = loc.get('city')
lat = loc.get('latitude')
lon = loc.get('longitude')
if country == 'MX' and not city and lat and lon:
candidates.append(inst)
print(f"Found {len(candidates)} Mexican institutions needing city names\n")
if dry_run:
print("🔍 DRY RUN MODE - No files will be modified\n")
# Process each candidate
manual_count = 0
geocoded_count = 0
failed_count = 0
low_confidence_count = 0
for inst in candidates:
name = inst.get('name', 'Unknown')
loc = inst['locations'][0]
region = loc.get('region', 'Unknown')
lat = loc['latitude']
lon = loc['longitude']
print(f"📍 {name}")
print(f" Region: {region}")
print(f" Coords: {lat}, {lon}")
# Check if we have a manual correction
if name in KNOWN_CITY_CORRECTIONS:
correction = KNOWN_CITY_CORRECTIONS[name]
city = correction['city']
source = correction['source']
confidence = correction['confidence']
print(f" ✅ Manual correction: {city} (confidence: {confidence})")
print(f" 📝 Source: {source}")
if not dry_run:
loc['city'] = city
# Add enrichment history
if 'provenance' not in inst:
inst['provenance'] = {}
if 'enrichment_history' not in inst['provenance']:
inst['provenance']['enrichment_history'] = []
inst['provenance']['enrichment_history'].append({
'enrichment_date': datetime.now(timezone.utc).isoformat(),
'enrichment_method': 'Manual city correction',
'fields_updated': ['locations.city'],
'source': source,
'confidence': confidence,
'notes': f'City name verified through institutional research'
})
manual_count += 1
elif use_geocoding:
# Try reverse geocoding
geocode_data = reverse_geocode(lat, lon)
if geocode_data:
city, confidence = extract_city_from_geocode(geocode_data)
if city:
if confidence == 'low':
print(f" ⚠️ Reverse geocoded: {city} (LOW CONFIDENCE - needs verification)")
low_confidence_count += 1
else:
print(f" ✅ Reverse geocoded: {city} (confidence: {confidence})")
if not dry_run:
loc['city'] = city
# Add enrichment history
if 'provenance' not in inst:
inst['provenance'] = {}
if 'enrichment_history' not in inst['provenance']:
inst['provenance']['enrichment_history'] = []
notes = 'Extracted city name from coordinates using OpenStreetMap Nominatim'
if confidence == 'low':
notes += ' - LOW CONFIDENCE, needs manual verification'
inst['provenance']['enrichment_history'].append({
'enrichment_date': datetime.now(timezone.utc).isoformat(),
'enrichment_method': 'Nominatim reverse geocoding',
'fields_updated': ['locations.city'],
'source': f'Nominatim API reverse geocode ({lat}, {lon})',
'confidence': confidence,
'notes': notes
})
geocoded_count += 1
else:
print(f" ⚠️ No city found in geocode response")
failed_count += 1
else:
print(f" ❌ Geocoding failed")
failed_count += 1
# Rate limit: 1 request per second
if use_geocoding:
time.sleep(1.1)
else:
print(f" ⏭️ Skipped (geocoding disabled)")
failed_count += 1
print()
# Save results
if not dry_run and (manual_count + geocoded_count) > 0:
# Create backup
backup_path = input_path.parent / f"{input_path.stem}_backup_before_city_enrichment.yaml"
with open(input_path, 'r', encoding='utf-8') as f:
backup_data = f.read()
with open(backup_path, 'w', encoding='utf-8') as f:
f.write(backup_data)
print(f"✅ Backup created: {backup_path}")
# Write updated data
with open(input_path, 'w', encoding='utf-8') as f:
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False)
print(f"✅ Updated data written to: {input_path}")
# Print summary
print("\n" + "="*80)
print("CITY ENRICHMENT SUMMARY")
print("="*80)
print(f"Institutions processed: {len(candidates)}")
print(f"Manual corrections: {manual_count}")
print(f"Reverse geocoded: {geocoded_count}")
print(f" └─ Low confidence (needs review): {low_confidence_count}")
print(f"Failed/Skipped: {failed_count}")
print("="*80)
if dry_run:
print("\n💡 Run without --dry-run flag to apply changes")
elif low_confidence_count > 0:
print(f"\n⚠️ {low_confidence_count} institutions have low-confidence city names")
print(" Review these entries and verify cities are correct")
def main():
import argparse
parser = argparse.ArgumentParser(description='Add city names to Mexican institutions')
parser.add_argument('--dry-run', action='store_true',
help='Test mode - do not modify files')
parser.add_argument('--no-geocoding', action='store_true',
help='Skip reverse geocoding, only apply manual corrections')
parser.add_argument('--input', type=Path,
default=Path('data/instances/all/globalglam-20251113-mexico-deduplicated.yaml'),
help='Input YAML file path')
args = parser.parse_args()
if not args.input.exists():
print(f"❌ Error: Input file not found: {args.input}")
sys.exit(1)
apply_city_corrections(args.input, dry_run=args.dry_run, use_geocoding=not args.no_geocoding)
if __name__ == '__main__':
main()