333 lines
12 KiB
Python
333 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Add city names to Mexican institutions with missing city field.
|
|
|
|
Combines manual corrections (for known institutions) with reverse geocoding
|
|
(for uncertain cases). Flags questionable automated results for manual review.
|
|
"""
|
|
|
|
import sys
|
|
import time
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from typing import Optional, Dict, Any, Tuple
|
|
import requests
|
|
import yaml
|
|
|
|
# Add project root to path
|
|
project_root = Path(__file__).parent.parent
|
|
sys.path.insert(0, str(project_root))
|
|
|
|
|
|
# Known city corrections (based on institutional context and research)
|
|
KNOWN_CITY_CORRECTIONS = {
|
|
# State cultural institutes - typically in state capitals
|
|
'Instituto Sudcaliforniano de Cultura': {
|
|
'city': 'La Paz',
|
|
'source': 'State cultural institute, headquarters in capital city La Paz',
|
|
'confidence': 'high'
|
|
},
|
|
'Instituto Tamaulipeco para la Cultura y las Artes': {
|
|
'city': 'Ciudad Victoria',
|
|
'source': 'State cultural institute (ITCA), headquarters in capital Ciudad Victoria',
|
|
'confidence': 'high'
|
|
},
|
|
|
|
# Specific sites with known locations
|
|
'Palenque Site Museum': {
|
|
'city': 'Palenque',
|
|
'source': 'Archaeological site museum located in Palenque, Chiapas',
|
|
'confidence': 'high'
|
|
},
|
|
'Calakmul Digital Project': {
|
|
'city': 'Calakmul',
|
|
'source': 'Archaeological site digital project, Calakmul Biosphere Reserve',
|
|
'confidence': 'high'
|
|
},
|
|
'Colonial Campeche Archive': {
|
|
'city': 'Campeche',
|
|
'source': 'Colonial archive for state of Campeche, located in capital city',
|
|
'confidence': 'high'
|
|
},
|
|
'Chetumal City Museum': {
|
|
'city': 'Chetumal',
|
|
'source': 'Museum explicitly named for Chetumal city',
|
|
'confidence': 'high'
|
|
},
|
|
'General Archive of Quintana Roo': {
|
|
'city': 'Chetumal',
|
|
'source': 'State archive, headquarters in capital city Chetumal',
|
|
'confidence': 'high'
|
|
},
|
|
'La Casa Redonda': {
|
|
'city': 'Chihuahua',
|
|
'source': 'Cultural center in Chihuahua city (state capital)',
|
|
'confidence': 'medium'
|
|
},
|
|
'UAS Repository': {
|
|
'city': 'Culiacán',
|
|
'source': 'Universidad Autónoma de Sinaloa repository, main campus in Culiacán',
|
|
'confidence': 'high'
|
|
},
|
|
'Instituto Regional del Patrimonio Mundial': {
|
|
'city': 'Zacatecas',
|
|
'source': 'Regional heritage institute, headquarters in Zacatecas city',
|
|
'confidence': 'medium'
|
|
}
|
|
}
|
|
|
|
|
|
def reverse_geocode(lat: float, lon: float) -> Optional[Dict[str, Any]]:
|
|
"""
|
|
Reverse geocode coordinates using Nominatim API.
|
|
|
|
Returns:
|
|
Dictionary with address components or None if request fails
|
|
"""
|
|
url = "https://nominatim.openstreetmap.org/reverse"
|
|
params = {
|
|
'format': 'json',
|
|
'lat': lat,
|
|
'lon': lon,
|
|
'zoom': 10, # City-level zoom (not too specific)
|
|
'addressdetails': 1,
|
|
'accept-language': 'es,en' # Spanish first, then English
|
|
}
|
|
headers = {
|
|
'User-Agent': 'GLAM-Heritage-Custodian-Project/1.0 (heritage data enrichment)'
|
|
}
|
|
|
|
try:
|
|
response = requests.get(url, params=params, headers=headers, timeout=10)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
if 'error' in data:
|
|
return None
|
|
|
|
return data
|
|
except requests.exceptions.RequestException:
|
|
return None
|
|
|
|
|
|
def extract_city_from_geocode(geocode_data: Dict[str, Any]) -> Tuple[Optional[str], str]:
|
|
"""
|
|
Extract city name from Nominatim response.
|
|
|
|
Returns:
|
|
Tuple of (city_name, confidence_level)
|
|
"""
|
|
address = geocode_data.get('address', {})
|
|
|
|
# High confidence: city or town
|
|
if 'city' in address:
|
|
return address['city'], 'high'
|
|
if 'town' in address:
|
|
return address['town'], 'high'
|
|
|
|
# Medium confidence: municipality
|
|
if 'municipality' in address:
|
|
return address['municipality'], 'medium'
|
|
|
|
# Low confidence: village, hamlet, suburb
|
|
if 'village' in address:
|
|
return address['village'], 'low'
|
|
if 'hamlet' in address:
|
|
return address['hamlet'], 'low'
|
|
if 'suburb' in address:
|
|
return address['suburb'], 'low'
|
|
|
|
return None, 'none'
|
|
|
|
|
|
def apply_city_corrections(input_path: Path, dry_run: bool = False, use_geocoding: bool = True):
|
|
"""
|
|
Apply city corrections to Mexican institutions.
|
|
"""
|
|
print("🗺️ Adding city names to Mexican institutions...\n")
|
|
|
|
# Load dataset
|
|
with open(input_path, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
institutions = data if isinstance(data, list) else [data]
|
|
|
|
# Find Mexican institutions with coordinates but no city
|
|
candidates = []
|
|
for inst in institutions:
|
|
locations = inst.get('locations', [])
|
|
if not locations:
|
|
continue
|
|
|
|
loc = locations[0] # Primary location
|
|
country = loc.get('country', '')
|
|
city = loc.get('city')
|
|
lat = loc.get('latitude')
|
|
lon = loc.get('longitude')
|
|
|
|
if country == 'MX' and not city and lat and lon:
|
|
candidates.append(inst)
|
|
|
|
print(f"Found {len(candidates)} Mexican institutions needing city names\n")
|
|
|
|
if dry_run:
|
|
print("🔍 DRY RUN MODE - No files will be modified\n")
|
|
|
|
# Process each candidate
|
|
manual_count = 0
|
|
geocoded_count = 0
|
|
failed_count = 0
|
|
low_confidence_count = 0
|
|
|
|
for inst in candidates:
|
|
name = inst.get('name', 'Unknown')
|
|
loc = inst['locations'][0]
|
|
region = loc.get('region', 'Unknown')
|
|
lat = loc['latitude']
|
|
lon = loc['longitude']
|
|
|
|
print(f"📍 {name}")
|
|
print(f" Region: {region}")
|
|
print(f" Coords: {lat}, {lon}")
|
|
|
|
# Check if we have a manual correction
|
|
if name in KNOWN_CITY_CORRECTIONS:
|
|
correction = KNOWN_CITY_CORRECTIONS[name]
|
|
city = correction['city']
|
|
source = correction['source']
|
|
confidence = correction['confidence']
|
|
|
|
print(f" ✅ Manual correction: {city} (confidence: {confidence})")
|
|
print(f" 📝 Source: {source}")
|
|
|
|
if not dry_run:
|
|
loc['city'] = city
|
|
|
|
# Add enrichment history
|
|
if 'provenance' not in inst:
|
|
inst['provenance'] = {}
|
|
|
|
if 'enrichment_history' not in inst['provenance']:
|
|
inst['provenance']['enrichment_history'] = []
|
|
|
|
inst['provenance']['enrichment_history'].append({
|
|
'enrichment_date': datetime.now(timezone.utc).isoformat(),
|
|
'enrichment_method': 'Manual city correction',
|
|
'fields_updated': ['locations.city'],
|
|
'source': source,
|
|
'confidence': confidence,
|
|
'notes': f'City name verified through institutional research'
|
|
})
|
|
|
|
manual_count += 1
|
|
|
|
elif use_geocoding:
|
|
# Try reverse geocoding
|
|
geocode_data = reverse_geocode(lat, lon)
|
|
|
|
if geocode_data:
|
|
city, confidence = extract_city_from_geocode(geocode_data)
|
|
|
|
if city:
|
|
if confidence == 'low':
|
|
print(f" ⚠️ Reverse geocoded: {city} (LOW CONFIDENCE - needs verification)")
|
|
low_confidence_count += 1
|
|
else:
|
|
print(f" ✅ Reverse geocoded: {city} (confidence: {confidence})")
|
|
|
|
if not dry_run:
|
|
loc['city'] = city
|
|
|
|
# Add enrichment history
|
|
if 'provenance' not in inst:
|
|
inst['provenance'] = {}
|
|
|
|
if 'enrichment_history' not in inst['provenance']:
|
|
inst['provenance']['enrichment_history'] = []
|
|
|
|
notes = 'Extracted city name from coordinates using OpenStreetMap Nominatim'
|
|
if confidence == 'low':
|
|
notes += ' - LOW CONFIDENCE, needs manual verification'
|
|
|
|
inst['provenance']['enrichment_history'].append({
|
|
'enrichment_date': datetime.now(timezone.utc).isoformat(),
|
|
'enrichment_method': 'Nominatim reverse geocoding',
|
|
'fields_updated': ['locations.city'],
|
|
'source': f'Nominatim API reverse geocode ({lat}, {lon})',
|
|
'confidence': confidence,
|
|
'notes': notes
|
|
})
|
|
|
|
geocoded_count += 1
|
|
else:
|
|
print(f" ⚠️ No city found in geocode response")
|
|
failed_count += 1
|
|
else:
|
|
print(f" ❌ Geocoding failed")
|
|
failed_count += 1
|
|
|
|
# Rate limit: 1 request per second
|
|
if use_geocoding:
|
|
time.sleep(1.1)
|
|
else:
|
|
print(f" ⏭️ Skipped (geocoding disabled)")
|
|
failed_count += 1
|
|
|
|
print()
|
|
|
|
# Save results
|
|
if not dry_run and (manual_count + geocoded_count) > 0:
|
|
# Create backup
|
|
backup_path = input_path.parent / f"{input_path.stem}_backup_before_city_enrichment.yaml"
|
|
with open(input_path, 'r', encoding='utf-8') as f:
|
|
backup_data = f.read()
|
|
with open(backup_path, 'w', encoding='utf-8') as f:
|
|
f.write(backup_data)
|
|
print(f"✅ Backup created: {backup_path}")
|
|
|
|
# Write updated data
|
|
with open(input_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False)
|
|
print(f"✅ Updated data written to: {input_path}")
|
|
|
|
# Print summary
|
|
print("\n" + "="*80)
|
|
print("CITY ENRICHMENT SUMMARY")
|
|
print("="*80)
|
|
print(f"Institutions processed: {len(candidates)}")
|
|
print(f"Manual corrections: {manual_count}")
|
|
print(f"Reverse geocoded: {geocoded_count}")
|
|
print(f" └─ Low confidence (needs review): {low_confidence_count}")
|
|
print(f"Failed/Skipped: {failed_count}")
|
|
print("="*80)
|
|
|
|
if dry_run:
|
|
print("\n💡 Run without --dry-run flag to apply changes")
|
|
elif low_confidence_count > 0:
|
|
print(f"\n⚠️ {low_confidence_count} institutions have low-confidence city names")
|
|
print(" Review these entries and verify cities are correct")
|
|
|
|
|
|
def main():
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description='Add city names to Mexican institutions')
|
|
parser.add_argument('--dry-run', action='store_true',
|
|
help='Test mode - do not modify files')
|
|
parser.add_argument('--no-geocoding', action='store_true',
|
|
help='Skip reverse geocoding, only apply manual corrections')
|
|
parser.add_argument('--input', type=Path,
|
|
default=Path('data/instances/all/globalglam-20251113-mexico-deduplicated.yaml'),
|
|
help='Input YAML file path')
|
|
|
|
args = parser.parse_args()
|
|
|
|
if not args.input.exists():
|
|
print(f"❌ Error: Input file not found: {args.input}")
|
|
sys.exit(1)
|
|
|
|
apply_city_corrections(args.input, dry_run=args.dry_run, use_geocoding=not args.no_geocoding)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|