#!/usr/bin/env python3 """ Fix geographic errors in Mexican heritage institutions dataset. This script corrects three types of geographic issues: 1. Non-Mexican institutions incorrectly classified as Mexican 2. Institutions with wrong Zacatecas coordinates (state centroid errors) 3. Institutions missing city field but with valid state-level coordinates """ import yaml from datetime import datetime, timezone from typing import Dict, List, Optional import sys # Known location corrections based on institutional websites and research LOCATION_CORRECTIONS = { # Mexico City institutions (incorrectly placed in Zacatecas) 'INALI': { 'city': 'Ciudad de México', 'region': 'Ciudad de México', 'country': 'MX', 'latitude': 19.3406624, 'longitude': -99.1989974, 'source': 'https://www.inali.gob.mx - Instituto Nacional de Lenguas Indígenas headquarters', 'geocode_method': 'Manual correction from institutional website' }, 'CONACYT Repository': { 'city': 'Ciudad de México', 'region': 'Ciudad de México', 'country': 'MX', 'latitude': 19.3907336, 'longitude': -99.1436127, 'source': 'CONACYT (Consejo Nacional de Ciencia y Tecnología) headquarters', 'geocode_method': 'Manual correction - national science agency' }, 'INPI Archives': { 'city': 'Ciudad de México', 'region': 'Ciudad de México', 'country': 'MX', 'latitude': 19.4326077, 'longitude': -99.133208, 'source': 'INPI (Instituto Nacional de los Pueblos Indígenas) headquarters', 'geocode_method': 'Manual correction from institutional context' }, 'Community Museums Network': { 'city': 'Ciudad de México', 'region': 'Ciudad de México', 'country': 'MX', 'latitude': 19.4326077, 'longitude': -99.133208, 'source': 'Part of INAH/national museum network coordinated from Mexico City', 'geocode_method': 'Manual correction - national network' }, 'Casasola Archive': { 'city': 'Pachuca', 'region': 'Hidalgo', 'country': 'MX', 'latitude': 20.1220032, 'longitude': -98.7380387, 'source': 'Part of Fototeca Nacional in Pachuca, Hidalgo', 'geocode_method': 'Manual correction - historical photograph collection' }, 'Fototeca Nacional': { 'city': 'Pachuca', 'region': 'Hidalgo', 'country': 'MX', 'latitude': 20.1220032, 'longitude': -98.7380387, 'source': 'Fototeca Nacional del INAH, Pachuca, Hidalgo', 'geocode_method': 'Manual correction from institutional website' }, 'SINAFO Network': { 'city': 'Ciudad de México', 'region': 'Ciudad de México', 'country': 'MX', 'latitude': 19.4326077, 'longitude': -99.133208, 'source': 'Sistema Nacional de Fototecas (INAH network headquarters)', 'geocode_method': 'Manual correction - national network coordination' }, 'Fundación Televisa': { 'city': 'Ciudad de México', 'region': 'Ciudad de México', 'country': 'MX', 'latitude': 19.3634032, 'longitude': -99.2580692, 'source': 'Fundación Televisa headquarters, Chapultepec area', 'geocode_method': 'Manual correction from corporate foundation location' }, 'Agrasanchez Archive': { 'city': 'Harlingen', 'region': 'Texas', 'country': 'US', 'latitude': 26.1906306, 'longitude': -97.6961026, 'source': 'Agrasánchez Film Archive, Harlingen, Texas (Mexican film archive in USA)', 'geocode_method': 'Manual correction - US-based archive of Mexican cinema', 'note': 'US institution collecting Mexican heritage materials - consider removal from MX dataset' }, } # Institutions to remove from Mexican dataset (non-Mexican institutions) REMOVE_FROM_DATASET = [ 'Library of Congress', # This is the main LoC entry with wrong MX coordinates 'Digital Florentine Codex', # Getty Research Institute project (Los Angeles) 'FAMSI Database', # Foundation for Advancement of Mesoamerican Studies (USA) ] # Institutions that need reverse geocoding from existing coordinates REVERSE_GEOCODE_NEEDED = [ 'Patrimonio Cultural Inmaterial Catalog', 'Instituto Sudcaliforniano de Cultura', 'Calakmul Digital Project', 'Colonial Campeche Archive', 'Palenque Site Museum', 'Indigenous Languages Archive', 'La Casa Redonda', 'Francisco de Burgoa Library', 'Chetumal City Museum', 'General Archive of Quintana Roo', 'UAS Repository', 'Instituto Tamaulipeco para la Cultura y las Artes', ] def fix_mexican_geography(filepath: str, dry_run: bool = False) -> Dict: """ Fix geographic errors in Mexican institutions dataset. Returns: Dictionary with correction statistics """ with open(filepath, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) stats = { 'total_institutions': len(data), 'mexican_before': 0, 'mexican_after': 0, 'removed': [], 'corrected': [], 'reverse_geocoded': [], 'errors': [] } timestamp = datetime.now(timezone.utc).isoformat() institutions_to_remove = [] # Process all institutions for idx, inst in enumerate(data): if not inst.get('locations'): continue inst_name = inst['name'] for loc in inst['locations']: if loc.get('country') == 'MX': stats['mexican_before'] += 1 # Check if institution should be removed if inst_name in REMOVE_FROM_DATASET: institutions_to_remove.append(idx) stats['removed'].append({ 'name': inst_name, 'reason': 'Non-Mexican institution incorrectly classified', 'original_location': f"{loc.get('region', 'N/A')}, {loc.get('country')}" }) print(f"[REMOVE] {inst_name} - Non-Mexican institution") continue # Check for location corrections if inst_name in LOCATION_CORRECTIONS: correction = LOCATION_CORRECTIONS[inst_name] old_location = { 'city': loc.get('city', 'N/A'), 'region': loc.get('region', 'N/A'), 'latitude': loc.get('latitude'), 'longitude': loc.get('longitude') } # Apply correction loc['city'] = correction['city'] loc['region'] = correction['region'] loc['country'] = correction['country'] loc['latitude'] = correction['latitude'] loc['longitude'] = correction['longitude'] # Add enrichment history if 'provenance' not in inst: inst['provenance'] = {} if 'enrichment_history' not in inst['provenance']: inst['provenance']['enrichment_history'] = [] inst['provenance']['enrichment_history'].append({ 'enrichment_date': timestamp, 'enrichment_type': 'GEOGRAPHIC_CORRECTION', 'enrichment_method': correction['geocode_method'], 'enrichment_source': correction['source'], 'verified': True, 'enrichment_notes': f"Corrected from incorrect Zacatecas coordinates. " + f"Old: {old_location['region']} ({old_location['latitude']}, {old_location['longitude']}). " + f"New: {correction['city']}, {correction['region']}" }) inst['provenance']['last_updated'] = timestamp stats['corrected'].append({ 'name': inst_name, 'old_location': old_location, 'new_location': correction }) print(f"[CORRECT] {inst_name}: {old_location['region']} → {correction['city']}, {correction['region']}") # Check if needs reverse geocoding (has coords but no city) elif inst_name in REVERSE_GEOCODE_NEEDED: if not loc.get('city') and loc.get('latitude') and loc.get('longitude'): # For now, just log these - actual geocoding would require API calls stats['reverse_geocoded'].append({ 'name': inst_name, 'region': loc.get('region'), 'latitude': loc.get('latitude'), 'longitude': loc.get('longitude'), 'note': 'Needs Nominatim reverse geocoding' }) print(f"[GEOCODE NEEDED] {inst_name}: {loc.get('region')} ({loc.get('latitude')}, {loc.get('longitude')})") # Remove non-Mexican institutions (in reverse order to preserve indices) for idx in sorted(institutions_to_remove, reverse=True): del data[idx] stats['mexican_after'] = sum( 1 for inst in data if inst.get('locations') and any(loc.get('country') == 'MX' for loc in inst['locations']) ) # Write corrected data if not dry_run: # Create backup backup_path = filepath.replace('.yaml', '_backup_before_geography_fix.yaml') with open(backup_path, 'w', encoding='utf-8') as f: # Read original again for backup with open(filepath, 'r', encoding='utf-8') as orig: f.write(orig.read()) print(f"\n✅ Backup created: {backup_path}") # Write corrected data with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f, allow_unicode=True, sort_keys=False, default_flow_style=False) print(f"✅ Corrected data written to: {filepath}") else: print("\n[DRY RUN] No files modified") return stats def print_report(stats: Dict): """Print correction report.""" print("\n" + "=" * 80) print("MEXICAN GEOGRAPHY CORRECTION REPORT") print("=" * 80) print() print(f"Total institutions in dataset: {stats['total_institutions']:,}") print(f"Mexican institutions before: {stats['mexican_before']}") print(f"Mexican institutions after: {stats['mexican_after']}") print(f"Change: {stats['mexican_after'] - stats['mexican_before']:+d}") print() print("-" * 80) print(f"REMOVED: {len(stats['removed'])} non-Mexican institutions") print("-" * 80) for item in stats['removed']: print(f" • {item['name']}") print(f" Reason: {item['reason']}") print(f" Original: {item['original_location']}") print() print("-" * 80) print(f"CORRECTED: {len(stats['corrected'])} location errors") print("-" * 80) for item in stats['corrected']: print(f" • {item['name']}") print(f" Old: {item['old_location']['region']} ({item['old_location']['latitude']}, {item['old_location']['longitude']})") print(f" New: {item['new_location']['city']}, {item['new_location']['region']} " + f"({item['new_location']['latitude']}, {item['new_location']['longitude']})") print() print("-" * 80) print(f"NEEDS REVERSE GEOCODING: {len(stats['reverse_geocoded'])} institutions") print("-" * 80) for item in stats['reverse_geocoded']: print(f" • {item['name']}") print(f" Region: {item['region']}, Coords: {item['latitude']}, {item['longitude']}") print() print("=" * 80) def main(): filepath = 'data/instances/all/globalglam-20251113-mexico-deduplicated.yaml' # Check if dry run dry_run = '--dry-run' in sys.argv if dry_run: print("🔍 DRY RUN MODE - No files will be modified\n") print("Fixing Mexican geographic data...") stats = fix_mexican_geography(filepath, dry_run=dry_run) print_report(stats) if not dry_run: print("\n✅ Geographic corrections completed!") print("📋 Next steps:") print(" 1. Run scripts/analyze_mexican_geography.py to verify corrections") print(" 2. Implement reverse geocoding for remaining 12 institutions") print(" 3. Continue with Wikidata enrichment") else: print("\n💡 Run without --dry-run flag to apply corrections") if __name__ == '__main__': main()