glam/scripts/fix_mexican_geography.py
2025-11-19 23:25:22 +01:00

326 lines
13 KiB
Python

#!/usr/bin/env python3
"""
Fix geographic errors in Mexican heritage institutions dataset.
This script corrects three types of geographic issues:
1. Non-Mexican institutions incorrectly classified as Mexican
2. Institutions with wrong Zacatecas coordinates (state centroid errors)
3. Institutions missing city field but with valid state-level coordinates
"""
import yaml
from datetime import datetime, timezone
from typing import Dict, List, Optional
import sys
# Known location corrections based on institutional websites and research
LOCATION_CORRECTIONS = {
# Mexico City institutions (incorrectly placed in Zacatecas)
'INALI': {
'city': 'Ciudad de México',
'region': 'Ciudad de México',
'country': 'MX',
'latitude': 19.3406624,
'longitude': -99.1989974,
'source': 'https://www.inali.gob.mx - Instituto Nacional de Lenguas Indígenas headquarters',
'geocode_method': 'Manual correction from institutional website'
},
'CONACYT Repository': {
'city': 'Ciudad de México',
'region': 'Ciudad de México',
'country': 'MX',
'latitude': 19.3907336,
'longitude': -99.1436127,
'source': 'CONACYT (Consejo Nacional de Ciencia y Tecnología) headquarters',
'geocode_method': 'Manual correction - national science agency'
},
'INPI Archives': {
'city': 'Ciudad de México',
'region': 'Ciudad de México',
'country': 'MX',
'latitude': 19.4326077,
'longitude': -99.133208,
'source': 'INPI (Instituto Nacional de los Pueblos Indígenas) headquarters',
'geocode_method': 'Manual correction from institutional context'
},
'Community Museums Network': {
'city': 'Ciudad de México',
'region': 'Ciudad de México',
'country': 'MX',
'latitude': 19.4326077,
'longitude': -99.133208,
'source': 'Part of INAH/national museum network coordinated from Mexico City',
'geocode_method': 'Manual correction - national network'
},
'Casasola Archive': {
'city': 'Pachuca',
'region': 'Hidalgo',
'country': 'MX',
'latitude': 20.1220032,
'longitude': -98.7380387,
'source': 'Part of Fototeca Nacional in Pachuca, Hidalgo',
'geocode_method': 'Manual correction - historical photograph collection'
},
'Fototeca Nacional': {
'city': 'Pachuca',
'region': 'Hidalgo',
'country': 'MX',
'latitude': 20.1220032,
'longitude': -98.7380387,
'source': 'Fototeca Nacional del INAH, Pachuca, Hidalgo',
'geocode_method': 'Manual correction from institutional website'
},
'SINAFO Network': {
'city': 'Ciudad de México',
'region': 'Ciudad de México',
'country': 'MX',
'latitude': 19.4326077,
'longitude': -99.133208,
'source': 'Sistema Nacional de Fototecas (INAH network headquarters)',
'geocode_method': 'Manual correction - national network coordination'
},
'Fundación Televisa': {
'city': 'Ciudad de México',
'region': 'Ciudad de México',
'country': 'MX',
'latitude': 19.3634032,
'longitude': -99.2580692,
'source': 'Fundación Televisa headquarters, Chapultepec area',
'geocode_method': 'Manual correction from corporate foundation location'
},
'Agrasanchez Archive': {
'city': 'Harlingen',
'region': 'Texas',
'country': 'US',
'latitude': 26.1906306,
'longitude': -97.6961026,
'source': 'Agrasánchez Film Archive, Harlingen, Texas (Mexican film archive in USA)',
'geocode_method': 'Manual correction - US-based archive of Mexican cinema',
'note': 'US institution collecting Mexican heritage materials - consider removal from MX dataset'
},
}
# Institutions to remove from Mexican dataset (non-Mexican institutions)
REMOVE_FROM_DATASET = [
'Library of Congress', # This is the main LoC entry with wrong MX coordinates
'Digital Florentine Codex', # Getty Research Institute project (Los Angeles)
'FAMSI Database', # Foundation for Advancement of Mesoamerican Studies (USA)
]
# Institutions that need reverse geocoding from existing coordinates
REVERSE_GEOCODE_NEEDED = [
'Patrimonio Cultural Inmaterial Catalog',
'Instituto Sudcaliforniano de Cultura',
'Calakmul Digital Project',
'Colonial Campeche Archive',
'Palenque Site Museum',
'Indigenous Languages Archive',
'La Casa Redonda',
'Francisco de Burgoa Library',
'Chetumal City Museum',
'General Archive of Quintana Roo',
'UAS Repository',
'Instituto Tamaulipeco para la Cultura y las Artes',
]
def fix_mexican_geography(filepath: str, dry_run: bool = False) -> Dict:
"""
Fix geographic errors in Mexican institutions dataset.
Returns:
Dictionary with correction statistics
"""
with open(filepath, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
stats = {
'total_institutions': len(data),
'mexican_before': 0,
'mexican_after': 0,
'removed': [],
'corrected': [],
'reverse_geocoded': [],
'errors': []
}
timestamp = datetime.now(timezone.utc).isoformat()
institutions_to_remove = []
# Process all institutions
for idx, inst in enumerate(data):
if not inst.get('locations'):
continue
inst_name = inst['name']
for loc in inst['locations']:
if loc.get('country') == 'MX':
stats['mexican_before'] += 1
# Check if institution should be removed
if inst_name in REMOVE_FROM_DATASET:
institutions_to_remove.append(idx)
stats['removed'].append({
'name': inst_name,
'reason': 'Non-Mexican institution incorrectly classified',
'original_location': f"{loc.get('region', 'N/A')}, {loc.get('country')}"
})
print(f"[REMOVE] {inst_name} - Non-Mexican institution")
continue
# Check for location corrections
if inst_name in LOCATION_CORRECTIONS:
correction = LOCATION_CORRECTIONS[inst_name]
old_location = {
'city': loc.get('city', 'N/A'),
'region': loc.get('region', 'N/A'),
'latitude': loc.get('latitude'),
'longitude': loc.get('longitude')
}
# Apply correction
loc['city'] = correction['city']
loc['region'] = correction['region']
loc['country'] = correction['country']
loc['latitude'] = correction['latitude']
loc['longitude'] = correction['longitude']
# Add enrichment history
if 'provenance' not in inst:
inst['provenance'] = {}
if 'enrichment_history' not in inst['provenance']:
inst['provenance']['enrichment_history'] = []
inst['provenance']['enrichment_history'].append({
'enrichment_date': timestamp,
'enrichment_type': 'GEOGRAPHIC_CORRECTION',
'enrichment_method': correction['geocode_method'],
'enrichment_source': correction['source'],
'verified': True,
'enrichment_notes': f"Corrected from incorrect Zacatecas coordinates. " +
f"Old: {old_location['region']} ({old_location['latitude']}, {old_location['longitude']}). " +
f"New: {correction['city']}, {correction['region']}"
})
inst['provenance']['last_updated'] = timestamp
stats['corrected'].append({
'name': inst_name,
'old_location': old_location,
'new_location': correction
})
print(f"[CORRECT] {inst_name}: {old_location['region']}{correction['city']}, {correction['region']}")
# Check if needs reverse geocoding (has coords but no city)
elif inst_name in REVERSE_GEOCODE_NEEDED:
if not loc.get('city') and loc.get('latitude') and loc.get('longitude'):
# For now, just log these - actual geocoding would require API calls
stats['reverse_geocoded'].append({
'name': inst_name,
'region': loc.get('region'),
'latitude': loc.get('latitude'),
'longitude': loc.get('longitude'),
'note': 'Needs Nominatim reverse geocoding'
})
print(f"[GEOCODE NEEDED] {inst_name}: {loc.get('region')} ({loc.get('latitude')}, {loc.get('longitude')})")
# Remove non-Mexican institutions (in reverse order to preserve indices)
for idx in sorted(institutions_to_remove, reverse=True):
del data[idx]
stats['mexican_after'] = sum(
1 for inst in data
if inst.get('locations') and any(loc.get('country') == 'MX' for loc in inst['locations'])
)
# Write corrected data
if not dry_run:
# Create backup
backup_path = filepath.replace('.yaml', '_backup_before_geography_fix.yaml')
with open(backup_path, 'w', encoding='utf-8') as f:
# Read original again for backup
with open(filepath, 'r', encoding='utf-8') as orig:
f.write(orig.read())
print(f"\n✅ Backup created: {backup_path}")
# Write corrected data
with open(filepath, 'w', encoding='utf-8') as f:
yaml.dump(data, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
print(f"✅ Corrected data written to: {filepath}")
else:
print("\n[DRY RUN] No files modified")
return stats
def print_report(stats: Dict):
"""Print correction report."""
print("\n" + "=" * 80)
print("MEXICAN GEOGRAPHY CORRECTION REPORT")
print("=" * 80)
print()
print(f"Total institutions in dataset: {stats['total_institutions']:,}")
print(f"Mexican institutions before: {stats['mexican_before']}")
print(f"Mexican institutions after: {stats['mexican_after']}")
print(f"Change: {stats['mexican_after'] - stats['mexican_before']:+d}")
print()
print("-" * 80)
print(f"REMOVED: {len(stats['removed'])} non-Mexican institutions")
print("-" * 80)
for item in stats['removed']:
print(f"{item['name']}")
print(f" Reason: {item['reason']}")
print(f" Original: {item['original_location']}")
print()
print("-" * 80)
print(f"CORRECTED: {len(stats['corrected'])} location errors")
print("-" * 80)
for item in stats['corrected']:
print(f"{item['name']}")
print(f" Old: {item['old_location']['region']} ({item['old_location']['latitude']}, {item['old_location']['longitude']})")
print(f" New: {item['new_location']['city']}, {item['new_location']['region']} " +
f"({item['new_location']['latitude']}, {item['new_location']['longitude']})")
print()
print("-" * 80)
print(f"NEEDS REVERSE GEOCODING: {len(stats['reverse_geocoded'])} institutions")
print("-" * 80)
for item in stats['reverse_geocoded']:
print(f"{item['name']}")
print(f" Region: {item['region']}, Coords: {item['latitude']}, {item['longitude']}")
print()
print("=" * 80)
def main():
filepath = 'data/instances/all/globalglam-20251113-mexico-deduplicated.yaml'
# Check if dry run
dry_run = '--dry-run' in sys.argv
if dry_run:
print("🔍 DRY RUN MODE - No files will be modified\n")
print("Fixing Mexican geographic data...")
stats = fix_mexican_geography(filepath, dry_run=dry_run)
print_report(stats)
if not dry_run:
print("\n✅ Geographic corrections completed!")
print("📋 Next steps:")
print(" 1. Run scripts/analyze_mexican_geography.py to verify corrections")
print(" 2. Implement reverse geocoding for remaining 12 institutions")
print(" 3. Continue with Wikidata enrichment")
else:
print("\n💡 Run without --dry-run flag to apply corrections")
if __name__ == '__main__':
main()