326 lines
13 KiB
Python
326 lines
13 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Fix geographic errors in Mexican heritage institutions dataset.
|
|
|
|
This script corrects three types of geographic issues:
|
|
1. Non-Mexican institutions incorrectly classified as Mexican
|
|
2. Institutions with wrong Zacatecas coordinates (state centroid errors)
|
|
3. Institutions missing city field but with valid state-level coordinates
|
|
"""
|
|
|
|
import yaml
|
|
from datetime import datetime, timezone
|
|
from typing import Dict, List, Optional
|
|
import sys
|
|
|
|
# Known location corrections based on institutional websites and research
|
|
LOCATION_CORRECTIONS = {
|
|
# Mexico City institutions (incorrectly placed in Zacatecas)
|
|
'INALI': {
|
|
'city': 'Ciudad de México',
|
|
'region': 'Ciudad de México',
|
|
'country': 'MX',
|
|
'latitude': 19.3406624,
|
|
'longitude': -99.1989974,
|
|
'source': 'https://www.inali.gob.mx - Instituto Nacional de Lenguas Indígenas headquarters',
|
|
'geocode_method': 'Manual correction from institutional website'
|
|
},
|
|
'CONACYT Repository': {
|
|
'city': 'Ciudad de México',
|
|
'region': 'Ciudad de México',
|
|
'country': 'MX',
|
|
'latitude': 19.3907336,
|
|
'longitude': -99.1436127,
|
|
'source': 'CONACYT (Consejo Nacional de Ciencia y Tecnología) headquarters',
|
|
'geocode_method': 'Manual correction - national science agency'
|
|
},
|
|
'INPI Archives': {
|
|
'city': 'Ciudad de México',
|
|
'region': 'Ciudad de México',
|
|
'country': 'MX',
|
|
'latitude': 19.4326077,
|
|
'longitude': -99.133208,
|
|
'source': 'INPI (Instituto Nacional de los Pueblos Indígenas) headquarters',
|
|
'geocode_method': 'Manual correction from institutional context'
|
|
},
|
|
'Community Museums Network': {
|
|
'city': 'Ciudad de México',
|
|
'region': 'Ciudad de México',
|
|
'country': 'MX',
|
|
'latitude': 19.4326077,
|
|
'longitude': -99.133208,
|
|
'source': 'Part of INAH/national museum network coordinated from Mexico City',
|
|
'geocode_method': 'Manual correction - national network'
|
|
},
|
|
'Casasola Archive': {
|
|
'city': 'Pachuca',
|
|
'region': 'Hidalgo',
|
|
'country': 'MX',
|
|
'latitude': 20.1220032,
|
|
'longitude': -98.7380387,
|
|
'source': 'Part of Fototeca Nacional in Pachuca, Hidalgo',
|
|
'geocode_method': 'Manual correction - historical photograph collection'
|
|
},
|
|
'Fototeca Nacional': {
|
|
'city': 'Pachuca',
|
|
'region': 'Hidalgo',
|
|
'country': 'MX',
|
|
'latitude': 20.1220032,
|
|
'longitude': -98.7380387,
|
|
'source': 'Fototeca Nacional del INAH, Pachuca, Hidalgo',
|
|
'geocode_method': 'Manual correction from institutional website'
|
|
},
|
|
'SINAFO Network': {
|
|
'city': 'Ciudad de México',
|
|
'region': 'Ciudad de México',
|
|
'country': 'MX',
|
|
'latitude': 19.4326077,
|
|
'longitude': -99.133208,
|
|
'source': 'Sistema Nacional de Fototecas (INAH network headquarters)',
|
|
'geocode_method': 'Manual correction - national network coordination'
|
|
},
|
|
'Fundación Televisa': {
|
|
'city': 'Ciudad de México',
|
|
'region': 'Ciudad de México',
|
|
'country': 'MX',
|
|
'latitude': 19.3634032,
|
|
'longitude': -99.2580692,
|
|
'source': 'Fundación Televisa headquarters, Chapultepec area',
|
|
'geocode_method': 'Manual correction from corporate foundation location'
|
|
},
|
|
'Agrasanchez Archive': {
|
|
'city': 'Harlingen',
|
|
'region': 'Texas',
|
|
'country': 'US',
|
|
'latitude': 26.1906306,
|
|
'longitude': -97.6961026,
|
|
'source': 'Agrasánchez Film Archive, Harlingen, Texas (Mexican film archive in USA)',
|
|
'geocode_method': 'Manual correction - US-based archive of Mexican cinema',
|
|
'note': 'US institution collecting Mexican heritage materials - consider removal from MX dataset'
|
|
},
|
|
}
|
|
|
|
# Institutions to remove from Mexican dataset (non-Mexican institutions)
|
|
REMOVE_FROM_DATASET = [
|
|
'Library of Congress', # This is the main LoC entry with wrong MX coordinates
|
|
'Digital Florentine Codex', # Getty Research Institute project (Los Angeles)
|
|
'FAMSI Database', # Foundation for Advancement of Mesoamerican Studies (USA)
|
|
]
|
|
|
|
# Institutions that need reverse geocoding from existing coordinates
|
|
REVERSE_GEOCODE_NEEDED = [
|
|
'Patrimonio Cultural Inmaterial Catalog',
|
|
'Instituto Sudcaliforniano de Cultura',
|
|
'Calakmul Digital Project',
|
|
'Colonial Campeche Archive',
|
|
'Palenque Site Museum',
|
|
'Indigenous Languages Archive',
|
|
'La Casa Redonda',
|
|
'Francisco de Burgoa Library',
|
|
'Chetumal City Museum',
|
|
'General Archive of Quintana Roo',
|
|
'UAS Repository',
|
|
'Instituto Tamaulipeco para la Cultura y las Artes',
|
|
]
|
|
|
|
def fix_mexican_geography(filepath: str, dry_run: bool = False) -> Dict:
|
|
"""
|
|
Fix geographic errors in Mexican institutions dataset.
|
|
|
|
Returns:
|
|
Dictionary with correction statistics
|
|
"""
|
|
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
stats = {
|
|
'total_institutions': len(data),
|
|
'mexican_before': 0,
|
|
'mexican_after': 0,
|
|
'removed': [],
|
|
'corrected': [],
|
|
'reverse_geocoded': [],
|
|
'errors': []
|
|
}
|
|
|
|
timestamp = datetime.now(timezone.utc).isoformat()
|
|
institutions_to_remove = []
|
|
|
|
# Process all institutions
|
|
for idx, inst in enumerate(data):
|
|
if not inst.get('locations'):
|
|
continue
|
|
|
|
inst_name = inst['name']
|
|
|
|
for loc in inst['locations']:
|
|
if loc.get('country') == 'MX':
|
|
stats['mexican_before'] += 1
|
|
|
|
# Check if institution should be removed
|
|
if inst_name in REMOVE_FROM_DATASET:
|
|
institutions_to_remove.append(idx)
|
|
stats['removed'].append({
|
|
'name': inst_name,
|
|
'reason': 'Non-Mexican institution incorrectly classified',
|
|
'original_location': f"{loc.get('region', 'N/A')}, {loc.get('country')}"
|
|
})
|
|
print(f"[REMOVE] {inst_name} - Non-Mexican institution")
|
|
continue
|
|
|
|
# Check for location corrections
|
|
if inst_name in LOCATION_CORRECTIONS:
|
|
correction = LOCATION_CORRECTIONS[inst_name]
|
|
|
|
old_location = {
|
|
'city': loc.get('city', 'N/A'),
|
|
'region': loc.get('region', 'N/A'),
|
|
'latitude': loc.get('latitude'),
|
|
'longitude': loc.get('longitude')
|
|
}
|
|
|
|
# Apply correction
|
|
loc['city'] = correction['city']
|
|
loc['region'] = correction['region']
|
|
loc['country'] = correction['country']
|
|
loc['latitude'] = correction['latitude']
|
|
loc['longitude'] = correction['longitude']
|
|
|
|
# Add enrichment history
|
|
if 'provenance' not in inst:
|
|
inst['provenance'] = {}
|
|
|
|
if 'enrichment_history' not in inst['provenance']:
|
|
inst['provenance']['enrichment_history'] = []
|
|
|
|
inst['provenance']['enrichment_history'].append({
|
|
'enrichment_date': timestamp,
|
|
'enrichment_type': 'GEOGRAPHIC_CORRECTION',
|
|
'enrichment_method': correction['geocode_method'],
|
|
'enrichment_source': correction['source'],
|
|
'verified': True,
|
|
'enrichment_notes': f"Corrected from incorrect Zacatecas coordinates. " +
|
|
f"Old: {old_location['region']} ({old_location['latitude']}, {old_location['longitude']}). " +
|
|
f"New: {correction['city']}, {correction['region']}"
|
|
})
|
|
|
|
inst['provenance']['last_updated'] = timestamp
|
|
|
|
stats['corrected'].append({
|
|
'name': inst_name,
|
|
'old_location': old_location,
|
|
'new_location': correction
|
|
})
|
|
|
|
print(f"[CORRECT] {inst_name}: {old_location['region']} → {correction['city']}, {correction['region']}")
|
|
|
|
# Check if needs reverse geocoding (has coords but no city)
|
|
elif inst_name in REVERSE_GEOCODE_NEEDED:
|
|
if not loc.get('city') and loc.get('latitude') and loc.get('longitude'):
|
|
# For now, just log these - actual geocoding would require API calls
|
|
stats['reverse_geocoded'].append({
|
|
'name': inst_name,
|
|
'region': loc.get('region'),
|
|
'latitude': loc.get('latitude'),
|
|
'longitude': loc.get('longitude'),
|
|
'note': 'Needs Nominatim reverse geocoding'
|
|
})
|
|
print(f"[GEOCODE NEEDED] {inst_name}: {loc.get('region')} ({loc.get('latitude')}, {loc.get('longitude')})")
|
|
|
|
# Remove non-Mexican institutions (in reverse order to preserve indices)
|
|
for idx in sorted(institutions_to_remove, reverse=True):
|
|
del data[idx]
|
|
|
|
stats['mexican_after'] = sum(
|
|
1 for inst in data
|
|
if inst.get('locations') and any(loc.get('country') == 'MX' for loc in inst['locations'])
|
|
)
|
|
|
|
# Write corrected data
|
|
if not dry_run:
|
|
# Create backup
|
|
backup_path = filepath.replace('.yaml', '_backup_before_geography_fix.yaml')
|
|
with open(backup_path, 'w', encoding='utf-8') as f:
|
|
# Read original again for backup
|
|
with open(filepath, 'r', encoding='utf-8') as orig:
|
|
f.write(orig.read())
|
|
print(f"\n✅ Backup created: {backup_path}")
|
|
|
|
# Write corrected data
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
|
|
print(f"✅ Corrected data written to: {filepath}")
|
|
else:
|
|
print("\n[DRY RUN] No files modified")
|
|
|
|
return stats
|
|
|
|
def print_report(stats: Dict):
|
|
"""Print correction report."""
|
|
|
|
print("\n" + "=" * 80)
|
|
print("MEXICAN GEOGRAPHY CORRECTION REPORT")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
print(f"Total institutions in dataset: {stats['total_institutions']:,}")
|
|
print(f"Mexican institutions before: {stats['mexican_before']}")
|
|
print(f"Mexican institutions after: {stats['mexican_after']}")
|
|
print(f"Change: {stats['mexican_after'] - stats['mexican_before']:+d}")
|
|
print()
|
|
|
|
print("-" * 80)
|
|
print(f"REMOVED: {len(stats['removed'])} non-Mexican institutions")
|
|
print("-" * 80)
|
|
for item in stats['removed']:
|
|
print(f" • {item['name']}")
|
|
print(f" Reason: {item['reason']}")
|
|
print(f" Original: {item['original_location']}")
|
|
print()
|
|
|
|
print("-" * 80)
|
|
print(f"CORRECTED: {len(stats['corrected'])} location errors")
|
|
print("-" * 80)
|
|
for item in stats['corrected']:
|
|
print(f" • {item['name']}")
|
|
print(f" Old: {item['old_location']['region']} ({item['old_location']['latitude']}, {item['old_location']['longitude']})")
|
|
print(f" New: {item['new_location']['city']}, {item['new_location']['region']} " +
|
|
f"({item['new_location']['latitude']}, {item['new_location']['longitude']})")
|
|
print()
|
|
|
|
print("-" * 80)
|
|
print(f"NEEDS REVERSE GEOCODING: {len(stats['reverse_geocoded'])} institutions")
|
|
print("-" * 80)
|
|
for item in stats['reverse_geocoded']:
|
|
print(f" • {item['name']}")
|
|
print(f" Region: {item['region']}, Coords: {item['latitude']}, {item['longitude']}")
|
|
print()
|
|
|
|
print("=" * 80)
|
|
|
|
def main():
|
|
filepath = 'data/instances/all/globalglam-20251113-mexico-deduplicated.yaml'
|
|
|
|
# Check if dry run
|
|
dry_run = '--dry-run' in sys.argv
|
|
|
|
if dry_run:
|
|
print("🔍 DRY RUN MODE - No files will be modified\n")
|
|
|
|
print("Fixing Mexican geographic data...")
|
|
stats = fix_mexican_geography(filepath, dry_run=dry_run)
|
|
|
|
print_report(stats)
|
|
|
|
if not dry_run:
|
|
print("\n✅ Geographic corrections completed!")
|
|
print("📋 Next steps:")
|
|
print(" 1. Run scripts/analyze_mexican_geography.py to verify corrections")
|
|
print(" 2. Implement reverse geocoding for remaining 12 institutions")
|
|
print(" 3. Continue with Wikidata enrichment")
|
|
else:
|
|
print("\n💡 Run without --dry-run flag to apply corrections")
|
|
|
|
if __name__ == '__main__':
|
|
main()
|