glam/scripts/normalize_mexican_cities.py
2025-11-19 23:25:22 +01:00

107 lines
3.7 KiB
Python

#!/usr/bin/env python3
"""
Normalize Mexican city names in the global dataset.
Changes:
1. "Mexico City""Ciudad de México" (24 institutions)
2. Preserve existing "Ciudad de México" entries unchanged (4 institutions)
Rationale:
- Official name since 2016 constitutional reform
- Aligns with heritage institution best practice (use local official names)
- Consistent with geocoded entries that have specific coordinates
"""
import yaml
from datetime import datetime, timezone
from pathlib import Path
def normalize_mexican_cities(input_file: str, output_file: str | None = None):
"""Normalize city names for Mexican institutions."""
if output_file is None:
output_file = input_file
# Load data
print(f"Loading: {input_file}")
with open(input_file, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
print(f"Total institutions: {len(data)}")
# Track changes
changes = {
'mexico_city_normalized': 0,
'already_correct': 0
}
# Process each institution
for inst in data:
if not inst.get('locations'):
continue
for loc in inst['locations']:
if loc.get('country') != 'MX':
continue
city = loc.get('city', '')
# Normalize "Mexico City" → "Ciudad de México"
if city == 'Mexico City':
loc['city'] = 'Ciudad de México'
changes['mexico_city_normalized'] += 1
# Add enrichment note
if 'provenance' not in inst:
inst['provenance'] = {}
if 'enrichment_history' not in inst['provenance']:
inst['provenance']['enrichment_history'] = []
inst['provenance']['enrichment_history'].append({
'enrichment_date': datetime.now(timezone.utc).isoformat(),
'enrichment_type': 'LOCATION_NORMALIZATION',
'enrichment_method': 'MANUAL_CORRECTION',
'enrichment_notes': (
'City name normalized from "Mexico City" to "Ciudad de México" '
'to align with official name (since 2016 constitutional reform) '
'and heritage institution best practices.'
),
'verified': True
})
# Update provenance last_updated
inst['provenance']['last_updated'] = datetime.now(timezone.utc).isoformat()
elif city == 'Ciudad de México':
changes['already_correct'] += 1
# Write output
print(f"\nWriting: {output_file}")
with open(output_file, 'w', encoding='utf-8') as f:
yaml.dump(data, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
# Report
print("\n" + "=" * 60)
print("NORMALIZATION SUMMARY")
print("=" * 60)
print(f"Mexico City → Ciudad de México: {changes['mexico_city_normalized']}")
print(f"Already correct (Ciudad de México): {changes['already_correct']}")
print(f"Total Mexican capital institutions: {changes['mexico_city_normalized'] + changes['already_correct']}")
print("\nOutput file:", output_file)
return changes
if __name__ == '__main__':
input_file = 'data/instances/all/globalglam-20251113-mexico-deduplicated.yaml'
# Backup first
backup_file = input_file.replace('.yaml', '_backup_before_city_normalization.yaml')
print(f"Creating backup: {backup_file}")
import shutil
shutil.copy2(input_file, backup_file)
# Normalize
normalize_mexican_cities(input_file)