107 lines
3.7 KiB
Python
107 lines
3.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Normalize Mexican city names in the global dataset.
|
|
|
|
Changes:
|
|
1. "Mexico City" → "Ciudad de México" (24 institutions)
|
|
2. Preserve existing "Ciudad de México" entries unchanged (4 institutions)
|
|
|
|
Rationale:
|
|
- Official name since 2016 constitutional reform
|
|
- Aligns with heritage institution best practice (use local official names)
|
|
- Consistent with geocoded entries that have specific coordinates
|
|
"""
|
|
|
|
import yaml
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
|
|
def normalize_mexican_cities(input_file: str, output_file: str | None = None):
|
|
"""Normalize city names for Mexican institutions."""
|
|
|
|
if output_file is None:
|
|
output_file = input_file
|
|
|
|
# Load data
|
|
print(f"Loading: {input_file}")
|
|
with open(input_file, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
print(f"Total institutions: {len(data)}")
|
|
|
|
# Track changes
|
|
changes = {
|
|
'mexico_city_normalized': 0,
|
|
'already_correct': 0
|
|
}
|
|
|
|
# Process each institution
|
|
for inst in data:
|
|
if not inst.get('locations'):
|
|
continue
|
|
|
|
for loc in inst['locations']:
|
|
if loc.get('country') != 'MX':
|
|
continue
|
|
|
|
city = loc.get('city', '')
|
|
|
|
# Normalize "Mexico City" → "Ciudad de México"
|
|
if city == 'Mexico City':
|
|
loc['city'] = 'Ciudad de México'
|
|
changes['mexico_city_normalized'] += 1
|
|
|
|
# Add enrichment note
|
|
if 'provenance' not in inst:
|
|
inst['provenance'] = {}
|
|
|
|
if 'enrichment_history' not in inst['provenance']:
|
|
inst['provenance']['enrichment_history'] = []
|
|
|
|
inst['provenance']['enrichment_history'].append({
|
|
'enrichment_date': datetime.now(timezone.utc).isoformat(),
|
|
'enrichment_type': 'LOCATION_NORMALIZATION',
|
|
'enrichment_method': 'MANUAL_CORRECTION',
|
|
'enrichment_notes': (
|
|
'City name normalized from "Mexico City" to "Ciudad de México" '
|
|
'to align with official name (since 2016 constitutional reform) '
|
|
'and heritage institution best practices.'
|
|
),
|
|
'verified': True
|
|
})
|
|
|
|
# Update provenance last_updated
|
|
inst['provenance']['last_updated'] = datetime.now(timezone.utc).isoformat()
|
|
|
|
elif city == 'Ciudad de México':
|
|
changes['already_correct'] += 1
|
|
|
|
# Write output
|
|
print(f"\nWriting: {output_file}")
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
|
|
|
|
# Report
|
|
print("\n" + "=" * 60)
|
|
print("NORMALIZATION SUMMARY")
|
|
print("=" * 60)
|
|
print(f"Mexico City → Ciudad de México: {changes['mexico_city_normalized']}")
|
|
print(f"Already correct (Ciudad de México): {changes['already_correct']}")
|
|
print(f"Total Mexican capital institutions: {changes['mexico_city_normalized'] + changes['already_correct']}")
|
|
print("\nOutput file:", output_file)
|
|
|
|
return changes
|
|
|
|
|
|
if __name__ == '__main__':
|
|
input_file = 'data/instances/all/globalglam-20251113-mexico-deduplicated.yaml'
|
|
|
|
# Backup first
|
|
backup_file = input_file.replace('.yaml', '_backup_before_city_normalization.yaml')
|
|
print(f"Creating backup: {backup_file}")
|
|
import shutil
|
|
shutil.copy2(input_file, backup_file)
|
|
|
|
# Normalize
|
|
normalize_mexican_cities(input_file)
|