#!/usr/bin/env python3 """ Normalize Mexican city names in the global dataset. Changes: 1. "Mexico City" → "Ciudad de México" (24 institutions) 2. Preserve existing "Ciudad de México" entries unchanged (4 institutions) Rationale: - Official name since 2016 constitutional reform - Aligns with heritage institution best practice (use local official names) - Consistent with geocoded entries that have specific coordinates """ import yaml from datetime import datetime, timezone from pathlib import Path def normalize_mexican_cities(input_file: str, output_file: str | None = None): """Normalize city names for Mexican institutions.""" if output_file is None: output_file = input_file # Load data print(f"Loading: {input_file}") with open(input_file, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) print(f"Total institutions: {len(data)}") # Track changes changes = { 'mexico_city_normalized': 0, 'already_correct': 0 } # Process each institution for inst in data: if not inst.get('locations'): continue for loc in inst['locations']: if loc.get('country') != 'MX': continue city = loc.get('city', '') # Normalize "Mexico City" → "Ciudad de México" if city == 'Mexico City': loc['city'] = 'Ciudad de México' changes['mexico_city_normalized'] += 1 # Add enrichment note if 'provenance' not in inst: inst['provenance'] = {} if 'enrichment_history' not in inst['provenance']: inst['provenance']['enrichment_history'] = [] inst['provenance']['enrichment_history'].append({ 'enrichment_date': datetime.now(timezone.utc).isoformat(), 'enrichment_type': 'LOCATION_NORMALIZATION', 'enrichment_method': 'MANUAL_CORRECTION', 'enrichment_notes': ( 'City name normalized from "Mexico City" to "Ciudad de México" ' 'to align with official name (since 2016 constitutional reform) ' 'and heritage institution best practices.' ), 'verified': True }) # Update provenance last_updated inst['provenance']['last_updated'] = datetime.now(timezone.utc).isoformat() elif city == 'Ciudad de México': changes['already_correct'] += 1 # Write output print(f"\nWriting: {output_file}") with open(output_file, 'w', encoding='utf-8') as f: yaml.dump(data, f, allow_unicode=True, sort_keys=False, default_flow_style=False) # Report print("\n" + "=" * 60) print("NORMALIZATION SUMMARY") print("=" * 60) print(f"Mexico City → Ciudad de México: {changes['mexico_city_normalized']}") print(f"Already correct (Ciudad de México): {changes['already_correct']}") print(f"Total Mexican capital institutions: {changes['mexico_city_normalized'] + changes['already_correct']}") print("\nOutput file:", output_file) return changes if __name__ == '__main__': input_file = 'data/instances/all/globalglam-20251113-mexico-deduplicated.yaml' # Backup first backup_file = input_file.replace('.yaml', '_backup_before_city_normalization.yaml') print(f"Creating backup: {backup_file}") import shutil shutil.copy2(input_file, backup_file) # Normalize normalize_mexican_cities(input_file)