#!/usr/bin/env python3 """ Fix remaining numeric region codes across multiple countries. Maps GeoNames admin1 codes to ISO 3166-2 codes. Countries with VALID numeric ISO codes (skip these): - JP (Japan): 01-47 are ISO codes - CZ (Czech Republic): 10-80 are ISO NUTS codes - AT (Austria): 1-9 are ISO codes - TR (Turkey): 01-81 are ISO codes (plate codes = ISO) - KR (South Korea): 11-50 are ISO codes """ import os import re import yaml from datetime import datetime, timezone from pathlib import Path # Mapping: (country, geonames_admin1) -> ISO 3166-2 code MAPPINGS = { # Brazil - GeoNames admin1 to ISO state codes ('BR', '01'): 'AC', # Acre ('BR', '02'): 'AL', # Alagoas ('BR', '03'): 'AP', # Amapá ('BR', '04'): 'AM', # Amazonas ('BR', '05'): 'BA', # Bahia ('BR', '06'): 'CE', # Ceará ('BR', '07'): 'DF', # Federal District ('BR', '08'): 'ES', # Espírito Santo ('BR', '11'): 'MS', # Mato Grosso do Sul ('BR', '13'): 'MA', # Maranhão ('BR', '14'): 'MT', # Mato Grosso ('BR', '15'): 'MG', # Minas Gerais ('BR', '16'): 'PA', # Pará ('BR', '17'): 'PB', # Paraíba ('BR', '18'): 'PR', # Paraná ('BR', '20'): 'PI', # Piauí ('BR', '21'): 'RJ', # Rio de Janeiro ('BR', '22'): 'RN', # Rio Grande do Norte ('BR', '23'): 'RS', # Rio Grande do Sul ('BR', '24'): 'RO', # Rondônia ('BR', '25'): 'RR', # Roraima ('BR', '26'): 'SC', # Santa Catarina ('BR', '27'): 'SP', # São Paulo ('BR', '28'): 'SE', # Sergipe ('BR', '29'): 'GO', # Goiás ('BR', '30'): 'PE', # Pernambuco ('BR', '31'): 'TO', # Tocantins # France - GeoNames admin1 to ISO region codes ('FR', '11'): 'IDF', # Île-de-France ('FR', '24'): 'CVL', # Centre-Val de Loire ('FR', '27'): 'BFC', # Bourgogne-Franche-Comté ('FR', '28'): 'NOR', # Normandie ('FR', '32'): 'HDF', # Hauts-de-France ('FR', '44'): 'GES', # Grand Est ('FR', '52'): 'PDL', # Pays de la Loire ('FR', '53'): 'BRE', # Bretagne ('FR', '75'): 'NAQ', # Nouvelle-Aquitaine ('FR', '76'): 'OCC', # Occitanie ('FR', '84'): 'ARA', # Auvergne-Rhône-Alpes ('FR', '93'): 'PAC', # Provence-Alpes-Côte d'Azur ('FR', '94'): 'COR', # Corse # Department codes that should map to region ('FR', '92'): 'IDF', # Hauts-de-Seine -> Île-de-France # Poland - GeoNames admin1 to ISO voivodeship codes ('PL', '72'): 'SK', # Świętokrzyskie ('PL', '73'): 'KP', # Kujawsko-Pomorskie ('PL', '74'): 'LU', # Lubelskie ('PL', '75'): 'LB', # Lubuskie ('PL', '76'): 'LD', # Łódzkie ('PL', '77'): 'MA', # Małopolskie ('PL', '78'): 'MZ', # Mazowieckie ('PL', '79'): 'OP', # Opolskie ('PL', '80'): 'PK', # Podkarpackie ('PL', '81'): 'PD', # Podlaskie ('PL', '82'): 'PM', # Pomorskie ('PL', '83'): 'SL', # Śląskie ('PL', '84'): 'WN', # Warmińsko-Mazurskie ('PL', '85'): 'WP', # Wielkopolskie ('PL', '86'): 'ZP', # Zachodniopomorskie ('PL', '87'): 'DS', # Dolnośląskie # Sweden - GeoNames admin1 to ISO county codes ('SE', '02'): 'AC', # Västerbotten ('SE', '03'): 'Y', # Västernorrland ('SE', '04'): 'Z', # Jämtland ('SE', '05'): 'BD', # Norrbotten ('SE', '06'): 'X', # Gävleborg ('SE', '07'): 'W', # Dalarna ('SE', '08'): 'S', # Värmland ('SE', '09'): 'T', # Örebro ('SE', '10'): 'U', # Västmanland ('SE', '12'): 'C', # Uppsala ('SE', '13'): 'D', # Södermanland ('SE', '14'): 'AB', # Stockholm ('SE', '15'): 'I', # Gotland ('SE', '16'): 'E', # Östergötland ('SE', '17'): 'F', # Jönköping ('SE', '18'): 'H', # Kalmar ('SE', '19'): 'G', # Kronoberg ('SE', '21'): 'K', # Blekinge ('SE', '22'): 'M', # Skåne ('SE', '23'): 'N', # Halland ('SE', '24'): 'O', # Västra Götaland ('SE', '25'): 'AB', # Stockholm (alt code) ('SE', '26'): 'AB', # Stockholm # Italy - GeoNames admin1 to ISO region codes ('IT', '01'): '65', # Abruzzo ('IT', '02'): '77', # Basilicata ('IT', '03'): '78', # Calabria ('IT', '04'): '72', # Campania ('IT', '05'): '45', # Emilia-Romagna ('IT', '06'): '36', # Friuli-Venezia Giulia ('IT', '07'): '62', # Lazio ('IT', '08'): '42', # Liguria ('IT', '09'): '25', # Lombardia ('IT', '10'): '57', # Marche ('IT', '11'): '67', # Molise ('IT', '12'): '21', # Piemonte ('IT', '13'): '75', # Puglia ('IT', '14'): '88', # Sardegna ('IT', '15'): '82', # Sicilia ('IT', '16'): '52', # Toscana ('IT', '17'): '32', # Trentino-Alto Adige ('IT', '18'): '55', # Umbria ('IT', '19'): '23', # Valle d'Aosta ('IT', '20'): '34', # Veneto # Canada - GeoNames admin1 to ISO province codes ('CA', '01'): 'AB', # Alberta ('CA', '02'): 'BC', # British Columbia ('CA', '03'): 'MB', # Manitoba ('CA', '04'): 'NB', # New Brunswick ('CA', '05'): 'NL', # Newfoundland and Labrador ('CA', '07'): 'NS', # Nova Scotia ('CA', '08'): 'ON', # Ontario ('CA', '09'): 'PE', # Prince Edward Island ('CA', '10'): 'QC', # Quebec ('CA', '11'): 'SK', # Saskatchewan ('CA', '12'): 'YT', # Yukon ('CA', '13'): 'NT', # Northwest Territories ('CA', '14'): 'NU', # Nunavut # Russia - GeoNames admin1 to ISO codes (federal subjects) ('RU', '48'): 'MOW', # Moscow (city) ('RU', '47'): 'MOS', # Moscow Oblast ('RU', '66'): 'SPE', # Saint Petersburg # Vietnam - GeoNames admin1 to ISO codes ('VN', '44'): 'HN', # Hanoi ('VN', '20'): 'SG', # Ho Chi Minh City # China - GeoNames admin1 to ISO codes ('CN', '02'): 'ZJ', # Zhejiang ('CN', '04'): 'JS', # Jiangsu ('CN', '06'): 'AH', # Anhui ('CN', '07'): 'FJ', # Fujian ('CN', '09'): 'HA', # Henan ('CN', '11'): 'HB', # Hubei ('CN', '12'): 'HN', # Hunan ('CN', '13'): 'JX', # Jiangxi ('CN', '19'): 'LN', # Liaoning ('CN', '22'): 'BJ', # Beijing ('CN', '23'): 'SH', # Shanghai ('CN', '25'): 'SD', # Shandong ('CN', '26'): 'SX', # Shanxi ('CN', '28'): 'SN', # Shaanxi ('CN', '30'): 'SH', # Shanghai (alt) ('CN', '32'): 'TJ', # Tianjin ('CN', '33'): 'XZ', # Tibet # Mexico - GeoNames admin1 to ISO codes ('MX', '05'): 'COA', # Coahuila ('MX', '07'): 'CHP', # Chiapas ('MX', '08'): 'CHH', # Chihuahua ('MX', '09'): 'CMX', # Ciudad de México (CDMX) ('MX', '10'): 'DUR', # Durango ('MX', '11'): 'GUA', # Guanajuato ('MX', '14'): 'JAL', # Jalisco ('MX', '15'): 'MEX', # Estado de México ('MX', '16'): 'MIC', # Michoacán ('MX', '17'): 'MOR', # Morelos ('MX', '18'): 'NAY', # Nayarit ('MX', '19'): 'NLE', # Nuevo León ('MX', '20'): 'OAX', # Oaxaca ('MX', '21'): 'PUE', # Puebla ('MX', '22'): 'QUE', # Querétaro ('MX', '23'): 'ROO', # Quintana Roo ('MX', '24'): 'SLP', # San Luis Potosí ('MX', '25'): 'SIN', # Sinaloa ('MX', '26'): 'SON', # Sonora ('MX', '27'): 'TAB', # Tabasco ('MX', '28'): 'TAM', # Tamaulipas ('MX', '29'): 'TLA', # Tlaxcala ('MX', '30'): 'VER', # Veracruz ('MX', '31'): 'YUC', # Yucatán ('MX', '32'): 'ZAC', # Zacatecas # Spain - GeoNames admin1 to ISO autonomous community codes ('ES', '29'): 'CT', # Cataluña ('ES', '31'): 'AN', # Andalucía ('ES', '32'): 'AR', # Aragón ('ES', '33'): 'AS', # Asturias ('ES', '34'): 'CL', # Castilla y León ('ES', '37'): 'CM', # Castilla-La Mancha ('ES', '39'): 'CN', # Canarias ('ES', '51'): 'EX', # Extremadura ('ES', '52'): 'GA', # Galicia ('ES', '53'): 'IB', # Islas Baleares ('ES', '54'): 'RI', # La Rioja ('ES', '55'): 'MD', # Madrid ('ES', '56'): 'CT', # Cataluña (alt - Girona) ('ES', '57'): 'MC', # Murcia ('ES', '58'): 'NC', # Navarra ('ES', '59'): 'PV', # País Vasco ('ES', '60'): 'VC', # Comunidad Valenciana # Bulgaria - GeoNames admin1 (38-65) to ISO 3166-2 (01-28) ('BG', '38'): '01', # Blagoevgrad ('BG', '39'): '02', # Burgas ('BG', '40'): '08', # Dobrich ('BG', '41'): '07', # Gabrovo ('BG', '42'): '22', # Sofia-Capital (Sofia City) ('BG', '43'): '26', # Haskovo ('BG', '44'): '09', # Kardzhali ('BG', '45'): '10', # Kyustendil ('BG', '46'): '11', # Lovech ('BG', '47'): '12', # Montana ('BG', '48'): '13', # Pazardzhik ('BG', '49'): '14', # Pernik ('BG', '50'): '15', # Pleven ('BG', '51'): '16', # Plovdiv ('BG', '52'): '17', # Razgrad ('BG', '53'): '18', # Ruse ('BG', '54'): '27', # Shumen ('BG', '55'): '19', # Silistra ('BG', '56'): '20', # Sliven ('BG', '57'): '21', # Smolyan ('BG', '58'): '23', # Sofia (Province) ('BG', '59'): '24', # Stara Zagora ('BG', '60'): '25', # Targovishte ('BG', '61'): '03', # Varna ('BG', '62'): '04', # Veliko Tarnovo ('BG', '63'): '05', # Vidin ('BG', '64'): '06', # Vratsa ('BG', '65'): '28', # Yambol # Iran - GeoNames admin1 to ISO province codes ('IR', '04'): 'SB', # Sistan and Baluchestan ('IR', '23'): '15', # Lorestan ('IR', '26'): '23', # Tehran ('IR', '33'): '01', # East Azerbaijan ('IR', '39'): '25', # Qom # Ukraine - GeoNames admin1 to ISO oblast codes ('UA', '12'): '30', # Kyiv City ('UA', '30'): '32', # Kyiv Oblast (alt code) # Thailand - GeoNames admin1 to ISO province codes ('TH', '10'): '10', # Bangkok (already ISO) ('TH', '40'): '10', # Bangkok (alt GeoNames) ('TH', '73'): '48', # Nakhon Phanom # Portugal - GeoNames admin1 to ISO district codes ('PT', '11'): '09', # Guarda ('PT', '14'): '11', # Lisboa ('PT', '17'): '13', # Porto # Norway - GeoNames admin1 to ISO county codes ('NO', '12'): '03', # Oslo ('NO', '46'): '46', # Vestland (already ISO) # Finland - GeoNames admin1 to ISO region codes ('FI', '01'): '18', # Uusimaa ('FI', '02'): '19', # Varsinais-Suomi (Southwest Finland) # Denmark - GeoNames admin1 to ISO region codes ('DK', '17'): '84', # Capital Region (Hovedstaden) ('DK', '84'): '84', # Already correct # Hungary - GeoNames admin1 to ISO codes ('HU', '05'): 'BU', # Budapest # Indonesia - GeoNames admin1 to ISO province codes ('ID', '02'): 'BA', # Bali ('ID', '04'): 'JK', # Jakarta ('ID', '07'): 'JT', # Central Java ('ID', '38'): 'SN', # South Sulawesi # Israel - GeoNames admin1 to ISO district codes ('IL', '04'): 'HA', # Haifa ('IL', '06'): 'JM', # Jerusalem # Malaysia - GeoNames admin1 to ISO codes ('MY', '13'): '11', # Terengganu ('MY', '14'): '14', # Kuala Lumpur (already correct) # Sri Lanka - GeoNames admin1 to ISO province codes ('LK', '29'): '2', # Central Province ('LK', '36'): '1', # Western Province # Uzbekistan - GeoNames admin1 to ISO codes ('UZ', '13'): 'TK', # Tashkent City ('UZ', '14'): 'TO', # Tashkent Region # Liechtenstein - GeoNames admin1 to ISO codes ('LI', '03'): '03', # Gamprin (already correct) ('LI', '11'): '11', # Vaduz (already correct) # Armenia - GeoNames admin1 to ISO codes ('AM', '11'): 'ER', # Yerevan # Bosnia and Herzegovina - GeoNames admin1 to ISO codes ('BA', '01'): 'BIH', # Federation of B&H ('BA', '02'): 'SRP', # Republika Srpska # Cuba - GeoNames admin1 to ISO codes ('CU', '02'): '03', # Ciudad de La Habana (Havana) # Kazakhstan - GeoNames admin1 to ISO codes ('KZ', '02'): 'ALA', # Almaty (city) # Latvia - GeoNames admin1 to ISO codes ('LV', '25'): 'RIX', # Riga # Morocco - GeoNames admin1 to ISO codes ('MA', '03'): '03', # Fès-Meknès ('MA', '04'): '04', # Rabat-Salé-Kénitra # Namibia - GeoNames admin1 to ISO codes ('NA', '21'): 'KH', # Khomas # Tunisia - GeoNames admin1 to ISO codes ('TN', '36'): '11', # Tunis # Uruguay - GeoNames admin1 to ISO codes ('UY', '10'): 'MO', # Montevideo # Venezuela - GeoNames admin1 to ISO codes ('VE', '25'): 'A', # Distrito Capital # South Africa - GeoNames admin1 to ISO codes ('ZA', '06'): 'GP', # Gauteng ('ZA', '11'): 'WC', # Western Cape # Zimbabwe - GeoNames admin1 to ISO codes ('ZW', '10'): 'HA', # Harare # Additional countries with one-off mappings ('AG', '04'): '04', # Antigua and Barbuda - Saint John ('AI', '00'): 'XX', # Anguilla - no subdivision (skip) ('AL', '50'): 'TR', # Albania - Tirana ('AO', '20'): 'LUA', # Angola - Luanda ('AW', '00'): 'XX', # Aruba - no subdivision (skip) ('AZ', '09'): 'BA', # Azerbaijan - Baku ('BB', '07'): '07', # Barbados - Saint Michael ('BB', '08'): '08', # Barbados - Christ Church ('BD', '81'): 'C', # Bangladesh - Dhaka Division ('BH', '16'): '15', # Bahrain - Manama (Capital) ('BJ', '16'): 'AQ', # Benin - Atlantique (Porto-Novo) ('BO', '01'): 'H', # Bolivia - Chuquisaca (Sucre) ('BS', '23'): 'NP', # Bahamas - New Providence (Nassau) ('BZ', '02'): 'BZ', # Belize - Belize District ('CO', '34'): 'DC', # Colombia - Bogotá D.C. ('CV', '14'): 'PR', # Cape Verde - Praia ('CW', '00'): 'XX', # Curaçao - no subdivision (skip) ('DO', '01'): '01', # Dominican Republic - Distrito Nacional ('DO', '19'): '19', # Dominican Republic - Hermanas Mirabal ('DO', '34'): '01', # Dominican Republic - Santo Domingo (alt) ('DZ', '01'): '16', # Algeria - Algiers ('EC', '18'): 'P', # Ecuador - Pichincha (Quito) ('EE', '37'): '37', # Estonia - Harju (Tallinn) - numeric is ISO ('EE', '79'): '79', # Estonia - Tartu - numeric is ISO ('ET', '44'): 'AA', # Ethiopia - Addis Ababa ('FJ', '01'): 'C', # Fiji - Central Division (Suva) ('GH', '01'): 'AA', # Ghana - Greater Accra ('GT', '07'): 'GU', # Guatemala - Guatemala Department ('GY', '12'): 'DE', # Guyana - Demerara-Mahaica (Georgetown) ('HT', '11'): 'OU', # Haiti - Ouest (Port-au-Prince) ('ID', '08'): 'JI', # Indonesia - East Java ('IR', '09'): '13', # Iran - Hamadan ('JM', '08'): '01', # Jamaica - Kingston ('JM', '10'): '14', # Jamaica - Saint Andrew (Spanish Town) ('JO', '16'): 'AM', # Jordan - Amman ('KG', '01'): 'GB', # Kyrgyzstan - Bishkek City ('KH', '22'): '12', # Cambodia - Phnom Penh ('KP', '12'): '01', # North Korea - Pyongyang ('LB', '05'): 'JL', # Lebanon - Mount Lebanon (Keserwan-Jbeil) ('LB', '11'): 'BA', # Lebanon - Beirut ('LC', '03'): '02', # Saint Lucia - Castries ('LT', '65'): 'VL', # Lithuania - Vilnius ('MG', '11'): 'T', # Madagascar - Antananarivo ('ML', '6'): 'BKO', # Mali - Bamako (alt code) ('ML', '08'): 'BKO', # Mali - Bamako ('MM', '08'): '07', # Myanmar - Mandalay ('MM', '18'): '03', # Myanmar - Rakhine (Kyaukpyu) ('MN', '20'): '1', # Mongolia - Ulaanbaatar ('MS', '03'): 'XX', # Montserrat - no subdivision ('MU', '15'): 'PL', # Mauritius - Port Louis (city) ('MU', '17'): 'PW', # Mauritius - Plaines Wilhems (Curepipe) ('MU', '18'): 'PL', # Mauritius - Port Louis District ('NI', '10'): 'MN', # Nicaragua - Managua ('NO', '09'): '09', # Norway - Nordland (numeric is valid) ('NP', '3'): 'BA', # Nepal - Bagmati (Kathmandu) ('OM', '06'): 'MA', # Oman - Muscat ('PA', '8'): '8', # Panama - Panamá ('PH', '00'): '00', # Philippines - NCR (National Capital Region) ('PH', '06'): 'RIZ', # Philippines - Calabarzon (Rizal) ('PK', '08'): 'SD', # Pakistan - Sindh (Karachi alt) ('PY', '22'): 'ASU', # Paraguay - Asunción ('SN', '01'): 'DK', # Senegal - Dakar ('SR', '16'): 'PM', # Suriname - Paramaribo ('SV', '05'): 'SS', # El Salvador - San Salvador ('SX', '00'): 'XX', # Sint Maarten - no subdivision ('SY', '09'): 'HL', # Syria - Aleppo ('SY', '11'): 'RD', # Syria - Rif Dimashq (rural Damascus) ('TJ', '04'): 'DU', # Tajikistan - Dushanbe ('TT', '05'): 'POS', # Trinidad and Tobago - Port of Spain ('TZ', '23'): 'DS', # Tanzania - Dar es Salaam } # Countries with valid numeric ISO codes - skip these if no mapping exists # These countries use numeric codes in ISO 3166-2 and their existing codes may be correct SKIP_COUNTRIES = { 'JP', # Japan: 01-47 'CZ', # Czech Republic: 10-80 (NUTS codes) 'AT', # Austria: 1-9 'TR', # Turkey: 01-81 'KR', # South Korea: 11-50 'IT', # Italy: 21-88 'HR', # Croatia: 01-21 'SK', # Slovakia: 01-08 (NUTS codes) 'IS', # Iceland: 0-8 'SI', # Slovenia: 001-213 (municipality codes) 'LI', # Liechtenstein: 01-11 (municipality codes) 'MT', # Malta: 01-68 'BG', # Bulgaria: 01-28 (38-65 have explicit mappings above) # Added after fixes - these countries now have valid numeric ISO codes 'BH', # Bahrain: 13-17 'CU', # Cuba: 01-16 (our mappings produced valid codes) 'DZ', # Algeria: 01-48 (wilayas) 'FI', # Finland: 01-21 (regions) 'IR', # Iran: 01-32 (provinces) 'JM', # Jamaica: 01-14 (parishes) 'KH', # Cambodia: 01-25 (provinces) 'KP', # North Korea: 01-14 'LC', # Saint Lucia: 01-11 (districts) 'LK', # Sri Lanka: 1-9 (provinces) 'MM', # Myanmar: 01-17 (states/regions) 'MN', # Mongolia: 1, 035-073 (aimags) 'MY', # Malaysia: 01-16 (states) 'NO', # Norway: 03, 11, 15, 18, 30, 34, 38, 42, 46, 50, 54 (fylker) 'PT', # Portugal: 01-20 (districts) 'TH', # Thailand: 10-96 (changwat) 'TN', # Tunisia: 11-83 (governorates) 'UA', # Ukraine: 05-77 (oblasts) } def update_file(filepath: Path, dry_run: bool = False) -> dict: """Update a single file with ISO region code.""" filename = filepath.name # Extract country and region from filename match = re.match(r'^([A-Z]{2})-(\d+)-', filename) if not match: return {'status': 'skip', 'reason': 'no numeric code in filename'} country = match.group(1) old_code = match.group(2) # Get ISO code mapping key = (country, old_code) # If no mapping exists, check if this country uses valid numeric ISO codes if key not in MAPPINGS: if country in SKIP_COUNTRIES: return {'status': 'skip', 'reason': f'{country} uses valid numeric ISO codes'} return {'status': 'error', 'reason': f'no mapping for {key}'} new_code = MAPPINGS[key] # Read file with open(filepath, 'r', encoding='utf-8') as f: content = f.read() data = yaml.safe_load(content) if not data: return {'status': 'error', 'reason': 'empty file'} # Get old GHCID old_ghcid = data.get('ghcid', {}).get('ghcid_current', '') if not old_ghcid: return {'status': 'error', 'reason': 'no ghcid_current'} # Create new GHCID new_ghcid = re.sub(f'^{country}-{old_code}-', f'{country}-{new_code}-', old_ghcid) if new_ghcid == old_ghcid: return {'status': 'skip', 'reason': 'GHCID unchanged'} # Check for collision new_filename = filename.replace(f'{country}-{old_code}-', f'{country}-{new_code}-') new_filepath = filepath.parent / new_filename if new_filepath.exists() and new_filepath != filepath: return { 'status': 'collision', 'old_file': filename, 'new_file': new_filename, 'old_ghcid': old_ghcid, 'new_ghcid': new_ghcid } if dry_run: return { 'status': 'would_update', 'old_file': filename, 'new_file': new_filename, 'old_ghcid': old_ghcid, 'new_ghcid': new_ghcid, 'old_code': old_code, 'new_code': new_code } # Update YAML content timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ') # Update ghcid.ghcid_current data['ghcid']['ghcid_current'] = new_ghcid # Update region_code in location_resolution if 'location_resolution' in data.get('ghcid', {}): data['ghcid']['location_resolution']['region_code'] = new_code # Update location.region_code if present if 'location' in data and 'region_code' in data['location']: data['location']['region_code'] = new_code # Update identifiers if 'identifiers' in data: for ident in data['identifiers']: if ident.get('identifier_scheme') == 'GHCID': ident['identifier_value'] = new_ghcid # Add to ghcid_history if 'ghcid_history' not in data['ghcid']: data['ghcid']['ghcid_history'] = [] history_entry = { 'ghcid': old_ghcid, 'valid_from': data['ghcid'].get('ghcid_history', [{}])[-1].get('valid_from') if data['ghcid'].get('ghcid_history') else None, 'valid_to': timestamp, 'reason': f'Region code corrected from GeoNames admin1 ({old_code}) to ISO 3166-2 ({new_code})' } data['ghcid']['ghcid_history'].append(history_entry) # Write updated content with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False) # Rename file if new_filepath != filepath: filepath.rename(new_filepath) return { 'status': 'updated', 'old_file': filename, 'new_file': new_filename, 'old_ghcid': old_ghcid, 'new_ghcid': new_ghcid, 'old_code': old_code, 'new_code': new_code } def main(): import argparse parser = argparse.ArgumentParser(description='Fix numeric region codes to ISO 3166-2') parser.add_argument('--dry-run', action='store_true', help='Show what would be done') parser.add_argument('--country', type=str, help='Process only specified country code') args = parser.parse_args() custodian_dir = Path('/Users/kempersc/apps/glam/data/custodian') # Find files with numeric region codes pattern = r'^[A-Z]{2}-\d+-' results = { 'updated': [], 'skipped': [], 'collisions': [], 'errors': [] } for filepath in sorted(custodian_dir.glob('*.yaml')): filename = filepath.name # Check if filename matches numeric pattern if not re.match(pattern, filename): continue # Filter by country if specified if args.country and not filename.startswith(f'{args.country}-'): continue result = update_file(filepath, dry_run=args.dry_run) if result['status'] in ['updated', 'would_update']: results['updated'].append(result) elif result['status'] == 'collision': results['collisions'].append(result) elif result['status'] == 'error': results['errors'].append({'file': filename, **result}) else: results['skipped'].append({'file': filename, **result}) # Print results print(f"\n{'=' * 60}") print(f"REGION CODE FIX RESULTS {'(DRY RUN)' if args.dry_run else ''}") print(f"{'=' * 60}") if results['updated']: print(f"\n✅ {'Would update' if args.dry_run else 'Updated'}: {len(results['updated'])} files") for r in results['updated']: print(f" {r['old_file']}") print(f" -> {r['new_file']}") print(f" Code: {r['old_code']} -> {r['new_code']}") if results['collisions']: print(f"\n⚠️ Collisions: {len(results['collisions'])} files") for r in results['collisions']: print(f" {r['old_file']} -> {r['new_file']} (EXISTS)") if results['errors']: print(f"\n❌ Errors: {len(results['errors'])} files") for r in results['errors']: print(f" {r['file']}: {r['reason']}") print(f"\n📊 Summary:") print(f" {'Would update' if args.dry_run else 'Updated'}: {len(results['updated'])}") print(f" Collisions: {len(results['collisions'])}") print(f" Errors: {len(results['errors'])}") print(f" Skipped: {len(results['skipped'])}") if __name__ == '__main__': main()