glam/scripts/fix_remaining_numeric_codes.py
2025-12-10 13:01:13 +01:00

649 lines
24 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Fix remaining numeric region codes across multiple countries.
Maps GeoNames admin1 codes to ISO 3166-2 codes.
Countries with VALID numeric ISO codes (skip these):
- JP (Japan): 01-47 are ISO codes
- CZ (Czech Republic): 10-80 are ISO NUTS codes
- AT (Austria): 1-9 are ISO codes
- TR (Turkey): 01-81 are ISO codes (plate codes = ISO)
- KR (South Korea): 11-50 are ISO codes
"""
import os
import re
import yaml
from datetime import datetime, timezone
from pathlib import Path
# Mapping: (country, geonames_admin1) -> ISO 3166-2 code
MAPPINGS = {
# Brazil - GeoNames admin1 to ISO state codes
('BR', '01'): 'AC', # Acre
('BR', '02'): 'AL', # Alagoas
('BR', '03'): 'AP', # Amapá
('BR', '04'): 'AM', # Amazonas
('BR', '05'): 'BA', # Bahia
('BR', '06'): 'CE', # Ceará
('BR', '07'): 'DF', # Federal District
('BR', '08'): 'ES', # Espírito Santo
('BR', '11'): 'MS', # Mato Grosso do Sul
('BR', '13'): 'MA', # Maranhão
('BR', '14'): 'MT', # Mato Grosso
('BR', '15'): 'MG', # Minas Gerais
('BR', '16'): 'PA', # Pará
('BR', '17'): 'PB', # Paraíba
('BR', '18'): 'PR', # Paraná
('BR', '20'): 'PI', # Piauí
('BR', '21'): 'RJ', # Rio de Janeiro
('BR', '22'): 'RN', # Rio Grande do Norte
('BR', '23'): 'RS', # Rio Grande do Sul
('BR', '24'): 'RO', # Rondônia
('BR', '25'): 'RR', # Roraima
('BR', '26'): 'SC', # Santa Catarina
('BR', '27'): 'SP', # São Paulo
('BR', '28'): 'SE', # Sergipe
('BR', '29'): 'GO', # Goiás
('BR', '30'): 'PE', # Pernambuco
('BR', '31'): 'TO', # Tocantins
# France - GeoNames admin1 to ISO region codes
('FR', '11'): 'IDF', # Île-de-France
('FR', '24'): 'CVL', # Centre-Val de Loire
('FR', '27'): 'BFC', # Bourgogne-Franche-Comté
('FR', '28'): 'NOR', # Normandie
('FR', '32'): 'HDF', # Hauts-de-France
('FR', '44'): 'GES', # Grand Est
('FR', '52'): 'PDL', # Pays de la Loire
('FR', '53'): 'BRE', # Bretagne
('FR', '75'): 'NAQ', # Nouvelle-Aquitaine
('FR', '76'): 'OCC', # Occitanie
('FR', '84'): 'ARA', # Auvergne-Rhône-Alpes
('FR', '93'): 'PAC', # Provence-Alpes-Côte d'Azur
('FR', '94'): 'COR', # Corse
# Department codes that should map to region
('FR', '92'): 'IDF', # Hauts-de-Seine -> Île-de-France
# Poland - GeoNames admin1 to ISO voivodeship codes
('PL', '72'): 'SK', # Świętokrzyskie
('PL', '73'): 'KP', # Kujawsko-Pomorskie
('PL', '74'): 'LU', # Lubelskie
('PL', '75'): 'LB', # Lubuskie
('PL', '76'): 'LD', # Łódzkie
('PL', '77'): 'MA', # Małopolskie
('PL', '78'): 'MZ', # Mazowieckie
('PL', '79'): 'OP', # Opolskie
('PL', '80'): 'PK', # Podkarpackie
('PL', '81'): 'PD', # Podlaskie
('PL', '82'): 'PM', # Pomorskie
('PL', '83'): 'SL', # Śląskie
('PL', '84'): 'WN', # Warmińsko-Mazurskie
('PL', '85'): 'WP', # Wielkopolskie
('PL', '86'): 'ZP', # Zachodniopomorskie
('PL', '87'): 'DS', # Dolnośląskie
# Sweden - GeoNames admin1 to ISO county codes
('SE', '02'): 'AC', # Västerbotten
('SE', '03'): 'Y', # Västernorrland
('SE', '04'): 'Z', # Jämtland
('SE', '05'): 'BD', # Norrbotten
('SE', '06'): 'X', # Gävleborg
('SE', '07'): 'W', # Dalarna
('SE', '08'): 'S', # Värmland
('SE', '09'): 'T', # Örebro
('SE', '10'): 'U', # Västmanland
('SE', '12'): 'C', # Uppsala
('SE', '13'): 'D', # Södermanland
('SE', '14'): 'AB', # Stockholm
('SE', '15'): 'I', # Gotland
('SE', '16'): 'E', # Östergötland
('SE', '17'): 'F', # Jönköping
('SE', '18'): 'H', # Kalmar
('SE', '19'): 'G', # Kronoberg
('SE', '21'): 'K', # Blekinge
('SE', '22'): 'M', # Skåne
('SE', '23'): 'N', # Halland
('SE', '24'): 'O', # Västra Götaland
('SE', '25'): 'AB', # Stockholm (alt code)
('SE', '26'): 'AB', # Stockholm
# Italy - GeoNames admin1 to ISO region codes
('IT', '01'): '65', # Abruzzo
('IT', '02'): '77', # Basilicata
('IT', '03'): '78', # Calabria
('IT', '04'): '72', # Campania
('IT', '05'): '45', # Emilia-Romagna
('IT', '06'): '36', # Friuli-Venezia Giulia
('IT', '07'): '62', # Lazio
('IT', '08'): '42', # Liguria
('IT', '09'): '25', # Lombardia
('IT', '10'): '57', # Marche
('IT', '11'): '67', # Molise
('IT', '12'): '21', # Piemonte
('IT', '13'): '75', # Puglia
('IT', '14'): '88', # Sardegna
('IT', '15'): '82', # Sicilia
('IT', '16'): '52', # Toscana
('IT', '17'): '32', # Trentino-Alto Adige
('IT', '18'): '55', # Umbria
('IT', '19'): '23', # Valle d'Aosta
('IT', '20'): '34', # Veneto
# Canada - GeoNames admin1 to ISO province codes
('CA', '01'): 'AB', # Alberta
('CA', '02'): 'BC', # British Columbia
('CA', '03'): 'MB', # Manitoba
('CA', '04'): 'NB', # New Brunswick
('CA', '05'): 'NL', # Newfoundland and Labrador
('CA', '07'): 'NS', # Nova Scotia
('CA', '08'): 'ON', # Ontario
('CA', '09'): 'PE', # Prince Edward Island
('CA', '10'): 'QC', # Quebec
('CA', '11'): 'SK', # Saskatchewan
('CA', '12'): 'YT', # Yukon
('CA', '13'): 'NT', # Northwest Territories
('CA', '14'): 'NU', # Nunavut
# Russia - GeoNames admin1 to ISO codes (federal subjects)
('RU', '48'): 'MOW', # Moscow (city)
('RU', '47'): 'MOS', # Moscow Oblast
('RU', '66'): 'SPE', # Saint Petersburg
# Vietnam - GeoNames admin1 to ISO codes
('VN', '44'): 'HN', # Hanoi
('VN', '20'): 'SG', # Ho Chi Minh City
# China - GeoNames admin1 to ISO codes
('CN', '02'): 'ZJ', # Zhejiang
('CN', '04'): 'JS', # Jiangsu
('CN', '06'): 'AH', # Anhui
('CN', '07'): 'FJ', # Fujian
('CN', '09'): 'HA', # Henan
('CN', '11'): 'HB', # Hubei
('CN', '12'): 'HN', # Hunan
('CN', '13'): 'JX', # Jiangxi
('CN', '19'): 'LN', # Liaoning
('CN', '22'): 'BJ', # Beijing
('CN', '23'): 'SH', # Shanghai
('CN', '25'): 'SD', # Shandong
('CN', '26'): 'SX', # Shanxi
('CN', '28'): 'SN', # Shaanxi
('CN', '30'): 'SH', # Shanghai (alt)
('CN', '32'): 'TJ', # Tianjin
('CN', '33'): 'XZ', # Tibet
# Mexico - GeoNames admin1 to ISO codes
('MX', '05'): 'COA', # Coahuila
('MX', '07'): 'CHP', # Chiapas
('MX', '08'): 'CHH', # Chihuahua
('MX', '09'): 'CMX', # Ciudad de México (CDMX)
('MX', '10'): 'DUR', # Durango
('MX', '11'): 'GUA', # Guanajuato
('MX', '14'): 'JAL', # Jalisco
('MX', '15'): 'MEX', # Estado de México
('MX', '16'): 'MIC', # Michoacán
('MX', '17'): 'MOR', # Morelos
('MX', '18'): 'NAY', # Nayarit
('MX', '19'): 'NLE', # Nuevo León
('MX', '20'): 'OAX', # Oaxaca
('MX', '21'): 'PUE', # Puebla
('MX', '22'): 'QUE', # Querétaro
('MX', '23'): 'ROO', # Quintana Roo
('MX', '24'): 'SLP', # San Luis Potosí
('MX', '25'): 'SIN', # Sinaloa
('MX', '26'): 'SON', # Sonora
('MX', '27'): 'TAB', # Tabasco
('MX', '28'): 'TAM', # Tamaulipas
('MX', '29'): 'TLA', # Tlaxcala
('MX', '30'): 'VER', # Veracruz
('MX', '31'): 'YUC', # Yucatán
('MX', '32'): 'ZAC', # Zacatecas
# Spain - GeoNames admin1 to ISO autonomous community codes
('ES', '29'): 'CT', # Cataluña
('ES', '31'): 'AN', # Andalucía
('ES', '32'): 'AR', # Aragón
('ES', '33'): 'AS', # Asturias
('ES', '34'): 'CL', # Castilla y León
('ES', '37'): 'CM', # Castilla-La Mancha
('ES', '39'): 'CN', # Canarias
('ES', '51'): 'EX', # Extremadura
('ES', '52'): 'GA', # Galicia
('ES', '53'): 'IB', # Islas Baleares
('ES', '54'): 'RI', # La Rioja
('ES', '55'): 'MD', # Madrid
('ES', '56'): 'CT', # Cataluña (alt - Girona)
('ES', '57'): 'MC', # Murcia
('ES', '58'): 'NC', # Navarra
('ES', '59'): 'PV', # País Vasco
('ES', '60'): 'VC', # Comunidad Valenciana
# Bulgaria - GeoNames admin1 (38-65) to ISO 3166-2 (01-28)
('BG', '38'): '01', # Blagoevgrad
('BG', '39'): '02', # Burgas
('BG', '40'): '08', # Dobrich
('BG', '41'): '07', # Gabrovo
('BG', '42'): '22', # Sofia-Capital (Sofia City)
('BG', '43'): '26', # Haskovo
('BG', '44'): '09', # Kardzhali
('BG', '45'): '10', # Kyustendil
('BG', '46'): '11', # Lovech
('BG', '47'): '12', # Montana
('BG', '48'): '13', # Pazardzhik
('BG', '49'): '14', # Pernik
('BG', '50'): '15', # Pleven
('BG', '51'): '16', # Plovdiv
('BG', '52'): '17', # Razgrad
('BG', '53'): '18', # Ruse
('BG', '54'): '27', # Shumen
('BG', '55'): '19', # Silistra
('BG', '56'): '20', # Sliven
('BG', '57'): '21', # Smolyan
('BG', '58'): '23', # Sofia (Province)
('BG', '59'): '24', # Stara Zagora
('BG', '60'): '25', # Targovishte
('BG', '61'): '03', # Varna
('BG', '62'): '04', # Veliko Tarnovo
('BG', '63'): '05', # Vidin
('BG', '64'): '06', # Vratsa
('BG', '65'): '28', # Yambol
# Iran - GeoNames admin1 to ISO province codes
('IR', '04'): 'SB', # Sistan and Baluchestan
('IR', '23'): '15', # Lorestan
('IR', '26'): '23', # Tehran
('IR', '33'): '01', # East Azerbaijan
('IR', '39'): '25', # Qom
# Ukraine - GeoNames admin1 to ISO oblast codes
('UA', '12'): '30', # Kyiv City
('UA', '30'): '32', # Kyiv Oblast (alt code)
# Thailand - GeoNames admin1 to ISO province codes
('TH', '10'): '10', # Bangkok (already ISO)
('TH', '40'): '10', # Bangkok (alt GeoNames)
('TH', '73'): '48', # Nakhon Phanom
# Portugal - GeoNames admin1 to ISO district codes
('PT', '11'): '09', # Guarda
('PT', '14'): '11', # Lisboa
('PT', '17'): '13', # Porto
# Norway - GeoNames admin1 to ISO county codes
('NO', '12'): '03', # Oslo
('NO', '46'): '46', # Vestland (already ISO)
# Finland - GeoNames admin1 to ISO region codes
('FI', '01'): '18', # Uusimaa
('FI', '02'): '19', # Varsinais-Suomi (Southwest Finland)
# Denmark - GeoNames admin1 to ISO region codes
('DK', '17'): '84', # Capital Region (Hovedstaden)
('DK', '84'): '84', # Already correct
# Hungary - GeoNames admin1 to ISO codes
('HU', '05'): 'BU', # Budapest
# Indonesia - GeoNames admin1 to ISO province codes
('ID', '02'): 'BA', # Bali
('ID', '04'): 'JK', # Jakarta
('ID', '07'): 'JT', # Central Java
('ID', '38'): 'SN', # South Sulawesi
# Israel - GeoNames admin1 to ISO district codes
('IL', '04'): 'HA', # Haifa
('IL', '06'): 'JM', # Jerusalem
# Malaysia - GeoNames admin1 to ISO codes
('MY', '13'): '11', # Terengganu
('MY', '14'): '14', # Kuala Lumpur (already correct)
# Sri Lanka - GeoNames admin1 to ISO province codes
('LK', '29'): '2', # Central Province
('LK', '36'): '1', # Western Province
# Uzbekistan - GeoNames admin1 to ISO codes
('UZ', '13'): 'TK', # Tashkent City
('UZ', '14'): 'TO', # Tashkent Region
# Liechtenstein - GeoNames admin1 to ISO codes
('LI', '03'): '03', # Gamprin (already correct)
('LI', '11'): '11', # Vaduz (already correct)
# Armenia - GeoNames admin1 to ISO codes
('AM', '11'): 'ER', # Yerevan
# Bosnia and Herzegovina - GeoNames admin1 to ISO codes
('BA', '01'): 'BIH', # Federation of B&H
('BA', '02'): 'SRP', # Republika Srpska
# Cuba - GeoNames admin1 to ISO codes
('CU', '02'): '03', # Ciudad de La Habana (Havana)
# Kazakhstan - GeoNames admin1 to ISO codes
('KZ', '02'): 'ALA', # Almaty (city)
# Latvia - GeoNames admin1 to ISO codes
('LV', '25'): 'RIX', # Riga
# Morocco - GeoNames admin1 to ISO codes
('MA', '03'): '03', # Fès-Meknès
('MA', '04'): '04', # Rabat-Salé-Kénitra
# Namibia - GeoNames admin1 to ISO codes
('NA', '21'): 'KH', # Khomas
# Tunisia - GeoNames admin1 to ISO codes
('TN', '36'): '11', # Tunis
# Uruguay - GeoNames admin1 to ISO codes
('UY', '10'): 'MO', # Montevideo
# Venezuela - GeoNames admin1 to ISO codes
('VE', '25'): 'A', # Distrito Capital
# South Africa - GeoNames admin1 to ISO codes
('ZA', '06'): 'GP', # Gauteng
('ZA', '11'): 'WC', # Western Cape
# Zimbabwe - GeoNames admin1 to ISO codes
('ZW', '10'): 'HA', # Harare
# Additional countries with one-off mappings
('AG', '04'): '04', # Antigua and Barbuda - Saint John
('AI', '00'): 'XX', # Anguilla - no subdivision (skip)
('AL', '50'): 'TR', # Albania - Tirana
('AO', '20'): 'LUA', # Angola - Luanda
('AW', '00'): 'XX', # Aruba - no subdivision (skip)
('AZ', '09'): 'BA', # Azerbaijan - Baku
('BB', '07'): '07', # Barbados - Saint Michael
('BB', '08'): '08', # Barbados - Christ Church
('BD', '81'): 'C', # Bangladesh - Dhaka Division
('BH', '16'): '15', # Bahrain - Manama (Capital)
('BJ', '16'): 'AQ', # Benin - Atlantique (Porto-Novo)
('BO', '01'): 'H', # Bolivia - Chuquisaca (Sucre)
('BS', '23'): 'NP', # Bahamas - New Providence (Nassau)
('BZ', '02'): 'BZ', # Belize - Belize District
('CO', '34'): 'DC', # Colombia - Bogotá D.C.
('CV', '14'): 'PR', # Cape Verde - Praia
('CW', '00'): 'XX', # Curaçao - no subdivision (skip)
('DO', '01'): '01', # Dominican Republic - Distrito Nacional
('DO', '19'): '19', # Dominican Republic - Hermanas Mirabal
('DO', '34'): '01', # Dominican Republic - Santo Domingo (alt)
('DZ', '01'): '16', # Algeria - Algiers
('EC', '18'): 'P', # Ecuador - Pichincha (Quito)
('EE', '37'): '37', # Estonia - Harju (Tallinn) - numeric is ISO
('EE', '79'): '79', # Estonia - Tartu - numeric is ISO
('ET', '44'): 'AA', # Ethiopia - Addis Ababa
('FJ', '01'): 'C', # Fiji - Central Division (Suva)
('GH', '01'): 'AA', # Ghana - Greater Accra
('GT', '07'): 'GU', # Guatemala - Guatemala Department
('GY', '12'): 'DE', # Guyana - Demerara-Mahaica (Georgetown)
('HT', '11'): 'OU', # Haiti - Ouest (Port-au-Prince)
('ID', '08'): 'JI', # Indonesia - East Java
('IR', '09'): '13', # Iran - Hamadan
('JM', '08'): '01', # Jamaica - Kingston
('JM', '10'): '14', # Jamaica - Saint Andrew (Spanish Town)
('JO', '16'): 'AM', # Jordan - Amman
('KG', '01'): 'GB', # Kyrgyzstan - Bishkek City
('KH', '22'): '12', # Cambodia - Phnom Penh
('KP', '12'): '01', # North Korea - Pyongyang
('LB', '05'): 'JL', # Lebanon - Mount Lebanon (Keserwan-Jbeil)
('LB', '11'): 'BA', # Lebanon - Beirut
('LC', '03'): '02', # Saint Lucia - Castries
('LT', '65'): 'VL', # Lithuania - Vilnius
('MG', '11'): 'T', # Madagascar - Antananarivo
('ML', '6'): 'BKO', # Mali - Bamako (alt code)
('ML', '08'): 'BKO', # Mali - Bamako
('MM', '08'): '07', # Myanmar - Mandalay
('MM', '18'): '03', # Myanmar - Rakhine (Kyaukpyu)
('MN', '20'): '1', # Mongolia - Ulaanbaatar
('MS', '03'): 'XX', # Montserrat - no subdivision
('MU', '15'): 'PL', # Mauritius - Port Louis (city)
('MU', '17'): 'PW', # Mauritius - Plaines Wilhems (Curepipe)
('MU', '18'): 'PL', # Mauritius - Port Louis District
('NI', '10'): 'MN', # Nicaragua - Managua
('NO', '09'): '09', # Norway - Nordland (numeric is valid)
('NP', '3'): 'BA', # Nepal - Bagmati (Kathmandu)
('OM', '06'): 'MA', # Oman - Muscat
('PA', '8'): '8', # Panama - Panamá
('PH', '00'): '00', # Philippines - NCR (National Capital Region)
('PH', '06'): 'RIZ', # Philippines - Calabarzon (Rizal)
('PK', '08'): 'SD', # Pakistan - Sindh (Karachi alt)
('PY', '22'): 'ASU', # Paraguay - Asunción
('SN', '01'): 'DK', # Senegal - Dakar
('SR', '16'): 'PM', # Suriname - Paramaribo
('SV', '05'): 'SS', # El Salvador - San Salvador
('SX', '00'): 'XX', # Sint Maarten - no subdivision
('SY', '09'): 'HL', # Syria - Aleppo
('SY', '11'): 'RD', # Syria - Rif Dimashq (rural Damascus)
('TJ', '04'): 'DU', # Tajikistan - Dushanbe
('TT', '05'): 'POS', # Trinidad and Tobago - Port of Spain
('TZ', '23'): 'DS', # Tanzania - Dar es Salaam
}
# Countries with valid numeric ISO codes - skip these if no mapping exists
# These countries use numeric codes in ISO 3166-2 and their existing codes may be correct
SKIP_COUNTRIES = {
'JP', # Japan: 01-47
'CZ', # Czech Republic: 10-80 (NUTS codes)
'AT', # Austria: 1-9
'TR', # Turkey: 01-81
'KR', # South Korea: 11-50
'IT', # Italy: 21-88
'HR', # Croatia: 01-21
'SK', # Slovakia: 01-08 (NUTS codes)
'IS', # Iceland: 0-8
'SI', # Slovenia: 001-213 (municipality codes)
'LI', # Liechtenstein: 01-11 (municipality codes)
'MT', # Malta: 01-68
'BG', # Bulgaria: 01-28 (38-65 have explicit mappings above)
# Added after fixes - these countries now have valid numeric ISO codes
'BH', # Bahrain: 13-17
'CU', # Cuba: 01-16 (our mappings produced valid codes)
'DZ', # Algeria: 01-48 (wilayas)
'FI', # Finland: 01-21 (regions)
'IR', # Iran: 01-32 (provinces)
'JM', # Jamaica: 01-14 (parishes)
'KH', # Cambodia: 01-25 (provinces)
'KP', # North Korea: 01-14
'LC', # Saint Lucia: 01-11 (districts)
'LK', # Sri Lanka: 1-9 (provinces)
'MM', # Myanmar: 01-17 (states/regions)
'MN', # Mongolia: 1, 035-073 (aimags)
'MY', # Malaysia: 01-16 (states)
'NO', # Norway: 03, 11, 15, 18, 30, 34, 38, 42, 46, 50, 54 (fylker)
'PT', # Portugal: 01-20 (districts)
'TH', # Thailand: 10-96 (changwat)
'TN', # Tunisia: 11-83 (governorates)
'UA', # Ukraine: 05-77 (oblasts)
}
def update_file(filepath: Path, dry_run: bool = False) -> dict:
"""Update a single file with ISO region code."""
filename = filepath.name
# Extract country and region from filename
match = re.match(r'^([A-Z]{2})-(\d+)-', filename)
if not match:
return {'status': 'skip', 'reason': 'no numeric code in filename'}
country = match.group(1)
old_code = match.group(2)
# Get ISO code mapping
key = (country, old_code)
# If no mapping exists, check if this country uses valid numeric ISO codes
if key not in MAPPINGS:
if country in SKIP_COUNTRIES:
return {'status': 'skip', 'reason': f'{country} uses valid numeric ISO codes'}
return {'status': 'error', 'reason': f'no mapping for {key}'}
new_code = MAPPINGS[key]
# Read file
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
data = yaml.safe_load(content)
if not data:
return {'status': 'error', 'reason': 'empty file'}
# Get old GHCID
old_ghcid = data.get('ghcid', {}).get('ghcid_current', '')
if not old_ghcid:
return {'status': 'error', 'reason': 'no ghcid_current'}
# Create new GHCID
new_ghcid = re.sub(f'^{country}-{old_code}-', f'{country}-{new_code}-', old_ghcid)
if new_ghcid == old_ghcid:
return {'status': 'skip', 'reason': 'GHCID unchanged'}
# Check for collision
new_filename = filename.replace(f'{country}-{old_code}-', f'{country}-{new_code}-')
new_filepath = filepath.parent / new_filename
if new_filepath.exists() and new_filepath != filepath:
return {
'status': 'collision',
'old_file': filename,
'new_file': new_filename,
'old_ghcid': old_ghcid,
'new_ghcid': new_ghcid
}
if dry_run:
return {
'status': 'would_update',
'old_file': filename,
'new_file': new_filename,
'old_ghcid': old_ghcid,
'new_ghcid': new_ghcid,
'old_code': old_code,
'new_code': new_code
}
# Update YAML content
timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')
# Update ghcid.ghcid_current
data['ghcid']['ghcid_current'] = new_ghcid
# Update region_code in location_resolution
if 'location_resolution' in data.get('ghcid', {}):
data['ghcid']['location_resolution']['region_code'] = new_code
# Update location.region_code if present
if 'location' in data and 'region_code' in data['location']:
data['location']['region_code'] = new_code
# Update identifiers
if 'identifiers' in data:
for ident in data['identifiers']:
if ident.get('identifier_scheme') == 'GHCID':
ident['identifier_value'] = new_ghcid
# Add to ghcid_history
if 'ghcid_history' not in data['ghcid']:
data['ghcid']['ghcid_history'] = []
history_entry = {
'ghcid': old_ghcid,
'valid_from': data['ghcid'].get('ghcid_history', [{}])[-1].get('valid_from') if data['ghcid'].get('ghcid_history') else None,
'valid_to': timestamp,
'reason': f'Region code corrected from GeoNames admin1 ({old_code}) to ISO 3166-2 ({new_code})'
}
data['ghcid']['ghcid_history'].append(history_entry)
# Write updated content
with open(filepath, 'w', encoding='utf-8') as f:
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
# Rename file
if new_filepath != filepath:
filepath.rename(new_filepath)
return {
'status': 'updated',
'old_file': filename,
'new_file': new_filename,
'old_ghcid': old_ghcid,
'new_ghcid': new_ghcid,
'old_code': old_code,
'new_code': new_code
}
def main():
import argparse
parser = argparse.ArgumentParser(description='Fix numeric region codes to ISO 3166-2')
parser.add_argument('--dry-run', action='store_true', help='Show what would be done')
parser.add_argument('--country', type=str, help='Process only specified country code')
args = parser.parse_args()
custodian_dir = Path('/Users/kempersc/apps/glam/data/custodian')
# Find files with numeric region codes
pattern = r'^[A-Z]{2}-\d+-'
results = {
'updated': [],
'skipped': [],
'collisions': [],
'errors': []
}
for filepath in sorted(custodian_dir.glob('*.yaml')):
filename = filepath.name
# Check if filename matches numeric pattern
if not re.match(pattern, filename):
continue
# Filter by country if specified
if args.country and not filename.startswith(f'{args.country}-'):
continue
result = update_file(filepath, dry_run=args.dry_run)
if result['status'] in ['updated', 'would_update']:
results['updated'].append(result)
elif result['status'] == 'collision':
results['collisions'].append(result)
elif result['status'] == 'error':
results['errors'].append({'file': filename, **result})
else:
results['skipped'].append({'file': filename, **result})
# Print results
print(f"\n{'=' * 60}")
print(f"REGION CODE FIX RESULTS {'(DRY RUN)' if args.dry_run else ''}")
print(f"{'=' * 60}")
if results['updated']:
print(f"\n{'Would update' if args.dry_run else 'Updated'}: {len(results['updated'])} files")
for r in results['updated']:
print(f" {r['old_file']}")
print(f" -> {r['new_file']}")
print(f" Code: {r['old_code']} -> {r['new_code']}")
if results['collisions']:
print(f"\n⚠️ Collisions: {len(results['collisions'])} files")
for r in results['collisions']:
print(f" {r['old_file']} -> {r['new_file']} (EXISTS)")
if results['errors']:
print(f"\n❌ Errors: {len(results['errors'])} files")
for r in results['errors']:
print(f" {r['file']}: {r['reason']}")
print(f"\n📊 Summary:")
print(f" {'Would update' if args.dry_run else 'Updated'}: {len(results['updated'])}")
print(f" Collisions: {len(results['collisions'])}")
print(f" Errors: {len(results['errors'])}")
print(f" Skipped: {len(results['skipped'])}")
if __name__ == '__main__':
main()