649 lines
24 KiB
Python
Executable file
649 lines
24 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Fix remaining numeric region codes across multiple countries.
|
|
Maps GeoNames admin1 codes to ISO 3166-2 codes.
|
|
|
|
Countries with VALID numeric ISO codes (skip these):
|
|
- JP (Japan): 01-47 are ISO codes
|
|
- CZ (Czech Republic): 10-80 are ISO NUTS codes
|
|
- AT (Austria): 1-9 are ISO codes
|
|
- TR (Turkey): 01-81 are ISO codes (plate codes = ISO)
|
|
- KR (South Korea): 11-50 are ISO codes
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
import yaml
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
# Mapping: (country, geonames_admin1) -> ISO 3166-2 code
|
|
MAPPINGS = {
|
|
# Brazil - GeoNames admin1 to ISO state codes
|
|
('BR', '01'): 'AC', # Acre
|
|
('BR', '02'): 'AL', # Alagoas
|
|
('BR', '03'): 'AP', # Amapá
|
|
('BR', '04'): 'AM', # Amazonas
|
|
('BR', '05'): 'BA', # Bahia
|
|
('BR', '06'): 'CE', # Ceará
|
|
('BR', '07'): 'DF', # Federal District
|
|
('BR', '08'): 'ES', # Espírito Santo
|
|
('BR', '11'): 'MS', # Mato Grosso do Sul
|
|
('BR', '13'): 'MA', # Maranhão
|
|
('BR', '14'): 'MT', # Mato Grosso
|
|
('BR', '15'): 'MG', # Minas Gerais
|
|
('BR', '16'): 'PA', # Pará
|
|
('BR', '17'): 'PB', # Paraíba
|
|
('BR', '18'): 'PR', # Paraná
|
|
('BR', '20'): 'PI', # Piauí
|
|
('BR', '21'): 'RJ', # Rio de Janeiro
|
|
('BR', '22'): 'RN', # Rio Grande do Norte
|
|
('BR', '23'): 'RS', # Rio Grande do Sul
|
|
('BR', '24'): 'RO', # Rondônia
|
|
('BR', '25'): 'RR', # Roraima
|
|
('BR', '26'): 'SC', # Santa Catarina
|
|
('BR', '27'): 'SP', # São Paulo
|
|
('BR', '28'): 'SE', # Sergipe
|
|
('BR', '29'): 'GO', # Goiás
|
|
('BR', '30'): 'PE', # Pernambuco
|
|
('BR', '31'): 'TO', # Tocantins
|
|
|
|
# France - GeoNames admin1 to ISO region codes
|
|
('FR', '11'): 'IDF', # Île-de-France
|
|
('FR', '24'): 'CVL', # Centre-Val de Loire
|
|
('FR', '27'): 'BFC', # Bourgogne-Franche-Comté
|
|
('FR', '28'): 'NOR', # Normandie
|
|
('FR', '32'): 'HDF', # Hauts-de-France
|
|
('FR', '44'): 'GES', # Grand Est
|
|
('FR', '52'): 'PDL', # Pays de la Loire
|
|
('FR', '53'): 'BRE', # Bretagne
|
|
('FR', '75'): 'NAQ', # Nouvelle-Aquitaine
|
|
('FR', '76'): 'OCC', # Occitanie
|
|
('FR', '84'): 'ARA', # Auvergne-Rhône-Alpes
|
|
('FR', '93'): 'PAC', # Provence-Alpes-Côte d'Azur
|
|
('FR', '94'): 'COR', # Corse
|
|
# Department codes that should map to region
|
|
('FR', '92'): 'IDF', # Hauts-de-Seine -> Île-de-France
|
|
|
|
# Poland - GeoNames admin1 to ISO voivodeship codes
|
|
('PL', '72'): 'SK', # Świętokrzyskie
|
|
('PL', '73'): 'KP', # Kujawsko-Pomorskie
|
|
('PL', '74'): 'LU', # Lubelskie
|
|
('PL', '75'): 'LB', # Lubuskie
|
|
('PL', '76'): 'LD', # Łódzkie
|
|
('PL', '77'): 'MA', # Małopolskie
|
|
('PL', '78'): 'MZ', # Mazowieckie
|
|
('PL', '79'): 'OP', # Opolskie
|
|
('PL', '80'): 'PK', # Podkarpackie
|
|
('PL', '81'): 'PD', # Podlaskie
|
|
('PL', '82'): 'PM', # Pomorskie
|
|
('PL', '83'): 'SL', # Śląskie
|
|
('PL', '84'): 'WN', # Warmińsko-Mazurskie
|
|
('PL', '85'): 'WP', # Wielkopolskie
|
|
('PL', '86'): 'ZP', # Zachodniopomorskie
|
|
('PL', '87'): 'DS', # Dolnośląskie
|
|
|
|
# Sweden - GeoNames admin1 to ISO county codes
|
|
('SE', '02'): 'AC', # Västerbotten
|
|
('SE', '03'): 'Y', # Västernorrland
|
|
('SE', '04'): 'Z', # Jämtland
|
|
('SE', '05'): 'BD', # Norrbotten
|
|
('SE', '06'): 'X', # Gävleborg
|
|
('SE', '07'): 'W', # Dalarna
|
|
('SE', '08'): 'S', # Värmland
|
|
('SE', '09'): 'T', # Örebro
|
|
('SE', '10'): 'U', # Västmanland
|
|
('SE', '12'): 'C', # Uppsala
|
|
('SE', '13'): 'D', # Södermanland
|
|
('SE', '14'): 'AB', # Stockholm
|
|
('SE', '15'): 'I', # Gotland
|
|
('SE', '16'): 'E', # Östergötland
|
|
('SE', '17'): 'F', # Jönköping
|
|
('SE', '18'): 'H', # Kalmar
|
|
('SE', '19'): 'G', # Kronoberg
|
|
('SE', '21'): 'K', # Blekinge
|
|
('SE', '22'): 'M', # Skåne
|
|
('SE', '23'): 'N', # Halland
|
|
('SE', '24'): 'O', # Västra Götaland
|
|
('SE', '25'): 'AB', # Stockholm (alt code)
|
|
('SE', '26'): 'AB', # Stockholm
|
|
|
|
# Italy - GeoNames admin1 to ISO region codes
|
|
('IT', '01'): '65', # Abruzzo
|
|
('IT', '02'): '77', # Basilicata
|
|
('IT', '03'): '78', # Calabria
|
|
('IT', '04'): '72', # Campania
|
|
('IT', '05'): '45', # Emilia-Romagna
|
|
('IT', '06'): '36', # Friuli-Venezia Giulia
|
|
('IT', '07'): '62', # Lazio
|
|
('IT', '08'): '42', # Liguria
|
|
('IT', '09'): '25', # Lombardia
|
|
('IT', '10'): '57', # Marche
|
|
('IT', '11'): '67', # Molise
|
|
('IT', '12'): '21', # Piemonte
|
|
('IT', '13'): '75', # Puglia
|
|
('IT', '14'): '88', # Sardegna
|
|
('IT', '15'): '82', # Sicilia
|
|
('IT', '16'): '52', # Toscana
|
|
('IT', '17'): '32', # Trentino-Alto Adige
|
|
('IT', '18'): '55', # Umbria
|
|
('IT', '19'): '23', # Valle d'Aosta
|
|
('IT', '20'): '34', # Veneto
|
|
|
|
# Canada - GeoNames admin1 to ISO province codes
|
|
('CA', '01'): 'AB', # Alberta
|
|
('CA', '02'): 'BC', # British Columbia
|
|
('CA', '03'): 'MB', # Manitoba
|
|
('CA', '04'): 'NB', # New Brunswick
|
|
('CA', '05'): 'NL', # Newfoundland and Labrador
|
|
('CA', '07'): 'NS', # Nova Scotia
|
|
('CA', '08'): 'ON', # Ontario
|
|
('CA', '09'): 'PE', # Prince Edward Island
|
|
('CA', '10'): 'QC', # Quebec
|
|
('CA', '11'): 'SK', # Saskatchewan
|
|
('CA', '12'): 'YT', # Yukon
|
|
('CA', '13'): 'NT', # Northwest Territories
|
|
('CA', '14'): 'NU', # Nunavut
|
|
|
|
# Russia - GeoNames admin1 to ISO codes (federal subjects)
|
|
('RU', '48'): 'MOW', # Moscow (city)
|
|
('RU', '47'): 'MOS', # Moscow Oblast
|
|
('RU', '66'): 'SPE', # Saint Petersburg
|
|
|
|
# Vietnam - GeoNames admin1 to ISO codes
|
|
('VN', '44'): 'HN', # Hanoi
|
|
('VN', '20'): 'SG', # Ho Chi Minh City
|
|
|
|
# China - GeoNames admin1 to ISO codes
|
|
('CN', '02'): 'ZJ', # Zhejiang
|
|
('CN', '04'): 'JS', # Jiangsu
|
|
('CN', '06'): 'AH', # Anhui
|
|
('CN', '07'): 'FJ', # Fujian
|
|
('CN', '09'): 'HA', # Henan
|
|
('CN', '11'): 'HB', # Hubei
|
|
('CN', '12'): 'HN', # Hunan
|
|
('CN', '13'): 'JX', # Jiangxi
|
|
('CN', '19'): 'LN', # Liaoning
|
|
('CN', '22'): 'BJ', # Beijing
|
|
('CN', '23'): 'SH', # Shanghai
|
|
('CN', '25'): 'SD', # Shandong
|
|
('CN', '26'): 'SX', # Shanxi
|
|
('CN', '28'): 'SN', # Shaanxi
|
|
('CN', '30'): 'SH', # Shanghai (alt)
|
|
('CN', '32'): 'TJ', # Tianjin
|
|
('CN', '33'): 'XZ', # Tibet
|
|
|
|
# Mexico - GeoNames admin1 to ISO codes
|
|
('MX', '05'): 'COA', # Coahuila
|
|
('MX', '07'): 'CHP', # Chiapas
|
|
('MX', '08'): 'CHH', # Chihuahua
|
|
('MX', '09'): 'CMX', # Ciudad de México (CDMX)
|
|
('MX', '10'): 'DUR', # Durango
|
|
('MX', '11'): 'GUA', # Guanajuato
|
|
('MX', '14'): 'JAL', # Jalisco
|
|
('MX', '15'): 'MEX', # Estado de México
|
|
('MX', '16'): 'MIC', # Michoacán
|
|
('MX', '17'): 'MOR', # Morelos
|
|
('MX', '18'): 'NAY', # Nayarit
|
|
('MX', '19'): 'NLE', # Nuevo León
|
|
('MX', '20'): 'OAX', # Oaxaca
|
|
('MX', '21'): 'PUE', # Puebla
|
|
('MX', '22'): 'QUE', # Querétaro
|
|
('MX', '23'): 'ROO', # Quintana Roo
|
|
('MX', '24'): 'SLP', # San Luis Potosí
|
|
('MX', '25'): 'SIN', # Sinaloa
|
|
('MX', '26'): 'SON', # Sonora
|
|
('MX', '27'): 'TAB', # Tabasco
|
|
('MX', '28'): 'TAM', # Tamaulipas
|
|
('MX', '29'): 'TLA', # Tlaxcala
|
|
('MX', '30'): 'VER', # Veracruz
|
|
('MX', '31'): 'YUC', # Yucatán
|
|
('MX', '32'): 'ZAC', # Zacatecas
|
|
|
|
# Spain - GeoNames admin1 to ISO autonomous community codes
|
|
('ES', '29'): 'CT', # Cataluña
|
|
('ES', '31'): 'AN', # Andalucía
|
|
('ES', '32'): 'AR', # Aragón
|
|
('ES', '33'): 'AS', # Asturias
|
|
('ES', '34'): 'CL', # Castilla y León
|
|
('ES', '37'): 'CM', # Castilla-La Mancha
|
|
('ES', '39'): 'CN', # Canarias
|
|
('ES', '51'): 'EX', # Extremadura
|
|
('ES', '52'): 'GA', # Galicia
|
|
('ES', '53'): 'IB', # Islas Baleares
|
|
('ES', '54'): 'RI', # La Rioja
|
|
('ES', '55'): 'MD', # Madrid
|
|
('ES', '56'): 'CT', # Cataluña (alt - Girona)
|
|
('ES', '57'): 'MC', # Murcia
|
|
('ES', '58'): 'NC', # Navarra
|
|
('ES', '59'): 'PV', # País Vasco
|
|
('ES', '60'): 'VC', # Comunidad Valenciana
|
|
|
|
# Bulgaria - GeoNames admin1 (38-65) to ISO 3166-2 (01-28)
|
|
('BG', '38'): '01', # Blagoevgrad
|
|
('BG', '39'): '02', # Burgas
|
|
('BG', '40'): '08', # Dobrich
|
|
('BG', '41'): '07', # Gabrovo
|
|
('BG', '42'): '22', # Sofia-Capital (Sofia City)
|
|
('BG', '43'): '26', # Haskovo
|
|
('BG', '44'): '09', # Kardzhali
|
|
('BG', '45'): '10', # Kyustendil
|
|
('BG', '46'): '11', # Lovech
|
|
('BG', '47'): '12', # Montana
|
|
('BG', '48'): '13', # Pazardzhik
|
|
('BG', '49'): '14', # Pernik
|
|
('BG', '50'): '15', # Pleven
|
|
('BG', '51'): '16', # Plovdiv
|
|
('BG', '52'): '17', # Razgrad
|
|
('BG', '53'): '18', # Ruse
|
|
('BG', '54'): '27', # Shumen
|
|
('BG', '55'): '19', # Silistra
|
|
('BG', '56'): '20', # Sliven
|
|
('BG', '57'): '21', # Smolyan
|
|
('BG', '58'): '23', # Sofia (Province)
|
|
('BG', '59'): '24', # Stara Zagora
|
|
('BG', '60'): '25', # Targovishte
|
|
('BG', '61'): '03', # Varna
|
|
('BG', '62'): '04', # Veliko Tarnovo
|
|
('BG', '63'): '05', # Vidin
|
|
('BG', '64'): '06', # Vratsa
|
|
('BG', '65'): '28', # Yambol
|
|
|
|
# Iran - GeoNames admin1 to ISO province codes
|
|
('IR', '04'): 'SB', # Sistan and Baluchestan
|
|
('IR', '23'): '15', # Lorestan
|
|
('IR', '26'): '23', # Tehran
|
|
('IR', '33'): '01', # East Azerbaijan
|
|
('IR', '39'): '25', # Qom
|
|
|
|
# Ukraine - GeoNames admin1 to ISO oblast codes
|
|
('UA', '12'): '30', # Kyiv City
|
|
('UA', '30'): '32', # Kyiv Oblast (alt code)
|
|
|
|
# Thailand - GeoNames admin1 to ISO province codes
|
|
('TH', '10'): '10', # Bangkok (already ISO)
|
|
('TH', '40'): '10', # Bangkok (alt GeoNames)
|
|
('TH', '73'): '48', # Nakhon Phanom
|
|
|
|
# Portugal - GeoNames admin1 to ISO district codes
|
|
('PT', '11'): '09', # Guarda
|
|
('PT', '14'): '11', # Lisboa
|
|
('PT', '17'): '13', # Porto
|
|
|
|
# Norway - GeoNames admin1 to ISO county codes
|
|
('NO', '12'): '03', # Oslo
|
|
('NO', '46'): '46', # Vestland (already ISO)
|
|
|
|
# Finland - GeoNames admin1 to ISO region codes
|
|
('FI', '01'): '18', # Uusimaa
|
|
('FI', '02'): '19', # Varsinais-Suomi (Southwest Finland)
|
|
|
|
# Denmark - GeoNames admin1 to ISO region codes
|
|
('DK', '17'): '84', # Capital Region (Hovedstaden)
|
|
('DK', '84'): '84', # Already correct
|
|
|
|
# Hungary - GeoNames admin1 to ISO codes
|
|
('HU', '05'): 'BU', # Budapest
|
|
|
|
# Indonesia - GeoNames admin1 to ISO province codes
|
|
('ID', '02'): 'BA', # Bali
|
|
('ID', '04'): 'JK', # Jakarta
|
|
('ID', '07'): 'JT', # Central Java
|
|
('ID', '38'): 'SN', # South Sulawesi
|
|
|
|
# Israel - GeoNames admin1 to ISO district codes
|
|
('IL', '04'): 'HA', # Haifa
|
|
('IL', '06'): 'JM', # Jerusalem
|
|
|
|
# Malaysia - GeoNames admin1 to ISO codes
|
|
('MY', '13'): '11', # Terengganu
|
|
('MY', '14'): '14', # Kuala Lumpur (already correct)
|
|
|
|
# Sri Lanka - GeoNames admin1 to ISO province codes
|
|
('LK', '29'): '2', # Central Province
|
|
('LK', '36'): '1', # Western Province
|
|
|
|
# Uzbekistan - GeoNames admin1 to ISO codes
|
|
('UZ', '13'): 'TK', # Tashkent City
|
|
('UZ', '14'): 'TO', # Tashkent Region
|
|
|
|
# Liechtenstein - GeoNames admin1 to ISO codes
|
|
('LI', '03'): '03', # Gamprin (already correct)
|
|
('LI', '11'): '11', # Vaduz (already correct)
|
|
|
|
# Armenia - GeoNames admin1 to ISO codes
|
|
('AM', '11'): 'ER', # Yerevan
|
|
|
|
# Bosnia and Herzegovina - GeoNames admin1 to ISO codes
|
|
('BA', '01'): 'BIH', # Federation of B&H
|
|
('BA', '02'): 'SRP', # Republika Srpska
|
|
|
|
# Cuba - GeoNames admin1 to ISO codes
|
|
('CU', '02'): '03', # Ciudad de La Habana (Havana)
|
|
|
|
# Kazakhstan - GeoNames admin1 to ISO codes
|
|
('KZ', '02'): 'ALA', # Almaty (city)
|
|
|
|
# Latvia - GeoNames admin1 to ISO codes
|
|
('LV', '25'): 'RIX', # Riga
|
|
|
|
# Morocco - GeoNames admin1 to ISO codes
|
|
('MA', '03'): '03', # Fès-Meknès
|
|
('MA', '04'): '04', # Rabat-Salé-Kénitra
|
|
|
|
# Namibia - GeoNames admin1 to ISO codes
|
|
('NA', '21'): 'KH', # Khomas
|
|
|
|
# Tunisia - GeoNames admin1 to ISO codes
|
|
('TN', '36'): '11', # Tunis
|
|
|
|
# Uruguay - GeoNames admin1 to ISO codes
|
|
('UY', '10'): 'MO', # Montevideo
|
|
|
|
# Venezuela - GeoNames admin1 to ISO codes
|
|
('VE', '25'): 'A', # Distrito Capital
|
|
|
|
# South Africa - GeoNames admin1 to ISO codes
|
|
('ZA', '06'): 'GP', # Gauteng
|
|
('ZA', '11'): 'WC', # Western Cape
|
|
|
|
# Zimbabwe - GeoNames admin1 to ISO codes
|
|
('ZW', '10'): 'HA', # Harare
|
|
|
|
# Additional countries with one-off mappings
|
|
('AG', '04'): '04', # Antigua and Barbuda - Saint John
|
|
('AI', '00'): 'XX', # Anguilla - no subdivision (skip)
|
|
('AL', '50'): 'TR', # Albania - Tirana
|
|
('AO', '20'): 'LUA', # Angola - Luanda
|
|
('AW', '00'): 'XX', # Aruba - no subdivision (skip)
|
|
('AZ', '09'): 'BA', # Azerbaijan - Baku
|
|
('BB', '07'): '07', # Barbados - Saint Michael
|
|
('BB', '08'): '08', # Barbados - Christ Church
|
|
('BD', '81'): 'C', # Bangladesh - Dhaka Division
|
|
('BH', '16'): '15', # Bahrain - Manama (Capital)
|
|
('BJ', '16'): 'AQ', # Benin - Atlantique (Porto-Novo)
|
|
('BO', '01'): 'H', # Bolivia - Chuquisaca (Sucre)
|
|
('BS', '23'): 'NP', # Bahamas - New Providence (Nassau)
|
|
('BZ', '02'): 'BZ', # Belize - Belize District
|
|
('CO', '34'): 'DC', # Colombia - Bogotá D.C.
|
|
('CV', '14'): 'PR', # Cape Verde - Praia
|
|
('CW', '00'): 'XX', # Curaçao - no subdivision (skip)
|
|
('DO', '01'): '01', # Dominican Republic - Distrito Nacional
|
|
('DO', '19'): '19', # Dominican Republic - Hermanas Mirabal
|
|
('DO', '34'): '01', # Dominican Republic - Santo Domingo (alt)
|
|
('DZ', '01'): '16', # Algeria - Algiers
|
|
('EC', '18'): 'P', # Ecuador - Pichincha (Quito)
|
|
('EE', '37'): '37', # Estonia - Harju (Tallinn) - numeric is ISO
|
|
('EE', '79'): '79', # Estonia - Tartu - numeric is ISO
|
|
('ET', '44'): 'AA', # Ethiopia - Addis Ababa
|
|
('FJ', '01'): 'C', # Fiji - Central Division (Suva)
|
|
('GH', '01'): 'AA', # Ghana - Greater Accra
|
|
('GT', '07'): 'GU', # Guatemala - Guatemala Department
|
|
('GY', '12'): 'DE', # Guyana - Demerara-Mahaica (Georgetown)
|
|
('HT', '11'): 'OU', # Haiti - Ouest (Port-au-Prince)
|
|
('ID', '08'): 'JI', # Indonesia - East Java
|
|
('IR', '09'): '13', # Iran - Hamadan
|
|
('JM', '08'): '01', # Jamaica - Kingston
|
|
('JM', '10'): '14', # Jamaica - Saint Andrew (Spanish Town)
|
|
('JO', '16'): 'AM', # Jordan - Amman
|
|
('KG', '01'): 'GB', # Kyrgyzstan - Bishkek City
|
|
('KH', '22'): '12', # Cambodia - Phnom Penh
|
|
('KP', '12'): '01', # North Korea - Pyongyang
|
|
('LB', '05'): 'JL', # Lebanon - Mount Lebanon (Keserwan-Jbeil)
|
|
('LB', '11'): 'BA', # Lebanon - Beirut
|
|
('LC', '03'): '02', # Saint Lucia - Castries
|
|
('LT', '65'): 'VL', # Lithuania - Vilnius
|
|
('MG', '11'): 'T', # Madagascar - Antananarivo
|
|
('ML', '6'): 'BKO', # Mali - Bamako (alt code)
|
|
('ML', '08'): 'BKO', # Mali - Bamako
|
|
('MM', '08'): '07', # Myanmar - Mandalay
|
|
('MM', '18'): '03', # Myanmar - Rakhine (Kyaukpyu)
|
|
('MN', '20'): '1', # Mongolia - Ulaanbaatar
|
|
('MS', '03'): 'XX', # Montserrat - no subdivision
|
|
('MU', '15'): 'PL', # Mauritius - Port Louis (city)
|
|
('MU', '17'): 'PW', # Mauritius - Plaines Wilhems (Curepipe)
|
|
('MU', '18'): 'PL', # Mauritius - Port Louis District
|
|
('NI', '10'): 'MN', # Nicaragua - Managua
|
|
('NO', '09'): '09', # Norway - Nordland (numeric is valid)
|
|
('NP', '3'): 'BA', # Nepal - Bagmati (Kathmandu)
|
|
('OM', '06'): 'MA', # Oman - Muscat
|
|
('PA', '8'): '8', # Panama - Panamá
|
|
('PH', '00'): '00', # Philippines - NCR (National Capital Region)
|
|
('PH', '06'): 'RIZ', # Philippines - Calabarzon (Rizal)
|
|
('PK', '08'): 'SD', # Pakistan - Sindh (Karachi alt)
|
|
('PY', '22'): 'ASU', # Paraguay - Asunción
|
|
('SN', '01'): 'DK', # Senegal - Dakar
|
|
('SR', '16'): 'PM', # Suriname - Paramaribo
|
|
('SV', '05'): 'SS', # El Salvador - San Salvador
|
|
('SX', '00'): 'XX', # Sint Maarten - no subdivision
|
|
('SY', '09'): 'HL', # Syria - Aleppo
|
|
('SY', '11'): 'RD', # Syria - Rif Dimashq (rural Damascus)
|
|
('TJ', '04'): 'DU', # Tajikistan - Dushanbe
|
|
('TT', '05'): 'POS', # Trinidad and Tobago - Port of Spain
|
|
('TZ', '23'): 'DS', # Tanzania - Dar es Salaam
|
|
}
|
|
|
|
# Countries with valid numeric ISO codes - skip these if no mapping exists
|
|
# These countries use numeric codes in ISO 3166-2 and their existing codes may be correct
|
|
SKIP_COUNTRIES = {
|
|
'JP', # Japan: 01-47
|
|
'CZ', # Czech Republic: 10-80 (NUTS codes)
|
|
'AT', # Austria: 1-9
|
|
'TR', # Turkey: 01-81
|
|
'KR', # South Korea: 11-50
|
|
'IT', # Italy: 21-88
|
|
'HR', # Croatia: 01-21
|
|
'SK', # Slovakia: 01-08 (NUTS codes)
|
|
'IS', # Iceland: 0-8
|
|
'SI', # Slovenia: 001-213 (municipality codes)
|
|
'LI', # Liechtenstein: 01-11 (municipality codes)
|
|
'MT', # Malta: 01-68
|
|
'BG', # Bulgaria: 01-28 (38-65 have explicit mappings above)
|
|
# Added after fixes - these countries now have valid numeric ISO codes
|
|
'BH', # Bahrain: 13-17
|
|
'CU', # Cuba: 01-16 (our mappings produced valid codes)
|
|
'DZ', # Algeria: 01-48 (wilayas)
|
|
'FI', # Finland: 01-21 (regions)
|
|
'IR', # Iran: 01-32 (provinces)
|
|
'JM', # Jamaica: 01-14 (parishes)
|
|
'KH', # Cambodia: 01-25 (provinces)
|
|
'KP', # North Korea: 01-14
|
|
'LC', # Saint Lucia: 01-11 (districts)
|
|
'LK', # Sri Lanka: 1-9 (provinces)
|
|
'MM', # Myanmar: 01-17 (states/regions)
|
|
'MN', # Mongolia: 1, 035-073 (aimags)
|
|
'MY', # Malaysia: 01-16 (states)
|
|
'NO', # Norway: 03, 11, 15, 18, 30, 34, 38, 42, 46, 50, 54 (fylker)
|
|
'PT', # Portugal: 01-20 (districts)
|
|
'TH', # Thailand: 10-96 (changwat)
|
|
'TN', # Tunisia: 11-83 (governorates)
|
|
'UA', # Ukraine: 05-77 (oblasts)
|
|
}
|
|
|
|
def update_file(filepath: Path, dry_run: bool = False) -> dict:
|
|
"""Update a single file with ISO region code."""
|
|
filename = filepath.name
|
|
|
|
# Extract country and region from filename
|
|
match = re.match(r'^([A-Z]{2})-(\d+)-', filename)
|
|
if not match:
|
|
return {'status': 'skip', 'reason': 'no numeric code in filename'}
|
|
|
|
country = match.group(1)
|
|
old_code = match.group(2)
|
|
|
|
# Get ISO code mapping
|
|
key = (country, old_code)
|
|
|
|
# If no mapping exists, check if this country uses valid numeric ISO codes
|
|
if key not in MAPPINGS:
|
|
if country in SKIP_COUNTRIES:
|
|
return {'status': 'skip', 'reason': f'{country} uses valid numeric ISO codes'}
|
|
return {'status': 'error', 'reason': f'no mapping for {key}'}
|
|
|
|
new_code = MAPPINGS[key]
|
|
|
|
# Read file
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
data = yaml.safe_load(content)
|
|
|
|
if not data:
|
|
return {'status': 'error', 'reason': 'empty file'}
|
|
|
|
# Get old GHCID
|
|
old_ghcid = data.get('ghcid', {}).get('ghcid_current', '')
|
|
if not old_ghcid:
|
|
return {'status': 'error', 'reason': 'no ghcid_current'}
|
|
|
|
# Create new GHCID
|
|
new_ghcid = re.sub(f'^{country}-{old_code}-', f'{country}-{new_code}-', old_ghcid)
|
|
|
|
if new_ghcid == old_ghcid:
|
|
return {'status': 'skip', 'reason': 'GHCID unchanged'}
|
|
|
|
# Check for collision
|
|
new_filename = filename.replace(f'{country}-{old_code}-', f'{country}-{new_code}-')
|
|
new_filepath = filepath.parent / new_filename
|
|
|
|
if new_filepath.exists() and new_filepath != filepath:
|
|
return {
|
|
'status': 'collision',
|
|
'old_file': filename,
|
|
'new_file': new_filename,
|
|
'old_ghcid': old_ghcid,
|
|
'new_ghcid': new_ghcid
|
|
}
|
|
|
|
if dry_run:
|
|
return {
|
|
'status': 'would_update',
|
|
'old_file': filename,
|
|
'new_file': new_filename,
|
|
'old_ghcid': old_ghcid,
|
|
'new_ghcid': new_ghcid,
|
|
'old_code': old_code,
|
|
'new_code': new_code
|
|
}
|
|
|
|
# Update YAML content
|
|
timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')
|
|
|
|
# Update ghcid.ghcid_current
|
|
data['ghcid']['ghcid_current'] = new_ghcid
|
|
|
|
# Update region_code in location_resolution
|
|
if 'location_resolution' in data.get('ghcid', {}):
|
|
data['ghcid']['location_resolution']['region_code'] = new_code
|
|
|
|
# Update location.region_code if present
|
|
if 'location' in data and 'region_code' in data['location']:
|
|
data['location']['region_code'] = new_code
|
|
|
|
# Update identifiers
|
|
if 'identifiers' in data:
|
|
for ident in data['identifiers']:
|
|
if ident.get('identifier_scheme') == 'GHCID':
|
|
ident['identifier_value'] = new_ghcid
|
|
|
|
# Add to ghcid_history
|
|
if 'ghcid_history' not in data['ghcid']:
|
|
data['ghcid']['ghcid_history'] = []
|
|
|
|
history_entry = {
|
|
'ghcid': old_ghcid,
|
|
'valid_from': data['ghcid'].get('ghcid_history', [{}])[-1].get('valid_from') if data['ghcid'].get('ghcid_history') else None,
|
|
'valid_to': timestamp,
|
|
'reason': f'Region code corrected from GeoNames admin1 ({old_code}) to ISO 3166-2 ({new_code})'
|
|
}
|
|
data['ghcid']['ghcid_history'].append(history_entry)
|
|
|
|
# Write updated content
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
|
|
|
# Rename file
|
|
if new_filepath != filepath:
|
|
filepath.rename(new_filepath)
|
|
|
|
return {
|
|
'status': 'updated',
|
|
'old_file': filename,
|
|
'new_file': new_filename,
|
|
'old_ghcid': old_ghcid,
|
|
'new_ghcid': new_ghcid,
|
|
'old_code': old_code,
|
|
'new_code': new_code
|
|
}
|
|
|
|
def main():
|
|
import argparse
|
|
parser = argparse.ArgumentParser(description='Fix numeric region codes to ISO 3166-2')
|
|
parser.add_argument('--dry-run', action='store_true', help='Show what would be done')
|
|
parser.add_argument('--country', type=str, help='Process only specified country code')
|
|
args = parser.parse_args()
|
|
|
|
custodian_dir = Path('/Users/kempersc/apps/glam/data/custodian')
|
|
|
|
# Find files with numeric region codes
|
|
pattern = r'^[A-Z]{2}-\d+-'
|
|
|
|
results = {
|
|
'updated': [],
|
|
'skipped': [],
|
|
'collisions': [],
|
|
'errors': []
|
|
}
|
|
|
|
for filepath in sorted(custodian_dir.glob('*.yaml')):
|
|
filename = filepath.name
|
|
|
|
# Check if filename matches numeric pattern
|
|
if not re.match(pattern, filename):
|
|
continue
|
|
|
|
# Filter by country if specified
|
|
if args.country and not filename.startswith(f'{args.country}-'):
|
|
continue
|
|
|
|
result = update_file(filepath, dry_run=args.dry_run)
|
|
|
|
if result['status'] in ['updated', 'would_update']:
|
|
results['updated'].append(result)
|
|
elif result['status'] == 'collision':
|
|
results['collisions'].append(result)
|
|
elif result['status'] == 'error':
|
|
results['errors'].append({'file': filename, **result})
|
|
else:
|
|
results['skipped'].append({'file': filename, **result})
|
|
|
|
# Print results
|
|
print(f"\n{'=' * 60}")
|
|
print(f"REGION CODE FIX RESULTS {'(DRY RUN)' if args.dry_run else ''}")
|
|
print(f"{'=' * 60}")
|
|
|
|
if results['updated']:
|
|
print(f"\n✅ {'Would update' if args.dry_run else 'Updated'}: {len(results['updated'])} files")
|
|
for r in results['updated']:
|
|
print(f" {r['old_file']}")
|
|
print(f" -> {r['new_file']}")
|
|
print(f" Code: {r['old_code']} -> {r['new_code']}")
|
|
|
|
if results['collisions']:
|
|
print(f"\n⚠️ Collisions: {len(results['collisions'])} files")
|
|
for r in results['collisions']:
|
|
print(f" {r['old_file']} -> {r['new_file']} (EXISTS)")
|
|
|
|
if results['errors']:
|
|
print(f"\n❌ Errors: {len(results['errors'])} files")
|
|
for r in results['errors']:
|
|
print(f" {r['file']}: {r['reason']}")
|
|
|
|
print(f"\n📊 Summary:")
|
|
print(f" {'Would update' if args.dry_run else 'Updated'}: {len(results['updated'])}")
|
|
print(f" Collisions: {len(results['collisions'])}")
|
|
print(f" Errors: {len(results['errors'])}")
|
|
print(f" Skipped: {len(results['skipped'])}")
|
|
|
|
if __name__ == '__main__':
|
|
main()
|