glam/scripts/fix_geonames_numeric_codes.py
2025-12-10 13:01:13 +01:00

279 lines
11 KiB
Python

#!/usr/bin/env python3
"""Fix GeoNames numeric admin1 codes to ISO 3166-2 codes for various countries."""
import os
import re
import yaml
from datetime import datetime, timezone
from pathlib import Path
CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian")
# GeoNames admin1 numeric -> ISO 3166-2 mappings
# Sources: Wikipedia ISO 3166-2 pages + GeoNames admin1 tables
COUNTRY_MAPPINGS = {
"BR": {
# Brazil: GeoNames uses 2-digit, ISO uses 2-letter
"01": "AC", "02": "AL", "03": "AP", "04": "AM", "05": "BA",
"06": "CE", "07": "DF", "08": "ES", "09": "GO", "10": "MA",
"11": "MT", "12": "MS", "13": "MG", "14": "PA", "15": "PB",
"16": "PR", "17": "PE", "18": "PI", "19": "RJ", "20": "RN",
"21": "RS", "22": "RO", "23": "RR", "24": "SC", "25": "SP",
"26": "SE", "27": "TO",
# Already correct codes
"AC": "AC", "AL": "AL", "AP": "AP", "AM": "AM", "BA": "BA",
"CE": "CE", "DF": "DF", "ES": "ES", "GO": "GO", "MA": "MA",
"MT": "MT", "MS": "MS", "MG": "MG", "PA": "PA", "PB": "PB",
"PR": "PR", "PE": "PE", "PI": "PI", "RJ": "RJ", "RN": "RN",
"RS": "RS", "RO": "RO", "RR": "RR", "SC": "SC", "SP": "SP",
"SE": "SE", "TO": "TO", "RI": "RI",
},
"MX": {
# Mexico: GeoNames uses 2-digit, ISO uses 3-letter
"01": "AGU", "02": "BCN", "03": "BCS", "04": "CAM", "05": "COA",
"06": "COL", "07": "CHP", "08": "CHH", "09": "CMX", "10": "DUR",
"11": "GUA", "12": "GRO", "13": "HID", "14": "JAL", "15": "MEX",
"16": "MIC", "17": "MOR", "18": "NAY", "19": "NLE", "20": "OAX",
"21": "PUE", "22": "QUE", "23": "ROO", "24": "SLP", "25": "SIN",
"26": "SON", "27": "TAB", "28": "TAM", "29": "TLA", "30": "VER",
"31": "YUC", "32": "ZAC",
# Partial codes to correct
"CI": "CMX", "DF": "CMX", # Ciudad de México
},
"CL": {
# Chile: ISO uses 2-letter codes
"01": "TA", "02": "AN", "03": "AT", "04": "CO", "05": "VA",
"06": "LI", "07": "ML", "08": "BI", "09": "AR", "10": "LL",
"11": "AI", "12": "MA", "13": "RM", "14": "LR", "15": "AP",
"16": "NB",
# 2017 new regions
"17": "TA", # Fallback
"18": "BI", # Ñuble -> Biobío fallback
},
"FR": {
# France: Use new region codes (2016 reform)
"11": "IDF", "24": "CVL", "27": "BFC", "28": "NOR", "32": "HDF",
"44": "GES", "52": "PDL", "53": "BRE", "75": "NAQ", "76": "OCC",
"84": "ARA", "93": "PAC", "94": "COR",
},
"KR": {
# South Korea: GeoNames uses 2-digit, ISO uses 2-letter
"01": "SO", "02": "IC", "03": "PU", "04": "TG", "05": "KW",
"06": "GJ", "07": "TJ", "08": "US", "09": "UL", "10": "SJ",
"11": "SO", "12": "GG", "13": "GW", "14": "CB", "15": "CN",
"16": "JB", "17": "JN", "18": "GB", "19": "GN", "20": "CJ",
# Seoul special handling
"SE": "SE",
},
"ES": {
# Spain: Use autonomous community codes
"29": "AN", "51": "CE", "52": "ML", "55": "PV",
"58": "MC", "60": "NC",
},
"PL": {
# Poland: GeoNames uses 2-digit, ISO uses 2-letter
"72": "DS", "73": "KP", "74": "LU", "75": "LB", "76": "LD",
"77": "MA", "78": "MZ", "79": "OP", "80": "PK", "81": "PD",
"82": "PM", "83": "SK", "84": "SL", "85": "WP", "86": "ZP",
},
"TR": {
# Turkey: Uses plate codes (01-81), match to ISO
"06": "06", "34": "34", "35": "35", "32": "32", "38": "38",
"68": "68", "71": "71",
# Turkey ISO codes ARE numeric, so most are valid
},
"IN": {
# India: GeoNames uses 2-digit, ISO uses 2-letter
"01": "AN", "02": "AP", "03": "AR", "04": "AS", "05": "BR",
"06": "CH", "07": "CT", "08": "DN", "09": "DD", "10": "GA",
"11": "GJ", "12": "HR", "13": "HP", "14": "JK", "15": "JH",
"16": "KA", "17": "KL", "19": "MP", "20": "MH", "21": "MN",
"22": "ML", "23": "MZ", "24": "NL", "25": "OR", "26": "PY",
"27": "PB", "28": "RJ", "29": "SK", "30": "TN", "31": "TG",
"32": "TR", "33": "UP", "34": "UT", "35": "WB", "36": "TS",
},
"SE": {
# Sweden: GeoNames uses 2-digit, ISO uses 1-2 letter
"01": "K", "02": "M", "03": "N", "04": "O", "05": "F",
"06": "G", "07": "H", "08": "I", "09": "D", "10": "E",
"12": "AB", "13": "C", "14": "S", "15": "T", "16": "U",
"17": "W", "18": "X", "19": "Y", "21": "Z", "22": "AC",
"23": "BD", "24": "AC", "25": "BD", "26": "AB", "27": "AB",
},
"IT": {
# Italy: GeoNames uses 2-digit, ISO uses 2-3 letter
"01": "PIE", "02": "VDA", "03": "LOM", "04": "TAA", "05": "VEN",
"06": "FVG", "07": "LIG", "08": "EMR", "09": "TOS", "10": "UMB",
"11": "MAR", "12": "LAZ", "13": "ABR", "14": "MOL", "15": "CAM",
"16": "PUG", "17": "BAS", "18": "CAL", "19": "SIC", "20": "SAR",
"62": "MAR", "65": "ABR", "66": "MOL", "67": "CAM", "72": "PUG",
"75": "BAS", "78": "CAL", "82": "SIC", "88": "SAR",
},
"GE": {
# Georgia: GeoNames uses 2-digit, ISO uses 2-letter
"01": "AB", "02": "AJ", "04": "GU", "05": "IM", "06": "KA",
"07": "KK", "08": "MM", "09": "RL", "10": "SZ", "11": "SJ",
"51": "TB", "52": "TB", "53": "KA",
},
"CA": {
# Canada: GeoNames uses 2-digit, ISO uses 2-letter
"01": "AB", "02": "BC", "03": "MB", "04": "NB", "05": "NL",
"07": "NS", "08": "ON", "09": "PE", "10": "QC", "11": "SK",
"12": "YT", "13": "NT", "14": "NU",
},
"RU": {
# Russia: GeoNames uses 2-digit, ISO uses 2-3 letter (complex)
"47": "LEN", "48": "MOW", "66": "SVE", "77": "MOW",
"78": "SPE", "86": "KHM",
},
"VN": {
# Vietnam: GeoNames uses 2-digit, ISO uses 2-digit (some match)
"01": "44", "02": "43", "03": "01", "04": "02",
# Many Vietnam codes ARE numeric in ISO
},
"CN": {
# China: GeoNames uses 2-digit, ISO uses 2-letter
"01": "AH", "02": "ZJ", "03": "JX", "04": "JS", "05": "JL",
"06": "QH", "07": "FJ", "08": "HL", "09": "HN", "10": "HB",
"11": "HN", "12": "SD", "13": "SX", "14": "SX", "15": "SC",
"16": "YN", "18": "TW", "19": "NM", "20": "NX", "21": "BJ",
"22": "TJ", "23": "SH", "24": "CQ", "25": "HI", "26": "GX",
"28": "XZ", "29": "XJ", "30": "GD", "31": "MO", "32": "HK",
"33": "GS", "34": "GZ", "35": "LN", "36": "SN",
},
"EG": {
# Egypt: GeoNames uses 2-digit, ISO uses 2-3 letter
"01": "C", "02": "ALX", "03": "SHG", "04": "ASN", "05": "AST",
"06": "BH", "07": "BNS", "08": "DK", "09": "DT", "10": "FYM",
"11": "GH", "12": "GZ", "14": "IS", "15": "KFS", "16": "MN",
"17": "MNF", "18": "MT", "19": "KN", "20": "SIN", "21": "WAD",
"22": "JS", "23": "KB", "24": "SHR", "25": "SUZ", "26": "BA",
"27": "PTS", "28": "LX", "29": "SU",
},
}
def fix_file(filepath: Path, country: str) -> tuple[bool, str]:
"""Fix region code in file."""
if country not in COUNTRY_MAPPINGS:
return False, f"No mapping for {country}"
mapping = COUNTRY_MAPPINGS[country]
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
data = yaml.safe_load(content)
if not data or 'ghcid' not in data:
return False, "No GHCID"
current = data['ghcid'].get('ghcid_current', '')
pattern = rf'^{country}-([A-Z0-9]{{1,3}})-(.+)$'
match = re.match(pattern, current)
if not match:
return False, "Invalid format"
old_region = match.group(1)
rest = match.group(2)
if old_region not in mapping:
return False, f"No mapping for region {old_region}"
new_region = mapping[old_region]
if old_region == new_region:
return False, "Already correct"
new_ghcid = f"{country}-{new_region}-{rest}"
new_filename = f"{new_ghcid}.yaml"
new_filepath = CUSTODIAN_DIR / new_filename
# Check for collision
if new_filepath.exists() and new_filepath != filepath:
return False, f"COLLISION: {new_ghcid}"
# Update GHCID
timestamp = datetime.now(timezone.utc).isoformat()
data['ghcid']['ghcid_current'] = new_ghcid
# Update location_resolution
if 'location_resolution' in data['ghcid']:
data['ghcid']['location_resolution']['region_code'] = new_region
# Add history entry
if 'ghcid_history' not in data['ghcid']:
data['ghcid']['ghcid_history'] = []
data['ghcid']['ghcid_history'].insert(0, {
'ghcid': new_ghcid,
'valid_from': timestamp,
'reason': f"Fixed region code: {old_region} -> {new_region} (ISO 3166-2:{country})"
})
# Update location.region_code if present
if 'location' in data and isinstance(data['location'], dict):
if data['location'].get('region_code') == old_region:
data['location']['region_code'] = new_region
# Update identifiers
if 'identifiers' in data:
for ident in data['identifiers']:
if ident.get('identifier_scheme') == 'GHCID':
ident['identifier_value'] = new_ghcid
# Write updated content
with open(filepath, 'w', encoding='utf-8') as f:
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
# Rename file
if new_filepath != filepath:
os.rename(filepath, new_filepath)
return True, f"{current} -> {new_ghcid}"
def main():
countries = list(COUNTRY_MAPPINGS.keys())
total_fixed = 0
total_collisions = 0
total_errors = 0
for country in countries:
files = list(CUSTODIAN_DIR.glob(f"{country}-*.yaml"))
if not files:
continue
print(f"\n=== {country} ({len(files)} files) ===")
fixed = 0
collisions = 0
errors = 0
for f in sorted(files):
# Check if region is numeric or in mapping
pattern = rf'^{country}-([A-Z0-9]{{1,3}})-'
match = re.match(pattern, f.name)
if match:
old_region = match.group(1)
if old_region in COUNTRY_MAPPINGS[country] and old_region != COUNTRY_MAPPINGS[country][old_region]:
success, msg = fix_file(f, country)
if success:
print(f" Fixed: {msg}")
fixed += 1
elif "COLLISION" in msg:
print(f" {msg}")
collisions += 1
elif "Already" not in msg and "No mapping" not in msg:
print(f" Error: {f.name}: {msg}")
errors += 1
if fixed > 0 or collisions > 0:
print(f" Summary: Fixed {fixed}, Collisions {collisions}")
total_fixed += fixed
total_collisions += collisions
total_errors += errors
print(f"\n=== TOTAL ===")
print(f"Fixed: {total_fixed}, Collisions: {total_collisions}, Errors: {total_errors}")
if __name__ == "__main__":
main()