#!/usr/bin/env python3 """Fix GeoNames numeric admin1 codes to ISO 3166-2 codes for various countries.""" import os import re import yaml from datetime import datetime, timezone from pathlib import Path CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian") # GeoNames admin1 numeric -> ISO 3166-2 mappings # Sources: Wikipedia ISO 3166-2 pages + GeoNames admin1 tables COUNTRY_MAPPINGS = { "BR": { # Brazil: GeoNames uses 2-digit, ISO uses 2-letter "01": "AC", "02": "AL", "03": "AP", "04": "AM", "05": "BA", "06": "CE", "07": "DF", "08": "ES", "09": "GO", "10": "MA", "11": "MT", "12": "MS", "13": "MG", "14": "PA", "15": "PB", "16": "PR", "17": "PE", "18": "PI", "19": "RJ", "20": "RN", "21": "RS", "22": "RO", "23": "RR", "24": "SC", "25": "SP", "26": "SE", "27": "TO", # Already correct codes "AC": "AC", "AL": "AL", "AP": "AP", "AM": "AM", "BA": "BA", "CE": "CE", "DF": "DF", "ES": "ES", "GO": "GO", "MA": "MA", "MT": "MT", "MS": "MS", "MG": "MG", "PA": "PA", "PB": "PB", "PR": "PR", "PE": "PE", "PI": "PI", "RJ": "RJ", "RN": "RN", "RS": "RS", "RO": "RO", "RR": "RR", "SC": "SC", "SP": "SP", "SE": "SE", "TO": "TO", "RI": "RI", }, "MX": { # Mexico: GeoNames uses 2-digit, ISO uses 3-letter "01": "AGU", "02": "BCN", "03": "BCS", "04": "CAM", "05": "COA", "06": "COL", "07": "CHP", "08": "CHH", "09": "CMX", "10": "DUR", "11": "GUA", "12": "GRO", "13": "HID", "14": "JAL", "15": "MEX", "16": "MIC", "17": "MOR", "18": "NAY", "19": "NLE", "20": "OAX", "21": "PUE", "22": "QUE", "23": "ROO", "24": "SLP", "25": "SIN", "26": "SON", "27": "TAB", "28": "TAM", "29": "TLA", "30": "VER", "31": "YUC", "32": "ZAC", # Partial codes to correct "CI": "CMX", "DF": "CMX", # Ciudad de México }, "CL": { # Chile: ISO uses 2-letter codes "01": "TA", "02": "AN", "03": "AT", "04": "CO", "05": "VA", "06": "LI", "07": "ML", "08": "BI", "09": "AR", "10": "LL", "11": "AI", "12": "MA", "13": "RM", "14": "LR", "15": "AP", "16": "NB", # 2017 new regions "17": "TA", # Fallback "18": "BI", # Ñuble -> Biobío fallback }, "FR": { # France: Use new region codes (2016 reform) "11": "IDF", "24": "CVL", "27": "BFC", "28": "NOR", "32": "HDF", "44": "GES", "52": "PDL", "53": "BRE", "75": "NAQ", "76": "OCC", "84": "ARA", "93": "PAC", "94": "COR", }, "KR": { # South Korea: GeoNames uses 2-digit, ISO uses 2-letter "01": "SO", "02": "IC", "03": "PU", "04": "TG", "05": "KW", "06": "GJ", "07": "TJ", "08": "US", "09": "UL", "10": "SJ", "11": "SO", "12": "GG", "13": "GW", "14": "CB", "15": "CN", "16": "JB", "17": "JN", "18": "GB", "19": "GN", "20": "CJ", # Seoul special handling "SE": "SE", }, "ES": { # Spain: Use autonomous community codes "29": "AN", "51": "CE", "52": "ML", "55": "PV", "58": "MC", "60": "NC", }, "PL": { # Poland: GeoNames uses 2-digit, ISO uses 2-letter "72": "DS", "73": "KP", "74": "LU", "75": "LB", "76": "LD", "77": "MA", "78": "MZ", "79": "OP", "80": "PK", "81": "PD", "82": "PM", "83": "SK", "84": "SL", "85": "WP", "86": "ZP", }, "TR": { # Turkey: Uses plate codes (01-81), match to ISO "06": "06", "34": "34", "35": "35", "32": "32", "38": "38", "68": "68", "71": "71", # Turkey ISO codes ARE numeric, so most are valid }, "IN": { # India: GeoNames uses 2-digit, ISO uses 2-letter "01": "AN", "02": "AP", "03": "AR", "04": "AS", "05": "BR", "06": "CH", "07": "CT", "08": "DN", "09": "DD", "10": "GA", "11": "GJ", "12": "HR", "13": "HP", "14": "JK", "15": "JH", "16": "KA", "17": "KL", "19": "MP", "20": "MH", "21": "MN", "22": "ML", "23": "MZ", "24": "NL", "25": "OR", "26": "PY", "27": "PB", "28": "RJ", "29": "SK", "30": "TN", "31": "TG", "32": "TR", "33": "UP", "34": "UT", "35": "WB", "36": "TS", }, "SE": { # Sweden: GeoNames uses 2-digit, ISO uses 1-2 letter "01": "K", "02": "M", "03": "N", "04": "O", "05": "F", "06": "G", "07": "H", "08": "I", "09": "D", "10": "E", "12": "AB", "13": "C", "14": "S", "15": "T", "16": "U", "17": "W", "18": "X", "19": "Y", "21": "Z", "22": "AC", "23": "BD", "24": "AC", "25": "BD", "26": "AB", "27": "AB", }, "IT": { # Italy: GeoNames uses 2-digit, ISO uses 2-3 letter "01": "PIE", "02": "VDA", "03": "LOM", "04": "TAA", "05": "VEN", "06": "FVG", "07": "LIG", "08": "EMR", "09": "TOS", "10": "UMB", "11": "MAR", "12": "LAZ", "13": "ABR", "14": "MOL", "15": "CAM", "16": "PUG", "17": "BAS", "18": "CAL", "19": "SIC", "20": "SAR", "62": "MAR", "65": "ABR", "66": "MOL", "67": "CAM", "72": "PUG", "75": "BAS", "78": "CAL", "82": "SIC", "88": "SAR", }, "GE": { # Georgia: GeoNames uses 2-digit, ISO uses 2-letter "01": "AB", "02": "AJ", "04": "GU", "05": "IM", "06": "KA", "07": "KK", "08": "MM", "09": "RL", "10": "SZ", "11": "SJ", "51": "TB", "52": "TB", "53": "KA", }, "CA": { # Canada: GeoNames uses 2-digit, ISO uses 2-letter "01": "AB", "02": "BC", "03": "MB", "04": "NB", "05": "NL", "07": "NS", "08": "ON", "09": "PE", "10": "QC", "11": "SK", "12": "YT", "13": "NT", "14": "NU", }, "RU": { # Russia: GeoNames uses 2-digit, ISO uses 2-3 letter (complex) "47": "LEN", "48": "MOW", "66": "SVE", "77": "MOW", "78": "SPE", "86": "KHM", }, "VN": { # Vietnam: GeoNames uses 2-digit, ISO uses 2-digit (some match) "01": "44", "02": "43", "03": "01", "04": "02", # Many Vietnam codes ARE numeric in ISO }, "CN": { # China: GeoNames uses 2-digit, ISO uses 2-letter "01": "AH", "02": "ZJ", "03": "JX", "04": "JS", "05": "JL", "06": "QH", "07": "FJ", "08": "HL", "09": "HN", "10": "HB", "11": "HN", "12": "SD", "13": "SX", "14": "SX", "15": "SC", "16": "YN", "18": "TW", "19": "NM", "20": "NX", "21": "BJ", "22": "TJ", "23": "SH", "24": "CQ", "25": "HI", "26": "GX", "28": "XZ", "29": "XJ", "30": "GD", "31": "MO", "32": "HK", "33": "GS", "34": "GZ", "35": "LN", "36": "SN", }, "EG": { # Egypt: GeoNames uses 2-digit, ISO uses 2-3 letter "01": "C", "02": "ALX", "03": "SHG", "04": "ASN", "05": "AST", "06": "BH", "07": "BNS", "08": "DK", "09": "DT", "10": "FYM", "11": "GH", "12": "GZ", "14": "IS", "15": "KFS", "16": "MN", "17": "MNF", "18": "MT", "19": "KN", "20": "SIN", "21": "WAD", "22": "JS", "23": "KB", "24": "SHR", "25": "SUZ", "26": "BA", "27": "PTS", "28": "LX", "29": "SU", }, } def fix_file(filepath: Path, country: str) -> tuple[bool, str]: """Fix region code in file.""" if country not in COUNTRY_MAPPINGS: return False, f"No mapping for {country}" mapping = COUNTRY_MAPPINGS[country] with open(filepath, 'r', encoding='utf-8') as f: content = f.read() data = yaml.safe_load(content) if not data or 'ghcid' not in data: return False, "No GHCID" current = data['ghcid'].get('ghcid_current', '') pattern = rf'^{country}-([A-Z0-9]{{1,3}})-(.+)$' match = re.match(pattern, current) if not match: return False, "Invalid format" old_region = match.group(1) rest = match.group(2) if old_region not in mapping: return False, f"No mapping for region {old_region}" new_region = mapping[old_region] if old_region == new_region: return False, "Already correct" new_ghcid = f"{country}-{new_region}-{rest}" new_filename = f"{new_ghcid}.yaml" new_filepath = CUSTODIAN_DIR / new_filename # Check for collision if new_filepath.exists() and new_filepath != filepath: return False, f"COLLISION: {new_ghcid}" # Update GHCID timestamp = datetime.now(timezone.utc).isoformat() data['ghcid']['ghcid_current'] = new_ghcid # Update location_resolution if 'location_resolution' in data['ghcid']: data['ghcid']['location_resolution']['region_code'] = new_region # Add history entry if 'ghcid_history' not in data['ghcid']: data['ghcid']['ghcid_history'] = [] data['ghcid']['ghcid_history'].insert(0, { 'ghcid': new_ghcid, 'valid_from': timestamp, 'reason': f"Fixed region code: {old_region} -> {new_region} (ISO 3166-2:{country})" }) # Update location.region_code if present if 'location' in data and isinstance(data['location'], dict): if data['location'].get('region_code') == old_region: data['location']['region_code'] = new_region # Update identifiers if 'identifiers' in data: for ident in data['identifiers']: if ident.get('identifier_scheme') == 'GHCID': ident['identifier_value'] = new_ghcid # Write updated content with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False) # Rename file if new_filepath != filepath: os.rename(filepath, new_filepath) return True, f"{current} -> {new_ghcid}" def main(): countries = list(COUNTRY_MAPPINGS.keys()) total_fixed = 0 total_collisions = 0 total_errors = 0 for country in countries: files = list(CUSTODIAN_DIR.glob(f"{country}-*.yaml")) if not files: continue print(f"\n=== {country} ({len(files)} files) ===") fixed = 0 collisions = 0 errors = 0 for f in sorted(files): # Check if region is numeric or in mapping pattern = rf'^{country}-([A-Z0-9]{{1,3}})-' match = re.match(pattern, f.name) if match: old_region = match.group(1) if old_region in COUNTRY_MAPPINGS[country] and old_region != COUNTRY_MAPPINGS[country][old_region]: success, msg = fix_file(f, country) if success: print(f" Fixed: {msg}") fixed += 1 elif "COLLISION" in msg: print(f" {msg}") collisions += 1 elif "Already" not in msg and "No mapping" not in msg: print(f" Error: {f.name}: {msg}") errors += 1 if fixed > 0 or collisions > 0: print(f" Summary: Fixed {fixed}, Collisions {collisions}") total_fixed += fixed total_collisions += collisions total_errors += errors print(f"\n=== TOTAL ===") print(f"Fixed: {total_fixed}, Collisions: {total_collisions}, Errors: {total_errors}") if __name__ == "__main__": main()