#!/usr/bin/env python3 """Fix Argentina region codes from numeric/invalid codes to ISO 3166-2:AR codes.""" import os import re import yaml from datetime import datetime, timezone from pathlib import Path CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian") # Argentina GeoNames admin1 -> ISO 3166-2:AR mapping # Reference: https://en.wikipedia.org/wiki/ISO_3166-2:AR # GeoNames uses 2-digit numeric codes AR_CODE_MAPPING = { # Numeric GeoNames codes to ISO "01": "B", # Buenos Aires Province "02": "K", # Catamarca "03": "H", # Chaco "04": "U", # Chubut "05": "X", # Córdoba "06": "W", # Corrientes "07": "E", # Entre Ríos "08": "P", # Formosa "09": "Y", # Jujuy "10": "L", # La Pampa "11": "F", # La Rioja "12": "M", # Mendoza "13": "N", # Misiones "14": "Q", # Neuquén "15": "R", # Río Negro "16": "A", # Salta "17": "J", # San Juan "18": "D", # San Luis "19": "Z", # Santa Cruz "20": "S", # Santa Fe "21": "G", # Santiago del Estero "22": "V", # Tierra del Fuego "23": "T", # Tucumán "24": "C", # Ciudad Autónoma de Buenos Aires # Invalid 2-letter codes that need fixing "BU": "B", # Buenos Aires (should be single letter) "SA": "A", # Salta (should be single letter) "EN": "E", # Entre Ríos (should be single letter) "CI": "C", # CABA (should be single letter) "LA": "L", # La Pampa (should be single letter) "CO": "X", # Córdoba (should be single letter X, not CO) "ME": "M", # Mendoza (should be single letter) "CF": "C", # Ciudad Federal - alias for CABA "TU": "T", # Tucumán (should be single letter) "SF": "S", # Santa Fe (should be single letter) "MI": "N", # Misiones (should be N, not MI) "RI": "R", # Río Negro (should be single letter) "NE": "Q", # Neuquén (should be Q, not NE) "JU": "Y", # Jujuy (should be Y, not JU) "CH": "U", # Chubut (should be U, not CH) "CA": "K", # Catamarca (should be K, not CA) # Already correct single-letter codes "A": "A", "B": "B", "C": "C", "D": "D", "E": "E", "F": "F", "G": "G", "H": "H", "J": "J", "K": "K", "L": "L", "M": "M", "N": "N", "P": "P", "Q": "Q", "R": "R", "S": "S", "T": "T", "U": "U", "V": "V", "W": "W", "X": "X", "Y": "Y", "Z": "Z", } def fix_file(filepath: Path) -> tuple[bool, str]: """Fix Argentina region code in file.""" with open(filepath, 'r', encoding='utf-8') as f: content = f.read() data = yaml.safe_load(content) if not data or 'ghcid' not in data: return False, "No GHCID" current = data['ghcid'].get('ghcid_current', '') match = re.match(r'^AR-([A-Z0-9]{1,2})-(.+)$', current) if not match: return False, "Invalid format" old_region = match.group(1) rest = match.group(2) if old_region not in AR_CODE_MAPPING: return False, f"Unknown region code {old_region}" new_region = AR_CODE_MAPPING[old_region] if old_region == new_region: return False, "Already correct" new_ghcid = f"AR-{new_region}-{rest}" new_filename = f"{new_ghcid}.yaml" new_filepath = CUSTODIAN_DIR / new_filename # Check for collision if new_filepath.exists() and new_filepath != filepath: return False, f"COLLISION: {new_ghcid}" # Update GHCID timestamp = datetime.now(timezone.utc).isoformat() data['ghcid']['ghcid_current'] = new_ghcid # Update location_resolution if 'location_resolution' in data['ghcid']: data['ghcid']['location_resolution']['region_code'] = new_region # Add history entry if 'ghcid_history' not in data['ghcid']: data['ghcid']['ghcid_history'] = [] data['ghcid']['ghcid_history'].insert(0, { 'ghcid': new_ghcid, 'valid_from': timestamp, 'reason': f"Fixed region code: {old_region} -> {new_region} (ISO 3166-2:AR)" }) # Update location.region_code if present if 'location' in data and isinstance(data['location'], dict): if data['location'].get('region_code') == old_region: data['location']['region_code'] = new_region # Update identifiers if 'identifiers' in data: for ident in data['identifiers']: if ident.get('identifier_scheme') == 'GHCID': ident['identifier_value'] = new_ghcid # Write updated content with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False) # Rename file if new_filepath != filepath: os.rename(filepath, new_filepath) return True, f"{current} -> {new_ghcid}" def main(): files = list(CUSTODIAN_DIR.glob("AR-*.yaml")) print(f"Found {len(files)} Argentina files") fixed = 0 errors = 0 collisions = [] for f in sorted(files): # Check if region needs correction match = re.match(r'^AR-([A-Z0-9]{1,2})-', f.name) if match: old_region = match.group(1) # Check if it needs fixing (numeric or 2-letter invalid) if old_region in AR_CODE_MAPPING and old_region != AR_CODE_MAPPING[old_region]: success, msg = fix_file(f) if success: print(f" Fixed: {msg}") fixed += 1 elif "COLLISION" in msg: print(f" {msg}") collisions.append((f.name, msg)) else: print(f" Error: {f.name}: {msg}") errors += 1 print(f"\nSummary: Fixed {fixed}, Collisions {len(collisions)}, Errors {errors}") if collisions: print("\nCollisions to resolve:") for name, msg in collisions: print(f" {name}: {msg}") if __name__ == "__main__": main()