170 lines
5.8 KiB
Python
170 lines
5.8 KiB
Python
#!/usr/bin/env python3
|
|
"""Fix Argentina region codes from numeric/invalid codes to ISO 3166-2:AR codes."""
|
|
|
|
import os
|
|
import re
|
|
import yaml
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian")
|
|
|
|
# Argentina GeoNames admin1 -> ISO 3166-2:AR mapping
|
|
# Reference: https://en.wikipedia.org/wiki/ISO_3166-2:AR
|
|
# GeoNames uses 2-digit numeric codes
|
|
AR_CODE_MAPPING = {
|
|
# Numeric GeoNames codes to ISO
|
|
"01": "B", # Buenos Aires Province
|
|
"02": "K", # Catamarca
|
|
"03": "H", # Chaco
|
|
"04": "U", # Chubut
|
|
"05": "X", # Córdoba
|
|
"06": "W", # Corrientes
|
|
"07": "E", # Entre Ríos
|
|
"08": "P", # Formosa
|
|
"09": "Y", # Jujuy
|
|
"10": "L", # La Pampa
|
|
"11": "F", # La Rioja
|
|
"12": "M", # Mendoza
|
|
"13": "N", # Misiones
|
|
"14": "Q", # Neuquén
|
|
"15": "R", # Río Negro
|
|
"16": "A", # Salta
|
|
"17": "J", # San Juan
|
|
"18": "D", # San Luis
|
|
"19": "Z", # Santa Cruz
|
|
"20": "S", # Santa Fe
|
|
"21": "G", # Santiago del Estero
|
|
"22": "V", # Tierra del Fuego
|
|
"23": "T", # Tucumán
|
|
"24": "C", # Ciudad Autónoma de Buenos Aires
|
|
# Invalid 2-letter codes that need fixing
|
|
"BU": "B", # Buenos Aires (should be single letter)
|
|
"SA": "A", # Salta (should be single letter)
|
|
"EN": "E", # Entre Ríos (should be single letter)
|
|
"CI": "C", # CABA (should be single letter)
|
|
"LA": "L", # La Pampa (should be single letter)
|
|
"CO": "X", # Córdoba (should be single letter X, not CO)
|
|
"ME": "M", # Mendoza (should be single letter)
|
|
"CF": "C", # Ciudad Federal - alias for CABA
|
|
"TU": "T", # Tucumán (should be single letter)
|
|
"SF": "S", # Santa Fe (should be single letter)
|
|
"MI": "N", # Misiones (should be N, not MI)
|
|
"RI": "R", # Río Negro (should be single letter)
|
|
"NE": "Q", # Neuquén (should be Q, not NE)
|
|
"JU": "Y", # Jujuy (should be Y, not JU)
|
|
"CH": "U", # Chubut (should be U, not CH)
|
|
"CA": "K", # Catamarca (should be K, not CA)
|
|
# Already correct single-letter codes
|
|
"A": "A", "B": "B", "C": "C", "D": "D", "E": "E", "F": "F", "G": "G",
|
|
"H": "H", "J": "J", "K": "K", "L": "L", "M": "M", "N": "N", "P": "P",
|
|
"Q": "Q", "R": "R", "S": "S", "T": "T", "U": "U", "V": "V", "W": "W",
|
|
"X": "X", "Y": "Y", "Z": "Z",
|
|
}
|
|
|
|
def fix_file(filepath: Path) -> tuple[bool, str]:
|
|
"""Fix Argentina region code in file."""
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
data = yaml.safe_load(content)
|
|
|
|
if not data or 'ghcid' not in data:
|
|
return False, "No GHCID"
|
|
|
|
current = data['ghcid'].get('ghcid_current', '')
|
|
match = re.match(r'^AR-([A-Z0-9]{1,2})-(.+)$', current)
|
|
if not match:
|
|
return False, "Invalid format"
|
|
|
|
old_region = match.group(1)
|
|
rest = match.group(2)
|
|
|
|
if old_region not in AR_CODE_MAPPING:
|
|
return False, f"Unknown region code {old_region}"
|
|
|
|
new_region = AR_CODE_MAPPING[old_region]
|
|
|
|
if old_region == new_region:
|
|
return False, "Already correct"
|
|
|
|
new_ghcid = f"AR-{new_region}-{rest}"
|
|
new_filename = f"{new_ghcid}.yaml"
|
|
new_filepath = CUSTODIAN_DIR / new_filename
|
|
|
|
# Check for collision
|
|
if new_filepath.exists() and new_filepath != filepath:
|
|
return False, f"COLLISION: {new_ghcid}"
|
|
|
|
# Update GHCID
|
|
timestamp = datetime.now(timezone.utc).isoformat()
|
|
data['ghcid']['ghcid_current'] = new_ghcid
|
|
|
|
# Update location_resolution
|
|
if 'location_resolution' in data['ghcid']:
|
|
data['ghcid']['location_resolution']['region_code'] = new_region
|
|
|
|
# Add history entry
|
|
if 'ghcid_history' not in data['ghcid']:
|
|
data['ghcid']['ghcid_history'] = []
|
|
data['ghcid']['ghcid_history'].insert(0, {
|
|
'ghcid': new_ghcid,
|
|
'valid_from': timestamp,
|
|
'reason': f"Fixed region code: {old_region} -> {new_region} (ISO 3166-2:AR)"
|
|
})
|
|
|
|
# Update location.region_code if present
|
|
if 'location' in data and isinstance(data['location'], dict):
|
|
if data['location'].get('region_code') == old_region:
|
|
data['location']['region_code'] = new_region
|
|
|
|
# Update identifiers
|
|
if 'identifiers' in data:
|
|
for ident in data['identifiers']:
|
|
if ident.get('identifier_scheme') == 'GHCID':
|
|
ident['identifier_value'] = new_ghcid
|
|
|
|
# Write updated content
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
|
|
|
# Rename file
|
|
if new_filepath != filepath:
|
|
os.rename(filepath, new_filepath)
|
|
|
|
return True, f"{current} -> {new_ghcid}"
|
|
|
|
def main():
|
|
files = list(CUSTODIAN_DIR.glob("AR-*.yaml"))
|
|
print(f"Found {len(files)} Argentina files")
|
|
|
|
fixed = 0
|
|
errors = 0
|
|
collisions = []
|
|
|
|
for f in sorted(files):
|
|
# Check if region needs correction
|
|
match = re.match(r'^AR-([A-Z0-9]{1,2})-', f.name)
|
|
if match:
|
|
old_region = match.group(1)
|
|
# Check if it needs fixing (numeric or 2-letter invalid)
|
|
if old_region in AR_CODE_MAPPING and old_region != AR_CODE_MAPPING[old_region]:
|
|
success, msg = fix_file(f)
|
|
if success:
|
|
print(f" Fixed: {msg}")
|
|
fixed += 1
|
|
elif "COLLISION" in msg:
|
|
print(f" {msg}")
|
|
collisions.append((f.name, msg))
|
|
else:
|
|
print(f" Error: {f.name}: {msg}")
|
|
errors += 1
|
|
|
|
print(f"\nSummary: Fixed {fixed}, Collisions {len(collisions)}, Errors {errors}")
|
|
|
|
if collisions:
|
|
print("\nCollisions to resolve:")
|
|
for name, msg in collisions:
|
|
print(f" {name}: {msg}")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|