#!/usr/bin/env python3 """Fix Belarus region codes from numeric GeoNames codes to ISO 3166-2:BY codes.""" import os import re import yaml from datetime import datetime, timezone from pathlib import Path CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian") # Belarus GeoNames admin1 -> ISO 3166-2:BY mapping # Reference: https://en.wikipedia.org/wiki/ISO_3166-2:BY BY_CODE_MAPPING = { "00": "XX", # Unknown - needs manual review "01": "BR", # Brest Oblast "02": "HO", # Gomel (Homel) Oblast "03": "HR", # Grodno (Hrodna) Oblast "04": "MI", # Minsk Oblast (region around Minsk) "05": "MA", # Mogilev (Mahilyow) Oblast "06": "VI", # Vitebsk (Vitsebsk) Oblast "07": "HR", # Grodno (Hrodna) Oblast - some GeoNames use 07 "HM": "HM", # Minsk City (already correct) # Letter codes that are already correct "BR": "BR", "HO": "HO", "HR": "HR", "MA": "MA", "MI": "MI", "VI": "VI", } def fix_file(filepath: Path) -> tuple[bool, str]: """Fix Belarus region code in file.""" with open(filepath, 'r', encoding='utf-8') as f: content = f.read() data = yaml.safe_load(content) if not data or 'ghcid' not in data: return False, "No GHCID" current = data['ghcid'].get('ghcid_current', '') match = re.match(r'^BY-([A-Z0-9]{2})-(.+)$', current) if not match: return False, "Invalid format" old_region = match.group(1) rest = match.group(2) if old_region not in BY_CODE_MAPPING: return False, f"Unknown region code {old_region}" new_region = BY_CODE_MAPPING[old_region] if old_region == new_region: return False, "Already correct" new_ghcid = f"BY-{new_region}-{rest}" new_filename = f"{new_ghcid}.yaml" new_filepath = CUSTODIAN_DIR / new_filename # Check for collision if new_filepath.exists() and new_filepath != filepath: return False, f"COLLISION: {new_ghcid}" # Update GHCID timestamp = datetime.now(timezone.utc).isoformat() data['ghcid']['ghcid_current'] = new_ghcid # Update location_resolution if 'location_resolution' in data['ghcid']: data['ghcid']['location_resolution']['region_code'] = new_region # Add history entry if 'ghcid_history' not in data['ghcid']: data['ghcid']['ghcid_history'] = [] data['ghcid']['ghcid_history'].insert(0, { 'ghcid': new_ghcid, 'valid_from': timestamp, 'reason': f"Fixed region code: {old_region} -> {new_region} (ISO 3166-2:BY)" }) # Update location.region_code if present if 'location' in data and isinstance(data['location'], dict): if data['location'].get('region_code') == old_region: data['location']['region_code'] = new_region # Update identifiers if 'identifiers' in data: for ident in data['identifiers']: if ident.get('identifier_scheme') == 'GHCID': ident['identifier_value'] = new_ghcid # Write updated content with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False) # Rename file if new_filepath != filepath: os.rename(filepath, new_filepath) return True, f"{current} -> {new_ghcid}" def main(): files = list(CUSTODIAN_DIR.glob("BY-*.yaml")) print(f"Found {len(files)} Belarus files") fixed = 0 errors = 0 collisions = 0 for f in sorted(files): # Check if region needs correction (numeric codes) match = re.match(r'^BY-([0-9]{2})-', f.name) if match: success, msg = fix_file(f) if success: print(f" Fixed: {msg}") fixed += 1 elif "COLLISION" in msg: print(f" {msg}") collisions += 1 else: print(f" Skip: {f.name}: {msg}") errors += 1 print(f"\nSummary: Fixed {fixed}, Collisions {collisions}, Errors {errors}") if __name__ == "__main__": main()