#!/usr/bin/env python3 """Fix Belarus region codes - additional corrections for GO, GR, MO.""" import os import re import yaml from datetime import datetime, timezone from pathlib import Path CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian") # Belarus corrections for letter codes BY_CODE_MAPPING = { "GO": "HO", # Gomel → Homel (ISO uses Belarusian Romanization) "GR": "HR", # Grodno → Hrodna (ISO uses Belarusian Romanization) "MO": "MA", # Mogilev → Mahilyow (ISO uses Belarusian Romanization) } def fix_file(filepath: Path) -> tuple[bool, str]: """Fix Belarus region code in file.""" with open(filepath, 'r', encoding='utf-8') as f: content = f.read() data = yaml.safe_load(content) if not data or 'ghcid' not in data: return False, "No GHCID" current = data['ghcid'].get('ghcid_current', '') match = re.match(r'^BY-([A-Z]{2})-(.+)$', current) if not match: return False, "Invalid format" old_region = match.group(1) rest = match.group(2) if old_region not in BY_CODE_MAPPING: return False, f"No correction needed for {old_region}" new_region = BY_CODE_MAPPING[old_region] new_ghcid = f"BY-{new_region}-{rest}" new_filename = f"{new_ghcid}.yaml" new_filepath = CUSTODIAN_DIR / new_filename # Check for collision if new_filepath.exists() and new_filepath != filepath: return False, f"COLLISION: {new_ghcid}" # Update GHCID timestamp = datetime.now(timezone.utc).isoformat() data['ghcid']['ghcid_current'] = new_ghcid # Update location_resolution if 'location_resolution' in data['ghcid']: data['ghcid']['location_resolution']['region_code'] = new_region # Add history entry if 'ghcid_history' not in data['ghcid']: data['ghcid']['ghcid_history'] = [] data['ghcid']['ghcid_history'].insert(0, { 'ghcid': new_ghcid, 'valid_from': timestamp, 'reason': f"Fixed region code: {old_region} -> {new_region} (ISO 3166-2:BY uses Belarusian Romanization)" }) # Update location.region_code if present if 'location' in data and isinstance(data['location'], dict): if data['location'].get('region_code') == old_region: data['location']['region_code'] = new_region # Update identifiers if 'identifiers' in data: for ident in data['identifiers']: if ident.get('identifier_scheme') == 'GHCID': ident['identifier_value'] = new_ghcid # Write updated content with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False) # Rename file if new_filepath != filepath: os.rename(filepath, new_filepath) return True, f"{current} -> {new_ghcid}" def main(): files = list(CUSTODIAN_DIR.glob("BY-*.yaml")) print(f"Found {len(files)} Belarus files") fixed = 0 errors = 0 collisions = 0 for f in sorted(files): match = re.match(r'^BY-([A-Z]{2})-', f.name) if match and match.group(1) in BY_CODE_MAPPING: success, msg = fix_file(f) if success: print(f" Fixed: {msg}") fixed += 1 elif "COLLISION" in msg: print(f" {msg}") collisions += 1 else: print(f" Skip: {f.name}: {msg}") errors += 1 print(f"\nSummary: Fixed {fixed}, Collisions {collisions}, Errors {errors}") if __name__ == "__main__": main()