glam/scripts/fix_belarus_region_codes.py
2025-12-10 13:01:13 +01:00

125 lines
4 KiB
Python

#!/usr/bin/env python3
"""Fix Belarus region codes from numeric GeoNames codes to ISO 3166-2:BY codes."""
import os
import re
import yaml
from datetime import datetime, timezone
from pathlib import Path
CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian")
# Belarus GeoNames admin1 -> ISO 3166-2:BY mapping
# Reference: https://en.wikipedia.org/wiki/ISO_3166-2:BY
BY_CODE_MAPPING = {
"00": "XX", # Unknown - needs manual review
"01": "BR", # Brest Oblast
"02": "HO", # Gomel (Homel) Oblast
"03": "HR", # Grodno (Hrodna) Oblast
"04": "MI", # Minsk Oblast (region around Minsk)
"05": "MA", # Mogilev (Mahilyow) Oblast
"06": "VI", # Vitebsk (Vitsebsk) Oblast
"07": "HR", # Grodno (Hrodna) Oblast - some GeoNames use 07
"HM": "HM", # Minsk City (already correct)
# Letter codes that are already correct
"BR": "BR", "HO": "HO", "HR": "HR", "MA": "MA", "MI": "MI", "VI": "VI",
}
def fix_file(filepath: Path) -> tuple[bool, str]:
"""Fix Belarus region code in file."""
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
data = yaml.safe_load(content)
if not data or 'ghcid' not in data:
return False, "No GHCID"
current = data['ghcid'].get('ghcid_current', '')
match = re.match(r'^BY-([A-Z0-9]{2})-(.+)$', current)
if not match:
return False, "Invalid format"
old_region = match.group(1)
rest = match.group(2)
if old_region not in BY_CODE_MAPPING:
return False, f"Unknown region code {old_region}"
new_region = BY_CODE_MAPPING[old_region]
if old_region == new_region:
return False, "Already correct"
new_ghcid = f"BY-{new_region}-{rest}"
new_filename = f"{new_ghcid}.yaml"
new_filepath = CUSTODIAN_DIR / new_filename
# Check for collision
if new_filepath.exists() and new_filepath != filepath:
return False, f"COLLISION: {new_ghcid}"
# Update GHCID
timestamp = datetime.now(timezone.utc).isoformat()
data['ghcid']['ghcid_current'] = new_ghcid
# Update location_resolution
if 'location_resolution' in data['ghcid']:
data['ghcid']['location_resolution']['region_code'] = new_region
# Add history entry
if 'ghcid_history' not in data['ghcid']:
data['ghcid']['ghcid_history'] = []
data['ghcid']['ghcid_history'].insert(0, {
'ghcid': new_ghcid,
'valid_from': timestamp,
'reason': f"Fixed region code: {old_region} -> {new_region} (ISO 3166-2:BY)"
})
# Update location.region_code if present
if 'location' in data and isinstance(data['location'], dict):
if data['location'].get('region_code') == old_region:
data['location']['region_code'] = new_region
# Update identifiers
if 'identifiers' in data:
for ident in data['identifiers']:
if ident.get('identifier_scheme') == 'GHCID':
ident['identifier_value'] = new_ghcid
# Write updated content
with open(filepath, 'w', encoding='utf-8') as f:
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
# Rename file
if new_filepath != filepath:
os.rename(filepath, new_filepath)
return True, f"{current} -> {new_ghcid}"
def main():
files = list(CUSTODIAN_DIR.glob("BY-*.yaml"))
print(f"Found {len(files)} Belarus files")
fixed = 0
errors = 0
collisions = 0
for f in sorted(files):
# Check if region needs correction (numeric codes)
match = re.match(r'^BY-([0-9]{2})-', f.name)
if match:
success, msg = fix_file(f)
if success:
print(f" Fixed: {msg}")
fixed += 1
elif "COLLISION" in msg:
print(f" {msg}")
collisions += 1
else:
print(f" Skip: {f.name}: {msg}")
errors += 1
print(f"\nSummary: Fixed {fixed}, Collisions {collisions}, Errors {errors}")
if __name__ == "__main__":
main()