glam/scripts/fix_ar_region_codes.py
2025-12-21 00:01:54 +01:00

246 lines
8.3 KiB
Python

#!/usr/bin/env python3
"""
Fix Argentina region code misclassifications.
This script fixes two issues:
1. Files coded as AR-A (Salta) that should be AR-B (Buenos Aires Province)
2. Files coded as AR-XX that have known province data
Usage:
python scripts/fix_ar_region_codes.py [--dry-run]
"""
import os
import re
import yaml
import uuid
import hashlib
from datetime import datetime, timezone
from pathlib import Path
# Configuration
CUSTODIAN_DIR = Path("data/custodian")
# Files that are AR-A but should be AR-B (Buenos Aires Province)
AR_A_TO_AR_B_FILES = [
"AR-A-AME-M-ZA.yaml",
"AR-A-AZU-M-MLGST.yaml",
"AR-A-FV-M-AD.yaml",
"AR-A-MAG-M-EA.yaml",
"AR-A-MOR-M-MMC.yaml",
"AR-A-PA-A-AHMPA.yaml",
]
# Manual fixes for specific files
MANUAL_FIXES = {
"AR-XX-LAP-M-MCQ.yaml": {
"region_code": "H",
"region_name": "Chaco",
"notes": "Lapachito is in Chaco province"
}
}
# GHCID namespace for UUID generation
GHCID_NAMESPACE = uuid.UUID("6ba7b810-9dad-11d1-80b4-00c04fd430c8")
def generate_ghcid_uuids(ghcid_string: str) -> dict:
"""Generate UUID v5 and UUID v8 (SHA-256 based) for a GHCID string."""
# UUID v5 (SHA-1)
uuid_v5 = uuid.uuid5(GHCID_NAMESPACE, ghcid_string)
# UUID v8 (SHA-256 based - custom implementation)
sha256_hash = hashlib.sha256(ghcid_string.encode()).digest()
uuid_v8_bytes = bytearray(sha256_hash[:16])
uuid_v8_bytes[6] = (uuid_v8_bytes[6] & 0x0F) | 0x80 # Version 8
uuid_v8_bytes[8] = (uuid_v8_bytes[8] & 0x3F) | 0x80 # Variant
uuid_v8 = uuid.UUID(bytes=bytes(uuid_v8_bytes))
# Numeric ID (64-bit from SHA-256)
numeric_id = int.from_bytes(sha256_hash[:8], 'big')
return {
"ghcid_uuid": str(uuid_v5),
"ghcid_uuid_sha256": str(uuid_v8),
"ghcid_numeric": numeric_id
}
def fix_ar_a_to_ar_b(dry_run: bool = False):
"""Fix files that are AR-A but should be AR-B."""
print("\n=== Fixing AR-A → AR-B (Buenos Aires Province) ===\n")
fixed_count = 0
for filename in AR_A_TO_AR_B_FILES:
old_path = CUSTODIAN_DIR / filename
if not old_path.exists():
print(f" ⚠️ File not found: {filename}")
continue
# Load the YAML
with open(old_path, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
# Generate new GHCID
old_ghcid = data['ghcid']['ghcid_current']
new_ghcid = old_ghcid.replace('AR-A-', 'AR-B-', 1)
new_filename = filename.replace('AR-A-', 'AR-B-', 1)
new_path = CUSTODIAN_DIR / new_filename
print(f" 📁 {filename}")
print(f" Old GHCID: {old_ghcid}")
print(f" New GHCID: {new_ghcid}")
print(f" City: {data['ghcid']['location_resolution'].get('city_label', 'unknown')}")
if dry_run:
print(f" [DRY RUN] Would rename to {new_filename}\n")
continue
# Update GHCID data
uuids = generate_ghcid_uuids(new_ghcid)
# Store old GHCID in history
if 'ghcid_history' not in data['ghcid']:
data['ghcid']['ghcid_history'] = []
data['ghcid']['ghcid_history'].append({
'ghcid': old_ghcid,
'ghcid_uuid': data['ghcid']['ghcid_uuid'],
'valid_from': data['ghcid'].get('generation_timestamp'),
'valid_to': datetime.now(timezone.utc).isoformat(),
'reason': 'Region code correction: AR-A (Salta) → AR-B (Buenos Aires Province)'
})
# Update current GHCID
data['ghcid']['ghcid_current'] = new_ghcid
data['ghcid']['ghcid_uuid'] = uuids['ghcid_uuid']
data['ghcid']['ghcid_uuid_sha256'] = uuids['ghcid_uuid_sha256']
data['ghcid']['ghcid_numeric'] = uuids['ghcid_numeric']
data['ghcid']['generation_timestamp'] = datetime.now(timezone.utc).isoformat()
data['ghcid']['location_resolution']['region_code'] = 'B'
data['ghcid']['location_resolution']['resolution_date'] = datetime.now(timezone.utc).isoformat()
data['ghcid']['location_resolution']['correction_note'] = 'Corrected from AR-A to AR-B based on province_name field'
# Update location
data['location']['region_code'] = 'B'
# Write updated YAML
with open(new_path, 'w', encoding='utf-8') as f:
yaml.dump(data, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
# Remove old file
old_path.unlink()
print(f" ✅ Renamed to {new_filename}\n")
fixed_count += 1
return fixed_count
def fix_manual_entries(dry_run: bool = False):
"""Fix manually identified files."""
print("\n=== Fixing Manual Entries ===\n")
fixed_count = 0
for filename, fix_info in MANUAL_FIXES.items():
old_path = CUSTODIAN_DIR / filename
if not old_path.exists():
print(f" ⚠️ File not found: {filename}")
continue
# Load the YAML
with open(old_path, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
# Generate new GHCID
old_ghcid = data['ghcid']['ghcid_current']
new_region = fix_info['region_code']
new_ghcid = re.sub(r'AR-XX-', f'AR-{new_region}-', old_ghcid, count=1)
new_filename = re.sub(r'AR-XX-', f'AR-{new_region}-', filename, count=1)
new_path = CUSTODIAN_DIR / new_filename
print(f" 📁 {filename}")
print(f" Old GHCID: {old_ghcid}")
print(f" New GHCID: {new_ghcid}")
print(f" Province: {fix_info['region_name']}")
print(f" Notes: {fix_info['notes']}")
if dry_run:
print(f" [DRY RUN] Would rename to {new_filename}\n")
continue
# Update GHCID data
uuids = generate_ghcid_uuids(new_ghcid)
# Store old GHCID in history
if 'ghcid_history' not in data['ghcid']:
data['ghcid']['ghcid_history'] = []
data['ghcid']['ghcid_history'].append({
'ghcid': old_ghcid,
'ghcid_uuid': data['ghcid']['ghcid_uuid'],
'valid_from': data['ghcid'].get('generation_timestamp'),
'valid_to': datetime.now(timezone.utc).isoformat(),
'reason': f"Region code correction: AR-XX → AR-{new_region} ({fix_info['region_name']})"
})
# Update current GHCID
data['ghcid']['ghcid_current'] = new_ghcid
data['ghcid']['ghcid_uuid'] = uuids['ghcid_uuid']
data['ghcid']['ghcid_uuid_sha256'] = uuids['ghcid_uuid_sha256']
data['ghcid']['ghcid_numeric'] = uuids['ghcid_numeric']
data['ghcid']['generation_timestamp'] = datetime.now(timezone.utc).isoformat()
data['ghcid']['location_resolution']['region_code'] = new_region
data['ghcid']['location_resolution']['region_name'] = fix_info['region_name']
data['ghcid']['location_resolution']['resolution_date'] = datetime.now(timezone.utc).isoformat()
data['ghcid']['location_resolution']['resolution_method'] = 'MANUAL_RESEARCH'
data['ghcid']['location_resolution']['correction_note'] = fix_info['notes']
# Update location
data['location']['region_code'] = new_region
# Write updated YAML
with open(new_path, 'w', encoding='utf-8') as f:
yaml.dump(data, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
# Remove old file
old_path.unlink()
print(f" ✅ Renamed to {new_filename}\n")
fixed_count += 1
return fixed_count
def main():
import sys
dry_run = '--dry-run' in sys.argv
if dry_run:
print("🔍 DRY RUN MODE - No files will be modified\n")
# Fix AR-A → AR-B
ar_b_fixed = fix_ar_a_to_ar_b(dry_run)
# Fix manual entries
manual_fixed = fix_manual_entries(dry_run)
# Summary
print("\n" + "=" * 50)
print("SUMMARY")
print("=" * 50)
print(f" AR-A → AR-B fixes: {ar_b_fixed}")
print(f" Manual fixes: {manual_fixed}")
print(f" Total: {ar_b_fixed + manual_fixed}")
if dry_run:
print("\n Run without --dry-run to apply changes.")
if __name__ == "__main__":
main()