#!/usr/bin/env python3 """ Fix Argentina region code misclassifications. This script fixes two issues: 1. Files coded as AR-A (Salta) that should be AR-B (Buenos Aires Province) 2. Files coded as AR-XX that have known province data Usage: python scripts/fix_ar_region_codes.py [--dry-run] """ import os import re import yaml import uuid import hashlib from datetime import datetime, timezone from pathlib import Path # Configuration CUSTODIAN_DIR = Path("data/custodian") # Files that are AR-A but should be AR-B (Buenos Aires Province) AR_A_TO_AR_B_FILES = [ "AR-A-AME-M-ZA.yaml", "AR-A-AZU-M-MLGST.yaml", "AR-A-FV-M-AD.yaml", "AR-A-MAG-M-EA.yaml", "AR-A-MOR-M-MMC.yaml", "AR-A-PA-A-AHMPA.yaml", ] # Manual fixes for specific files MANUAL_FIXES = { "AR-XX-LAP-M-MCQ.yaml": { "region_code": "H", "region_name": "Chaco", "notes": "Lapachito is in Chaco province" } } # GHCID namespace for UUID generation GHCID_NAMESPACE = uuid.UUID("6ba7b810-9dad-11d1-80b4-00c04fd430c8") def generate_ghcid_uuids(ghcid_string: str) -> dict: """Generate UUID v5 and UUID v8 (SHA-256 based) for a GHCID string.""" # UUID v5 (SHA-1) uuid_v5 = uuid.uuid5(GHCID_NAMESPACE, ghcid_string) # UUID v8 (SHA-256 based - custom implementation) sha256_hash = hashlib.sha256(ghcid_string.encode()).digest() uuid_v8_bytes = bytearray(sha256_hash[:16]) uuid_v8_bytes[6] = (uuid_v8_bytes[6] & 0x0F) | 0x80 # Version 8 uuid_v8_bytes[8] = (uuid_v8_bytes[8] & 0x3F) | 0x80 # Variant uuid_v8 = uuid.UUID(bytes=bytes(uuid_v8_bytes)) # Numeric ID (64-bit from SHA-256) numeric_id = int.from_bytes(sha256_hash[:8], 'big') return { "ghcid_uuid": str(uuid_v5), "ghcid_uuid_sha256": str(uuid_v8), "ghcid_numeric": numeric_id } def fix_ar_a_to_ar_b(dry_run: bool = False): """Fix files that are AR-A but should be AR-B.""" print("\n=== Fixing AR-A → AR-B (Buenos Aires Province) ===\n") fixed_count = 0 for filename in AR_A_TO_AR_B_FILES: old_path = CUSTODIAN_DIR / filename if not old_path.exists(): print(f" ⚠️ File not found: {filename}") continue # Load the YAML with open(old_path, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) # Generate new GHCID old_ghcid = data['ghcid']['ghcid_current'] new_ghcid = old_ghcid.replace('AR-A-', 'AR-B-', 1) new_filename = filename.replace('AR-A-', 'AR-B-', 1) new_path = CUSTODIAN_DIR / new_filename print(f" 📁 {filename}") print(f" Old GHCID: {old_ghcid}") print(f" New GHCID: {new_ghcid}") print(f" City: {data['ghcid']['location_resolution'].get('city_label', 'unknown')}") if dry_run: print(f" [DRY RUN] Would rename to {new_filename}\n") continue # Update GHCID data uuids = generate_ghcid_uuids(new_ghcid) # Store old GHCID in history if 'ghcid_history' not in data['ghcid']: data['ghcid']['ghcid_history'] = [] data['ghcid']['ghcid_history'].append({ 'ghcid': old_ghcid, 'ghcid_uuid': data['ghcid']['ghcid_uuid'], 'valid_from': data['ghcid'].get('generation_timestamp'), 'valid_to': datetime.now(timezone.utc).isoformat(), 'reason': 'Region code correction: AR-A (Salta) → AR-B (Buenos Aires Province)' }) # Update current GHCID data['ghcid']['ghcid_current'] = new_ghcid data['ghcid']['ghcid_uuid'] = uuids['ghcid_uuid'] data['ghcid']['ghcid_uuid_sha256'] = uuids['ghcid_uuid_sha256'] data['ghcid']['ghcid_numeric'] = uuids['ghcid_numeric'] data['ghcid']['generation_timestamp'] = datetime.now(timezone.utc).isoformat() data['ghcid']['location_resolution']['region_code'] = 'B' data['ghcid']['location_resolution']['resolution_date'] = datetime.now(timezone.utc).isoformat() data['ghcid']['location_resolution']['correction_note'] = 'Corrected from AR-A to AR-B based on province_name field' # Update location data['location']['region_code'] = 'B' # Write updated YAML with open(new_path, 'w', encoding='utf-8') as f: yaml.dump(data, f, allow_unicode=True, sort_keys=False, default_flow_style=False) # Remove old file old_path.unlink() print(f" ✅ Renamed to {new_filename}\n") fixed_count += 1 return fixed_count def fix_manual_entries(dry_run: bool = False): """Fix manually identified files.""" print("\n=== Fixing Manual Entries ===\n") fixed_count = 0 for filename, fix_info in MANUAL_FIXES.items(): old_path = CUSTODIAN_DIR / filename if not old_path.exists(): print(f" ⚠️ File not found: {filename}") continue # Load the YAML with open(old_path, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) # Generate new GHCID old_ghcid = data['ghcid']['ghcid_current'] new_region = fix_info['region_code'] new_ghcid = re.sub(r'AR-XX-', f'AR-{new_region}-', old_ghcid, count=1) new_filename = re.sub(r'AR-XX-', f'AR-{new_region}-', filename, count=1) new_path = CUSTODIAN_DIR / new_filename print(f" 📁 {filename}") print(f" Old GHCID: {old_ghcid}") print(f" New GHCID: {new_ghcid}") print(f" Province: {fix_info['region_name']}") print(f" Notes: {fix_info['notes']}") if dry_run: print(f" [DRY RUN] Would rename to {new_filename}\n") continue # Update GHCID data uuids = generate_ghcid_uuids(new_ghcid) # Store old GHCID in history if 'ghcid_history' not in data['ghcid']: data['ghcid']['ghcid_history'] = [] data['ghcid']['ghcid_history'].append({ 'ghcid': old_ghcid, 'ghcid_uuid': data['ghcid']['ghcid_uuid'], 'valid_from': data['ghcid'].get('generation_timestamp'), 'valid_to': datetime.now(timezone.utc).isoformat(), 'reason': f"Region code correction: AR-XX → AR-{new_region} ({fix_info['region_name']})" }) # Update current GHCID data['ghcid']['ghcid_current'] = new_ghcid data['ghcid']['ghcid_uuid'] = uuids['ghcid_uuid'] data['ghcid']['ghcid_uuid_sha256'] = uuids['ghcid_uuid_sha256'] data['ghcid']['ghcid_numeric'] = uuids['ghcid_numeric'] data['ghcid']['generation_timestamp'] = datetime.now(timezone.utc).isoformat() data['ghcid']['location_resolution']['region_code'] = new_region data['ghcid']['location_resolution']['region_name'] = fix_info['region_name'] data['ghcid']['location_resolution']['resolution_date'] = datetime.now(timezone.utc).isoformat() data['ghcid']['location_resolution']['resolution_method'] = 'MANUAL_RESEARCH' data['ghcid']['location_resolution']['correction_note'] = fix_info['notes'] # Update location data['location']['region_code'] = new_region # Write updated YAML with open(new_path, 'w', encoding='utf-8') as f: yaml.dump(data, f, allow_unicode=True, sort_keys=False, default_flow_style=False) # Remove old file old_path.unlink() print(f" ✅ Renamed to {new_filename}\n") fixed_count += 1 return fixed_count def main(): import sys dry_run = '--dry-run' in sys.argv if dry_run: print("🔍 DRY RUN MODE - No files will be modified\n") # Fix AR-A → AR-B ar_b_fixed = fix_ar_a_to_ar_b(dry_run) # Fix manual entries manual_fixed = fix_manual_entries(dry_run) # Summary print("\n" + "=" * 50) print("SUMMARY") print("=" * 50) print(f" AR-A → AR-B fixes: {ar_b_fixed}") print(f" Manual fixes: {manual_fixed}") print(f" Total: {ar_b_fixed + manual_fixed}") if dry_run: print("\n Run without --dry-run to apply changes.") if __name__ == "__main__": main()