246 lines
8.3 KiB
Python
246 lines
8.3 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Fix Argentina region code misclassifications.
|
|
|
|
This script fixes two issues:
|
|
1. Files coded as AR-A (Salta) that should be AR-B (Buenos Aires Province)
|
|
2. Files coded as AR-XX that have known province data
|
|
|
|
Usage:
|
|
python scripts/fix_ar_region_codes.py [--dry-run]
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
import yaml
|
|
import uuid
|
|
import hashlib
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
# Configuration
|
|
CUSTODIAN_DIR = Path("data/custodian")
|
|
|
|
# Files that are AR-A but should be AR-B (Buenos Aires Province)
|
|
AR_A_TO_AR_B_FILES = [
|
|
"AR-A-AME-M-ZA.yaml",
|
|
"AR-A-AZU-M-MLGST.yaml",
|
|
"AR-A-FV-M-AD.yaml",
|
|
"AR-A-MAG-M-EA.yaml",
|
|
"AR-A-MOR-M-MMC.yaml",
|
|
"AR-A-PA-A-AHMPA.yaml",
|
|
]
|
|
|
|
# Manual fixes for specific files
|
|
MANUAL_FIXES = {
|
|
"AR-XX-LAP-M-MCQ.yaml": {
|
|
"region_code": "H",
|
|
"region_name": "Chaco",
|
|
"notes": "Lapachito is in Chaco province"
|
|
}
|
|
}
|
|
|
|
# GHCID namespace for UUID generation
|
|
GHCID_NAMESPACE = uuid.UUID("6ba7b810-9dad-11d1-80b4-00c04fd430c8")
|
|
|
|
|
|
def generate_ghcid_uuids(ghcid_string: str) -> dict:
|
|
"""Generate UUID v5 and UUID v8 (SHA-256 based) for a GHCID string."""
|
|
# UUID v5 (SHA-1)
|
|
uuid_v5 = uuid.uuid5(GHCID_NAMESPACE, ghcid_string)
|
|
|
|
# UUID v8 (SHA-256 based - custom implementation)
|
|
sha256_hash = hashlib.sha256(ghcid_string.encode()).digest()
|
|
uuid_v8_bytes = bytearray(sha256_hash[:16])
|
|
uuid_v8_bytes[6] = (uuid_v8_bytes[6] & 0x0F) | 0x80 # Version 8
|
|
uuid_v8_bytes[8] = (uuid_v8_bytes[8] & 0x3F) | 0x80 # Variant
|
|
uuid_v8 = uuid.UUID(bytes=bytes(uuid_v8_bytes))
|
|
|
|
# Numeric ID (64-bit from SHA-256)
|
|
numeric_id = int.from_bytes(sha256_hash[:8], 'big')
|
|
|
|
return {
|
|
"ghcid_uuid": str(uuid_v5),
|
|
"ghcid_uuid_sha256": str(uuid_v8),
|
|
"ghcid_numeric": numeric_id
|
|
}
|
|
|
|
|
|
def fix_ar_a_to_ar_b(dry_run: bool = False):
|
|
"""Fix files that are AR-A but should be AR-B."""
|
|
print("\n=== Fixing AR-A → AR-B (Buenos Aires Province) ===\n")
|
|
|
|
fixed_count = 0
|
|
|
|
for filename in AR_A_TO_AR_B_FILES:
|
|
old_path = CUSTODIAN_DIR / filename
|
|
|
|
if not old_path.exists():
|
|
print(f" ⚠️ File not found: {filename}")
|
|
continue
|
|
|
|
# Load the YAML
|
|
with open(old_path, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
# Generate new GHCID
|
|
old_ghcid = data['ghcid']['ghcid_current']
|
|
new_ghcid = old_ghcid.replace('AR-A-', 'AR-B-', 1)
|
|
new_filename = filename.replace('AR-A-', 'AR-B-', 1)
|
|
new_path = CUSTODIAN_DIR / new_filename
|
|
|
|
print(f" 📁 {filename}")
|
|
print(f" Old GHCID: {old_ghcid}")
|
|
print(f" New GHCID: {new_ghcid}")
|
|
print(f" City: {data['ghcid']['location_resolution'].get('city_label', 'unknown')}")
|
|
|
|
if dry_run:
|
|
print(f" [DRY RUN] Would rename to {new_filename}\n")
|
|
continue
|
|
|
|
# Update GHCID data
|
|
uuids = generate_ghcid_uuids(new_ghcid)
|
|
|
|
# Store old GHCID in history
|
|
if 'ghcid_history' not in data['ghcid']:
|
|
data['ghcid']['ghcid_history'] = []
|
|
|
|
data['ghcid']['ghcid_history'].append({
|
|
'ghcid': old_ghcid,
|
|
'ghcid_uuid': data['ghcid']['ghcid_uuid'],
|
|
'valid_from': data['ghcid'].get('generation_timestamp'),
|
|
'valid_to': datetime.now(timezone.utc).isoformat(),
|
|
'reason': 'Region code correction: AR-A (Salta) → AR-B (Buenos Aires Province)'
|
|
})
|
|
|
|
# Update current GHCID
|
|
data['ghcid']['ghcid_current'] = new_ghcid
|
|
data['ghcid']['ghcid_uuid'] = uuids['ghcid_uuid']
|
|
data['ghcid']['ghcid_uuid_sha256'] = uuids['ghcid_uuid_sha256']
|
|
data['ghcid']['ghcid_numeric'] = uuids['ghcid_numeric']
|
|
data['ghcid']['generation_timestamp'] = datetime.now(timezone.utc).isoformat()
|
|
data['ghcid']['location_resolution']['region_code'] = 'B'
|
|
data['ghcid']['location_resolution']['resolution_date'] = datetime.now(timezone.utc).isoformat()
|
|
data['ghcid']['location_resolution']['correction_note'] = 'Corrected from AR-A to AR-B based on province_name field'
|
|
|
|
# Update location
|
|
data['location']['region_code'] = 'B'
|
|
|
|
# Write updated YAML
|
|
with open(new_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
|
|
|
|
# Remove old file
|
|
old_path.unlink()
|
|
|
|
print(f" ✅ Renamed to {new_filename}\n")
|
|
fixed_count += 1
|
|
|
|
return fixed_count
|
|
|
|
|
|
def fix_manual_entries(dry_run: bool = False):
|
|
"""Fix manually identified files."""
|
|
print("\n=== Fixing Manual Entries ===\n")
|
|
|
|
fixed_count = 0
|
|
|
|
for filename, fix_info in MANUAL_FIXES.items():
|
|
old_path = CUSTODIAN_DIR / filename
|
|
|
|
if not old_path.exists():
|
|
print(f" ⚠️ File not found: {filename}")
|
|
continue
|
|
|
|
# Load the YAML
|
|
with open(old_path, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
# Generate new GHCID
|
|
old_ghcid = data['ghcid']['ghcid_current']
|
|
new_region = fix_info['region_code']
|
|
new_ghcid = re.sub(r'AR-XX-', f'AR-{new_region}-', old_ghcid, count=1)
|
|
new_filename = re.sub(r'AR-XX-', f'AR-{new_region}-', filename, count=1)
|
|
new_path = CUSTODIAN_DIR / new_filename
|
|
|
|
print(f" 📁 {filename}")
|
|
print(f" Old GHCID: {old_ghcid}")
|
|
print(f" New GHCID: {new_ghcid}")
|
|
print(f" Province: {fix_info['region_name']}")
|
|
print(f" Notes: {fix_info['notes']}")
|
|
|
|
if dry_run:
|
|
print(f" [DRY RUN] Would rename to {new_filename}\n")
|
|
continue
|
|
|
|
# Update GHCID data
|
|
uuids = generate_ghcid_uuids(new_ghcid)
|
|
|
|
# Store old GHCID in history
|
|
if 'ghcid_history' not in data['ghcid']:
|
|
data['ghcid']['ghcid_history'] = []
|
|
|
|
data['ghcid']['ghcid_history'].append({
|
|
'ghcid': old_ghcid,
|
|
'ghcid_uuid': data['ghcid']['ghcid_uuid'],
|
|
'valid_from': data['ghcid'].get('generation_timestamp'),
|
|
'valid_to': datetime.now(timezone.utc).isoformat(),
|
|
'reason': f"Region code correction: AR-XX → AR-{new_region} ({fix_info['region_name']})"
|
|
})
|
|
|
|
# Update current GHCID
|
|
data['ghcid']['ghcid_current'] = new_ghcid
|
|
data['ghcid']['ghcid_uuid'] = uuids['ghcid_uuid']
|
|
data['ghcid']['ghcid_uuid_sha256'] = uuids['ghcid_uuid_sha256']
|
|
data['ghcid']['ghcid_numeric'] = uuids['ghcid_numeric']
|
|
data['ghcid']['generation_timestamp'] = datetime.now(timezone.utc).isoformat()
|
|
data['ghcid']['location_resolution']['region_code'] = new_region
|
|
data['ghcid']['location_resolution']['region_name'] = fix_info['region_name']
|
|
data['ghcid']['location_resolution']['resolution_date'] = datetime.now(timezone.utc).isoformat()
|
|
data['ghcid']['location_resolution']['resolution_method'] = 'MANUAL_RESEARCH'
|
|
data['ghcid']['location_resolution']['correction_note'] = fix_info['notes']
|
|
|
|
# Update location
|
|
data['location']['region_code'] = new_region
|
|
|
|
# Write updated YAML
|
|
with open(new_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
|
|
|
|
# Remove old file
|
|
old_path.unlink()
|
|
|
|
print(f" ✅ Renamed to {new_filename}\n")
|
|
fixed_count += 1
|
|
|
|
return fixed_count
|
|
|
|
|
|
def main():
|
|
import sys
|
|
|
|
dry_run = '--dry-run' in sys.argv
|
|
|
|
if dry_run:
|
|
print("🔍 DRY RUN MODE - No files will be modified\n")
|
|
|
|
# Fix AR-A → AR-B
|
|
ar_b_fixed = fix_ar_a_to_ar_b(dry_run)
|
|
|
|
# Fix manual entries
|
|
manual_fixed = fix_manual_entries(dry_run)
|
|
|
|
# Summary
|
|
print("\n" + "=" * 50)
|
|
print("SUMMARY")
|
|
print("=" * 50)
|
|
print(f" AR-A → AR-B fixes: {ar_b_fixed}")
|
|
print(f" Manual fixes: {manual_fixed}")
|
|
print(f" Total: {ar_b_fixed + manual_fixed}")
|
|
|
|
if dry_run:
|
|
print("\n Run without --dry-run to apply changes.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|