#!/usr/bin/env python3 """ Fix resolved AR-XX-XXX files with researched location data. This script fixes Argentina institution files where the location has been researched and confirmed through web searches and Wikidata queries. Usage: python scripts/fix_ar_xx_xxx_resolved.py [--dry-run] """ import os import re import yaml import uuid import hashlib from datetime import datetime, timezone from pathlib import Path from typing import Dict, Any, Optional # Configuration CUSTODIAN_DIR = Path("data/custodian") # GHCID namespace for UUID generation (same as in other scripts) GHCID_NAMESPACE = uuid.UUID("6ba7b810-9dad-11d1-80b4-00c04fd430c8") # Resolved institutions with researched location data # Format: old_filename -> fix_info RESOLVED_INSTITUTIONS = { "AR-XX-XXX-M-MB.yaml": { "new_region_code": "C", "new_city_code": "LB", "new_city_label": "La Boca", "new_abbreviation": "MLB", # MARCO La Boca "region_name": "Ciudad Autónoma de Buenos Aires", "address": "Av. Almirante Brown 1031, La Boca, Buenos Aires", "latitude": -34.6367, "longitude": -58.3633, "research_source": "Web search - museomarco.org", "research_notes": "MARCO La Boca art museum located in the La Boca neighborhood of Buenos Aires city" }, "AR-XX-XXX-M-MUFCA.yaml": { "new_region_code": "S", "new_city_code": "ROS", "new_city_label": "Rosario", "new_abbreviation": "MUFCA", # Keep existing abbreviation "region_name": "Santa Fe", "address": "FCEIA, Universidad Nacional de Rosario, Rosario", "latitude": -32.9468, "longitude": -60.6393, "research_source": "Web search - Universidad Nacional de Rosario", "research_notes": "Part of Facultad de Ciencias Exactas, Ingeniería y Agrimensura at UNR" }, "AR-XX-XXX-M-ESS.yaml": { "new_region_code": "B", "new_city_code": "LC", "new_city_label": "Los Cardales", "new_abbreviation": "ESS", # Keep existing abbreviation "region_name": "Provincia de Buenos Aires", "address": "Los Cardales, near Campana, Buenos Aires Province", "latitude": -34.3167, "longitude": -58.9667, "research_source": "Web search - tourism sites", "research_notes": "Tourist estancia with gaucho museum, ~70km NW of Buenos Aires city" }, "AR-XX-XXX-A-CDVO.yaml": { "new_region_code": "B", "new_city_code": "SI", "new_city_label": "San Isidro", "new_abbreviation": "CDVO", # Keep existing abbreviation "region_name": "Provincia de Buenos Aires", "address": "Elortondo 1837, Beccar, San Isidro", "latitude": -34.4756, "longitude": -58.5322, "research_source": "Web search - Villa Ocampo UNESCO documentation center", "research_notes": "Located at Villa Ocampo in Beccar (part of San Isidro partido). Note: Wikidata P276 incorrectly shows Houghton Library (Harvard) which is where correspondence copies are held, not the actual location." }, # --- Batch 2: Researched 2025-12-19 --- "AR-XX-XXX-M-TOP.yaml": { "new_region_code": "C", "new_city_code": "LP", "new_city_label": "La Paternal", "new_abbreviation": "TOP", # Templo del Otro Partido "region_name": "Ciudad Autónoma de Buenos Aires", "address": "Gavilán 2151 (Gate 10), Diego Armando Maradona Stadium, La Paternal, Buenos Aires", "latitude": -34.6062, "longitude": -58.4679, "research_source": "Web search - Tripadvisor, Whichmuseum, Evendo", "research_notes": "Sports museum located under Diego Armando Maradona Stadium (Argentinos Juniors). Focuses on club history and Argentine football culture. Also known as 'El Templo del Fútbol'." }, "AR-XX-XXX-M-MAIAE.yaml": { "new_region_code": "M", "new_city_code": "MEN", "new_city_label": "Mendoza", "new_abbreviation": "MAIAE", # Keep existing abbreviation "region_name": "Mendoza", "address": "Instituto de Arqueología y Etnología, Facultad de Filosofía y Letras, Universidad Nacional de Cuyo, Mendoza", "latitude": -32.8833, "longitude": -68.8333, "research_source": "Web search - PaleoArgentina, ResearchGate, hotel proximity listings", "research_notes": "Archaeological museum part of Instituto de Arqueología y Etnología at Facultad de Filosofía y Letras, Universidad Nacional de Cuyo. Contains pre-Columbian and regional archaeological collections." }, "AR-XX-XXX-M-MHRSMBGDJM.yaml": { "new_region_code": "B", "new_city_code": "SM", "new_city_label": "San Martín", "new_abbreviation": "MHRSMBGDJM", # Keep existing abbreviation "region_name": "Provincia de Buenos Aires", "address": "Diego Pombo 3324, San Andrés, Partido de General San Martín, Buenos Aires Province", "latitude": -34.5667, "longitude": -58.5333, "research_source": "Wikimedia Commons, Wikipedia, Buenos Aires Province cultural database", "research_notes": "Historic house museum - Casa de Rosas. Full name: Museo Histórico Regional de San Martín «Brig. Gral. Don Juan Manuel de Rosas». Located in San Andrés locality within General San Martín partido." }, } def generate_ghcid_uuids(ghcid_string: str) -> dict: """Generate UUID v5 and UUID v8 (SHA-256 based) for a GHCID string.""" # UUID v5 (SHA-1) uuid_v5 = uuid.uuid5(GHCID_NAMESPACE, ghcid_string) # UUID v8 (SHA-256 based - custom implementation) sha256_hash = hashlib.sha256(ghcid_string.encode()).digest() uuid_v8_bytes = bytearray(sha256_hash[:16]) uuid_v8_bytes[6] = (uuid_v8_bytes[6] & 0x0F) | 0x80 # Version 8 uuid_v8_bytes[8] = (uuid_v8_bytes[8] & 0x3F) | 0x80 # Variant uuid_v8 = uuid.UUID(bytes=bytes(uuid_v8_bytes)) # Numeric ID (64-bit from SHA-256) numeric_id = int.from_bytes(sha256_hash[:8], 'big') return { "ghcid_uuid": str(uuid_v5), "ghcid_uuid_sha256": str(uuid_v8), "ghcid_numeric": numeric_id } def construct_new_ghcid(old_ghcid: str, fix_info: Dict[str, Any]) -> str: """Construct new GHCID from fix info.""" # Old format: AR-XX-XXX-{type}-{abbrev} # New format: AR-{region}-{city}-{type}-{abbrev} parts = old_ghcid.split('-') inst_type = parts[3] # M, A, G, L, etc. new_ghcid = f"AR-{fix_info['new_region_code']}-{fix_info['new_city_code']}-{inst_type}-{fix_info['new_abbreviation']}" return new_ghcid def fix_resolved_institution(filename: str, fix_info: Dict[str, Any], dry_run: bool = False) -> bool: """Fix a single resolved institution file.""" old_path = CUSTODIAN_DIR / filename if not old_path.exists(): print(f" ⚠️ File not found: {filename}") return False # Load the YAML with open(old_path, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) # Get old GHCID and construct new one old_ghcid = data['ghcid']['ghcid_current'] new_ghcid = construct_new_ghcid(old_ghcid, fix_info) # Construct new filename parts = old_ghcid.split('-') inst_type = parts[3] new_filename = f"AR-{fix_info['new_region_code']}-{fix_info['new_city_code']}-{inst_type}-{fix_info['new_abbreviation']}.yaml" new_path = CUSTODIAN_DIR / new_filename # Print info print(f"\n 📁 {filename}") print(f" Name: {data['custodian_name']['claim_value']}") print(f" Old GHCID: {old_ghcid}") print(f" New GHCID: {new_ghcid}") print(f" Location: {fix_info['new_city_label']}, {fix_info['region_name']}") if fix_info.get('address'): print(f" Address: {fix_info['address']}") print(f" Research: {fix_info['research_source']}") if dry_run: print(f" [DRY RUN] Would rename to {new_filename}") return True # Check for collision if new_path.exists() and new_path != old_path: print(f" ⚠️ WARNING: Target file already exists: {new_filename}") print(f" Skipping to avoid overwrite. Manual review needed.") return False # Generate new UUIDs uuids = generate_ghcid_uuids(new_ghcid) now = datetime.now(timezone.utc).isoformat() # Store old GHCID in history if 'ghcid_history' not in data['ghcid']: data['ghcid']['ghcid_history'] = [] data['ghcid']['ghcid_history'].append({ 'ghcid': old_ghcid, 'ghcid_uuid': data['ghcid']['ghcid_uuid'], 'valid_from': data['ghcid'].get('generation_timestamp'), 'valid_to': now, 'reason': f"Location resolved via research: {fix_info['research_notes']}" }) # Update current GHCID data['ghcid']['ghcid_current'] = new_ghcid data['ghcid']['ghcid_uuid'] = uuids['ghcid_uuid'] data['ghcid']['ghcid_uuid_sha256'] = uuids['ghcid_uuid_sha256'] data['ghcid']['ghcid_numeric'] = uuids['ghcid_numeric'] data['ghcid']['generation_timestamp'] = now # Update location_resolution data['ghcid']['location_resolution'] = { 'method': 'MANUAL_RESEARCH', 'country_code': 'AR', 'region_code': fix_info['new_region_code'], 'region_name': fix_info['region_name'], 'city_code': fix_info['new_city_code'], 'city_label': fix_info['new_city_label'], 'resolution_date': now, 'research_source': fix_info['research_source'], 'research_notes': fix_info['research_notes'] } # Update location block data['location'] = { 'country': 'AR', 'region_code': fix_info['new_region_code'], 'city': fix_info['new_city_label'] } if fix_info.get('address'): data['location']['address'] = fix_info['address'] if fix_info.get('latitude') and fix_info.get('longitude'): data['location']['latitude'] = fix_info['latitude'] data['location']['longitude'] = fix_info['longitude'] # Write updated YAML with open(new_path, 'w', encoding='utf-8') as f: yaml.dump(data, f, allow_unicode=True, sort_keys=False, default_flow_style=False) # Remove old file if it's different if old_path != new_path: old_path.unlink() print(f" ✅ Fixed → {new_filename}") return True def main(): import sys dry_run = '--dry-run' in sys.argv print("=" * 60) print("Argentina AR-XX-XXX Resolution Fix Script") print("=" * 60) if dry_run: print("\n🔍 DRY RUN MODE - No files will be modified\n") fixed_count = 0 error_count = 0 for filename, fix_info in RESOLVED_INSTITUTIONS.items(): try: if fix_resolved_institution(filename, fix_info, dry_run): fixed_count += 1 except Exception as e: print(f"\n ❌ ERROR processing {filename}: {e}") error_count += 1 # Summary print("\n" + "=" * 60) print("SUMMARY") print("=" * 60) print(f" Total processed: {len(RESOLVED_INSTITUTIONS)}") print(f" Successfully fixed: {fixed_count}") print(f" Errors: {error_count}") if dry_run: print("\n Run without --dry-run to apply changes.") # List remaining AR-XX-XXX files print("\n" + "=" * 60) print("REMAINING AR-XX-XXX FILES") print("=" * 60) remaining = list(CUSTODIAN_DIR.glob("AR-XX-*.yaml")) if remaining: for f in sorted(remaining): print(f" • {f.name}") print(f"\n Total remaining: {len(remaining)}") else: print(" None remaining! All AR-XX-XXX files have been resolved.") if __name__ == "__main__": main()