glam/scripts/fix_ar_xx_xxx_resolved.py
2025-12-21 00:01:54 +01:00

304 lines
12 KiB
Python

#!/usr/bin/env python3
"""
Fix resolved AR-XX-XXX files with researched location data.
This script fixes Argentina institution files where the location has been
researched and confirmed through web searches and Wikidata queries.
Usage:
python scripts/fix_ar_xx_xxx_resolved.py [--dry-run]
"""
import os
import re
import yaml
import uuid
import hashlib
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, Any, Optional
# Configuration
CUSTODIAN_DIR = Path("data/custodian")
# GHCID namespace for UUID generation (same as in other scripts)
GHCID_NAMESPACE = uuid.UUID("6ba7b810-9dad-11d1-80b4-00c04fd430c8")
# Resolved institutions with researched location data
# Format: old_filename -> fix_info
RESOLVED_INSTITUTIONS = {
"AR-XX-XXX-M-MB.yaml": {
"new_region_code": "C",
"new_city_code": "LB",
"new_city_label": "La Boca",
"new_abbreviation": "MLB", # MARCO La Boca
"region_name": "Ciudad Autónoma de Buenos Aires",
"address": "Av. Almirante Brown 1031, La Boca, Buenos Aires",
"latitude": -34.6367,
"longitude": -58.3633,
"research_source": "Web search - museomarco.org",
"research_notes": "MARCO La Boca art museum located in the La Boca neighborhood of Buenos Aires city"
},
"AR-XX-XXX-M-MUFCA.yaml": {
"new_region_code": "S",
"new_city_code": "ROS",
"new_city_label": "Rosario",
"new_abbreviation": "MUFCA", # Keep existing abbreviation
"region_name": "Santa Fe",
"address": "FCEIA, Universidad Nacional de Rosario, Rosario",
"latitude": -32.9468,
"longitude": -60.6393,
"research_source": "Web search - Universidad Nacional de Rosario",
"research_notes": "Part of Facultad de Ciencias Exactas, Ingeniería y Agrimensura at UNR"
},
"AR-XX-XXX-M-ESS.yaml": {
"new_region_code": "B",
"new_city_code": "LC",
"new_city_label": "Los Cardales",
"new_abbreviation": "ESS", # Keep existing abbreviation
"region_name": "Provincia de Buenos Aires",
"address": "Los Cardales, near Campana, Buenos Aires Province",
"latitude": -34.3167,
"longitude": -58.9667,
"research_source": "Web search - tourism sites",
"research_notes": "Tourist estancia with gaucho museum, ~70km NW of Buenos Aires city"
},
"AR-XX-XXX-A-CDVO.yaml": {
"new_region_code": "B",
"new_city_code": "SI",
"new_city_label": "San Isidro",
"new_abbreviation": "CDVO", # Keep existing abbreviation
"region_name": "Provincia de Buenos Aires",
"address": "Elortondo 1837, Beccar, San Isidro",
"latitude": -34.4756,
"longitude": -58.5322,
"research_source": "Web search - Villa Ocampo UNESCO documentation center",
"research_notes": "Located at Villa Ocampo in Beccar (part of San Isidro partido). Note: Wikidata P276 incorrectly shows Houghton Library (Harvard) which is where correspondence copies are held, not the actual location."
},
# --- Batch 2: Researched 2025-12-19 ---
"AR-XX-XXX-M-TOP.yaml": {
"new_region_code": "C",
"new_city_code": "LP",
"new_city_label": "La Paternal",
"new_abbreviation": "TOP", # Templo del Otro Partido
"region_name": "Ciudad Autónoma de Buenos Aires",
"address": "Gavilán 2151 (Gate 10), Diego Armando Maradona Stadium, La Paternal, Buenos Aires",
"latitude": -34.6062,
"longitude": -58.4679,
"research_source": "Web search - Tripadvisor, Whichmuseum, Evendo",
"research_notes": "Sports museum located under Diego Armando Maradona Stadium (Argentinos Juniors). Focuses on club history and Argentine football culture. Also known as 'El Templo del Fútbol'."
},
"AR-XX-XXX-M-MAIAE.yaml": {
"new_region_code": "M",
"new_city_code": "MEN",
"new_city_label": "Mendoza",
"new_abbreviation": "MAIAE", # Keep existing abbreviation
"region_name": "Mendoza",
"address": "Instituto de Arqueología y Etnología, Facultad de Filosofía y Letras, Universidad Nacional de Cuyo, Mendoza",
"latitude": -32.8833,
"longitude": -68.8333,
"research_source": "Web search - PaleoArgentina, ResearchGate, hotel proximity listings",
"research_notes": "Archaeological museum part of Instituto de Arqueología y Etnología at Facultad de Filosofía y Letras, Universidad Nacional de Cuyo. Contains pre-Columbian and regional archaeological collections."
},
"AR-XX-XXX-M-MHRSMBGDJM.yaml": {
"new_region_code": "B",
"new_city_code": "SM",
"new_city_label": "San Martín",
"new_abbreviation": "MHRSMBGDJM", # Keep existing abbreviation
"region_name": "Provincia de Buenos Aires",
"address": "Diego Pombo 3324, San Andrés, Partido de General San Martín, Buenos Aires Province",
"latitude": -34.5667,
"longitude": -58.5333,
"research_source": "Wikimedia Commons, Wikipedia, Buenos Aires Province cultural database",
"research_notes": "Historic house museum - Casa de Rosas. Full name: Museo Histórico Regional de San Martín «Brig. Gral. Don Juan Manuel de Rosas». Located in San Andrés locality within General San Martín partido."
},
}
def generate_ghcid_uuids(ghcid_string: str) -> dict:
"""Generate UUID v5 and UUID v8 (SHA-256 based) for a GHCID string."""
# UUID v5 (SHA-1)
uuid_v5 = uuid.uuid5(GHCID_NAMESPACE, ghcid_string)
# UUID v8 (SHA-256 based - custom implementation)
sha256_hash = hashlib.sha256(ghcid_string.encode()).digest()
uuid_v8_bytes = bytearray(sha256_hash[:16])
uuid_v8_bytes[6] = (uuid_v8_bytes[6] & 0x0F) | 0x80 # Version 8
uuid_v8_bytes[8] = (uuid_v8_bytes[8] & 0x3F) | 0x80 # Variant
uuid_v8 = uuid.UUID(bytes=bytes(uuid_v8_bytes))
# Numeric ID (64-bit from SHA-256)
numeric_id = int.from_bytes(sha256_hash[:8], 'big')
return {
"ghcid_uuid": str(uuid_v5),
"ghcid_uuid_sha256": str(uuid_v8),
"ghcid_numeric": numeric_id
}
def construct_new_ghcid(old_ghcid: str, fix_info: Dict[str, Any]) -> str:
"""Construct new GHCID from fix info."""
# Old format: AR-XX-XXX-{type}-{abbrev}
# New format: AR-{region}-{city}-{type}-{abbrev}
parts = old_ghcid.split('-')
inst_type = parts[3] # M, A, G, L, etc.
new_ghcid = f"AR-{fix_info['new_region_code']}-{fix_info['new_city_code']}-{inst_type}-{fix_info['new_abbreviation']}"
return new_ghcid
def fix_resolved_institution(filename: str, fix_info: Dict[str, Any], dry_run: bool = False) -> bool:
"""Fix a single resolved institution file."""
old_path = CUSTODIAN_DIR / filename
if not old_path.exists():
print(f" ⚠️ File not found: {filename}")
return False
# Load the YAML
with open(old_path, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
# Get old GHCID and construct new one
old_ghcid = data['ghcid']['ghcid_current']
new_ghcid = construct_new_ghcid(old_ghcid, fix_info)
# Construct new filename
parts = old_ghcid.split('-')
inst_type = parts[3]
new_filename = f"AR-{fix_info['new_region_code']}-{fix_info['new_city_code']}-{inst_type}-{fix_info['new_abbreviation']}.yaml"
new_path = CUSTODIAN_DIR / new_filename
# Print info
print(f"\n 📁 {filename}")
print(f" Name: {data['custodian_name']['claim_value']}")
print(f" Old GHCID: {old_ghcid}")
print(f" New GHCID: {new_ghcid}")
print(f" Location: {fix_info['new_city_label']}, {fix_info['region_name']}")
if fix_info.get('address'):
print(f" Address: {fix_info['address']}")
print(f" Research: {fix_info['research_source']}")
if dry_run:
print(f" [DRY RUN] Would rename to {new_filename}")
return True
# Check for collision
if new_path.exists() and new_path != old_path:
print(f" ⚠️ WARNING: Target file already exists: {new_filename}")
print(f" Skipping to avoid overwrite. Manual review needed.")
return False
# Generate new UUIDs
uuids = generate_ghcid_uuids(new_ghcid)
now = datetime.now(timezone.utc).isoformat()
# Store old GHCID in history
if 'ghcid_history' not in data['ghcid']:
data['ghcid']['ghcid_history'] = []
data['ghcid']['ghcid_history'].append({
'ghcid': old_ghcid,
'ghcid_uuid': data['ghcid']['ghcid_uuid'],
'valid_from': data['ghcid'].get('generation_timestamp'),
'valid_to': now,
'reason': f"Location resolved via research: {fix_info['research_notes']}"
})
# Update current GHCID
data['ghcid']['ghcid_current'] = new_ghcid
data['ghcid']['ghcid_uuid'] = uuids['ghcid_uuid']
data['ghcid']['ghcid_uuid_sha256'] = uuids['ghcid_uuid_sha256']
data['ghcid']['ghcid_numeric'] = uuids['ghcid_numeric']
data['ghcid']['generation_timestamp'] = now
# Update location_resolution
data['ghcid']['location_resolution'] = {
'method': 'MANUAL_RESEARCH',
'country_code': 'AR',
'region_code': fix_info['new_region_code'],
'region_name': fix_info['region_name'],
'city_code': fix_info['new_city_code'],
'city_label': fix_info['new_city_label'],
'resolution_date': now,
'research_source': fix_info['research_source'],
'research_notes': fix_info['research_notes']
}
# Update location block
data['location'] = {
'country': 'AR',
'region_code': fix_info['new_region_code'],
'city': fix_info['new_city_label']
}
if fix_info.get('address'):
data['location']['address'] = fix_info['address']
if fix_info.get('latitude') and fix_info.get('longitude'):
data['location']['latitude'] = fix_info['latitude']
data['location']['longitude'] = fix_info['longitude']
# Write updated YAML
with open(new_path, 'w', encoding='utf-8') as f:
yaml.dump(data, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
# Remove old file if it's different
if old_path != new_path:
old_path.unlink()
print(f" ✅ Fixed → {new_filename}")
return True
def main():
import sys
dry_run = '--dry-run' in sys.argv
print("=" * 60)
print("Argentina AR-XX-XXX Resolution Fix Script")
print("=" * 60)
if dry_run:
print("\n🔍 DRY RUN MODE - No files will be modified\n")
fixed_count = 0
error_count = 0
for filename, fix_info in RESOLVED_INSTITUTIONS.items():
try:
if fix_resolved_institution(filename, fix_info, dry_run):
fixed_count += 1
except Exception as e:
print(f"\n ❌ ERROR processing {filename}: {e}")
error_count += 1
# Summary
print("\n" + "=" * 60)
print("SUMMARY")
print("=" * 60)
print(f" Total processed: {len(RESOLVED_INSTITUTIONS)}")
print(f" Successfully fixed: {fixed_count}")
print(f" Errors: {error_count}")
if dry_run:
print("\n Run without --dry-run to apply changes.")
# List remaining AR-XX-XXX files
print("\n" + "=" * 60)
print("REMAINING AR-XX-XXX FILES")
print("=" * 60)
remaining = list(CUSTODIAN_DIR.glob("AR-XX-*.yaml"))
if remaining:
for f in sorted(remaining):
print(f"{f.name}")
print(f"\n Total remaining: {len(remaining)}")
else:
print(" None remaining! All AR-XX-XXX files have been resolved.")
if __name__ == "__main__":
main()