304 lines
12 KiB
Python
304 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Fix resolved AR-XX-XXX files with researched location data.
|
|
|
|
This script fixes Argentina institution files where the location has been
|
|
researched and confirmed through web searches and Wikidata queries.
|
|
|
|
Usage:
|
|
python scripts/fix_ar_xx_xxx_resolved.py [--dry-run]
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
import yaml
|
|
import uuid
|
|
import hashlib
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Dict, Any, Optional
|
|
|
|
# Configuration
|
|
CUSTODIAN_DIR = Path("data/custodian")
|
|
|
|
# GHCID namespace for UUID generation (same as in other scripts)
|
|
GHCID_NAMESPACE = uuid.UUID("6ba7b810-9dad-11d1-80b4-00c04fd430c8")
|
|
|
|
# Resolved institutions with researched location data
|
|
# Format: old_filename -> fix_info
|
|
RESOLVED_INSTITUTIONS = {
|
|
"AR-XX-XXX-M-MB.yaml": {
|
|
"new_region_code": "C",
|
|
"new_city_code": "LB",
|
|
"new_city_label": "La Boca",
|
|
"new_abbreviation": "MLB", # MARCO La Boca
|
|
"region_name": "Ciudad Autónoma de Buenos Aires",
|
|
"address": "Av. Almirante Brown 1031, La Boca, Buenos Aires",
|
|
"latitude": -34.6367,
|
|
"longitude": -58.3633,
|
|
"research_source": "Web search - museomarco.org",
|
|
"research_notes": "MARCO La Boca art museum located in the La Boca neighborhood of Buenos Aires city"
|
|
},
|
|
"AR-XX-XXX-M-MUFCA.yaml": {
|
|
"new_region_code": "S",
|
|
"new_city_code": "ROS",
|
|
"new_city_label": "Rosario",
|
|
"new_abbreviation": "MUFCA", # Keep existing abbreviation
|
|
"region_name": "Santa Fe",
|
|
"address": "FCEIA, Universidad Nacional de Rosario, Rosario",
|
|
"latitude": -32.9468,
|
|
"longitude": -60.6393,
|
|
"research_source": "Web search - Universidad Nacional de Rosario",
|
|
"research_notes": "Part of Facultad de Ciencias Exactas, Ingeniería y Agrimensura at UNR"
|
|
},
|
|
"AR-XX-XXX-M-ESS.yaml": {
|
|
"new_region_code": "B",
|
|
"new_city_code": "LC",
|
|
"new_city_label": "Los Cardales",
|
|
"new_abbreviation": "ESS", # Keep existing abbreviation
|
|
"region_name": "Provincia de Buenos Aires",
|
|
"address": "Los Cardales, near Campana, Buenos Aires Province",
|
|
"latitude": -34.3167,
|
|
"longitude": -58.9667,
|
|
"research_source": "Web search - tourism sites",
|
|
"research_notes": "Tourist estancia with gaucho museum, ~70km NW of Buenos Aires city"
|
|
},
|
|
"AR-XX-XXX-A-CDVO.yaml": {
|
|
"new_region_code": "B",
|
|
"new_city_code": "SI",
|
|
"new_city_label": "San Isidro",
|
|
"new_abbreviation": "CDVO", # Keep existing abbreviation
|
|
"region_name": "Provincia de Buenos Aires",
|
|
"address": "Elortondo 1837, Beccar, San Isidro",
|
|
"latitude": -34.4756,
|
|
"longitude": -58.5322,
|
|
"research_source": "Web search - Villa Ocampo UNESCO documentation center",
|
|
"research_notes": "Located at Villa Ocampo in Beccar (part of San Isidro partido). Note: Wikidata P276 incorrectly shows Houghton Library (Harvard) which is where correspondence copies are held, not the actual location."
|
|
},
|
|
# --- Batch 2: Researched 2025-12-19 ---
|
|
"AR-XX-XXX-M-TOP.yaml": {
|
|
"new_region_code": "C",
|
|
"new_city_code": "LP",
|
|
"new_city_label": "La Paternal",
|
|
"new_abbreviation": "TOP", # Templo del Otro Partido
|
|
"region_name": "Ciudad Autónoma de Buenos Aires",
|
|
"address": "Gavilán 2151 (Gate 10), Diego Armando Maradona Stadium, La Paternal, Buenos Aires",
|
|
"latitude": -34.6062,
|
|
"longitude": -58.4679,
|
|
"research_source": "Web search - Tripadvisor, Whichmuseum, Evendo",
|
|
"research_notes": "Sports museum located under Diego Armando Maradona Stadium (Argentinos Juniors). Focuses on club history and Argentine football culture. Also known as 'El Templo del Fútbol'."
|
|
},
|
|
"AR-XX-XXX-M-MAIAE.yaml": {
|
|
"new_region_code": "M",
|
|
"new_city_code": "MEN",
|
|
"new_city_label": "Mendoza",
|
|
"new_abbreviation": "MAIAE", # Keep existing abbreviation
|
|
"region_name": "Mendoza",
|
|
"address": "Instituto de Arqueología y Etnología, Facultad de Filosofía y Letras, Universidad Nacional de Cuyo, Mendoza",
|
|
"latitude": -32.8833,
|
|
"longitude": -68.8333,
|
|
"research_source": "Web search - PaleoArgentina, ResearchGate, hotel proximity listings",
|
|
"research_notes": "Archaeological museum part of Instituto de Arqueología y Etnología at Facultad de Filosofía y Letras, Universidad Nacional de Cuyo. Contains pre-Columbian and regional archaeological collections."
|
|
},
|
|
"AR-XX-XXX-M-MHRSMBGDJM.yaml": {
|
|
"new_region_code": "B",
|
|
"new_city_code": "SM",
|
|
"new_city_label": "San Martín",
|
|
"new_abbreviation": "MHRSMBGDJM", # Keep existing abbreviation
|
|
"region_name": "Provincia de Buenos Aires",
|
|
"address": "Diego Pombo 3324, San Andrés, Partido de General San Martín, Buenos Aires Province",
|
|
"latitude": -34.5667,
|
|
"longitude": -58.5333,
|
|
"research_source": "Wikimedia Commons, Wikipedia, Buenos Aires Province cultural database",
|
|
"research_notes": "Historic house museum - Casa de Rosas. Full name: Museo Histórico Regional de San Martín «Brig. Gral. Don Juan Manuel de Rosas». Located in San Andrés locality within General San Martín partido."
|
|
},
|
|
}
|
|
|
|
|
|
def generate_ghcid_uuids(ghcid_string: str) -> dict:
|
|
"""Generate UUID v5 and UUID v8 (SHA-256 based) for a GHCID string."""
|
|
# UUID v5 (SHA-1)
|
|
uuid_v5 = uuid.uuid5(GHCID_NAMESPACE, ghcid_string)
|
|
|
|
# UUID v8 (SHA-256 based - custom implementation)
|
|
sha256_hash = hashlib.sha256(ghcid_string.encode()).digest()
|
|
uuid_v8_bytes = bytearray(sha256_hash[:16])
|
|
uuid_v8_bytes[6] = (uuid_v8_bytes[6] & 0x0F) | 0x80 # Version 8
|
|
uuid_v8_bytes[8] = (uuid_v8_bytes[8] & 0x3F) | 0x80 # Variant
|
|
uuid_v8 = uuid.UUID(bytes=bytes(uuid_v8_bytes))
|
|
|
|
# Numeric ID (64-bit from SHA-256)
|
|
numeric_id = int.from_bytes(sha256_hash[:8], 'big')
|
|
|
|
return {
|
|
"ghcid_uuid": str(uuid_v5),
|
|
"ghcid_uuid_sha256": str(uuid_v8),
|
|
"ghcid_numeric": numeric_id
|
|
}
|
|
|
|
|
|
def construct_new_ghcid(old_ghcid: str, fix_info: Dict[str, Any]) -> str:
|
|
"""Construct new GHCID from fix info."""
|
|
# Old format: AR-XX-XXX-{type}-{abbrev}
|
|
# New format: AR-{region}-{city}-{type}-{abbrev}
|
|
parts = old_ghcid.split('-')
|
|
inst_type = parts[3] # M, A, G, L, etc.
|
|
|
|
new_ghcid = f"AR-{fix_info['new_region_code']}-{fix_info['new_city_code']}-{inst_type}-{fix_info['new_abbreviation']}"
|
|
return new_ghcid
|
|
|
|
|
|
def fix_resolved_institution(filename: str, fix_info: Dict[str, Any], dry_run: bool = False) -> bool:
|
|
"""Fix a single resolved institution file."""
|
|
old_path = CUSTODIAN_DIR / filename
|
|
|
|
if not old_path.exists():
|
|
print(f" ⚠️ File not found: {filename}")
|
|
return False
|
|
|
|
# Load the YAML
|
|
with open(old_path, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
# Get old GHCID and construct new one
|
|
old_ghcid = data['ghcid']['ghcid_current']
|
|
new_ghcid = construct_new_ghcid(old_ghcid, fix_info)
|
|
|
|
# Construct new filename
|
|
parts = old_ghcid.split('-')
|
|
inst_type = parts[3]
|
|
new_filename = f"AR-{fix_info['new_region_code']}-{fix_info['new_city_code']}-{inst_type}-{fix_info['new_abbreviation']}.yaml"
|
|
new_path = CUSTODIAN_DIR / new_filename
|
|
|
|
# Print info
|
|
print(f"\n 📁 {filename}")
|
|
print(f" Name: {data['custodian_name']['claim_value']}")
|
|
print(f" Old GHCID: {old_ghcid}")
|
|
print(f" New GHCID: {new_ghcid}")
|
|
print(f" Location: {fix_info['new_city_label']}, {fix_info['region_name']}")
|
|
if fix_info.get('address'):
|
|
print(f" Address: {fix_info['address']}")
|
|
print(f" Research: {fix_info['research_source']}")
|
|
|
|
if dry_run:
|
|
print(f" [DRY RUN] Would rename to {new_filename}")
|
|
return True
|
|
|
|
# Check for collision
|
|
if new_path.exists() and new_path != old_path:
|
|
print(f" ⚠️ WARNING: Target file already exists: {new_filename}")
|
|
print(f" Skipping to avoid overwrite. Manual review needed.")
|
|
return False
|
|
|
|
# Generate new UUIDs
|
|
uuids = generate_ghcid_uuids(new_ghcid)
|
|
now = datetime.now(timezone.utc).isoformat()
|
|
|
|
# Store old GHCID in history
|
|
if 'ghcid_history' not in data['ghcid']:
|
|
data['ghcid']['ghcid_history'] = []
|
|
|
|
data['ghcid']['ghcid_history'].append({
|
|
'ghcid': old_ghcid,
|
|
'ghcid_uuid': data['ghcid']['ghcid_uuid'],
|
|
'valid_from': data['ghcid'].get('generation_timestamp'),
|
|
'valid_to': now,
|
|
'reason': f"Location resolved via research: {fix_info['research_notes']}"
|
|
})
|
|
|
|
# Update current GHCID
|
|
data['ghcid']['ghcid_current'] = new_ghcid
|
|
data['ghcid']['ghcid_uuid'] = uuids['ghcid_uuid']
|
|
data['ghcid']['ghcid_uuid_sha256'] = uuids['ghcid_uuid_sha256']
|
|
data['ghcid']['ghcid_numeric'] = uuids['ghcid_numeric']
|
|
data['ghcid']['generation_timestamp'] = now
|
|
|
|
# Update location_resolution
|
|
data['ghcid']['location_resolution'] = {
|
|
'method': 'MANUAL_RESEARCH',
|
|
'country_code': 'AR',
|
|
'region_code': fix_info['new_region_code'],
|
|
'region_name': fix_info['region_name'],
|
|
'city_code': fix_info['new_city_code'],
|
|
'city_label': fix_info['new_city_label'],
|
|
'resolution_date': now,
|
|
'research_source': fix_info['research_source'],
|
|
'research_notes': fix_info['research_notes']
|
|
}
|
|
|
|
# Update location block
|
|
data['location'] = {
|
|
'country': 'AR',
|
|
'region_code': fix_info['new_region_code'],
|
|
'city': fix_info['new_city_label']
|
|
}
|
|
|
|
if fix_info.get('address'):
|
|
data['location']['address'] = fix_info['address']
|
|
|
|
if fix_info.get('latitude') and fix_info.get('longitude'):
|
|
data['location']['latitude'] = fix_info['latitude']
|
|
data['location']['longitude'] = fix_info['longitude']
|
|
|
|
# Write updated YAML
|
|
with open(new_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
|
|
|
|
# Remove old file if it's different
|
|
if old_path != new_path:
|
|
old_path.unlink()
|
|
|
|
print(f" ✅ Fixed → {new_filename}")
|
|
return True
|
|
|
|
|
|
def main():
|
|
import sys
|
|
|
|
dry_run = '--dry-run' in sys.argv
|
|
|
|
print("=" * 60)
|
|
print("Argentina AR-XX-XXX Resolution Fix Script")
|
|
print("=" * 60)
|
|
|
|
if dry_run:
|
|
print("\n🔍 DRY RUN MODE - No files will be modified\n")
|
|
|
|
fixed_count = 0
|
|
error_count = 0
|
|
|
|
for filename, fix_info in RESOLVED_INSTITUTIONS.items():
|
|
try:
|
|
if fix_resolved_institution(filename, fix_info, dry_run):
|
|
fixed_count += 1
|
|
except Exception as e:
|
|
print(f"\n ❌ ERROR processing {filename}: {e}")
|
|
error_count += 1
|
|
|
|
# Summary
|
|
print("\n" + "=" * 60)
|
|
print("SUMMARY")
|
|
print("=" * 60)
|
|
print(f" Total processed: {len(RESOLVED_INSTITUTIONS)}")
|
|
print(f" Successfully fixed: {fixed_count}")
|
|
print(f" Errors: {error_count}")
|
|
|
|
if dry_run:
|
|
print("\n Run without --dry-run to apply changes.")
|
|
|
|
# List remaining AR-XX-XXX files
|
|
print("\n" + "=" * 60)
|
|
print("REMAINING AR-XX-XXX FILES")
|
|
print("=" * 60)
|
|
|
|
remaining = list(CUSTODIAN_DIR.glob("AR-XX-*.yaml"))
|
|
if remaining:
|
|
for f in sorted(remaining):
|
|
print(f" • {f.name}")
|
|
print(f"\n Total remaining: {len(remaining)}")
|
|
else:
|
|
print(" None remaining! All AR-XX-XXX files have been resolved.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|