glam/scripts/generate_argentina_uuids.py
2025-11-19 23:25:22 +01:00

277 lines
9.9 KiB
Python

#!/usr/bin/env python3
"""
Generate UUIDs for Argentina CONABIP Libraries
This script:
1. Loads parsed Argentina CONABIP data (288 institutions)
2. Generates UUID v5, UUID v7, UUID v8 for each institution
3. Exports enriched data with all persistent identifiers
4. Creates activity log for tracking
UUID Types:
- UUID v5 (SHA-1): Primary persistent identifier (RFC 4122)
- UUID v8 (SHA-256): Secondary identifier (future-proofing)
- UUID v7 (time-ordered): Database record ID
Author: OpenCode AI Agent
Date: 2025-11-17
"""
import sys
import json
import uuid
from datetime import datetime, timezone
from pathlib import Path
from typing import List
# Add src to path
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
# Import directly to avoid broken __init__.py
import importlib.util
# Load argentina_conabip parser directly
parser_path = Path(__file__).parent.parent / "src/glam_extractor/parsers/argentina_conabip.py"
spec = importlib.util.spec_from_file_location(
"argentina_conabip",
parser_path
)
if spec and spec.loader:
argentina_module = importlib.util.module_from_spec(spec)
sys.modules["argentina_conabip"] = argentina_module
spec.loader.exec_module(argentina_module)
ArgentinaCONABIPParser = argentina_module.ArgentinaCONABIPParser
else:
raise ImportError("Could not load argentina_conabip parser")
from glam_extractor.models import HeritageCustodian
def generate_uuid_v7() -> uuid.UUID:
"""
Generate UUID v7 (time-ordered, RFC 9562 draft).
UUID v7 embeds timestamp for database ordering.
Uses Python's uuid.uuid7() if available (Python 3.12+),
otherwise generates custom time-ordered UUID.
Returns:
UUID v7 with embedded timestamp
"""
try:
# Python 3.12+ has native uuid7()
return uuid.uuid7()
except AttributeError:
# Fallback for older Python versions
# Generate time-ordered UUID manually
import time
import random
# Get current timestamp in milliseconds
timestamp_ms = int(time.time() * 1000)
# UUID v7 format: timestamp (48 bits) + version (4 bits) + random (74 bits)
uuid_int = (timestamp_ms << 80) # 48-bit timestamp
uuid_int |= (7 << 76) # Version 7
uuid_int |= (2 << 62) # Variant (RFC 4122)
uuid_int |= random.getrandbits(62) # Random bits
return uuid.UUID(int=uuid_int)
def enrich_with_uuids(custodian: HeritageCustodian) -> HeritageCustodian:
"""
Add UUID identifiers to HeritageCustodian instance.
Generates:
- UUID v5 (from GHCID string via GHCIDComponents)
- UUID v8 (from GHCID string via GHCIDComponents)
- UUID v7 (time-ordered for database)
Args:
custodian: HeritageCustodian instance
Returns:
Updated HeritageCustodian with UUID fields populated
"""
# UUIDs are generated from GHCID via GHCIDComponents
# The parser already creates ghcid_numeric, but not UUID v5/v8
# Regenerate GHCIDComponents to get UUIDs
if custodian.ghcid_history and len(custodian.ghcid_history) > 0:
# Extract GHCID string from current history entry
ghcid_str = custodian.ghcid_history[0].ghcid
# Parse GHCID components
from glam_extractor.identifiers.ghcid import GHCIDComponents
# Parse GHCID string: AR-CA-CIU-L-BPHLR
parts = ghcid_str.split('-')
if len(parts) >= 5:
country = parts[0]
region = parts[1]
city = parts[2]
inst_type = parts[3]
abbreviation = '-'.join(parts[4:]) # Handle multi-part abbreviations
components = GHCIDComponents(
country_code=country,
region_code=region,
city_locode=city,
institution_type=inst_type,
abbreviation=abbreviation
)
# Generate UUIDs from GHCID
custodian.ghcid_uuid = components.to_uuid()
custodian.ghcid_uuid_sha256 = components.to_uuid_sha256()
# Generate UUID v7 for database record
custodian.record_id = generate_uuid_v7()
return custodian
def main():
"""Main execution: Parse, enrich with UUIDs, export."""
print("=" * 70)
print("ARGENTINA CONABIP - UUID GENERATION")
print("=" * 70)
print()
# Paths
input_file = Path("data/isil/AR/conabip_libraries_enhanced_FULL.json")
output_dir = Path("data/instances/argentina")
output_dir.mkdir(parents=True, exist_ok=True)
activity_log = []
# Step 1: Parse CONABIP data
print("Step 1: Parsing CONABIP data...")
parser = ArgentinaCONABIPParser()
custodians = parser.parse_and_convert(input_file)
print(f"✓ Parsed {len(custodians)} institutions")
activity_log.append({
"step": "parse",
"timestamp": datetime.now(timezone.utc).isoformat(),
"institutions_parsed": len(custodians),
"input_file": str(input_file)
})
# Step 2: Generate UUIDs
print("\nStep 2: Generating UUIDs (v5, v7, v8)...")
enriched_custodians = []
for i, custodian in enumerate(custodians):
enriched = enrich_with_uuids(custodian)
enriched_custodians.append(enriched)
# Progress indicator
if (i + 1) % 50 == 0:
print(f" Processed {i + 1}/{len(custodians)} institutions...")
print(f"✓ Generated UUIDs for {len(enriched_custodians)} institutions")
activity_log.append({
"step": "uuid_generation",
"timestamp": datetime.now(timezone.utc).isoformat(),
"institutions_enriched": len(enriched_custodians),
"uuid_types": ["v5", "v7", "v8"]
})
# Step 3: Validate UUID coverage
print("\nStep 3: Validating UUID coverage...")
with_uuid_v5 = sum(1 for c in enriched_custodians if c.ghcid_uuid)
with_uuid_v7 = sum(1 for c in enriched_custodians if c.record_id)
with_uuid_v8 = sum(1 for c in enriched_custodians if c.ghcid_uuid_sha256)
print(f" UUID v5 coverage: {with_uuid_v5}/{len(enriched_custodians)} ({100*with_uuid_v5/len(enriched_custodians):.1f}%)")
print(f" UUID v7 coverage: {with_uuid_v7}/{len(enriched_custodians)} ({100*with_uuid_v7/len(enriched_custodians):.1f}%)")
print(f" UUID v8 coverage: {with_uuid_v8}/{len(enriched_custodians)} ({100*with_uuid_v8/len(enriched_custodians):.1f}%)")
activity_log.append({
"step": "validation",
"timestamp": datetime.now(timezone.utc).isoformat(),
"uuid_v5_coverage": with_uuid_v5,
"uuid_v7_coverage": with_uuid_v7,
"uuid_v8_coverage": with_uuid_v8,
"total_institutions": len(enriched_custodians)
})
# Step 4: Export sample (first 5 institutions for verification)
print("\nStep 4: Exporting sample data...")
sample_output = output_dir / "argentina_conabip_sample_with_uuids.json"
sample_data = []
for custodian in enriched_custodians[:5]:
sample_data.append({
"name": custodian.name,
"conabip_reg": custodian.id,
"ghcid_current": custodian.ghcid_current,
"ghcid_numeric": custodian.ghcid_numeric,
"ghcid_uuid_v5": str(custodian.ghcid_uuid) if custodian.ghcid_uuid else None,
"ghcid_uuid_v8": str(custodian.ghcid_uuid_sha256) if custodian.ghcid_uuid_sha256 else None,
"record_id_v7": str(custodian.record_id) if custodian.record_id else None,
"location": {
"city": custodian.locations[0].city if custodian.locations else None,
"province": custodian.locations[0].region if custodian.locations else None,
"coordinates": [
custodian.locations[0].latitude,
custodian.locations[0].longitude
] if custodian.locations and custodian.locations[0].latitude else None
}
})
with open(sample_output, 'w', encoding='utf-8') as f:
json.dump(sample_data, f, indent=2, ensure_ascii=False)
print(f"✓ Exported sample to {sample_output}")
activity_log.append({
"step": "export_sample",
"timestamp": datetime.now(timezone.utc).isoformat(),
"output_file": str(sample_output),
"sample_size": len(sample_data)
})
# Step 5: Show sample results
print("\n" + "=" * 70)
print("SAMPLE RESULTS (First 3 Institutions)")
print("=" * 70)
for i, custodian in enumerate(enriched_custodians[:3], 1):
print(f"\n{i}. {custodian.name}")
print(f" GHCID: {custodian.ghcid_current}")
print(f" UUID v5: {custodian.ghcid_uuid}")
print(f" UUID v8: {custodian.ghcid_uuid_sha256}")
print(f" Record ID: {custodian.record_id}")
print(f" Numeric: {custodian.ghcid_numeric}")
# Step 6: Save activity log
print("\n" + "=" * 70)
print("Step 5: Saving activity log...")
log_file = output_dir / "uuid_generation_log.json"
with open(log_file, 'w', encoding='utf-8') as f:
json.dump({
"session": "Argentina CONABIP UUID Generation",
"date": datetime.now(timezone.utc).isoformat(),
"total_institutions": len(enriched_custodians),
"activities": activity_log,
"summary": {
"uuid_v5_generated": with_uuid_v5,
"uuid_v7_generated": with_uuid_v7,
"uuid_v8_generated": with_uuid_v8,
"success_rate": f"{100*min(with_uuid_v5, with_uuid_v7, with_uuid_v8)/len(enriched_custodians):.1f}%"
}
}, f, indent=2, ensure_ascii=False)
print(f"✓ Activity log saved to {log_file}")
print("\n" + "=" * 70)
print("✅ UUID GENERATION COMPLETE")
print("=" * 70)
print(f"\nNext step: Wikidata enrichment to add Q-numbers")
# Return enriched custodians for potential further processing
return enriched_custodians
if __name__ == "__main__":
main()