277 lines
9.9 KiB
Python
277 lines
9.9 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Generate UUIDs for Argentina CONABIP Libraries
|
|
|
|
This script:
|
|
1. Loads parsed Argentina CONABIP data (288 institutions)
|
|
2. Generates UUID v5, UUID v7, UUID v8 for each institution
|
|
3. Exports enriched data with all persistent identifiers
|
|
4. Creates activity log for tracking
|
|
|
|
UUID Types:
|
|
- UUID v5 (SHA-1): Primary persistent identifier (RFC 4122)
|
|
- UUID v8 (SHA-256): Secondary identifier (future-proofing)
|
|
- UUID v7 (time-ordered): Database record ID
|
|
|
|
Author: OpenCode AI Agent
|
|
Date: 2025-11-17
|
|
"""
|
|
|
|
import sys
|
|
import json
|
|
import uuid
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import List
|
|
|
|
# Add src to path
|
|
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
|
|
|
# Import directly to avoid broken __init__.py
|
|
import importlib.util
|
|
|
|
# Load argentina_conabip parser directly
|
|
parser_path = Path(__file__).parent.parent / "src/glam_extractor/parsers/argentina_conabip.py"
|
|
spec = importlib.util.spec_from_file_location(
|
|
"argentina_conabip",
|
|
parser_path
|
|
)
|
|
if spec and spec.loader:
|
|
argentina_module = importlib.util.module_from_spec(spec)
|
|
sys.modules["argentina_conabip"] = argentina_module
|
|
spec.loader.exec_module(argentina_module)
|
|
ArgentinaCONABIPParser = argentina_module.ArgentinaCONABIPParser
|
|
else:
|
|
raise ImportError("Could not load argentina_conabip parser")
|
|
|
|
from glam_extractor.models import HeritageCustodian
|
|
|
|
|
|
def generate_uuid_v7() -> uuid.UUID:
|
|
"""
|
|
Generate UUID v7 (time-ordered, RFC 9562 draft).
|
|
|
|
UUID v7 embeds timestamp for database ordering.
|
|
Uses Python's uuid.uuid7() if available (Python 3.12+),
|
|
otherwise generates custom time-ordered UUID.
|
|
|
|
Returns:
|
|
UUID v7 with embedded timestamp
|
|
"""
|
|
try:
|
|
# Python 3.12+ has native uuid7()
|
|
return uuid.uuid7()
|
|
except AttributeError:
|
|
# Fallback for older Python versions
|
|
# Generate time-ordered UUID manually
|
|
import time
|
|
import random
|
|
|
|
# Get current timestamp in milliseconds
|
|
timestamp_ms = int(time.time() * 1000)
|
|
|
|
# UUID v7 format: timestamp (48 bits) + version (4 bits) + random (74 bits)
|
|
uuid_int = (timestamp_ms << 80) # 48-bit timestamp
|
|
uuid_int |= (7 << 76) # Version 7
|
|
uuid_int |= (2 << 62) # Variant (RFC 4122)
|
|
uuid_int |= random.getrandbits(62) # Random bits
|
|
|
|
return uuid.UUID(int=uuid_int)
|
|
|
|
|
|
def enrich_with_uuids(custodian: HeritageCustodian) -> HeritageCustodian:
|
|
"""
|
|
Add UUID identifiers to HeritageCustodian instance.
|
|
|
|
Generates:
|
|
- UUID v5 (from GHCID string via GHCIDComponents)
|
|
- UUID v8 (from GHCID string via GHCIDComponents)
|
|
- UUID v7 (time-ordered for database)
|
|
|
|
Args:
|
|
custodian: HeritageCustodian instance
|
|
|
|
Returns:
|
|
Updated HeritageCustodian with UUID fields populated
|
|
"""
|
|
# UUIDs are generated from GHCID via GHCIDComponents
|
|
# The parser already creates ghcid_numeric, but not UUID v5/v8
|
|
|
|
# Regenerate GHCIDComponents to get UUIDs
|
|
if custodian.ghcid_history and len(custodian.ghcid_history) > 0:
|
|
# Extract GHCID string from current history entry
|
|
ghcid_str = custodian.ghcid_history[0].ghcid
|
|
|
|
# Parse GHCID components
|
|
from glam_extractor.identifiers.ghcid import GHCIDComponents
|
|
|
|
# Parse GHCID string: AR-CA-CIU-L-BPHLR
|
|
parts = ghcid_str.split('-')
|
|
if len(parts) >= 5:
|
|
country = parts[0]
|
|
region = parts[1]
|
|
city = parts[2]
|
|
inst_type = parts[3]
|
|
abbreviation = '-'.join(parts[4:]) # Handle multi-part abbreviations
|
|
|
|
components = GHCIDComponents(
|
|
country_code=country,
|
|
region_code=region,
|
|
city_locode=city,
|
|
institution_type=inst_type,
|
|
abbreviation=abbreviation
|
|
)
|
|
|
|
# Generate UUIDs from GHCID
|
|
custodian.ghcid_uuid = components.to_uuid()
|
|
custodian.ghcid_uuid_sha256 = components.to_uuid_sha256()
|
|
|
|
# Generate UUID v7 for database record
|
|
custodian.record_id = generate_uuid_v7()
|
|
|
|
return custodian
|
|
|
|
|
|
def main():
|
|
"""Main execution: Parse, enrich with UUIDs, export."""
|
|
|
|
print("=" * 70)
|
|
print("ARGENTINA CONABIP - UUID GENERATION")
|
|
print("=" * 70)
|
|
print()
|
|
|
|
# Paths
|
|
input_file = Path("data/isil/AR/conabip_libraries_enhanced_FULL.json")
|
|
output_dir = Path("data/instances/argentina")
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
activity_log = []
|
|
|
|
# Step 1: Parse CONABIP data
|
|
print("Step 1: Parsing CONABIP data...")
|
|
parser = ArgentinaCONABIPParser()
|
|
custodians = parser.parse_and_convert(input_file)
|
|
print(f"✓ Parsed {len(custodians)} institutions")
|
|
activity_log.append({
|
|
"step": "parse",
|
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
"institutions_parsed": len(custodians),
|
|
"input_file": str(input_file)
|
|
})
|
|
|
|
# Step 2: Generate UUIDs
|
|
print("\nStep 2: Generating UUIDs (v5, v7, v8)...")
|
|
enriched_custodians = []
|
|
for i, custodian in enumerate(custodians):
|
|
enriched = enrich_with_uuids(custodian)
|
|
enriched_custodians.append(enriched)
|
|
|
|
# Progress indicator
|
|
if (i + 1) % 50 == 0:
|
|
print(f" Processed {i + 1}/{len(custodians)} institutions...")
|
|
|
|
print(f"✓ Generated UUIDs for {len(enriched_custodians)} institutions")
|
|
activity_log.append({
|
|
"step": "uuid_generation",
|
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
"institutions_enriched": len(enriched_custodians),
|
|
"uuid_types": ["v5", "v7", "v8"]
|
|
})
|
|
|
|
# Step 3: Validate UUID coverage
|
|
print("\nStep 3: Validating UUID coverage...")
|
|
with_uuid_v5 = sum(1 for c in enriched_custodians if c.ghcid_uuid)
|
|
with_uuid_v7 = sum(1 for c in enriched_custodians if c.record_id)
|
|
with_uuid_v8 = sum(1 for c in enriched_custodians if c.ghcid_uuid_sha256)
|
|
|
|
print(f" UUID v5 coverage: {with_uuid_v5}/{len(enriched_custodians)} ({100*with_uuid_v5/len(enriched_custodians):.1f}%)")
|
|
print(f" UUID v7 coverage: {with_uuid_v7}/{len(enriched_custodians)} ({100*with_uuid_v7/len(enriched_custodians):.1f}%)")
|
|
print(f" UUID v8 coverage: {with_uuid_v8}/{len(enriched_custodians)} ({100*with_uuid_v8/len(enriched_custodians):.1f}%)")
|
|
|
|
activity_log.append({
|
|
"step": "validation",
|
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
"uuid_v5_coverage": with_uuid_v5,
|
|
"uuid_v7_coverage": with_uuid_v7,
|
|
"uuid_v8_coverage": with_uuid_v8,
|
|
"total_institutions": len(enriched_custodians)
|
|
})
|
|
|
|
# Step 4: Export sample (first 5 institutions for verification)
|
|
print("\nStep 4: Exporting sample data...")
|
|
sample_output = output_dir / "argentina_conabip_sample_with_uuids.json"
|
|
|
|
sample_data = []
|
|
for custodian in enriched_custodians[:5]:
|
|
sample_data.append({
|
|
"name": custodian.name,
|
|
"conabip_reg": custodian.id,
|
|
"ghcid_current": custodian.ghcid_current,
|
|
"ghcid_numeric": custodian.ghcid_numeric,
|
|
"ghcid_uuid_v5": str(custodian.ghcid_uuid) if custodian.ghcid_uuid else None,
|
|
"ghcid_uuid_v8": str(custodian.ghcid_uuid_sha256) if custodian.ghcid_uuid_sha256 else None,
|
|
"record_id_v7": str(custodian.record_id) if custodian.record_id else None,
|
|
"location": {
|
|
"city": custodian.locations[0].city if custodian.locations else None,
|
|
"province": custodian.locations[0].region if custodian.locations else None,
|
|
"coordinates": [
|
|
custodian.locations[0].latitude,
|
|
custodian.locations[0].longitude
|
|
] if custodian.locations and custodian.locations[0].latitude else None
|
|
}
|
|
})
|
|
|
|
with open(sample_output, 'w', encoding='utf-8') as f:
|
|
json.dump(sample_data, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"✓ Exported sample to {sample_output}")
|
|
activity_log.append({
|
|
"step": "export_sample",
|
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
"output_file": str(sample_output),
|
|
"sample_size": len(sample_data)
|
|
})
|
|
|
|
# Step 5: Show sample results
|
|
print("\n" + "=" * 70)
|
|
print("SAMPLE RESULTS (First 3 Institutions)")
|
|
print("=" * 70)
|
|
for i, custodian in enumerate(enriched_custodians[:3], 1):
|
|
print(f"\n{i}. {custodian.name}")
|
|
print(f" GHCID: {custodian.ghcid_current}")
|
|
print(f" UUID v5: {custodian.ghcid_uuid}")
|
|
print(f" UUID v8: {custodian.ghcid_uuid_sha256}")
|
|
print(f" Record ID: {custodian.record_id}")
|
|
print(f" Numeric: {custodian.ghcid_numeric}")
|
|
|
|
# Step 6: Save activity log
|
|
print("\n" + "=" * 70)
|
|
print("Step 5: Saving activity log...")
|
|
log_file = output_dir / "uuid_generation_log.json"
|
|
with open(log_file, 'w', encoding='utf-8') as f:
|
|
json.dump({
|
|
"session": "Argentina CONABIP UUID Generation",
|
|
"date": datetime.now(timezone.utc).isoformat(),
|
|
"total_institutions": len(enriched_custodians),
|
|
"activities": activity_log,
|
|
"summary": {
|
|
"uuid_v5_generated": with_uuid_v5,
|
|
"uuid_v7_generated": with_uuid_v7,
|
|
"uuid_v8_generated": with_uuid_v8,
|
|
"success_rate": f"{100*min(with_uuid_v5, with_uuid_v7, with_uuid_v8)/len(enriched_custodians):.1f}%"
|
|
}
|
|
}, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"✓ Activity log saved to {log_file}")
|
|
|
|
print("\n" + "=" * 70)
|
|
print("✅ UUID GENERATION COMPLETE")
|
|
print("=" * 70)
|
|
print(f"\nNext step: Wikidata enrichment to add Q-numbers")
|
|
|
|
# Return enriched custodians for potential further processing
|
|
return enriched_custodians
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|