#!/usr/bin/env python3 """ Generate UUIDs for Argentina CONABIP Libraries This script: 1. Loads parsed Argentina CONABIP data (288 institutions) 2. Generates UUID v5, UUID v7, UUID v8 for each institution 3. Exports enriched data with all persistent identifiers 4. Creates activity log for tracking UUID Types: - UUID v5 (SHA-1): Primary persistent identifier (RFC 4122) - UUID v8 (SHA-256): Secondary identifier (future-proofing) - UUID v7 (time-ordered): Database record ID Author: OpenCode AI Agent Date: 2025-11-17 """ import sys import json import uuid from datetime import datetime, timezone from pathlib import Path from typing import List # Add src to path sys.path.insert(0, str(Path(__file__).parent.parent / "src")) # Import directly to avoid broken __init__.py import importlib.util # Load argentina_conabip parser directly parser_path = Path(__file__).parent.parent / "src/glam_extractor/parsers/argentina_conabip.py" spec = importlib.util.spec_from_file_location( "argentina_conabip", parser_path ) if spec and spec.loader: argentina_module = importlib.util.module_from_spec(spec) sys.modules["argentina_conabip"] = argentina_module spec.loader.exec_module(argentina_module) ArgentinaCONABIPParser = argentina_module.ArgentinaCONABIPParser else: raise ImportError("Could not load argentina_conabip parser") from glam_extractor.models import HeritageCustodian def generate_uuid_v7() -> uuid.UUID: """ Generate UUID v7 (time-ordered, RFC 9562 draft). UUID v7 embeds timestamp for database ordering. Uses Python's uuid.uuid7() if available (Python 3.12+), otherwise generates custom time-ordered UUID. Returns: UUID v7 with embedded timestamp """ try: # Python 3.12+ has native uuid7() return uuid.uuid7() except AttributeError: # Fallback for older Python versions # Generate time-ordered UUID manually import time import random # Get current timestamp in milliseconds timestamp_ms = int(time.time() * 1000) # UUID v7 format: timestamp (48 bits) + version (4 bits) + random (74 bits) uuid_int = (timestamp_ms << 80) # 48-bit timestamp uuid_int |= (7 << 76) # Version 7 uuid_int |= (2 << 62) # Variant (RFC 4122) uuid_int |= random.getrandbits(62) # Random bits return uuid.UUID(int=uuid_int) def enrich_with_uuids(custodian: HeritageCustodian) -> HeritageCustodian: """ Add UUID identifiers to HeritageCustodian instance. Generates: - UUID v5 (from GHCID string via GHCIDComponents) - UUID v8 (from GHCID string via GHCIDComponents) - UUID v7 (time-ordered for database) Args: custodian: HeritageCustodian instance Returns: Updated HeritageCustodian with UUID fields populated """ # UUIDs are generated from GHCID via GHCIDComponents # The parser already creates ghcid_numeric, but not UUID v5/v8 # Regenerate GHCIDComponents to get UUIDs if custodian.ghcid_history and len(custodian.ghcid_history) > 0: # Extract GHCID string from current history entry ghcid_str = custodian.ghcid_history[0].ghcid # Parse GHCID components from glam_extractor.identifiers.ghcid import GHCIDComponents # Parse GHCID string: AR-CA-CIU-L-BPHLR parts = ghcid_str.split('-') if len(parts) >= 5: country = parts[0] region = parts[1] city = parts[2] inst_type = parts[3] abbreviation = '-'.join(parts[4:]) # Handle multi-part abbreviations components = GHCIDComponents( country_code=country, region_code=region, city_locode=city, institution_type=inst_type, abbreviation=abbreviation ) # Generate UUIDs from GHCID custodian.ghcid_uuid = components.to_uuid() custodian.ghcid_uuid_sha256 = components.to_uuid_sha256() # Generate UUID v7 for database record custodian.record_id = generate_uuid_v7() return custodian def main(): """Main execution: Parse, enrich with UUIDs, export.""" print("=" * 70) print("ARGENTINA CONABIP - UUID GENERATION") print("=" * 70) print() # Paths input_file = Path("data/isil/AR/conabip_libraries_enhanced_FULL.json") output_dir = Path("data/instances/argentina") output_dir.mkdir(parents=True, exist_ok=True) activity_log = [] # Step 1: Parse CONABIP data print("Step 1: Parsing CONABIP data...") parser = ArgentinaCONABIPParser() custodians = parser.parse_and_convert(input_file) print(f"✓ Parsed {len(custodians)} institutions") activity_log.append({ "step": "parse", "timestamp": datetime.now(timezone.utc).isoformat(), "institutions_parsed": len(custodians), "input_file": str(input_file) }) # Step 2: Generate UUIDs print("\nStep 2: Generating UUIDs (v5, v7, v8)...") enriched_custodians = [] for i, custodian in enumerate(custodians): enriched = enrich_with_uuids(custodian) enriched_custodians.append(enriched) # Progress indicator if (i + 1) % 50 == 0: print(f" Processed {i + 1}/{len(custodians)} institutions...") print(f"✓ Generated UUIDs for {len(enriched_custodians)} institutions") activity_log.append({ "step": "uuid_generation", "timestamp": datetime.now(timezone.utc).isoformat(), "institutions_enriched": len(enriched_custodians), "uuid_types": ["v5", "v7", "v8"] }) # Step 3: Validate UUID coverage print("\nStep 3: Validating UUID coverage...") with_uuid_v5 = sum(1 for c in enriched_custodians if c.ghcid_uuid) with_uuid_v7 = sum(1 for c in enriched_custodians if c.record_id) with_uuid_v8 = sum(1 for c in enriched_custodians if c.ghcid_uuid_sha256) print(f" UUID v5 coverage: {with_uuid_v5}/{len(enriched_custodians)} ({100*with_uuid_v5/len(enriched_custodians):.1f}%)") print(f" UUID v7 coverage: {with_uuid_v7}/{len(enriched_custodians)} ({100*with_uuid_v7/len(enriched_custodians):.1f}%)") print(f" UUID v8 coverage: {with_uuid_v8}/{len(enriched_custodians)} ({100*with_uuid_v8/len(enriched_custodians):.1f}%)") activity_log.append({ "step": "validation", "timestamp": datetime.now(timezone.utc).isoformat(), "uuid_v5_coverage": with_uuid_v5, "uuid_v7_coverage": with_uuid_v7, "uuid_v8_coverage": with_uuid_v8, "total_institutions": len(enriched_custodians) }) # Step 4: Export sample (first 5 institutions for verification) print("\nStep 4: Exporting sample data...") sample_output = output_dir / "argentina_conabip_sample_with_uuids.json" sample_data = [] for custodian in enriched_custodians[:5]: sample_data.append({ "name": custodian.name, "conabip_reg": custodian.id, "ghcid_current": custodian.ghcid_current, "ghcid_numeric": custodian.ghcid_numeric, "ghcid_uuid_v5": str(custodian.ghcid_uuid) if custodian.ghcid_uuid else None, "ghcid_uuid_v8": str(custodian.ghcid_uuid_sha256) if custodian.ghcid_uuid_sha256 else None, "record_id_v7": str(custodian.record_id) if custodian.record_id else None, "location": { "city": custodian.locations[0].city if custodian.locations else None, "province": custodian.locations[0].region if custodian.locations else None, "coordinates": [ custodian.locations[0].latitude, custodian.locations[0].longitude ] if custodian.locations and custodian.locations[0].latitude else None } }) with open(sample_output, 'w', encoding='utf-8') as f: json.dump(sample_data, f, indent=2, ensure_ascii=False) print(f"✓ Exported sample to {sample_output}") activity_log.append({ "step": "export_sample", "timestamp": datetime.now(timezone.utc).isoformat(), "output_file": str(sample_output), "sample_size": len(sample_data) }) # Step 5: Show sample results print("\n" + "=" * 70) print("SAMPLE RESULTS (First 3 Institutions)") print("=" * 70) for i, custodian in enumerate(enriched_custodians[:3], 1): print(f"\n{i}. {custodian.name}") print(f" GHCID: {custodian.ghcid_current}") print(f" UUID v5: {custodian.ghcid_uuid}") print(f" UUID v8: {custodian.ghcid_uuid_sha256}") print(f" Record ID: {custodian.record_id}") print(f" Numeric: {custodian.ghcid_numeric}") # Step 6: Save activity log print("\n" + "=" * 70) print("Step 5: Saving activity log...") log_file = output_dir / "uuid_generation_log.json" with open(log_file, 'w', encoding='utf-8') as f: json.dump({ "session": "Argentina CONABIP UUID Generation", "date": datetime.now(timezone.utc).isoformat(), "total_institutions": len(enriched_custodians), "activities": activity_log, "summary": { "uuid_v5_generated": with_uuid_v5, "uuid_v7_generated": with_uuid_v7, "uuid_v8_generated": with_uuid_v8, "success_rate": f"{100*min(with_uuid_v5, with_uuid_v7, with_uuid_v8)/len(enriched_custodians):.1f}%" } }, f, indent=2, ensure_ascii=False) print(f"✓ Activity log saved to {log_file}") print("\n" + "=" * 70) print("✅ UUID GENERATION COMPLETE") print("=" * 70) print(f"\nNext step: Wikidata enrichment to add Q-numbers") # Return enriched custodians for potential further processing return enriched_custodians if __name__ == "__main__": main()