#!/usr/bin/env python3 """ Export Argentina Heritage Institutions to LinkML YAML Converts Argentine heritage institution data from JSON to LinkML-compliant YAML format: 1. CONABIP libraries (288 popular libraries) 2. AGN (Archivo General de la Nación) Output: data/instances/argentina/ directory with batch YAML files """ import json import sys from datetime import datetime, timezone from pathlib import Path from typing import List import uuid import yaml # Add project root to path project_root = Path(__file__).parent.parent sys.path.insert(0, str(project_root / "src")) from glam_extractor.parsers.argentina_conabip import ArgentinaCONABIPParser from glam_extractor.models import ( HeritageCustodian, Identifier, InstitutionTypeEnum, Location, Provenance, DataSourceEnum, DataTierEnum, GHCIDHistoryEntry, ) from glam_extractor.identifiers.ghcid import ( GHCIDGenerator, InstitutionType as GHCIDInstitutionType, ) def generate_uuids_for_custodian(custodian: HeritageCustodian) -> None: """ Generate all UUID variants for a custodian using GHCID components. Modifies custodian in-place with: - ghcid_uuid (UUID v5 from GHCID string) - ghcid_uuid_sha256 (UUID v8 from GHCID string) - record_id (UUID v7 time-ordered) """ if not custodian.ghcid_current: print(f"Warning: No GHCID for {custodian.name}, skipping UUID generation") return # Parse GHCID string back into components # GHCID format: AR-BA-BUE-L-BPH (country-region-city-type-abbrev) parts = custodian.ghcid_current.split('-') if len(parts) < 5: print(f"Warning: Invalid GHCID format for {custodian.name}: {custodian.ghcid_current}") return # Reconstruct GHCIDComponents from glam_extractor.identifiers.ghcid import GHCIDComponents components = GHCIDComponents( country_code=parts[0], region_code=parts[1], city_locode=parts[2], institution_type=parts[3], abbreviation=parts[4], wikidata_qid=parts[5] if len(parts) > 5 else None ) # Generate UUIDs from GHCID components uuid_v5 = components.to_uuid() uuid_v8 = components.to_uuid_sha256() uuid_v7 = components.generate_uuid_v7() # Time-ordered UUID for database record ID custodian.ghcid_uuid = str(uuid_v5) custodian.ghcid_uuid_sha256 = str(uuid_v8) custodian.record_id = str(uuid_v7) def parse_agn_json(filepath: Path) -> HeritageCustodian: """ Parse AGN (Archivo General de la Nación) JSON file. Returns single HeritageCustodian for the national archive. """ with open(filepath, 'r', encoding='utf-8') as f: data = json.load(f) if 'institutions' not in data or len(data['institutions']) == 0: raise ValueError("AGN JSON must contain at least one institution") agn = data['institutions'][0] # Generate GHCID generator = GHCIDGenerator() ghcid_components = generator.generate( institution_name=agn['name'], english_name=agn.get('name_en', agn['name']), institution_type=GHCIDInstitutionType.ARCHIVE, country_code="AR", region_code="CA", # Ciudad Autónoma de Buenos Aires city_locode="BUE" # Buenos Aires ) ghcid_string = ghcid_components.to_string() ghcid_numeric = ghcid_components.to_numeric() # Create identifiers identifiers = [ Identifier( identifier_scheme="Website", identifier_value=agn['url'], identifier_url=agn['url'] ) ] # Create location location = Location( location_type="Physical Address", city=agn['city'], region="AR-C", # ISO 3166-2:AR for CABA country="AR", is_primary=True ) # Create provenance extraction_date = datetime.fromisoformat(agn['extraction_date'].replace('Z', '+00:00')) provenance = Provenance( data_source=DataSourceEnum.WEB_CRAWL, data_tier=DataTierEnum.TIER_2_VERIFIED, extraction_date=extraction_date, extraction_method="AGN web scraper (scrape_agn_argentina.py)", confidence_score=0.95, source_url=agn['url'] ) # Create GHCID history ghcid_history = [ GHCIDHistoryEntry( ghcid=ghcid_string, ghcid_numeric=ghcid_numeric, valid_from=extraction_date, valid_to=None, reason="Initial assignment from AGN web scraping", institution_name=agn['name'], location_city=agn['city'], location_country="AR" ) ] # Create HeritageCustodian custodian = HeritageCustodian( id=f"AR-AGN-001", name=agn['name'], alternative_names=[agn['name_en']] if 'name_en' in agn else None, institution_type=InstitutionTypeEnum.ARCHIVE, description=agn.get('description'), homepage=agn['url'], identifiers=identifiers, locations=[location], provenance=provenance, ghcid_current=ghcid_string, ghcid_original=ghcid_string, ghcid_numeric=ghcid_numeric, ghcid_history=ghcid_history ) # Generate UUIDs generate_uuids_for_custodian(custodian) return custodian def linkml_to_dict(obj): """ Recursively convert LinkML objects to plain Python dicts. Handles: - LinkML dataclass objects - Enum PermissibleValues - Lists and nested structures - Datetime objects """ from datetime import datetime from dataclasses import is_dataclass, fields from linkml_runtime.linkml_model.meta import PermissibleValue if obj is None: return None # Handle datetime objects if isinstance(obj, datetime): return obj.isoformat() # Handle LinkML enum PermissibleValues if isinstance(obj, PermissibleValue): return obj.text # Handle LinkML enum proxy objects (they have _code attribute) if hasattr(obj, '_code') and isinstance(obj._code, PermissibleValue): return obj._code.text # Handle dataclass objects if is_dataclass(obj): result = {} for field in fields(obj): value = getattr(obj, field.name) if value is not None: result[field.name] = linkml_to_dict(value) return result # Handle lists if isinstance(obj, list): return [linkml_to_dict(item) for item in obj] # Handle dicts if isinstance(obj, dict): return {k: linkml_to_dict(v) for k, v in obj.items()} # Handle simple types (str, int, float, bool) return obj def export_to_yaml(custodians: List[HeritageCustodian], output_file: Path) -> None: """ Export HeritageCustodian instances to LinkML-compliant YAML. Converts LinkML dataclass objects to plain Python dicts for clean YAML output. """ # Convert LinkML models to dicts data = [linkml_to_dict(custodian) for custodian in custodians] # Write YAML output_file.parent.mkdir(parents=True, exist_ok=True) with open(output_file, 'w', encoding='utf-8') as f: f.write("---\n") f.write("# Argentina Heritage Institutions - LinkML Export\n") f.write(f"# Generated: {datetime.now(timezone.utc).isoformat()}\n") f.write(f"# Total institutions: {len(custodians)}\n") f.write("\n") yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) print(f"✅ Exported {len(custodians)} institutions to {output_file}") def main(): """Main execution""" print("=" * 80) print("Argentina Heritage Institution LinkML Export") print("=" * 80) # Paths project_root = Path(__file__).parent.parent data_dir = project_root / "data" instances_dir = data_dir / "instances" / "argentina" conabip_json = data_dir / "isil" / "AR" / "conabip_libraries_wikidata_enriched.json" agn_json = data_dir / "isil" / "AR" / "agn_argentina_archives.json" # Check files exist if not conabip_json.exists(): print(f"❌ CONABIP file not found: {conabip_json}") return 1 if not agn_json.exists(): print(f"❌ AGN file not found: {agn_json}") return 1 print(f"\n📂 Input files:") print(f" - CONABIP: {conabip_json}") print(f" - AGN: {agn_json}") # Parse CONABIP libraries print(f"\n🔄 Parsing CONABIP libraries...") conabip_parser = ArgentinaCONABIPParser() conabip_custodians = conabip_parser.parse_and_convert(conabip_json) # Generate UUIDs for CONABIP libraries print(f"🔄 Generating UUIDs for {len(conabip_custodians)} CONABIP libraries...") for custodian in conabip_custodians: generate_uuids_for_custodian(custodian) print(f"✅ Parsed {len(conabip_custodians)} CONABIP libraries") # Parse AGN print(f"\n🔄 Parsing AGN archive...") agn_custodian = parse_agn_json(agn_json) print(f"✅ Parsed AGN: {agn_custodian.name}") # Export in batches (100 institutions per file to keep files manageable) BATCH_SIZE = 100 print(f"\n📤 Exporting to YAML (batch size: {BATCH_SIZE})...") # Export CONABIP in batches total_batches = (len(conabip_custodians) + BATCH_SIZE - 1) // BATCH_SIZE for batch_num in range(total_batches): start_idx = batch_num * BATCH_SIZE end_idx = min(start_idx + BATCH_SIZE, len(conabip_custodians)) batch = conabip_custodians[start_idx:end_idx] output_file = instances_dir / f"conabip_libraries_batch{batch_num + 1:02d}.yaml" export_to_yaml(batch, output_file) # Export AGN separately agn_output = instances_dir / "agn_archive.yaml" export_to_yaml([agn_custodian], agn_output) # Summary print("\n" + "=" * 80) print("✅ Export Complete") print("=" * 80) print(f"📊 Statistics:") print(f" - CONABIP libraries: {len(conabip_custodians)}") print(f" - AGN archive: 1") print(f" - Total institutions: {len(conabip_custodians) + 1}") print(f"\n📁 Output directory: {instances_dir}") print(f" - CONABIP batches: {total_batches} files") print(f" - AGN: 1 file") print(f"\n🎯 Next Steps:") print(f" 1. Review exported YAML files") print(f" 2. Send IRAM email requesting official ISIL registry") print(f" 3. Wait for IRAM response (1-2 weeks)") print(f" 4. Cross-reference with Wikidata for enrichment") return 0 if __name__ == "__main__": sys.exit(main())