glam/scripts/export_argentina_to_linkml.py
2025-11-19 23:25:22 +01:00

337 lines
10 KiB
Python

#!/usr/bin/env python3
"""
Export Argentina Heritage Institutions to LinkML YAML
Converts Argentine heritage institution data from JSON to LinkML-compliant YAML format:
1. CONABIP libraries (288 popular libraries)
2. AGN (Archivo General de la Nación)
Output: data/instances/argentina/ directory with batch YAML files
"""
import json
import sys
from datetime import datetime, timezone
from pathlib import Path
from typing import List
import uuid
import yaml
# Add project root to path
project_root = Path(__file__).parent.parent
sys.path.insert(0, str(project_root / "src"))
from glam_extractor.parsers.argentina_conabip import ArgentinaCONABIPParser
from glam_extractor.models import (
HeritageCustodian,
Identifier,
InstitutionTypeEnum,
Location,
Provenance,
DataSourceEnum,
DataTierEnum,
GHCIDHistoryEntry,
)
from glam_extractor.identifiers.ghcid import (
GHCIDGenerator,
InstitutionType as GHCIDInstitutionType,
)
def generate_uuids_for_custodian(custodian: HeritageCustodian) -> None:
"""
Generate all UUID variants for a custodian using GHCID components.
Modifies custodian in-place with:
- ghcid_uuid (UUID v5 from GHCID string)
- ghcid_uuid_sha256 (UUID v8 from GHCID string)
- record_id (UUID v7 time-ordered)
"""
if not custodian.ghcid_current:
print(f"Warning: No GHCID for {custodian.name}, skipping UUID generation")
return
# Parse GHCID string back into components
# GHCID format: AR-BA-BUE-L-BPH (country-region-city-type-abbrev)
parts = custodian.ghcid_current.split('-')
if len(parts) < 5:
print(f"Warning: Invalid GHCID format for {custodian.name}: {custodian.ghcid_current}")
return
# Reconstruct GHCIDComponents
from glam_extractor.identifiers.ghcid import GHCIDComponents
components = GHCIDComponents(
country_code=parts[0],
region_code=parts[1],
city_locode=parts[2],
institution_type=parts[3],
abbreviation=parts[4],
wikidata_qid=parts[5] if len(parts) > 5 else None
)
# Generate UUIDs from GHCID components
uuid_v5 = components.to_uuid()
uuid_v8 = components.to_uuid_sha256()
uuid_v7 = components.generate_uuid_v7() # Time-ordered UUID for database record ID
custodian.ghcid_uuid = str(uuid_v5)
custodian.ghcid_uuid_sha256 = str(uuid_v8)
custodian.record_id = str(uuid_v7)
def parse_agn_json(filepath: Path) -> HeritageCustodian:
"""
Parse AGN (Archivo General de la Nación) JSON file.
Returns single HeritageCustodian for the national archive.
"""
with open(filepath, 'r', encoding='utf-8') as f:
data = json.load(f)
if 'institutions' not in data or len(data['institutions']) == 0:
raise ValueError("AGN JSON must contain at least one institution")
agn = data['institutions'][0]
# Generate GHCID
generator = GHCIDGenerator()
ghcid_components = generator.generate(
institution_name=agn['name'],
english_name=agn.get('name_en', agn['name']),
institution_type=GHCIDInstitutionType.ARCHIVE,
country_code="AR",
region_code="CA", # Ciudad Autónoma de Buenos Aires
city_locode="BUE" # Buenos Aires
)
ghcid_string = ghcid_components.to_string()
ghcid_numeric = ghcid_components.to_numeric()
# Create identifiers
identifiers = [
Identifier(
identifier_scheme="Website",
identifier_value=agn['url'],
identifier_url=agn['url']
)
]
# Create location
location = Location(
location_type="Physical Address",
city=agn['city'],
region="AR-C", # ISO 3166-2:AR for CABA
country="AR",
is_primary=True
)
# Create provenance
extraction_date = datetime.fromisoformat(agn['extraction_date'].replace('Z', '+00:00'))
provenance = Provenance(
data_source=DataSourceEnum.WEB_CRAWL,
data_tier=DataTierEnum.TIER_2_VERIFIED,
extraction_date=extraction_date,
extraction_method="AGN web scraper (scrape_agn_argentina.py)",
confidence_score=0.95,
source_url=agn['url']
)
# Create GHCID history
ghcid_history = [
GHCIDHistoryEntry(
ghcid=ghcid_string,
ghcid_numeric=ghcid_numeric,
valid_from=extraction_date,
valid_to=None,
reason="Initial assignment from AGN web scraping",
institution_name=agn['name'],
location_city=agn['city'],
location_country="AR"
)
]
# Create HeritageCustodian
custodian = HeritageCustodian(
id=f"AR-AGN-001",
name=agn['name'],
alternative_names=[agn['name_en']] if 'name_en' in agn else None,
institution_type=InstitutionTypeEnum.ARCHIVE,
description=agn.get('description'),
homepage=agn['url'],
identifiers=identifiers,
locations=[location],
provenance=provenance,
ghcid_current=ghcid_string,
ghcid_original=ghcid_string,
ghcid_numeric=ghcid_numeric,
ghcid_history=ghcid_history
)
# Generate UUIDs
generate_uuids_for_custodian(custodian)
return custodian
def linkml_to_dict(obj):
"""
Recursively convert LinkML objects to plain Python dicts.
Handles:
- LinkML dataclass objects
- Enum PermissibleValues
- Lists and nested structures
- Datetime objects
"""
from datetime import datetime
from dataclasses import is_dataclass, fields
from linkml_runtime.linkml_model.meta import PermissibleValue
if obj is None:
return None
# Handle datetime objects
if isinstance(obj, datetime):
return obj.isoformat()
# Handle LinkML enum PermissibleValues
if isinstance(obj, PermissibleValue):
return obj.text
# Handle LinkML enum proxy objects (they have _code attribute)
if hasattr(obj, '_code') and isinstance(obj._code, PermissibleValue):
return obj._code.text
# Handle dataclass objects
if is_dataclass(obj):
result = {}
for field in fields(obj):
value = getattr(obj, field.name)
if value is not None:
result[field.name] = linkml_to_dict(value)
return result
# Handle lists
if isinstance(obj, list):
return [linkml_to_dict(item) for item in obj]
# Handle dicts
if isinstance(obj, dict):
return {k: linkml_to_dict(v) for k, v in obj.items()}
# Handle simple types (str, int, float, bool)
return obj
def export_to_yaml(custodians: List[HeritageCustodian], output_file: Path) -> None:
"""
Export HeritageCustodian instances to LinkML-compliant YAML.
Converts LinkML dataclass objects to plain Python dicts for clean YAML output.
"""
# Convert LinkML models to dicts
data = [linkml_to_dict(custodian) for custodian in custodians]
# Write YAML
output_file.parent.mkdir(parents=True, exist_ok=True)
with open(output_file, 'w', encoding='utf-8') as f:
f.write("---\n")
f.write("# Argentina Heritage Institutions - LinkML Export\n")
f.write(f"# Generated: {datetime.now(timezone.utc).isoformat()}\n")
f.write(f"# Total institutions: {len(custodians)}\n")
f.write("\n")
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
print(f"✅ Exported {len(custodians)} institutions to {output_file}")
def main():
"""Main execution"""
print("=" * 80)
print("Argentina Heritage Institution LinkML Export")
print("=" * 80)
# Paths
project_root = Path(__file__).parent.parent
data_dir = project_root / "data"
instances_dir = data_dir / "instances" / "argentina"
conabip_json = data_dir / "isil" / "AR" / "conabip_libraries_wikidata_enriched.json"
agn_json = data_dir / "isil" / "AR" / "agn_argentina_archives.json"
# Check files exist
if not conabip_json.exists():
print(f"❌ CONABIP file not found: {conabip_json}")
return 1
if not agn_json.exists():
print(f"❌ AGN file not found: {agn_json}")
return 1
print(f"\n📂 Input files:")
print(f" - CONABIP: {conabip_json}")
print(f" - AGN: {agn_json}")
# Parse CONABIP libraries
print(f"\n🔄 Parsing CONABIP libraries...")
conabip_parser = ArgentinaCONABIPParser()
conabip_custodians = conabip_parser.parse_and_convert(conabip_json)
# Generate UUIDs for CONABIP libraries
print(f"🔄 Generating UUIDs for {len(conabip_custodians)} CONABIP libraries...")
for custodian in conabip_custodians:
generate_uuids_for_custodian(custodian)
print(f"✅ Parsed {len(conabip_custodians)} CONABIP libraries")
# Parse AGN
print(f"\n🔄 Parsing AGN archive...")
agn_custodian = parse_agn_json(agn_json)
print(f"✅ Parsed AGN: {agn_custodian.name}")
# Export in batches (100 institutions per file to keep files manageable)
BATCH_SIZE = 100
print(f"\n📤 Exporting to YAML (batch size: {BATCH_SIZE})...")
# Export CONABIP in batches
total_batches = (len(conabip_custodians) + BATCH_SIZE - 1) // BATCH_SIZE
for batch_num in range(total_batches):
start_idx = batch_num * BATCH_SIZE
end_idx = min(start_idx + BATCH_SIZE, len(conabip_custodians))
batch = conabip_custodians[start_idx:end_idx]
output_file = instances_dir / f"conabip_libraries_batch{batch_num + 1:02d}.yaml"
export_to_yaml(batch, output_file)
# Export AGN separately
agn_output = instances_dir / "agn_archive.yaml"
export_to_yaml([agn_custodian], agn_output)
# Summary
print("\n" + "=" * 80)
print("✅ Export Complete")
print("=" * 80)
print(f"📊 Statistics:")
print(f" - CONABIP libraries: {len(conabip_custodians)}")
print(f" - AGN archive: 1")
print(f" - Total institutions: {len(conabip_custodians) + 1}")
print(f"\n📁 Output directory: {instances_dir}")
print(f" - CONABIP batches: {total_batches} files")
print(f" - AGN: 1 file")
print(f"\n🎯 Next Steps:")
print(f" 1. Review exported YAML files")
print(f" 2. Send IRAM email requesting official ISIL registry")
print(f" 3. Wait for IRAM response (1-2 weeks)")
print(f" 4. Cross-reference with Wikidata for enrichment")
return 0
if __name__ == "__main__":
sys.exit(main())