337 lines
10 KiB
Python
337 lines
10 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Export Argentina Heritage Institutions to LinkML YAML
|
|
|
|
Converts Argentine heritage institution data from JSON to LinkML-compliant YAML format:
|
|
1. CONABIP libraries (288 popular libraries)
|
|
2. AGN (Archivo General de la Nación)
|
|
|
|
Output: data/instances/argentina/ directory with batch YAML files
|
|
"""
|
|
|
|
import json
|
|
import sys
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import List
|
|
import uuid
|
|
|
|
import yaml
|
|
|
|
# Add project root to path
|
|
project_root = Path(__file__).parent.parent
|
|
sys.path.insert(0, str(project_root / "src"))
|
|
|
|
from glam_extractor.parsers.argentina_conabip import ArgentinaCONABIPParser
|
|
from glam_extractor.models import (
|
|
HeritageCustodian,
|
|
Identifier,
|
|
InstitutionTypeEnum,
|
|
Location,
|
|
Provenance,
|
|
DataSourceEnum,
|
|
DataTierEnum,
|
|
GHCIDHistoryEntry,
|
|
)
|
|
from glam_extractor.identifiers.ghcid import (
|
|
GHCIDGenerator,
|
|
InstitutionType as GHCIDInstitutionType,
|
|
)
|
|
|
|
|
|
def generate_uuids_for_custodian(custodian: HeritageCustodian) -> None:
|
|
"""
|
|
Generate all UUID variants for a custodian using GHCID components.
|
|
|
|
Modifies custodian in-place with:
|
|
- ghcid_uuid (UUID v5 from GHCID string)
|
|
- ghcid_uuid_sha256 (UUID v8 from GHCID string)
|
|
- record_id (UUID v7 time-ordered)
|
|
"""
|
|
if not custodian.ghcid_current:
|
|
print(f"Warning: No GHCID for {custodian.name}, skipping UUID generation")
|
|
return
|
|
|
|
# Parse GHCID string back into components
|
|
# GHCID format: AR-BA-BUE-L-BPH (country-region-city-type-abbrev)
|
|
parts = custodian.ghcid_current.split('-')
|
|
|
|
if len(parts) < 5:
|
|
print(f"Warning: Invalid GHCID format for {custodian.name}: {custodian.ghcid_current}")
|
|
return
|
|
|
|
# Reconstruct GHCIDComponents
|
|
from glam_extractor.identifiers.ghcid import GHCIDComponents
|
|
|
|
components = GHCIDComponents(
|
|
country_code=parts[0],
|
|
region_code=parts[1],
|
|
city_locode=parts[2],
|
|
institution_type=parts[3],
|
|
abbreviation=parts[4],
|
|
wikidata_qid=parts[5] if len(parts) > 5 else None
|
|
)
|
|
|
|
# Generate UUIDs from GHCID components
|
|
uuid_v5 = components.to_uuid()
|
|
uuid_v8 = components.to_uuid_sha256()
|
|
uuid_v7 = components.generate_uuid_v7() # Time-ordered UUID for database record ID
|
|
|
|
custodian.ghcid_uuid = str(uuid_v5)
|
|
custodian.ghcid_uuid_sha256 = str(uuid_v8)
|
|
custodian.record_id = str(uuid_v7)
|
|
|
|
|
|
def parse_agn_json(filepath: Path) -> HeritageCustodian:
|
|
"""
|
|
Parse AGN (Archivo General de la Nación) JSON file.
|
|
|
|
Returns single HeritageCustodian for the national archive.
|
|
"""
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
if 'institutions' not in data or len(data['institutions']) == 0:
|
|
raise ValueError("AGN JSON must contain at least one institution")
|
|
|
|
agn = data['institutions'][0]
|
|
|
|
# Generate GHCID
|
|
generator = GHCIDGenerator()
|
|
ghcid_components = generator.generate(
|
|
institution_name=agn['name'],
|
|
english_name=agn.get('name_en', agn['name']),
|
|
institution_type=GHCIDInstitutionType.ARCHIVE,
|
|
country_code="AR",
|
|
region_code="CA", # Ciudad Autónoma de Buenos Aires
|
|
city_locode="BUE" # Buenos Aires
|
|
)
|
|
|
|
ghcid_string = ghcid_components.to_string()
|
|
ghcid_numeric = ghcid_components.to_numeric()
|
|
|
|
# Create identifiers
|
|
identifiers = [
|
|
Identifier(
|
|
identifier_scheme="Website",
|
|
identifier_value=agn['url'],
|
|
identifier_url=agn['url']
|
|
)
|
|
]
|
|
|
|
# Create location
|
|
location = Location(
|
|
location_type="Physical Address",
|
|
city=agn['city'],
|
|
region="AR-C", # ISO 3166-2:AR for CABA
|
|
country="AR",
|
|
is_primary=True
|
|
)
|
|
|
|
# Create provenance
|
|
extraction_date = datetime.fromisoformat(agn['extraction_date'].replace('Z', '+00:00'))
|
|
provenance = Provenance(
|
|
data_source=DataSourceEnum.WEB_CRAWL,
|
|
data_tier=DataTierEnum.TIER_2_VERIFIED,
|
|
extraction_date=extraction_date,
|
|
extraction_method="AGN web scraper (scrape_agn_argentina.py)",
|
|
confidence_score=0.95,
|
|
source_url=agn['url']
|
|
)
|
|
|
|
# Create GHCID history
|
|
ghcid_history = [
|
|
GHCIDHistoryEntry(
|
|
ghcid=ghcid_string,
|
|
ghcid_numeric=ghcid_numeric,
|
|
valid_from=extraction_date,
|
|
valid_to=None,
|
|
reason="Initial assignment from AGN web scraping",
|
|
institution_name=agn['name'],
|
|
location_city=agn['city'],
|
|
location_country="AR"
|
|
)
|
|
]
|
|
|
|
# Create HeritageCustodian
|
|
custodian = HeritageCustodian(
|
|
id=f"AR-AGN-001",
|
|
name=agn['name'],
|
|
alternative_names=[agn['name_en']] if 'name_en' in agn else None,
|
|
institution_type=InstitutionTypeEnum.ARCHIVE,
|
|
description=agn.get('description'),
|
|
homepage=agn['url'],
|
|
identifiers=identifiers,
|
|
locations=[location],
|
|
provenance=provenance,
|
|
ghcid_current=ghcid_string,
|
|
ghcid_original=ghcid_string,
|
|
ghcid_numeric=ghcid_numeric,
|
|
ghcid_history=ghcid_history
|
|
)
|
|
|
|
# Generate UUIDs
|
|
generate_uuids_for_custodian(custodian)
|
|
|
|
return custodian
|
|
|
|
|
|
def linkml_to_dict(obj):
|
|
"""
|
|
Recursively convert LinkML objects to plain Python dicts.
|
|
|
|
Handles:
|
|
- LinkML dataclass objects
|
|
- Enum PermissibleValues
|
|
- Lists and nested structures
|
|
- Datetime objects
|
|
"""
|
|
from datetime import datetime
|
|
from dataclasses import is_dataclass, fields
|
|
from linkml_runtime.linkml_model.meta import PermissibleValue
|
|
|
|
if obj is None:
|
|
return None
|
|
|
|
# Handle datetime objects
|
|
if isinstance(obj, datetime):
|
|
return obj.isoformat()
|
|
|
|
# Handle LinkML enum PermissibleValues
|
|
if isinstance(obj, PermissibleValue):
|
|
return obj.text
|
|
|
|
# Handle LinkML enum proxy objects (they have _code attribute)
|
|
if hasattr(obj, '_code') and isinstance(obj._code, PermissibleValue):
|
|
return obj._code.text
|
|
|
|
# Handle dataclass objects
|
|
if is_dataclass(obj):
|
|
result = {}
|
|
for field in fields(obj):
|
|
value = getattr(obj, field.name)
|
|
if value is not None:
|
|
result[field.name] = linkml_to_dict(value)
|
|
return result
|
|
|
|
# Handle lists
|
|
if isinstance(obj, list):
|
|
return [linkml_to_dict(item) for item in obj]
|
|
|
|
# Handle dicts
|
|
if isinstance(obj, dict):
|
|
return {k: linkml_to_dict(v) for k, v in obj.items()}
|
|
|
|
# Handle simple types (str, int, float, bool)
|
|
return obj
|
|
|
|
|
|
def export_to_yaml(custodians: List[HeritageCustodian], output_file: Path) -> None:
|
|
"""
|
|
Export HeritageCustodian instances to LinkML-compliant YAML.
|
|
|
|
Converts LinkML dataclass objects to plain Python dicts for clean YAML output.
|
|
"""
|
|
# Convert LinkML models to dicts
|
|
data = [linkml_to_dict(custodian) for custodian in custodians]
|
|
|
|
# Write YAML
|
|
output_file.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
f.write("---\n")
|
|
f.write("# Argentina Heritage Institutions - LinkML Export\n")
|
|
f.write(f"# Generated: {datetime.now(timezone.utc).isoformat()}\n")
|
|
f.write(f"# Total institutions: {len(custodians)}\n")
|
|
f.write("\n")
|
|
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
print(f"✅ Exported {len(custodians)} institutions to {output_file}")
|
|
|
|
|
|
def main():
|
|
"""Main execution"""
|
|
print("=" * 80)
|
|
print("Argentina Heritage Institution LinkML Export")
|
|
print("=" * 80)
|
|
|
|
# Paths
|
|
project_root = Path(__file__).parent.parent
|
|
data_dir = project_root / "data"
|
|
instances_dir = data_dir / "instances" / "argentina"
|
|
|
|
conabip_json = data_dir / "isil" / "AR" / "conabip_libraries_wikidata_enriched.json"
|
|
agn_json = data_dir / "isil" / "AR" / "agn_argentina_archives.json"
|
|
|
|
# Check files exist
|
|
if not conabip_json.exists():
|
|
print(f"❌ CONABIP file not found: {conabip_json}")
|
|
return 1
|
|
|
|
if not agn_json.exists():
|
|
print(f"❌ AGN file not found: {agn_json}")
|
|
return 1
|
|
|
|
print(f"\n📂 Input files:")
|
|
print(f" - CONABIP: {conabip_json}")
|
|
print(f" - AGN: {agn_json}")
|
|
|
|
# Parse CONABIP libraries
|
|
print(f"\n🔄 Parsing CONABIP libraries...")
|
|
conabip_parser = ArgentinaCONABIPParser()
|
|
conabip_custodians = conabip_parser.parse_and_convert(conabip_json)
|
|
|
|
# Generate UUIDs for CONABIP libraries
|
|
print(f"🔄 Generating UUIDs for {len(conabip_custodians)} CONABIP libraries...")
|
|
for custodian in conabip_custodians:
|
|
generate_uuids_for_custodian(custodian)
|
|
|
|
print(f"✅ Parsed {len(conabip_custodians)} CONABIP libraries")
|
|
|
|
# Parse AGN
|
|
print(f"\n🔄 Parsing AGN archive...")
|
|
agn_custodian = parse_agn_json(agn_json)
|
|
print(f"✅ Parsed AGN: {agn_custodian.name}")
|
|
|
|
# Export in batches (100 institutions per file to keep files manageable)
|
|
BATCH_SIZE = 100
|
|
|
|
print(f"\n📤 Exporting to YAML (batch size: {BATCH_SIZE})...")
|
|
|
|
# Export CONABIP in batches
|
|
total_batches = (len(conabip_custodians) + BATCH_SIZE - 1) // BATCH_SIZE
|
|
|
|
for batch_num in range(total_batches):
|
|
start_idx = batch_num * BATCH_SIZE
|
|
end_idx = min(start_idx + BATCH_SIZE, len(conabip_custodians))
|
|
batch = conabip_custodians[start_idx:end_idx]
|
|
|
|
output_file = instances_dir / f"conabip_libraries_batch{batch_num + 1:02d}.yaml"
|
|
export_to_yaml(batch, output_file)
|
|
|
|
# Export AGN separately
|
|
agn_output = instances_dir / "agn_archive.yaml"
|
|
export_to_yaml([agn_custodian], agn_output)
|
|
|
|
# Summary
|
|
print("\n" + "=" * 80)
|
|
print("✅ Export Complete")
|
|
print("=" * 80)
|
|
print(f"📊 Statistics:")
|
|
print(f" - CONABIP libraries: {len(conabip_custodians)}")
|
|
print(f" - AGN archive: 1")
|
|
print(f" - Total institutions: {len(conabip_custodians) + 1}")
|
|
print(f"\n📁 Output directory: {instances_dir}")
|
|
print(f" - CONABIP batches: {total_batches} files")
|
|
print(f" - AGN: 1 file")
|
|
print(f"\n🎯 Next Steps:")
|
|
print(f" 1. Review exported YAML files")
|
|
print(f" 2. Send IRAM email requesting official ISIL registry")
|
|
print(f" 3. Wait for IRAM response (1-2 weeks)")
|
|
print(f" 4. Cross-reference with Wikidata for enrichment")
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|