#!/usr/bin/env python3 """ Export EU ISIL institutions to LinkML instance YAML file. Reads the parsed EU ISIL directory and creates a LinkML-compliant YAML file with all heritage custodian records. """ from pathlib import Path from datetime import datetime, timezone import yaml import json from glam_extractor.parsers.eu_isil import EUIsilParser def export_eu_institutions(): """Export EU ISIL institutions to LinkML instance file.""" # Paths project_root = Path(__file__).parent.parent txt_path = project_root / "data" / "isil" / "EUR" / "isil-directory.txt" output_path = project_root / "data" / "instances" / "eu_institutions.yaml" # Ensure output directory exists output_path.parent.mkdir(parents=True, exist_ok=True) # Parse EU ISIL data parser = EUIsilParser() records = list(parser.parse_file(txt_path)) print(f"Parsed {len(records)} EU ISIL records") # Convert to HeritageCustodian instances extraction_date = datetime.now(timezone.utc) custodians = [ parser.to_heritage_custodian(rec, extraction_date) for rec in records ] print(f"Converted {len(custodians)} HeritageCustodian instances") # Convert to dictionaries for YAML serialization # Using json() and then loading ensures proper datetime/nested model serialization custodian_dicts = [] for custodian in custodians: json_str = custodian.json(exclude_none=True, by_alias=False) data = json.loads(json_str) custodian_dicts.append(data) # Write YAML file with open(output_path, 'w', encoding='utf-8') as f: # Write file header f.write("# EU Heritage Institutions - ISIL Registry\n") f.write("# Source: Historical Archives of the European Union (HAEU)\n") f.write(f"# Extracted: {extraction_date.isoformat()}\n") f.write(f"# Total institutions: {len(custodian_dicts)}\n") f.write("# Schema: schemas/heritage_custodian.yaml (v0.2.0)\n") f.write("# Data Tier: TIER_1_AUTHORITATIVE\n") f.write("---\n") # Write institutions as YAML list yaml.dump( custodian_dicts, f, default_flow_style=False, allow_unicode=True, sort_keys=False, width=100 ) print(f"\n✅ Exported to: {output_path}") print(f"\nSample institution:") print(f" Name: {custodians[0].name}") print(f" GHCID: {custodians[0].ghcid}") print(f" UUID v5: {custodians[0].ghcid_uuid}") print(f" UUID v8: {custodians[0].ghcid_uuid_sha256}") print(f" Type: {custodians[0].institution_type}") print(f" ISIL: {custodians[0].identifiers[0].identifier_value if custodians[0].identifiers else 'N/A'}") return output_path, len(custodians) if __name__ == "__main__": export_eu_institutions()