glam/scripts/export_eu_isil_to_linkml.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

85 lines
2.8 KiB
Python

#!/usr/bin/env python3
"""
Export EU ISIL institutions to LinkML instance YAML file.
Reads the parsed EU ISIL directory and creates a LinkML-compliant YAML file
with all heritage custodian records.
"""
from pathlib import Path
from datetime import datetime, timezone
import yaml
import json
from glam_extractor.parsers.eu_isil import EUIsilParser
def export_eu_institutions():
"""Export EU ISIL institutions to LinkML instance file."""
# Paths
project_root = Path(__file__).parent.parent
txt_path = project_root / "data" / "isil" / "EUR" / "isil-directory.txt"
output_path = project_root / "data" / "instances" / "eu_institutions.yaml"
# Ensure output directory exists
output_path.parent.mkdir(parents=True, exist_ok=True)
# Parse EU ISIL data
parser = EUIsilParser()
records = list(parser.parse_file(txt_path))
print(f"Parsed {len(records)} EU ISIL records")
# Convert to HeritageCustodian instances
extraction_date = datetime.now(timezone.utc)
custodians = [
parser.to_heritage_custodian(rec, extraction_date)
for rec in records
]
print(f"Converted {len(custodians)} HeritageCustodian instances")
# Convert to dictionaries for YAML serialization
# Using json() and then loading ensures proper datetime/nested model serialization
custodian_dicts = []
for custodian in custodians:
json_str = custodian.json(exclude_none=True, by_alias=False)
data = json.loads(json_str)
custodian_dicts.append(data)
# Write YAML file
with open(output_path, 'w', encoding='utf-8') as f:
# Write file header
f.write("# EU Heritage Institutions - ISIL Registry\n")
f.write("# Source: Historical Archives of the European Union (HAEU)\n")
f.write(f"# Extracted: {extraction_date.isoformat()}\n")
f.write(f"# Total institutions: {len(custodian_dicts)}\n")
f.write("# Schema: schemas/heritage_custodian.yaml (v0.2.0)\n")
f.write("# Data Tier: TIER_1_AUTHORITATIVE\n")
f.write("---\n")
# Write institutions as YAML list
yaml.dump(
custodian_dicts,
f,
default_flow_style=False,
allow_unicode=True,
sort_keys=False,
width=100
)
print(f"\n✅ Exported to: {output_path}")
print(f"\nSample institution:")
print(f" Name: {custodians[0].name}")
print(f" GHCID: {custodians[0].ghcid}")
print(f" UUID v5: {custodians[0].ghcid_uuid}")
print(f" UUID v8: {custodians[0].ghcid_uuid_sha256}")
print(f" Type: {custodians[0].institution_type}")
print(f" ISIL: {custodians[0].identifiers[0].identifier_value if custodians[0].identifiers else 'N/A'}")
return output_path, len(custodians)
if __name__ == "__main__":
export_eu_institutions()