- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
85 lines
2.8 KiB
Python
85 lines
2.8 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Export EU ISIL institutions to LinkML instance YAML file.
|
|
|
|
Reads the parsed EU ISIL directory and creates a LinkML-compliant YAML file
|
|
with all heritage custodian records.
|
|
"""
|
|
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
import yaml
|
|
import json
|
|
|
|
from glam_extractor.parsers.eu_isil import EUIsilParser
|
|
|
|
|
|
def export_eu_institutions():
|
|
"""Export EU ISIL institutions to LinkML instance file."""
|
|
|
|
# Paths
|
|
project_root = Path(__file__).parent.parent
|
|
txt_path = project_root / "data" / "isil" / "EUR" / "isil-directory.txt"
|
|
output_path = project_root / "data" / "instances" / "eu_institutions.yaml"
|
|
|
|
# Ensure output directory exists
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Parse EU ISIL data
|
|
parser = EUIsilParser()
|
|
records = list(parser.parse_file(txt_path))
|
|
|
|
print(f"Parsed {len(records)} EU ISIL records")
|
|
|
|
# Convert to HeritageCustodian instances
|
|
extraction_date = datetime.now(timezone.utc)
|
|
custodians = [
|
|
parser.to_heritage_custodian(rec, extraction_date)
|
|
for rec in records
|
|
]
|
|
|
|
print(f"Converted {len(custodians)} HeritageCustodian instances")
|
|
|
|
# Convert to dictionaries for YAML serialization
|
|
# Using json() and then loading ensures proper datetime/nested model serialization
|
|
custodian_dicts = []
|
|
for custodian in custodians:
|
|
json_str = custodian.json(exclude_none=True, by_alias=False)
|
|
data = json.loads(json_str)
|
|
custodian_dicts.append(data)
|
|
|
|
# Write YAML file
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
# Write file header
|
|
f.write("# EU Heritage Institutions - ISIL Registry\n")
|
|
f.write("# Source: Historical Archives of the European Union (HAEU)\n")
|
|
f.write(f"# Extracted: {extraction_date.isoformat()}\n")
|
|
f.write(f"# Total institutions: {len(custodian_dicts)}\n")
|
|
f.write("# Schema: schemas/heritage_custodian.yaml (v0.2.0)\n")
|
|
f.write("# Data Tier: TIER_1_AUTHORITATIVE\n")
|
|
f.write("---\n")
|
|
|
|
# Write institutions as YAML list
|
|
yaml.dump(
|
|
custodian_dicts,
|
|
f,
|
|
default_flow_style=False,
|
|
allow_unicode=True,
|
|
sort_keys=False,
|
|
width=100
|
|
)
|
|
|
|
print(f"\n✅ Exported to: {output_path}")
|
|
print(f"\nSample institution:")
|
|
print(f" Name: {custodians[0].name}")
|
|
print(f" GHCID: {custodians[0].ghcid}")
|
|
print(f" UUID v5: {custodians[0].ghcid_uuid}")
|
|
print(f" UUID v8: {custodians[0].ghcid_uuid_sha256}")
|
|
print(f" Type: {custodians[0].institution_type}")
|
|
print(f" ISIL: {custodians[0].identifiers[0].identifier_value if custodians[0].identifiers else 'N/A'}")
|
|
|
|
return output_path, len(custodians)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
export_eu_institutions()
|