glam/scripts/test_eu_parser.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

74 lines
2.5 KiB
Python

#!/usr/bin/env python3
"""
Test script for EU ISIL directory parser.
Parses data/isil/EUR/isil-directory.txt and prints extracted records.
"""
from pathlib import Path
import sys
# Add src directory to path
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
from glam_extractor.parsers.eu_isil import EUIsilParser
def main():
parser = EUIsilParser()
file_path = Path("data/isil/EUR/isil-directory.txt")
print(f"Parsing {file_path}...\n")
records = list(parser.parse_file(file_path))
print(f"Found {len(records)} EU institutions:\n")
print("=" * 100)
for i, record in enumerate(records, 1):
print(f"\n{i}. {record.isil}: {record.organisation_name}")
if record.subunit:
print(f" Sub-unit: {record.subunit}")
if record.variants:
print(f" Variants: {record.variants}")
if record.city:
print(f" Location: {record.city}, {record.state}")
if record.address:
print(f" Address: {record.address}")
if record.approval_date:
approval_dt = record.get_approval_datetime()
if approval_dt:
print(f" Approved: {approval_dt.strftime('%d %B %Y')}")
print("\n" + "=" * 100)
print(f"\nTotal: {len(records)} institutions")
# Now convert to HeritageCustodian
print("\n\nConverting to HeritageCustodian models...")
custodians = list(parser.parse_and_convert(file_path))
print(f"Successfully converted {len(custodians)} institutions")
# Show sample record
if custodians:
print("\n" + "=" * 100)
print("Sample HeritageCustodian record:")
print("=" * 100)
sample = custodians[0]
print(f"ID: {sample.id}")
print(f"Name: {sample.name}")
print(f"Type: {sample.institution_type}")
print(f"Alternative names: {sample.alternative_names}")
print(f"GHCID: {sample.ghcid}")
print(f"GHCID Numeric: {sample.ghcid_numeric}")
print(f"Description: {sample.description}")
print(f"Identifiers: {len(sample.identifiers)} identifier(s)")
for ident in sample.identifiers:
print(f" - {ident.identifier_scheme}: {ident.identifier_value}")
if sample.locations:
print(f"Locations: {len(sample.locations)} location(s)")
for loc in sample.locations:
print(f" - {loc.city}, {loc.country}")
print(f"Provenance: {sample.provenance.data_source} (Tier {sample.provenance.data_tier})")
if __name__ == "__main__":
main()