- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
74 lines
2.5 KiB
Python
74 lines
2.5 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Test script for EU ISIL directory parser.
|
|
|
|
Parses data/isil/EUR/isil-directory.txt and prints extracted records.
|
|
"""
|
|
|
|
from pathlib import Path
|
|
import sys
|
|
|
|
# Add src directory to path
|
|
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
|
|
|
from glam_extractor.parsers.eu_isil import EUIsilParser
|
|
|
|
def main():
|
|
parser = EUIsilParser()
|
|
file_path = Path("data/isil/EUR/isil-directory.txt")
|
|
|
|
print(f"Parsing {file_path}...\n")
|
|
|
|
records = list(parser.parse_file(file_path))
|
|
|
|
print(f"Found {len(records)} EU institutions:\n")
|
|
print("=" * 100)
|
|
|
|
for i, record in enumerate(records, 1):
|
|
print(f"\n{i}. {record.isil}: {record.organisation_name}")
|
|
if record.subunit:
|
|
print(f" Sub-unit: {record.subunit}")
|
|
if record.variants:
|
|
print(f" Variants: {record.variants}")
|
|
if record.city:
|
|
print(f" Location: {record.city}, {record.state}")
|
|
if record.address:
|
|
print(f" Address: {record.address}")
|
|
if record.approval_date:
|
|
approval_dt = record.get_approval_datetime()
|
|
if approval_dt:
|
|
print(f" Approved: {approval_dt.strftime('%d %B %Y')}")
|
|
|
|
print("\n" + "=" * 100)
|
|
print(f"\nTotal: {len(records)} institutions")
|
|
|
|
# Now convert to HeritageCustodian
|
|
print("\n\nConverting to HeritageCustodian models...")
|
|
custodians = list(parser.parse_and_convert(file_path))
|
|
|
|
print(f"Successfully converted {len(custodians)} institutions")
|
|
|
|
# Show sample record
|
|
if custodians:
|
|
print("\n" + "=" * 100)
|
|
print("Sample HeritageCustodian record:")
|
|
print("=" * 100)
|
|
sample = custodians[0]
|
|
print(f"ID: {sample.id}")
|
|
print(f"Name: {sample.name}")
|
|
print(f"Type: {sample.institution_type}")
|
|
print(f"Alternative names: {sample.alternative_names}")
|
|
print(f"GHCID: {sample.ghcid}")
|
|
print(f"GHCID Numeric: {sample.ghcid_numeric}")
|
|
print(f"Description: {sample.description}")
|
|
print(f"Identifiers: {len(sample.identifiers)} identifier(s)")
|
|
for ident in sample.identifiers:
|
|
print(f" - {ident.identifier_scheme}: {ident.identifier_value}")
|
|
if sample.locations:
|
|
print(f"Locations: {len(sample.locations)} location(s)")
|
|
for loc in sample.locations:
|
|
print(f" - {loc.city}, {loc.country}")
|
|
print(f"Provenance: {sample.provenance.data_source} (Tier {sample.provenance.data_tier})")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|