glam/tests/exporters/test_rdf_exporter.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

291 lines
11 KiB
Python

"""
Tests for RDF exporter with Partnership support.
"""
import pytest
from datetime import date, datetime, timezone
from rdflib import Graph, Literal, URIRef, Namespace
from rdflib.namespace import RDF
from glam_extractor.models import (
HeritageCustodian,
Partnership,
Location,
Identifier,
Provenance,
InstitutionType,
DataSource,
DataTier
)
from glam_extractor.exporters.rdf_exporter import RDFExporter
# Define SCHEMA namespace
SCHEMA = Namespace("http://schema.org/")
# Namespaces
GHCID = Namespace("https://w3id.org/heritage/custodian/")
ORG = Namespace("http://www.w3.org/ns/org#")
class TestRDFExporterPartnership:
"""Test RDF exporter with Partnership serialization."""
def test_single_partnership_export(self):
"""Test exporting a custodian with one partnership."""
custodian = HeritageCustodian(
id="https://w3id.org/heritage/custodian/nl/test-museum",
name="Test Museum",
institution_type=InstitutionType.MUSEUM,
partnerships=[
Partnership(
partner_name="Museum Register",
partnership_type="national_museum_certification"
)
],
provenance=Provenance(
data_source=DataSource.CSV_REGISTRY,
data_tier=DataTier.TIER_1_AUTHORITATIVE,
extraction_date=datetime.now(timezone.utc),
extraction_method="Test data"
)
)
exporter = RDFExporter()
exporter.add_custodian(custodian)
# Check that partnership triples exist
custodian_uri = URIRef(custodian.id)
# Should have org:hasMembership
memberships = list(exporter.graph.objects(custodian_uri, ORG.hasMembership))
assert len(memberships) == 1, "Should have exactly one membership"
membership_node = memberships[0]
# Check membership type
types = list(exporter.graph.objects(membership_node, RDF.type))
assert ORG.Membership in types, "Should be typed as org:Membership"
assert GHCID.Partnership in types, "Should be typed as ghcid:Partnership"
# Check partner name
partner_names = list(exporter.graph.objects(membership_node, GHCID.partner_name))
assert len(partner_names) == 1
assert str(partner_names[0]) == "Museum Register"
# Check partnership type
roles = list(exporter.graph.objects(membership_node, ORG.role))
assert len(roles) == 1
assert str(roles[0]) == "national_museum_certification"
def test_multiple_partnerships_export(self):
"""Test exporting a custodian with multiple partnerships."""
custodian = HeritageCustodian(
id="https://w3id.org/heritage/custodian/nl/rijksmuseum",
name="Rijksmuseum",
institution_type=InstitutionType.MUSEUM,
partnerships=[
Partnership(
partner_name="Museum Register",
partnership_type="national_museum_certification"
),
Partnership(
partner_name="Rijkscollectie",
partnership_type="national_collection_designation"
),
Partnership(
partner_name="Collectie Nederland",
partnership_type="aggregator_participation"
)
],
provenance=Provenance(
data_source=DataSource.CSV_REGISTRY,
data_tier=DataTier.TIER_1_AUTHORITATIVE,
extraction_date=datetime.now(timezone.utc),
extraction_method="Test data"
)
)
exporter = RDFExporter()
exporter.add_custodian(custodian)
custodian_uri = URIRef(custodian.id)
# Should have 3 memberships
memberships = list(exporter.graph.objects(custodian_uri, ORG.hasMembership))
assert len(memberships) == 3, "Should have exactly three memberships"
# Collect all partner names
partner_names = set()
for membership in memberships:
names = list(exporter.graph.objects(membership, GHCID.partner_name))
assert len(names) == 1, "Each membership should have one partner name"
partner_names.add(str(names[0]))
expected_partners = {"Museum Register", "Rijkscollectie", "Collectie Nederland"}
assert partner_names == expected_partners, "All partners should be present"
def test_partnership_with_temporal_scope(self):
"""Test partnership with start and end dates."""
custodian = HeritageCustodian(
id="https://w3id.org/heritage/custodian/nl/test-archive",
name="Test Archive",
institution_type=InstitutionType.ARCHIVE,
partnerships=[
Partnership(
partner_name="DC4EU Project",
partnership_type="digitization_program",
start_date=date(2022, 1, 1),
end_date=date(2025, 12, 31),
description="Participating in EU digitization initiative"
)
],
provenance=Provenance(
data_source=DataSource.CSV_REGISTRY,
data_tier=DataTier.TIER_1_AUTHORITATIVE,
extraction_date=datetime.now(timezone.utc),
extraction_method="Test data"
)
)
exporter = RDFExporter()
exporter.add_custodian(custodian)
custodian_uri = URIRef(custodian.id)
memberships = list(exporter.graph.objects(custodian_uri, ORG.hasMembership))
membership_node = memberships[0]
# Check start date
start_dates = list(exporter.graph.objects(membership_node, SCHEMA.startDate))
assert len(start_dates) == 1
assert str(start_dates[0]) == "2022-01-01"
# Check end date
end_dates = list(exporter.graph.objects(membership_node, SCHEMA.endDate))
assert len(end_dates) == 1
assert str(end_dates[0]) == "2025-12-31"
# Check description
descriptions = list(exporter.graph.objects(membership_node, SCHEMA.description))
assert len(descriptions) == 1
assert "EU digitization initiative" in str(descriptions[0])
def test_export_to_turtle(self):
"""Test full Turtle serialization with partnerships."""
custodian = HeritageCustodian(
id="https://w3id.org/heritage/custodian/nl/test-museum",
name="Test Museum",
institution_type=InstitutionType.MUSEUM,
description="A test museum for RDF export",
locations=[
Location(city="Amsterdam", country="NL")
],
identifiers=[
Identifier(
identifier_scheme="ISIL",
identifier_value="NL-AmsTEST"
)
],
partnerships=[
Partnership(
partner_name="Museum Register",
partnership_type="national_museum_certification"
)
],
provenance=Provenance(
data_source=DataSource.CSV_REGISTRY,
data_tier=DataTier.TIER_1_AUTHORITATIVE,
extraction_date=datetime.now(timezone.utc),
extraction_method="Test data"
)
)
exporter = RDFExporter()
turtle_output = exporter.export([custodian], format="turtle")
# Parse output to verify it's valid Turtle
test_graph = Graph()
test_graph.parse(data=turtle_output, format="turtle")
# Should have substantial content
assert len(test_graph) > 10, "Graph should have multiple triples"
# Verify key patterns in Turtle output
assert "org:hasMembership" in turtle_output
assert "org:Membership" in turtle_output
assert "Museum Register" in turtle_output
assert "national_museum_certification" in turtle_output
class TestRDFExporterCompleteness:
"""Test that RDF exporter handles all custodian fields."""
def test_full_custodian_export(self):
"""Test exporting a custodian with all fields populated."""
custodian = HeritageCustodian(
id="https://w3id.org/heritage/custodian/nl/complete-test",
name="Complete Test Institution",
alternative_names=["CTI", "Compleet Test Instituut"],
institution_type=InstitutionType.ARCHIVE,
description="A fully populated test institution",
homepage="https://example.org",
locations=[
Location(
city="Utrecht",
country="NL",
street_address="Teststraat 123",
postal_code="1234 AB"
)
],
identifiers=[
Identifier(
identifier_scheme="ISIL",
identifier_value="NL-UtrechtCTI"
),
Identifier(
identifier_scheme="Wikidata",
identifier_value="Q12345",
identifier_url="https://www.wikidata.org/wiki/Q12345"
)
],
partnerships=[
Partnership(
partner_name="Archieven.nl",
partnership_type="aggregator_participation"
),
Partnership(
partner_name="DC4EU",
partnership_type="digitization_program"
)
],
provenance=Provenance(
data_source=DataSource.CSV_REGISTRY,
data_tier=DataTier.TIER_1_AUTHORITATIVE,
extraction_date=datetime.now(timezone.utc),
extraction_method="Comprehensive test"
)
)
exporter = RDFExporter()
exporter.add_custodian(custodian)
# Export to Turtle
turtle_output = exporter.export([custodian], format="turtle")
# Verify all components are present
assert "Complete Test Institution" in turtle_output
assert "CTI" in turtle_output
assert "Utrecht" in turtle_output
assert "Teststraat 123" in turtle_output
assert "NL-UtrechtCTI" in turtle_output
assert "Q12345" in turtle_output
assert "Archieven.nl" in turtle_output
assert "DC4EU" in turtle_output
assert "org:hasMembership" in turtle_output
# Parse and count triples
test_graph = Graph()
test_graph.parse(data=turtle_output, format="turtle")
# Should have rich metadata (50+ triples for a complete institution)
assert len(test_graph) > 50, f"Graph should have substantial content, got {len(test_graph)} triples"