- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
291 lines
11 KiB
Python
291 lines
11 KiB
Python
"""
|
|
Tests for RDF exporter with Partnership support.
|
|
"""
|
|
|
|
import pytest
|
|
from datetime import date, datetime, timezone
|
|
from rdflib import Graph, Literal, URIRef, Namespace
|
|
from rdflib.namespace import RDF
|
|
|
|
from glam_extractor.models import (
|
|
HeritageCustodian,
|
|
Partnership,
|
|
Location,
|
|
Identifier,
|
|
Provenance,
|
|
InstitutionType,
|
|
DataSource,
|
|
DataTier
|
|
)
|
|
from glam_extractor.exporters.rdf_exporter import RDFExporter
|
|
|
|
# Define SCHEMA namespace
|
|
SCHEMA = Namespace("http://schema.org/")
|
|
|
|
|
|
# Namespaces
|
|
GHCID = Namespace("https://w3id.org/heritage/custodian/")
|
|
ORG = Namespace("http://www.w3.org/ns/org#")
|
|
|
|
|
|
class TestRDFExporterPartnership:
|
|
"""Test RDF exporter with Partnership serialization."""
|
|
|
|
def test_single_partnership_export(self):
|
|
"""Test exporting a custodian with one partnership."""
|
|
custodian = HeritageCustodian(
|
|
id="https://w3id.org/heritage/custodian/nl/test-museum",
|
|
name="Test Museum",
|
|
institution_type=InstitutionType.MUSEUM,
|
|
partnerships=[
|
|
Partnership(
|
|
partner_name="Museum Register",
|
|
partnership_type="national_museum_certification"
|
|
)
|
|
],
|
|
provenance=Provenance(
|
|
data_source=DataSource.CSV_REGISTRY,
|
|
data_tier=DataTier.TIER_1_AUTHORITATIVE,
|
|
extraction_date=datetime.now(timezone.utc),
|
|
extraction_method="Test data"
|
|
)
|
|
)
|
|
|
|
exporter = RDFExporter()
|
|
exporter.add_custodian(custodian)
|
|
|
|
# Check that partnership triples exist
|
|
custodian_uri = URIRef(custodian.id)
|
|
|
|
# Should have org:hasMembership
|
|
memberships = list(exporter.graph.objects(custodian_uri, ORG.hasMembership))
|
|
assert len(memberships) == 1, "Should have exactly one membership"
|
|
|
|
membership_node = memberships[0]
|
|
|
|
# Check membership type
|
|
types = list(exporter.graph.objects(membership_node, RDF.type))
|
|
assert ORG.Membership in types, "Should be typed as org:Membership"
|
|
assert GHCID.Partnership in types, "Should be typed as ghcid:Partnership"
|
|
|
|
# Check partner name
|
|
partner_names = list(exporter.graph.objects(membership_node, GHCID.partner_name))
|
|
assert len(partner_names) == 1
|
|
assert str(partner_names[0]) == "Museum Register"
|
|
|
|
# Check partnership type
|
|
roles = list(exporter.graph.objects(membership_node, ORG.role))
|
|
assert len(roles) == 1
|
|
assert str(roles[0]) == "national_museum_certification"
|
|
|
|
def test_multiple_partnerships_export(self):
|
|
"""Test exporting a custodian with multiple partnerships."""
|
|
custodian = HeritageCustodian(
|
|
id="https://w3id.org/heritage/custodian/nl/rijksmuseum",
|
|
name="Rijksmuseum",
|
|
institution_type=InstitutionType.MUSEUM,
|
|
partnerships=[
|
|
Partnership(
|
|
partner_name="Museum Register",
|
|
partnership_type="national_museum_certification"
|
|
),
|
|
Partnership(
|
|
partner_name="Rijkscollectie",
|
|
partnership_type="national_collection_designation"
|
|
),
|
|
Partnership(
|
|
partner_name="Collectie Nederland",
|
|
partnership_type="aggregator_participation"
|
|
)
|
|
],
|
|
provenance=Provenance(
|
|
data_source=DataSource.CSV_REGISTRY,
|
|
data_tier=DataTier.TIER_1_AUTHORITATIVE,
|
|
extraction_date=datetime.now(timezone.utc),
|
|
extraction_method="Test data"
|
|
)
|
|
)
|
|
|
|
exporter = RDFExporter()
|
|
exporter.add_custodian(custodian)
|
|
|
|
custodian_uri = URIRef(custodian.id)
|
|
|
|
# Should have 3 memberships
|
|
memberships = list(exporter.graph.objects(custodian_uri, ORG.hasMembership))
|
|
assert len(memberships) == 3, "Should have exactly three memberships"
|
|
|
|
# Collect all partner names
|
|
partner_names = set()
|
|
for membership in memberships:
|
|
names = list(exporter.graph.objects(membership, GHCID.partner_name))
|
|
assert len(names) == 1, "Each membership should have one partner name"
|
|
partner_names.add(str(names[0]))
|
|
|
|
expected_partners = {"Museum Register", "Rijkscollectie", "Collectie Nederland"}
|
|
assert partner_names == expected_partners, "All partners should be present"
|
|
|
|
def test_partnership_with_temporal_scope(self):
|
|
"""Test partnership with start and end dates."""
|
|
custodian = HeritageCustodian(
|
|
id="https://w3id.org/heritage/custodian/nl/test-archive",
|
|
name="Test Archive",
|
|
institution_type=InstitutionType.ARCHIVE,
|
|
partnerships=[
|
|
Partnership(
|
|
partner_name="DC4EU Project",
|
|
partnership_type="digitization_program",
|
|
start_date=date(2022, 1, 1),
|
|
end_date=date(2025, 12, 31),
|
|
description="Participating in EU digitization initiative"
|
|
)
|
|
],
|
|
provenance=Provenance(
|
|
data_source=DataSource.CSV_REGISTRY,
|
|
data_tier=DataTier.TIER_1_AUTHORITATIVE,
|
|
extraction_date=datetime.now(timezone.utc),
|
|
extraction_method="Test data"
|
|
)
|
|
)
|
|
|
|
exporter = RDFExporter()
|
|
exporter.add_custodian(custodian)
|
|
|
|
custodian_uri = URIRef(custodian.id)
|
|
memberships = list(exporter.graph.objects(custodian_uri, ORG.hasMembership))
|
|
membership_node = memberships[0]
|
|
|
|
# Check start date
|
|
start_dates = list(exporter.graph.objects(membership_node, SCHEMA.startDate))
|
|
assert len(start_dates) == 1
|
|
assert str(start_dates[0]) == "2022-01-01"
|
|
|
|
# Check end date
|
|
end_dates = list(exporter.graph.objects(membership_node, SCHEMA.endDate))
|
|
assert len(end_dates) == 1
|
|
assert str(end_dates[0]) == "2025-12-31"
|
|
|
|
# Check description
|
|
descriptions = list(exporter.graph.objects(membership_node, SCHEMA.description))
|
|
assert len(descriptions) == 1
|
|
assert "EU digitization initiative" in str(descriptions[0])
|
|
|
|
def test_export_to_turtle(self):
|
|
"""Test full Turtle serialization with partnerships."""
|
|
custodian = HeritageCustodian(
|
|
id="https://w3id.org/heritage/custodian/nl/test-museum",
|
|
name="Test Museum",
|
|
institution_type=InstitutionType.MUSEUM,
|
|
description="A test museum for RDF export",
|
|
locations=[
|
|
Location(city="Amsterdam", country="NL")
|
|
],
|
|
identifiers=[
|
|
Identifier(
|
|
identifier_scheme="ISIL",
|
|
identifier_value="NL-AmsTEST"
|
|
)
|
|
],
|
|
partnerships=[
|
|
Partnership(
|
|
partner_name="Museum Register",
|
|
partnership_type="national_museum_certification"
|
|
)
|
|
],
|
|
provenance=Provenance(
|
|
data_source=DataSource.CSV_REGISTRY,
|
|
data_tier=DataTier.TIER_1_AUTHORITATIVE,
|
|
extraction_date=datetime.now(timezone.utc),
|
|
extraction_method="Test data"
|
|
)
|
|
)
|
|
|
|
exporter = RDFExporter()
|
|
turtle_output = exporter.export([custodian], format="turtle")
|
|
|
|
# Parse output to verify it's valid Turtle
|
|
test_graph = Graph()
|
|
test_graph.parse(data=turtle_output, format="turtle")
|
|
|
|
# Should have substantial content
|
|
assert len(test_graph) > 10, "Graph should have multiple triples"
|
|
|
|
# Verify key patterns in Turtle output
|
|
assert "org:hasMembership" in turtle_output
|
|
assert "org:Membership" in turtle_output
|
|
assert "Museum Register" in turtle_output
|
|
assert "national_museum_certification" in turtle_output
|
|
|
|
|
|
class TestRDFExporterCompleteness:
|
|
"""Test that RDF exporter handles all custodian fields."""
|
|
|
|
def test_full_custodian_export(self):
|
|
"""Test exporting a custodian with all fields populated."""
|
|
custodian = HeritageCustodian(
|
|
id="https://w3id.org/heritage/custodian/nl/complete-test",
|
|
name="Complete Test Institution",
|
|
alternative_names=["CTI", "Compleet Test Instituut"],
|
|
institution_type=InstitutionType.ARCHIVE,
|
|
description="A fully populated test institution",
|
|
homepage="https://example.org",
|
|
locations=[
|
|
Location(
|
|
city="Utrecht",
|
|
country="NL",
|
|
street_address="Teststraat 123",
|
|
postal_code="1234 AB"
|
|
)
|
|
],
|
|
identifiers=[
|
|
Identifier(
|
|
identifier_scheme="ISIL",
|
|
identifier_value="NL-UtrechtCTI"
|
|
),
|
|
Identifier(
|
|
identifier_scheme="Wikidata",
|
|
identifier_value="Q12345",
|
|
identifier_url="https://www.wikidata.org/wiki/Q12345"
|
|
)
|
|
],
|
|
partnerships=[
|
|
Partnership(
|
|
partner_name="Archieven.nl",
|
|
partnership_type="aggregator_participation"
|
|
),
|
|
Partnership(
|
|
partner_name="DC4EU",
|
|
partnership_type="digitization_program"
|
|
)
|
|
],
|
|
provenance=Provenance(
|
|
data_source=DataSource.CSV_REGISTRY,
|
|
data_tier=DataTier.TIER_1_AUTHORITATIVE,
|
|
extraction_date=datetime.now(timezone.utc),
|
|
extraction_method="Comprehensive test"
|
|
)
|
|
)
|
|
|
|
exporter = RDFExporter()
|
|
exporter.add_custodian(custodian)
|
|
|
|
# Export to Turtle
|
|
turtle_output = exporter.export([custodian], format="turtle")
|
|
|
|
# Verify all components are present
|
|
assert "Complete Test Institution" in turtle_output
|
|
assert "CTI" in turtle_output
|
|
assert "Utrecht" in turtle_output
|
|
assert "Teststraat 123" in turtle_output
|
|
assert "NL-UtrechtCTI" in turtle_output
|
|
assert "Q12345" in turtle_output
|
|
assert "Archieven.nl" in turtle_output
|
|
assert "DC4EU" in turtle_output
|
|
assert "org:hasMembership" in turtle_output
|
|
|
|
# Parse and count triples
|
|
test_graph = Graph()
|
|
test_graph.parse(data=turtle_output, format="turtle")
|
|
|
|
# Should have rich metadata (50+ triples for a complete institution)
|
|
assert len(test_graph) > 50, f"Graph should have substantial content, got {len(test_graph)} triples"
|