- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
371 lines
16 KiB
Python
371 lines
16 KiB
Python
"""
|
||
End-to-end integration test: Conversation → Partnership Extraction → RDF Export
|
||
|
||
This test validates the complete pipeline from parsing a real conversation JSON file,
|
||
extracting partnership mentions, and serializing them to RDF/Turtle format using
|
||
the W3C Organization Ontology (ORG).
|
||
"""
|
||
|
||
import pytest
|
||
from datetime import datetime, timezone, date
|
||
from pathlib import Path
|
||
from rdflib import Graph, Namespace, URIRef
|
||
|
||
from glam_extractor.parsers.conversation import ConversationParser
|
||
from glam_extractor.models import (
|
||
HeritageCustodian,
|
||
Partnership,
|
||
InstitutionType,
|
||
Provenance,
|
||
DataSource,
|
||
DataTier
|
||
)
|
||
from glam_extractor.exporters.rdf_exporter import RDFExporter
|
||
|
||
|
||
# Define namespaces
|
||
GHCID = Namespace("https://w3id.org/heritage/custodian/")
|
||
ORG = Namespace("http://www.w3.org/ns/org#")
|
||
SCHEMA = Namespace("http://schema.org/")
|
||
|
||
|
||
class TestPartnershipRDFIntegration:
|
||
"""Test end-to-end partnership extraction and RDF serialization."""
|
||
|
||
@pytest.fixture
|
||
def chilean_conversation_path(self):
|
||
"""Path to Chilean GLAM conversation JSON file."""
|
||
return Path(__file__).parent.parent / "data" / "raw" / "chilean_glam_conversation.json"
|
||
|
||
def test_extract_partnerships_from_chilean_conversation(self, chilean_conversation_path):
|
||
"""Test extracting partnerships from real Chilean conversation."""
|
||
if not chilean_conversation_path.exists():
|
||
pytest.skip(f"Chilean conversation file not found: {chilean_conversation_path}")
|
||
|
||
# Parse conversation
|
||
parser = ConversationParser()
|
||
conversation = parser.parse_file(str(chilean_conversation_path))
|
||
|
||
# Extract partnerships
|
||
partnerships = parser.extract_partnerships(conversation)
|
||
|
||
# Verify extraction (we know Chilean conversation mentions UNESCO)
|
||
assert len(partnerships) > 0, "Should extract at least one partnership"
|
||
|
||
# Check for UNESCO partnership
|
||
unesco_partnerships = [p for p in partnerships if "UNESCO" in p["partner_name"]]
|
||
assert len(unesco_partnerships) > 0, "Should find UNESCO partnership"
|
||
|
||
# Verify partnership structure
|
||
for partnership in partnerships:
|
||
assert "partner_name" in partnership
|
||
assert "partnership_type" in partnership
|
||
assert "description" in partnership
|
||
# Temporal fields are optional
|
||
if "start_date" in partnership:
|
||
assert partnership["start_date"] is None or isinstance(partnership["start_date"], str)
|
||
|
||
def test_conversation_to_rdf_export(self, chilean_conversation_path):
|
||
"""Test full pipeline: conversation → extract partnerships → RDF export."""
|
||
if not chilean_conversation_path.exists():
|
||
pytest.skip(f"Chilean conversation file not found: {chilean_conversation_path}")
|
||
|
||
# Step 1: Parse conversation
|
||
parser = ConversationParser()
|
||
conversation = parser.parse_file(str(chilean_conversation_path))
|
||
|
||
# Step 2: Extract partnerships
|
||
partnerships_data = parser.extract_partnerships(conversation)
|
||
|
||
# Step 3: Create HeritageCustodian with extracted partnerships
|
||
# Note: In real workflow, we'd extract institution names too, but for this test
|
||
# we create a test institution with extracted partnerships
|
||
|
||
# Convert extracted partnership dicts to Partnership model objects
|
||
partnership_models = []
|
||
for p_data in partnerships_data:
|
||
partnership = Partnership(
|
||
partner_name=p_data["partner_name"],
|
||
partnership_type=p_data["partnership_type"],
|
||
description=p_data["description"],
|
||
# Handle optional date fields - convert string dates to date objects if present
|
||
# For now, we'll keep them as None since our current extraction returns None
|
||
start_date=None, # Would parse p_data.get("start_date") if present
|
||
end_date=None # Would parse p_data.get("end_date") if present
|
||
)
|
||
partnership_models.append(partnership)
|
||
|
||
# Create test custodian with extracted partnerships
|
||
custodian = HeritageCustodian(
|
||
id="https://w3id.org/heritage/custodian/cl/test-institution",
|
||
name="Chilean Test Institution",
|
||
institution_type=InstitutionType.MUSEUM,
|
||
description="Test institution with partnerships extracted from conversation",
|
||
partnerships=partnership_models,
|
||
provenance=Provenance(
|
||
data_source=DataSource.CONVERSATION_NLP,
|
||
data_tier=DataTier.TIER_4_INFERRED,
|
||
extraction_date=datetime.now(timezone.utc),
|
||
extraction_method="ConversationParser.extract_partnerships() + pattern matching",
|
||
confidence_score=0.85,
|
||
conversation_id=conversation.uuid
|
||
)
|
||
)
|
||
|
||
# Step 4: Export to RDF
|
||
exporter = RDFExporter()
|
||
turtle_output = exporter.export([custodian], format="turtle")
|
||
|
||
# Step 5: Validate RDF output
|
||
# Parse Turtle to verify it's valid
|
||
test_graph = Graph()
|
||
test_graph.parse(data=turtle_output, format="turtle")
|
||
|
||
# Verify graph has substantial content
|
||
assert len(test_graph) > 10, "RDF graph should have multiple triples"
|
||
|
||
# Verify partnership-specific triples
|
||
custodian_uri = URIRef(custodian.id)
|
||
|
||
# Check for org:hasMembership triples
|
||
memberships = list(test_graph.objects(custodian_uri, ORG.hasMembership))
|
||
assert len(memberships) == len(partnership_models), \
|
||
f"Should have {len(partnership_models)} membership(s), got {len(memberships)}"
|
||
|
||
# Verify each membership has required properties
|
||
for membership_node in memberships:
|
||
# Check ORG.Membership type
|
||
types = list(test_graph.objects(membership_node, URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#type")))
|
||
type_uris = [str(t) for t in types]
|
||
assert "http://www.w3.org/ns/org#Membership" in type_uris, \
|
||
"Membership should be typed as org:Membership"
|
||
|
||
# Check partner name
|
||
partner_names = list(test_graph.objects(membership_node, GHCID.partner_name))
|
||
assert len(partner_names) >= 1, "Each membership should have a partner_name"
|
||
|
||
# Check partnership type
|
||
partnership_types = list(test_graph.objects(membership_node, GHCID.partnership_type))
|
||
assert len(partnership_types) >= 1, "Each membership should have a partnership_type"
|
||
|
||
# Verify specific content appears in Turtle output
|
||
assert "org:hasMembership" in turtle_output, "Turtle should contain org:hasMembership predicate"
|
||
assert "org:Membership" in turtle_output, "Turtle should contain org:Membership class"
|
||
|
||
# Check for at least one extracted partner name
|
||
assert any(p["partner_name"] in turtle_output for p in partnerships_data), \
|
||
"At least one partner name should appear in Turtle output"
|
||
|
||
def test_partnership_with_temporal_properties_in_rdf(self):
|
||
"""Test that partnerships with dates are correctly serialized to RDF."""
|
||
from datetime import date
|
||
|
||
# Create custodian with time-bounded partnership
|
||
custodian = HeritageCustodian(
|
||
id="https://w3id.org/heritage/custodian/cl/temporal-test",
|
||
name="Temporal Test Institution",
|
||
institution_type=InstitutionType.ARCHIVE,
|
||
partnerships=[
|
||
Partnership(
|
||
partner_name="DC4EU Project",
|
||
partnership_type="digitization_program",
|
||
start_date=date(2022, 1, 1),
|
||
end_date=date(2025, 12, 31),
|
||
description="EU digitization initiative 2022-2025"
|
||
)
|
||
],
|
||
provenance=Provenance(
|
||
data_source=DataSource.CONVERSATION_NLP,
|
||
data_tier=DataTier.TIER_4_INFERRED,
|
||
extraction_date=datetime.now(timezone.utc),
|
||
extraction_method="Test data"
|
||
)
|
||
)
|
||
|
||
# Export to RDF
|
||
exporter = RDFExporter()
|
||
turtle_output = exporter.export([custodian], format="turtle")
|
||
|
||
# Parse RDF
|
||
test_graph = Graph()
|
||
test_graph.parse(data=turtle_output, format="turtle")
|
||
|
||
# Verify temporal properties
|
||
custodian_uri = URIRef(custodian.id)
|
||
memberships = list(test_graph.objects(custodian_uri, ORG.hasMembership))
|
||
membership_node = memberships[0]
|
||
|
||
# Check start date
|
||
start_dates = list(test_graph.objects(membership_node, SCHEMA.startDate))
|
||
assert len(start_dates) == 1, "Should have schema:startDate"
|
||
assert str(start_dates[0]) == "2022-01-01", "Start date should be 2022-01-01"
|
||
|
||
# Check end date
|
||
end_dates = list(test_graph.objects(membership_node, SCHEMA.endDate))
|
||
assert len(end_dates) == 1, "Should have schema:endDate"
|
||
assert str(end_dates[0]) == "2025-12-31", "End date should be 2025-12-31"
|
||
|
||
# Verify dates appear in Turtle
|
||
assert "2022-01-01" in turtle_output, "Start date should appear in Turtle"
|
||
assert "2025-12-31" in turtle_output, "End date should appear in Turtle"
|
||
assert "schema:startDate" in turtle_output or "startDate" in turtle_output
|
||
assert "schema:endDate" in turtle_output or "endDate" in turtle_output
|
||
|
||
def test_multiple_custodians_with_partnerships_to_rdf(self):
|
||
"""Test exporting multiple custodians with partnerships to a single RDF graph."""
|
||
custodians = [
|
||
HeritageCustodian(
|
||
id=f"https://w3id.org/heritage/custodian/cl/museum-{i}",
|
||
name=f"Chilean Museum {i}",
|
||
institution_type=InstitutionType.MUSEUM,
|
||
partnerships=[
|
||
Partnership(
|
||
partner_name="Museum Register",
|
||
partnership_type="national_certification"
|
||
),
|
||
Partnership(
|
||
partner_name="UNESCO",
|
||
partnership_type="international_certification"
|
||
)
|
||
],
|
||
provenance=Provenance(
|
||
data_source=DataSource.CONVERSATION_NLP,
|
||
data_tier=DataTier.TIER_4_INFERRED,
|
||
extraction_date=datetime.now(timezone.utc),
|
||
extraction_method="Test data"
|
||
)
|
||
)
|
||
for i in range(1, 4) # Create 3 test museums
|
||
]
|
||
|
||
# Export all to single RDF graph
|
||
exporter = RDFExporter()
|
||
turtle_output = exporter.export(custodians, format="turtle")
|
||
|
||
# Parse RDF
|
||
test_graph = Graph()
|
||
test_graph.parse(data=turtle_output, format="turtle")
|
||
|
||
# Verify all custodians are present
|
||
for custodian in custodians:
|
||
custodian_uri = URIRef(custodian.id)
|
||
|
||
# Check institution exists
|
||
names = list(test_graph.objects(custodian_uri, GHCID.name))
|
||
assert len(names) == 1, f"Custodian {custodian.id} should have a name"
|
||
|
||
# Check partnerships
|
||
memberships = list(test_graph.objects(custodian_uri, ORG.hasMembership))
|
||
assert len(memberships) == 2, f"Custodian {custodian.id} should have 2 partnerships"
|
||
|
||
# Verify graph size (3 institutions × 2 partnerships + metadata = 50+ triples)
|
||
assert len(test_graph) > 50, \
|
||
f"Graph should have substantial content for 3 institutions, got {len(test_graph)} triples"
|
||
|
||
# Verify both UNESCO and Museum Register appear multiple times
|
||
turtle_str = str(turtle_output)
|
||
assert turtle_str.count("UNESCO") >= 3, "UNESCO should appear for each institution"
|
||
assert turtle_str.count("Museum Register") >= 3, "Museum Register should appear for each institution"
|
||
|
||
|
||
class TestPartnershipRDFValidation:
|
||
"""Validate RDF output conforms to W3C Organization Ontology patterns."""
|
||
|
||
def test_org_ontology_conformance(self):
|
||
"""Test that exported RDF conforms to W3C ORG ontology patterns."""
|
||
custodian = HeritageCustodian(
|
||
id="https://w3id.org/heritage/custodian/test",
|
||
name="Test Organization",
|
||
institution_type=InstitutionType.MUSEUM,
|
||
partnerships=[
|
||
Partnership(
|
||
partner_name="Partner Organization",
|
||
partnership_type="collaboration"
|
||
)
|
||
],
|
||
provenance=Provenance(
|
||
data_source=DataSource.CSV_REGISTRY,
|
||
data_tier=DataTier.TIER_1_AUTHORITATIVE,
|
||
extraction_date=datetime.now(timezone.utc),
|
||
extraction_method="Test"
|
||
)
|
||
)
|
||
|
||
exporter = RDFExporter()
|
||
turtle_output = exporter.export([custodian], format="turtle")
|
||
|
||
# Parse RDF
|
||
test_graph = Graph()
|
||
test_graph.parse(data=turtle_output, format="turtle")
|
||
|
||
# W3C ORG Ontology requirements:
|
||
# 1. Organization should have type org:Organization
|
||
custodian_uri = URIRef(custodian.id)
|
||
types = list(test_graph.objects(custodian_uri, URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#type")))
|
||
type_uris = [str(t) for t in types]
|
||
assert "http://www.w3.org/ns/org#Organization" in type_uris, \
|
||
"Custodian should be typed as org:Organization"
|
||
|
||
# 2. Membership node should link organization to member
|
||
memberships = list(test_graph.objects(custodian_uri, ORG.hasMembership))
|
||
assert len(memberships) == 1
|
||
|
||
membership_node = memberships[0]
|
||
|
||
# 3. Membership should have org:organization property linking back to custodian
|
||
orgs = list(test_graph.objects(membership_node, ORG.organization))
|
||
assert custodian_uri in orgs, "Membership should link back to organization via org:organization"
|
||
|
||
# 4. Membership should have org:member property (partner)
|
||
members = list(test_graph.objects(membership_node, ORG.member))
|
||
assert len(members) == 1, "Membership should have an org:member"
|
||
|
||
# 5. Optional org:role for partnership type
|
||
roles = list(test_graph.objects(membership_node, ORG.role))
|
||
assert len(roles) == 1, "Membership should have an org:role"
|
||
assert str(roles[0]) == "collaboration"
|
||
|
||
def test_prov_o_integration(self):
|
||
"""Test that partnerships are linked with PROV-O provenance."""
|
||
custodian = HeritageCustodian(
|
||
id="https://w3id.org/heritage/custodian/prov-test",
|
||
name="Provenance Test",
|
||
institution_type=InstitutionType.LIBRARY,
|
||
partnerships=[
|
||
Partnership(
|
||
partner_name="Test Partner",
|
||
partnership_type="network_membership"
|
||
)
|
||
],
|
||
provenance=Provenance(
|
||
data_source=DataSource.CONVERSATION_NLP,
|
||
data_tier=DataTier.TIER_4_INFERRED,
|
||
extraction_date=datetime(2025, 11, 5, 14, 30, 0, tzinfo=timezone.utc),
|
||
extraction_method="Pattern-based extraction from conversation",
|
||
confidence_score=0.90,
|
||
conversation_id="test-uuid-12345"
|
||
)
|
||
)
|
||
|
||
exporter = RDFExporter()
|
||
turtle_output = exporter.export([custodian], format="turtle")
|
||
|
||
# Parse RDF
|
||
test_graph = Graph()
|
||
test_graph.parse(data=turtle_output, format="turtle")
|
||
|
||
# Verify PROV-O namespace is used
|
||
assert "prov:" in turtle_output or "http://www.w3.org/ns/prov#" in turtle_output, \
|
||
"PROV-O namespace should be present"
|
||
|
||
# Verify provenance activity exists
|
||
custodian_uri = URIRef(custodian.id)
|
||
PROV = Namespace("http://www.w3.org/ns/prov#")
|
||
|
||
# Check prov:wasGeneratedBy
|
||
activities = list(test_graph.objects(custodian_uri, PROV.wasGeneratedBy))
|
||
assert len(activities) >= 1, "Custodian should have prov:wasGeneratedBy"
|
||
|
||
# Check prov:generatedAtTime
|
||
gen_times = list(test_graph.objects(custodian_uri, PROV.generatedAtTime))
|
||
assert len(gen_times) >= 1, "Custodian should have prov:generatedAtTime"
|