glam/tests/test_partnership_rdf_integration.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

371 lines
16 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
End-to-end integration test: Conversation → Partnership Extraction → RDF Export
This test validates the complete pipeline from parsing a real conversation JSON file,
extracting partnership mentions, and serializing them to RDF/Turtle format using
the W3C Organization Ontology (ORG).
"""
import pytest
from datetime import datetime, timezone, date
from pathlib import Path
from rdflib import Graph, Namespace, URIRef
from glam_extractor.parsers.conversation import ConversationParser
from glam_extractor.models import (
HeritageCustodian,
Partnership,
InstitutionType,
Provenance,
DataSource,
DataTier
)
from glam_extractor.exporters.rdf_exporter import RDFExporter
# Define namespaces
GHCID = Namespace("https://w3id.org/heritage/custodian/")
ORG = Namespace("http://www.w3.org/ns/org#")
SCHEMA = Namespace("http://schema.org/")
class TestPartnershipRDFIntegration:
"""Test end-to-end partnership extraction and RDF serialization."""
@pytest.fixture
def chilean_conversation_path(self):
"""Path to Chilean GLAM conversation JSON file."""
return Path(__file__).parent.parent / "data" / "raw" / "chilean_glam_conversation.json"
def test_extract_partnerships_from_chilean_conversation(self, chilean_conversation_path):
"""Test extracting partnerships from real Chilean conversation."""
if not chilean_conversation_path.exists():
pytest.skip(f"Chilean conversation file not found: {chilean_conversation_path}")
# Parse conversation
parser = ConversationParser()
conversation = parser.parse_file(str(chilean_conversation_path))
# Extract partnerships
partnerships = parser.extract_partnerships(conversation)
# Verify extraction (we know Chilean conversation mentions UNESCO)
assert len(partnerships) > 0, "Should extract at least one partnership"
# Check for UNESCO partnership
unesco_partnerships = [p for p in partnerships if "UNESCO" in p["partner_name"]]
assert len(unesco_partnerships) > 0, "Should find UNESCO partnership"
# Verify partnership structure
for partnership in partnerships:
assert "partner_name" in partnership
assert "partnership_type" in partnership
assert "description" in partnership
# Temporal fields are optional
if "start_date" in partnership:
assert partnership["start_date"] is None or isinstance(partnership["start_date"], str)
def test_conversation_to_rdf_export(self, chilean_conversation_path):
"""Test full pipeline: conversation → extract partnerships → RDF export."""
if not chilean_conversation_path.exists():
pytest.skip(f"Chilean conversation file not found: {chilean_conversation_path}")
# Step 1: Parse conversation
parser = ConversationParser()
conversation = parser.parse_file(str(chilean_conversation_path))
# Step 2: Extract partnerships
partnerships_data = parser.extract_partnerships(conversation)
# Step 3: Create HeritageCustodian with extracted partnerships
# Note: In real workflow, we'd extract institution names too, but for this test
# we create a test institution with extracted partnerships
# Convert extracted partnership dicts to Partnership model objects
partnership_models = []
for p_data in partnerships_data:
partnership = Partnership(
partner_name=p_data["partner_name"],
partnership_type=p_data["partnership_type"],
description=p_data["description"],
# Handle optional date fields - convert string dates to date objects if present
# For now, we'll keep them as None since our current extraction returns None
start_date=None, # Would parse p_data.get("start_date") if present
end_date=None # Would parse p_data.get("end_date") if present
)
partnership_models.append(partnership)
# Create test custodian with extracted partnerships
custodian = HeritageCustodian(
id="https://w3id.org/heritage/custodian/cl/test-institution",
name="Chilean Test Institution",
institution_type=InstitutionType.MUSEUM,
description="Test institution with partnerships extracted from conversation",
partnerships=partnership_models,
provenance=Provenance(
data_source=DataSource.CONVERSATION_NLP,
data_tier=DataTier.TIER_4_INFERRED,
extraction_date=datetime.now(timezone.utc),
extraction_method="ConversationParser.extract_partnerships() + pattern matching",
confidence_score=0.85,
conversation_id=conversation.uuid
)
)
# Step 4: Export to RDF
exporter = RDFExporter()
turtle_output = exporter.export([custodian], format="turtle")
# Step 5: Validate RDF output
# Parse Turtle to verify it's valid
test_graph = Graph()
test_graph.parse(data=turtle_output, format="turtle")
# Verify graph has substantial content
assert len(test_graph) > 10, "RDF graph should have multiple triples"
# Verify partnership-specific triples
custodian_uri = URIRef(custodian.id)
# Check for org:hasMembership triples
memberships = list(test_graph.objects(custodian_uri, ORG.hasMembership))
assert len(memberships) == len(partnership_models), \
f"Should have {len(partnership_models)} membership(s), got {len(memberships)}"
# Verify each membership has required properties
for membership_node in memberships:
# Check ORG.Membership type
types = list(test_graph.objects(membership_node, URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#type")))
type_uris = [str(t) for t in types]
assert "http://www.w3.org/ns/org#Membership" in type_uris, \
"Membership should be typed as org:Membership"
# Check partner name
partner_names = list(test_graph.objects(membership_node, GHCID.partner_name))
assert len(partner_names) >= 1, "Each membership should have a partner_name"
# Check partnership type
partnership_types = list(test_graph.objects(membership_node, GHCID.partnership_type))
assert len(partnership_types) >= 1, "Each membership should have a partnership_type"
# Verify specific content appears in Turtle output
assert "org:hasMembership" in turtle_output, "Turtle should contain org:hasMembership predicate"
assert "org:Membership" in turtle_output, "Turtle should contain org:Membership class"
# Check for at least one extracted partner name
assert any(p["partner_name"] in turtle_output for p in partnerships_data), \
"At least one partner name should appear in Turtle output"
def test_partnership_with_temporal_properties_in_rdf(self):
"""Test that partnerships with dates are correctly serialized to RDF."""
from datetime import date
# Create custodian with time-bounded partnership
custodian = HeritageCustodian(
id="https://w3id.org/heritage/custodian/cl/temporal-test",
name="Temporal Test Institution",
institution_type=InstitutionType.ARCHIVE,
partnerships=[
Partnership(
partner_name="DC4EU Project",
partnership_type="digitization_program",
start_date=date(2022, 1, 1),
end_date=date(2025, 12, 31),
description="EU digitization initiative 2022-2025"
)
],
provenance=Provenance(
data_source=DataSource.CONVERSATION_NLP,
data_tier=DataTier.TIER_4_INFERRED,
extraction_date=datetime.now(timezone.utc),
extraction_method="Test data"
)
)
# Export to RDF
exporter = RDFExporter()
turtle_output = exporter.export([custodian], format="turtle")
# Parse RDF
test_graph = Graph()
test_graph.parse(data=turtle_output, format="turtle")
# Verify temporal properties
custodian_uri = URIRef(custodian.id)
memberships = list(test_graph.objects(custodian_uri, ORG.hasMembership))
membership_node = memberships[0]
# Check start date
start_dates = list(test_graph.objects(membership_node, SCHEMA.startDate))
assert len(start_dates) == 1, "Should have schema:startDate"
assert str(start_dates[0]) == "2022-01-01", "Start date should be 2022-01-01"
# Check end date
end_dates = list(test_graph.objects(membership_node, SCHEMA.endDate))
assert len(end_dates) == 1, "Should have schema:endDate"
assert str(end_dates[0]) == "2025-12-31", "End date should be 2025-12-31"
# Verify dates appear in Turtle
assert "2022-01-01" in turtle_output, "Start date should appear in Turtle"
assert "2025-12-31" in turtle_output, "End date should appear in Turtle"
assert "schema:startDate" in turtle_output or "startDate" in turtle_output
assert "schema:endDate" in turtle_output or "endDate" in turtle_output
def test_multiple_custodians_with_partnerships_to_rdf(self):
"""Test exporting multiple custodians with partnerships to a single RDF graph."""
custodians = [
HeritageCustodian(
id=f"https://w3id.org/heritage/custodian/cl/museum-{i}",
name=f"Chilean Museum {i}",
institution_type=InstitutionType.MUSEUM,
partnerships=[
Partnership(
partner_name="Museum Register",
partnership_type="national_certification"
),
Partnership(
partner_name="UNESCO",
partnership_type="international_certification"
)
],
provenance=Provenance(
data_source=DataSource.CONVERSATION_NLP,
data_tier=DataTier.TIER_4_INFERRED,
extraction_date=datetime.now(timezone.utc),
extraction_method="Test data"
)
)
for i in range(1, 4) # Create 3 test museums
]
# Export all to single RDF graph
exporter = RDFExporter()
turtle_output = exporter.export(custodians, format="turtle")
# Parse RDF
test_graph = Graph()
test_graph.parse(data=turtle_output, format="turtle")
# Verify all custodians are present
for custodian in custodians:
custodian_uri = URIRef(custodian.id)
# Check institution exists
names = list(test_graph.objects(custodian_uri, GHCID.name))
assert len(names) == 1, f"Custodian {custodian.id} should have a name"
# Check partnerships
memberships = list(test_graph.objects(custodian_uri, ORG.hasMembership))
assert len(memberships) == 2, f"Custodian {custodian.id} should have 2 partnerships"
# Verify graph size (3 institutions × 2 partnerships + metadata = 50+ triples)
assert len(test_graph) > 50, \
f"Graph should have substantial content for 3 institutions, got {len(test_graph)} triples"
# Verify both UNESCO and Museum Register appear multiple times
turtle_str = str(turtle_output)
assert turtle_str.count("UNESCO") >= 3, "UNESCO should appear for each institution"
assert turtle_str.count("Museum Register") >= 3, "Museum Register should appear for each institution"
class TestPartnershipRDFValidation:
"""Validate RDF output conforms to W3C Organization Ontology patterns."""
def test_org_ontology_conformance(self):
"""Test that exported RDF conforms to W3C ORG ontology patterns."""
custodian = HeritageCustodian(
id="https://w3id.org/heritage/custodian/test",
name="Test Organization",
institution_type=InstitutionType.MUSEUM,
partnerships=[
Partnership(
partner_name="Partner Organization",
partnership_type="collaboration"
)
],
provenance=Provenance(
data_source=DataSource.CSV_REGISTRY,
data_tier=DataTier.TIER_1_AUTHORITATIVE,
extraction_date=datetime.now(timezone.utc),
extraction_method="Test"
)
)
exporter = RDFExporter()
turtle_output = exporter.export([custodian], format="turtle")
# Parse RDF
test_graph = Graph()
test_graph.parse(data=turtle_output, format="turtle")
# W3C ORG Ontology requirements:
# 1. Organization should have type org:Organization
custodian_uri = URIRef(custodian.id)
types = list(test_graph.objects(custodian_uri, URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#type")))
type_uris = [str(t) for t in types]
assert "http://www.w3.org/ns/org#Organization" in type_uris, \
"Custodian should be typed as org:Organization"
# 2. Membership node should link organization to member
memberships = list(test_graph.objects(custodian_uri, ORG.hasMembership))
assert len(memberships) == 1
membership_node = memberships[0]
# 3. Membership should have org:organization property linking back to custodian
orgs = list(test_graph.objects(membership_node, ORG.organization))
assert custodian_uri in orgs, "Membership should link back to organization via org:organization"
# 4. Membership should have org:member property (partner)
members = list(test_graph.objects(membership_node, ORG.member))
assert len(members) == 1, "Membership should have an org:member"
# 5. Optional org:role for partnership type
roles = list(test_graph.objects(membership_node, ORG.role))
assert len(roles) == 1, "Membership should have an org:role"
assert str(roles[0]) == "collaboration"
def test_prov_o_integration(self):
"""Test that partnerships are linked with PROV-O provenance."""
custodian = HeritageCustodian(
id="https://w3id.org/heritage/custodian/prov-test",
name="Provenance Test",
institution_type=InstitutionType.LIBRARY,
partnerships=[
Partnership(
partner_name="Test Partner",
partnership_type="network_membership"
)
],
provenance=Provenance(
data_source=DataSource.CONVERSATION_NLP,
data_tier=DataTier.TIER_4_INFERRED,
extraction_date=datetime(2025, 11, 5, 14, 30, 0, tzinfo=timezone.utc),
extraction_method="Pattern-based extraction from conversation",
confidence_score=0.90,
conversation_id="test-uuid-12345"
)
)
exporter = RDFExporter()
turtle_output = exporter.export([custodian], format="turtle")
# Parse RDF
test_graph = Graph()
test_graph.parse(data=turtle_output, format="turtle")
# Verify PROV-O namespace is used
assert "prov:" in turtle_output or "http://www.w3.org/ns/prov#" in turtle_output, \
"PROV-O namespace should be present"
# Verify provenance activity exists
custodian_uri = URIRef(custodian.id)
PROV = Namespace("http://www.w3.org/ns/prov#")
# Check prov:wasGeneratedBy
activities = list(test_graph.objects(custodian_uri, PROV.wasGeneratedBy))
assert len(activities) >= 1, "Custodian should have prov:wasGeneratedBy"
# Check prov:generatedAtTime
gen_times = list(test_graph.objects(custodian_uri, PROV.generatedAtTime))
assert len(gen_times) >= 1, "Custodian should have prov:generatedAtTime"