""" End-to-end integration test: Conversation → Partnership Extraction → RDF Export This test validates the complete pipeline from parsing a real conversation JSON file, extracting partnership mentions, and serializing them to RDF/Turtle format using the W3C Organization Ontology (ORG). """ import pytest from datetime import datetime, timezone, date from pathlib import Path from rdflib import Graph, Namespace, URIRef from glam_extractor.parsers.conversation import ConversationParser from glam_extractor.models import ( HeritageCustodian, Partnership, InstitutionType, Provenance, DataSource, DataTier ) from glam_extractor.exporters.rdf_exporter import RDFExporter # Define namespaces GHCID = Namespace("https://w3id.org/heritage/custodian/") ORG = Namespace("http://www.w3.org/ns/org#") SCHEMA = Namespace("http://schema.org/") class TestPartnershipRDFIntegration: """Test end-to-end partnership extraction and RDF serialization.""" @pytest.fixture def chilean_conversation_path(self): """Path to Chilean GLAM conversation JSON file.""" return Path(__file__).parent.parent / "data" / "raw" / "chilean_glam_conversation.json" def test_extract_partnerships_from_chilean_conversation(self, chilean_conversation_path): """Test extracting partnerships from real Chilean conversation.""" if not chilean_conversation_path.exists(): pytest.skip(f"Chilean conversation file not found: {chilean_conversation_path}") # Parse conversation parser = ConversationParser() conversation = parser.parse_file(str(chilean_conversation_path)) # Extract partnerships partnerships = parser.extract_partnerships(conversation) # Verify extraction (we know Chilean conversation mentions UNESCO) assert len(partnerships) > 0, "Should extract at least one partnership" # Check for UNESCO partnership unesco_partnerships = [p for p in partnerships if "UNESCO" in p["partner_name"]] assert len(unesco_partnerships) > 0, "Should find UNESCO partnership" # Verify partnership structure for partnership in partnerships: assert "partner_name" in partnership assert "partnership_type" in partnership assert "description" in partnership # Temporal fields are optional if "start_date" in partnership: assert partnership["start_date"] is None or isinstance(partnership["start_date"], str) def test_conversation_to_rdf_export(self, chilean_conversation_path): """Test full pipeline: conversation → extract partnerships → RDF export.""" if not chilean_conversation_path.exists(): pytest.skip(f"Chilean conversation file not found: {chilean_conversation_path}") # Step 1: Parse conversation parser = ConversationParser() conversation = parser.parse_file(str(chilean_conversation_path)) # Step 2: Extract partnerships partnerships_data = parser.extract_partnerships(conversation) # Step 3: Create HeritageCustodian with extracted partnerships # Note: In real workflow, we'd extract institution names too, but for this test # we create a test institution with extracted partnerships # Convert extracted partnership dicts to Partnership model objects partnership_models = [] for p_data in partnerships_data: partnership = Partnership( partner_name=p_data["partner_name"], partnership_type=p_data["partnership_type"], description=p_data["description"], # Handle optional date fields - convert string dates to date objects if present # For now, we'll keep them as None since our current extraction returns None start_date=None, # Would parse p_data.get("start_date") if present end_date=None # Would parse p_data.get("end_date") if present ) partnership_models.append(partnership) # Create test custodian with extracted partnerships custodian = HeritageCustodian( id="https://w3id.org/heritage/custodian/cl/test-institution", name="Chilean Test Institution", institution_type=InstitutionType.MUSEUM, description="Test institution with partnerships extracted from conversation", partnerships=partnership_models, provenance=Provenance( data_source=DataSource.CONVERSATION_NLP, data_tier=DataTier.TIER_4_INFERRED, extraction_date=datetime.now(timezone.utc), extraction_method="ConversationParser.extract_partnerships() + pattern matching", confidence_score=0.85, conversation_id=conversation.uuid ) ) # Step 4: Export to RDF exporter = RDFExporter() turtle_output = exporter.export([custodian], format="turtle") # Step 5: Validate RDF output # Parse Turtle to verify it's valid test_graph = Graph() test_graph.parse(data=turtle_output, format="turtle") # Verify graph has substantial content assert len(test_graph) > 10, "RDF graph should have multiple triples" # Verify partnership-specific triples custodian_uri = URIRef(custodian.id) # Check for org:hasMembership triples memberships = list(test_graph.objects(custodian_uri, ORG.hasMembership)) assert len(memberships) == len(partnership_models), \ f"Should have {len(partnership_models)} membership(s), got {len(memberships)}" # Verify each membership has required properties for membership_node in memberships: # Check ORG.Membership type types = list(test_graph.objects(membership_node, URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#type"))) type_uris = [str(t) for t in types] assert "http://www.w3.org/ns/org#Membership" in type_uris, \ "Membership should be typed as org:Membership" # Check partner name partner_names = list(test_graph.objects(membership_node, GHCID.partner_name)) assert len(partner_names) >= 1, "Each membership should have a partner_name" # Check partnership type partnership_types = list(test_graph.objects(membership_node, GHCID.partnership_type)) assert len(partnership_types) >= 1, "Each membership should have a partnership_type" # Verify specific content appears in Turtle output assert "org:hasMembership" in turtle_output, "Turtle should contain org:hasMembership predicate" assert "org:Membership" in turtle_output, "Turtle should contain org:Membership class" # Check for at least one extracted partner name assert any(p["partner_name"] in turtle_output for p in partnerships_data), \ "At least one partner name should appear in Turtle output" def test_partnership_with_temporal_properties_in_rdf(self): """Test that partnerships with dates are correctly serialized to RDF.""" from datetime import date # Create custodian with time-bounded partnership custodian = HeritageCustodian( id="https://w3id.org/heritage/custodian/cl/temporal-test", name="Temporal Test Institution", institution_type=InstitutionType.ARCHIVE, partnerships=[ Partnership( partner_name="DC4EU Project", partnership_type="digitization_program", start_date=date(2022, 1, 1), end_date=date(2025, 12, 31), description="EU digitization initiative 2022-2025" ) ], provenance=Provenance( data_source=DataSource.CONVERSATION_NLP, data_tier=DataTier.TIER_4_INFERRED, extraction_date=datetime.now(timezone.utc), extraction_method="Test data" ) ) # Export to RDF exporter = RDFExporter() turtle_output = exporter.export([custodian], format="turtle") # Parse RDF test_graph = Graph() test_graph.parse(data=turtle_output, format="turtle") # Verify temporal properties custodian_uri = URIRef(custodian.id) memberships = list(test_graph.objects(custodian_uri, ORG.hasMembership)) membership_node = memberships[0] # Check start date start_dates = list(test_graph.objects(membership_node, SCHEMA.startDate)) assert len(start_dates) == 1, "Should have schema:startDate" assert str(start_dates[0]) == "2022-01-01", "Start date should be 2022-01-01" # Check end date end_dates = list(test_graph.objects(membership_node, SCHEMA.endDate)) assert len(end_dates) == 1, "Should have schema:endDate" assert str(end_dates[0]) == "2025-12-31", "End date should be 2025-12-31" # Verify dates appear in Turtle assert "2022-01-01" in turtle_output, "Start date should appear in Turtle" assert "2025-12-31" in turtle_output, "End date should appear in Turtle" assert "schema:startDate" in turtle_output or "startDate" in turtle_output assert "schema:endDate" in turtle_output or "endDate" in turtle_output def test_multiple_custodians_with_partnerships_to_rdf(self): """Test exporting multiple custodians with partnerships to a single RDF graph.""" custodians = [ HeritageCustodian( id=f"https://w3id.org/heritage/custodian/cl/museum-{i}", name=f"Chilean Museum {i}", institution_type=InstitutionType.MUSEUM, partnerships=[ Partnership( partner_name="Museum Register", partnership_type="national_certification" ), Partnership( partner_name="UNESCO", partnership_type="international_certification" ) ], provenance=Provenance( data_source=DataSource.CONVERSATION_NLP, data_tier=DataTier.TIER_4_INFERRED, extraction_date=datetime.now(timezone.utc), extraction_method="Test data" ) ) for i in range(1, 4) # Create 3 test museums ] # Export all to single RDF graph exporter = RDFExporter() turtle_output = exporter.export(custodians, format="turtle") # Parse RDF test_graph = Graph() test_graph.parse(data=turtle_output, format="turtle") # Verify all custodians are present for custodian in custodians: custodian_uri = URIRef(custodian.id) # Check institution exists names = list(test_graph.objects(custodian_uri, GHCID.name)) assert len(names) == 1, f"Custodian {custodian.id} should have a name" # Check partnerships memberships = list(test_graph.objects(custodian_uri, ORG.hasMembership)) assert len(memberships) == 2, f"Custodian {custodian.id} should have 2 partnerships" # Verify graph size (3 institutions × 2 partnerships + metadata = 50+ triples) assert len(test_graph) > 50, \ f"Graph should have substantial content for 3 institutions, got {len(test_graph)} triples" # Verify both UNESCO and Museum Register appear multiple times turtle_str = str(turtle_output) assert turtle_str.count("UNESCO") >= 3, "UNESCO should appear for each institution" assert turtle_str.count("Museum Register") >= 3, "Museum Register should appear for each institution" class TestPartnershipRDFValidation: """Validate RDF output conforms to W3C Organization Ontology patterns.""" def test_org_ontology_conformance(self): """Test that exported RDF conforms to W3C ORG ontology patterns.""" custodian = HeritageCustodian( id="https://w3id.org/heritage/custodian/test", name="Test Organization", institution_type=InstitutionType.MUSEUM, partnerships=[ Partnership( partner_name="Partner Organization", partnership_type="collaboration" ) ], provenance=Provenance( data_source=DataSource.CSV_REGISTRY, data_tier=DataTier.TIER_1_AUTHORITATIVE, extraction_date=datetime.now(timezone.utc), extraction_method="Test" ) ) exporter = RDFExporter() turtle_output = exporter.export([custodian], format="turtle") # Parse RDF test_graph = Graph() test_graph.parse(data=turtle_output, format="turtle") # W3C ORG Ontology requirements: # 1. Organization should have type org:Organization custodian_uri = URIRef(custodian.id) types = list(test_graph.objects(custodian_uri, URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#type"))) type_uris = [str(t) for t in types] assert "http://www.w3.org/ns/org#Organization" in type_uris, \ "Custodian should be typed as org:Organization" # 2. Membership node should link organization to member memberships = list(test_graph.objects(custodian_uri, ORG.hasMembership)) assert len(memberships) == 1 membership_node = memberships[0] # 3. Membership should have org:organization property linking back to custodian orgs = list(test_graph.objects(membership_node, ORG.organization)) assert custodian_uri in orgs, "Membership should link back to organization via org:organization" # 4. Membership should have org:member property (partner) members = list(test_graph.objects(membership_node, ORG.member)) assert len(members) == 1, "Membership should have an org:member" # 5. Optional org:role for partnership type roles = list(test_graph.objects(membership_node, ORG.role)) assert len(roles) == 1, "Membership should have an org:role" assert str(roles[0]) == "collaboration" def test_prov_o_integration(self): """Test that partnerships are linked with PROV-O provenance.""" custodian = HeritageCustodian( id="https://w3id.org/heritage/custodian/prov-test", name="Provenance Test", institution_type=InstitutionType.LIBRARY, partnerships=[ Partnership( partner_name="Test Partner", partnership_type="network_membership" ) ], provenance=Provenance( data_source=DataSource.CONVERSATION_NLP, data_tier=DataTier.TIER_4_INFERRED, extraction_date=datetime(2025, 11, 5, 14, 30, 0, tzinfo=timezone.utc), extraction_method="Pattern-based extraction from conversation", confidence_score=0.90, conversation_id="test-uuid-12345" ) ) exporter = RDFExporter() turtle_output = exporter.export([custodian], format="turtle") # Parse RDF test_graph = Graph() test_graph.parse(data=turtle_output, format="turtle") # Verify PROV-O namespace is used assert "prov:" in turtle_output or "http://www.w3.org/ns/prov#" in turtle_output, \ "PROV-O namespace should be present" # Verify provenance activity exists custodian_uri = URIRef(custodian.id) PROV = Namespace("http://www.w3.org/ns/prov#") # Check prov:wasGeneratedBy activities = list(test_graph.objects(custodian_uri, PROV.wasGeneratedBy)) assert len(activities) >= 1, "Custodian should have prov:wasGeneratedBy" # Check prov:generatedAtTime gen_times = list(test_graph.objects(custodian_uri, PROV.generatedAtTime)) assert len(gen_times) >= 1, "Custodian should have prov:generatedAtTime"