glam/tests/test_partnership_rdf_integration.py

"""
End-to-end integration test: Conversation → Partnership Extraction → RDF Export

This test validates the complete pipeline from parsing a real conversation JSON file,
extracting partnership mentions, and serializing them to RDF/Turtle format using
the W3C Organization Ontology (ORG).
"""

import pytest
from datetime import datetime, timezone, date
from pathlib import Path
from rdflib import Graph, Namespace, URIRef

from glam_extractor.parsers.conversation import ConversationParser
from glam_extractor.models import (
    HeritageCustodian,
    Partnership,
    InstitutionType,
    Provenance,
    DataSource,
    DataTier
)
from glam_extractor.exporters.rdf_exporter import RDFExporter


# Define namespaces
GHCID = Namespace("https://w3id.org/heritage/custodian/")
ORG = Namespace("http://www.w3.org/ns/org#")
SCHEMA = Namespace("http://schema.org/")


class TestPartnershipRDFIntegration:
    """Test end-to-end partnership extraction and RDF serialization."""

    @pytest.fixture
    def chilean_conversation_path(self):
        """Path to Chilean GLAM conversation JSON file."""
        return Path(__file__).parent.parent / "data" / "raw" / "chilean_glam_conversation.json"

    def test_extract_partnerships_from_chilean_conversation(self, chilean_conversation_path):
        """Test extracting partnerships from real Chilean conversation."""
        if not chilean_conversation_path.exists():
            pytest.skip(f"Chilean conversation file not found: {chilean_conversation_path}")

        # Parse conversation
        parser = ConversationParser()
        conversation = parser.parse_file(str(chilean_conversation_path))

        # Extract partnerships
        partnerships = parser.extract_partnerships(conversation)

        # Verify extraction (we know Chilean conversation mentions UNESCO)
        assert len(partnerships) > 0, "Should extract at least one partnership"

        # Check for UNESCO partnership
        unesco_partnerships = [p for p in partnerships if "UNESCO" in p["partner_name"]]
        assert len(unesco_partnerships) > 0, "Should find UNESCO partnership"

        # Verify partnership structure
        for partnership in partnerships:
            assert "partner_name" in partnership
            assert "partnership_type" in partnership
            assert "description" in partnership
            # Temporal fields are optional
            if "start_date" in partnership:
                assert partnership["start_date"] is None or isinstance(partnership["start_date"], str)

    def test_conversation_to_rdf_export(self, chilean_conversation_path):
        """Test full pipeline: conversation → extract partnerships → RDF export."""
        if not chilean_conversation_path.exists():
            pytest.skip(f"Chilean conversation file not found: {chilean_conversation_path}")

        # Step 1: Parse conversation
        parser = ConversationParser()
        conversation = parser.parse_file(str(chilean_conversation_path))

        # Step 2: Extract partnerships
        partnerships_data = parser.extract_partnerships(conversation)

        # Step 3: Create HeritageCustodian with extracted partnerships
        # Note: In real workflow, we'd extract institution names too, but for this test
        # we create a test institution with extracted partnerships

        # Convert extracted partnership dicts to Partnership model objects
        partnership_models = []
        for p_data in partnerships_data:
            partnership = Partnership(
                partner_name=p_data["partner_name"],
                partnership_type=p_data["partnership_type"],
                description=p_data["description"],
                # Handle optional date fields - convert string dates to date objects if present
                # For now, we'll keep them as None since our current extraction returns None
                start_date=None,  # Would parse p_data.get("start_date") if present
                end_date=None     # Would parse p_data.get("end_date") if present
            )
            partnership_models.append(partnership)

        # Create test custodian with extracted partnerships
        custodian = HeritageCustodian(
            id="https://w3id.org/heritage/custodian/cl/test-institution",
            name="Chilean Test Institution",
            institution_type=InstitutionType.MUSEUM,
            description="Test institution with partnerships extracted from conversation",
            partnerships=partnership_models,
            provenance=Provenance(
                data_source=DataSource.CONVERSATION_NLP,
                data_tier=DataTier.TIER_4_INFERRED,
                extraction_date=datetime.now(timezone.utc),
                extraction_method="ConversationParser.extract_partnerships() + pattern matching",
                confidence_score=0.85,
                conversation_id=conversation.uuid
            )
        )

        # Step 4: Export to RDF
        exporter = RDFExporter()
        turtle_output = exporter.export([custodian], format="turtle")

        # Step 5: Validate RDF output
        # Parse Turtle to verify it's valid
        test_graph = Graph()
        test_graph.parse(data=turtle_output, format="turtle")

        # Verify graph has substantial content
        assert len(test_graph) > 10, "RDF graph should have multiple triples"

        # Verify partnership-specific triples
        custodian_uri = URIRef(custodian.id)

        # Check for org:hasMembership triples
        memberships = list(test_graph.objects(custodian_uri, ORG.hasMembership))
        assert len(memberships) == len(partnership_models), \
            f"Should have {len(partnership_models)} membership(s), got {len(memberships)}"

        # Verify each membership has required properties
        for membership_node in memberships:
            # Check ORG.Membership type
            types = list(test_graph.objects(membership_node, URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#type")))
            type_uris = [str(t) for t in types]
            assert "http://www.w3.org/ns/org#Membership" in type_uris, \
                "Membership should be typed as org:Membership"

            # Check partner name
            partner_names = list(test_graph.objects(membership_node, GHCID.partner_name))
            assert len(partner_names) >= 1, "Each membership should have a partner_name"

            # Check partnership type
            partnership_types = list(test_graph.objects(membership_node, GHCID.partnership_type))
            assert len(partnership_types) >= 1, "Each membership should have a partnership_type"

        # Verify specific content appears in Turtle output
        assert "org:hasMembership" in turtle_output, "Turtle should contain org:hasMembership predicate"
        assert "org:Membership" in turtle_output, "Turtle should contain org:Membership class"

        # Check for at least one extracted partner name
        assert any(p["partner_name"] in turtle_output for p in partnerships_data), \
            "At least one partner name should appear in Turtle output"

    def test_partnership_with_temporal_properties_in_rdf(self):
        """Test that partnerships with dates are correctly serialized to RDF."""
        from datetime import date

        # Create custodian with time-bounded partnership
        custodian = HeritageCustodian(
            id="https://w3id.org/heritage/custodian/cl/temporal-test",
            name="Temporal Test Institution",
            institution_type=InstitutionType.ARCHIVE,
            partnerships=[
                Partnership(
                    partner_name="DC4EU Project",
                    partnership_type="digitization_program",
                    start_date=date(2022, 1, 1),
                    end_date=date(2025, 12, 31),
                    description="EU digitization initiative 2022-2025"
                )
            ],
            provenance=Provenance(
                data_source=DataSource.CONVERSATION_NLP,
                data_tier=DataTier.TIER_4_INFERRED,
                extraction_date=datetime.now(timezone.utc),
                extraction_method="Test data"
            )
        )

        # Export to RDF
        exporter = RDFExporter()
        turtle_output = exporter.export([custodian], format="turtle")

        # Parse RDF
        test_graph = Graph()
        test_graph.parse(data=turtle_output, format="turtle")

        # Verify temporal properties
        custodian_uri = URIRef(custodian.id)
        memberships = list(test_graph.objects(custodian_uri, ORG.hasMembership))
        membership_node = memberships[0]

        # Check start date
        start_dates = list(test_graph.objects(membership_node, SCHEMA.startDate))
        assert len(start_dates) == 1, "Should have schema:startDate"
        assert str(start_dates[0]) == "2022-01-01", "Start date should be 2022-01-01"

        # Check end date
        end_dates = list(test_graph.objects(membership_node, SCHEMA.endDate))
        assert len(end_dates) == 1, "Should have schema:endDate"
        assert str(end_dates[0]) == "2025-12-31", "End date should be 2025-12-31"

        # Verify dates appear in Turtle
        assert "2022-01-01" in turtle_output, "Start date should appear in Turtle"
        assert "2025-12-31" in turtle_output, "End date should appear in Turtle"
        assert "schema:startDate" in turtle_output or "startDate" in turtle_output
        assert "schema:endDate" in turtle_output or "endDate" in turtle_output

    def test_multiple_custodians_with_partnerships_to_rdf(self):
        """Test exporting multiple custodians with partnerships to a single RDF graph."""
        custodians = [
            HeritageCustodian(
                id=f"https://w3id.org/heritage/custodian/cl/museum-{i}",
                name=f"Chilean Museum {i}",
                institution_type=InstitutionType.MUSEUM,
                partnerships=[
                    Partnership(
                        partner_name="Museum Register",
                        partnership_type="national_certification"
                    ),
                    Partnership(
                        partner_name="UNESCO",
                        partnership_type="international_certification"
                    )
                ],
                provenance=Provenance(
                    data_source=DataSource.CONVERSATION_NLP,
                    data_tier=DataTier.TIER_4_INFERRED,
                    extraction_date=datetime.now(timezone.utc),
                    extraction_method="Test data"
                )
            )
            for i in range(1, 4)  # Create 3 test museums
        ]

        # Export all to single RDF graph
        exporter = RDFExporter()
        turtle_output = exporter.export(custodians, format="turtle")

        # Parse RDF
        test_graph = Graph()
        test_graph.parse(data=turtle_output, format="turtle")

        # Verify all custodians are present
        for custodian in custodians:
            custodian_uri = URIRef(custodian.id)

            # Check institution exists
            names = list(test_graph.objects(custodian_uri, GHCID.name))
            assert len(names) == 1, f"Custodian {custodian.id} should have a name"

            # Check partnerships
            memberships = list(test_graph.objects(custodian_uri, ORG.hasMembership))
            assert len(memberships) == 2, f"Custodian {custodian.id} should have 2 partnerships"

        # Verify graph size (3 institutions × 2 partnerships + metadata = 50+ triples)
        assert len(test_graph) > 50, \
            f"Graph should have substantial content for 3 institutions, got {len(test_graph)} triples"

        # Verify both UNESCO and Museum Register appear multiple times
        turtle_str = str(turtle_output)
        assert turtle_str.count("UNESCO") >= 3, "UNESCO should appear for each institution"
        assert turtle_str.count("Museum Register") >= 3, "Museum Register should appear for each institution"


class TestPartnershipRDFValidation:
    """Validate RDF output conforms to W3C Organization Ontology patterns."""

    def test_org_ontology_conformance(self):
        """Test that exported RDF conforms to W3C ORG ontology patterns."""
        custodian = HeritageCustodian(
            id="https://w3id.org/heritage/custodian/test",
            name="Test Organization",
            institution_type=InstitutionType.MUSEUM,
            partnerships=[
                Partnership(
                    partner_name="Partner Organization",
                    partnership_type="collaboration"
                )
            ],
            provenance=Provenance(
                data_source=DataSource.CSV_REGISTRY,
                data_tier=DataTier.TIER_1_AUTHORITATIVE,
                extraction_date=datetime.now(timezone.utc),
                extraction_method="Test"
            )
        )

        exporter = RDFExporter()
        turtle_output = exporter.export([custodian], format="turtle")

        # Parse RDF
        test_graph = Graph()
        test_graph.parse(data=turtle_output, format="turtle")

        # W3C ORG Ontology requirements:
        # 1. Organization should have type org:Organization
        custodian_uri = URIRef(custodian.id)
        types = list(test_graph.objects(custodian_uri, URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#type")))
        type_uris = [str(t) for t in types]
        assert "http://www.w3.org/ns/org#Organization" in type_uris, \
            "Custodian should be typed as org:Organization"

        # 2. Membership node should link organization to member
        memberships = list(test_graph.objects(custodian_uri, ORG.hasMembership))
        assert len(memberships) == 1

        membership_node = memberships[0]

        # 3. Membership should have org:organization property linking back to custodian
        orgs = list(test_graph.objects(membership_node, ORG.organization))
        assert custodian_uri in orgs, "Membership should link back to organization via org:organization"

        # 4. Membership should have org:member property (partner)
        members = list(test_graph.objects(membership_node, ORG.member))
        assert len(members) == 1, "Membership should have an org:member"

        # 5. Optional org:role for partnership type
        roles = list(test_graph.objects(membership_node, ORG.role))
        assert len(roles) == 1, "Membership should have an org:role"
        assert str(roles[0]) == "collaboration"

    def test_prov_o_integration(self):
        """Test that partnerships are linked with PROV-O provenance."""
        custodian = HeritageCustodian(
            id="https://w3id.org/heritage/custodian/prov-test",
            name="Provenance Test",
            institution_type=InstitutionType.LIBRARY,
            partnerships=[
                Partnership(
                    partner_name="Test Partner",
                    partnership_type="network_membership"
                )
            ],
            provenance=Provenance(
                data_source=DataSource.CONVERSATION_NLP,
                data_tier=DataTier.TIER_4_INFERRED,
                extraction_date=datetime(2025, 11, 5, 14, 30, 0, tzinfo=timezone.utc),
                extraction_method="Pattern-based extraction from conversation",
                confidence_score=0.90,
                conversation_id="test-uuid-12345"
            )
        )

        exporter = RDFExporter()
        turtle_output = exporter.export([custodian], format="turtle")

        # Parse RDF
        test_graph = Graph()
        test_graph.parse(data=turtle_output, format="turtle")

        # Verify PROV-O namespace is used
        assert "prov:" in turtle_output or "http://www.w3.org/ns/prov#" in turtle_output, \
            "PROV-O namespace should be present"

        # Verify provenance activity exists
        custodian_uri = URIRef(custodian.id)
        PROV = Namespace("http://www.w3.org/ns/prov#")

        # Check prov:wasGeneratedBy
        activities = list(test_graph.objects(custodian_uri, PROV.wasGeneratedBy))
        assert len(activities) >= 1, "Custodian should have prov:wasGeneratedBy"

        # Check prov:generatedAtTime
        gen_times = list(test_graph.objects(custodian_uri, PROV.generatedAtTime))
        assert len(gen_times) >= 1, "Custodian should have prov:generatedAtTime"