glam/tests/exporters/test_rdf_exporter.py

"""
Tests for RDF exporter with Partnership support.
"""

import pytest
from datetime import date, datetime, timezone
from rdflib import Graph, Literal, URIRef, Namespace
from rdflib.namespace import RDF

from glam_extractor.models import (
    HeritageCustodian,
    Partnership,
    Location,
    Identifier,
    Provenance,
    InstitutionType,
    DataSource,
    DataTier
)
from glam_extractor.exporters.rdf_exporter import RDFExporter

# Define SCHEMA namespace
SCHEMA = Namespace("http://schema.org/")


# Namespaces
GHCID = Namespace("https://w3id.org/heritage/custodian/")
ORG = Namespace("http://www.w3.org/ns/org#")


class TestRDFExporterPartnership:
    """Test RDF exporter with Partnership serialization."""

    def test_single_partnership_export(self):
        """Test exporting a custodian with one partnership."""
        custodian = HeritageCustodian(
            id="https://w3id.org/heritage/custodian/nl/test-museum",
            name="Test Museum",
            institution_type=InstitutionType.MUSEUM,
            partnerships=[
                Partnership(
                    partner_name="Museum Register",
                    partnership_type="national_museum_certification"
                )
            ],
            provenance=Provenance(
                data_source=DataSource.CSV_REGISTRY,
                data_tier=DataTier.TIER_1_AUTHORITATIVE,
                extraction_date=datetime.now(timezone.utc),
                extraction_method="Test data"
            )
        )

        exporter = RDFExporter()
        exporter.add_custodian(custodian)

        # Check that partnership triples exist
        custodian_uri = URIRef(custodian.id)

        # Should have org:hasMembership
        memberships = list(exporter.graph.objects(custodian_uri, ORG.hasMembership))
        assert len(memberships) == 1, "Should have exactly one membership"

        membership_node = memberships[0]

        # Check membership type
        types = list(exporter.graph.objects(membership_node, RDF.type))
        assert ORG.Membership in types, "Should be typed as org:Membership"
        assert GHCID.Partnership in types, "Should be typed as ghcid:Partnership"

        # Check partner name
        partner_names = list(exporter.graph.objects(membership_node, GHCID.partner_name))
        assert len(partner_names) == 1
        assert str(partner_names[0]) == "Museum Register"

        # Check partnership type
        roles = list(exporter.graph.objects(membership_node, ORG.role))
        assert len(roles) == 1
        assert str(roles[0]) == "national_museum_certification"

    def test_multiple_partnerships_export(self):
        """Test exporting a custodian with multiple partnerships."""
        custodian = HeritageCustodian(
            id="https://w3id.org/heritage/custodian/nl/rijksmuseum",
            name="Rijksmuseum",
            institution_type=InstitutionType.MUSEUM,
            partnerships=[
                Partnership(
                    partner_name="Museum Register",
                    partnership_type="national_museum_certification"
                ),
                Partnership(
                    partner_name="Rijkscollectie",
                    partnership_type="national_collection_designation"
                ),
                Partnership(
                    partner_name="Collectie Nederland",
                    partnership_type="aggregator_participation"
                )
            ],
            provenance=Provenance(
                data_source=DataSource.CSV_REGISTRY,
                data_tier=DataTier.TIER_1_AUTHORITATIVE,
                extraction_date=datetime.now(timezone.utc),
                extraction_method="Test data"
            )
        )

        exporter = RDFExporter()
        exporter.add_custodian(custodian)

        custodian_uri = URIRef(custodian.id)

        # Should have 3 memberships
        memberships = list(exporter.graph.objects(custodian_uri, ORG.hasMembership))
        assert len(memberships) == 3, "Should have exactly three memberships"

        # Collect all partner names
        partner_names = set()
        for membership in memberships:
            names = list(exporter.graph.objects(membership, GHCID.partner_name))
            assert len(names) == 1, "Each membership should have one partner name"
            partner_names.add(str(names[0]))

        expected_partners = {"Museum Register", "Rijkscollectie", "Collectie Nederland"}
        assert partner_names == expected_partners, "All partners should be present"

    def test_partnership_with_temporal_scope(self):
        """Test partnership with start and end dates."""
        custodian = HeritageCustodian(
            id="https://w3id.org/heritage/custodian/nl/test-archive",
            name="Test Archive",
            institution_type=InstitutionType.ARCHIVE,
            partnerships=[
                Partnership(
                    partner_name="DC4EU Project",
                    partnership_type="digitization_program",
                    start_date=date(2022, 1, 1),
                    end_date=date(2025, 12, 31),
                    description="Participating in EU digitization initiative"
                )
            ],
            provenance=Provenance(
                data_source=DataSource.CSV_REGISTRY,
                data_tier=DataTier.TIER_1_AUTHORITATIVE,
                extraction_date=datetime.now(timezone.utc),
                extraction_method="Test data"
            )
        )

        exporter = RDFExporter()
        exporter.add_custodian(custodian)

        custodian_uri = URIRef(custodian.id)
        memberships = list(exporter.graph.objects(custodian_uri, ORG.hasMembership))
        membership_node = memberships[0]

        # Check start date
        start_dates = list(exporter.graph.objects(membership_node, SCHEMA.startDate))
        assert len(start_dates) == 1
        assert str(start_dates[0]) == "2022-01-01"

        # Check end date
        end_dates = list(exporter.graph.objects(membership_node, SCHEMA.endDate))
        assert len(end_dates) == 1
        assert str(end_dates[0]) == "2025-12-31"

        # Check description
        descriptions = list(exporter.graph.objects(membership_node, SCHEMA.description))
        assert len(descriptions) == 1
        assert "EU digitization initiative" in str(descriptions[0])

    def test_export_to_turtle(self):
        """Test full Turtle serialization with partnerships."""
        custodian = HeritageCustodian(
            id="https://w3id.org/heritage/custodian/nl/test-museum",
            name="Test Museum",
            institution_type=InstitutionType.MUSEUM,
            description="A test museum for RDF export",
            locations=[
                Location(city="Amsterdam", country="NL")
            ],
            identifiers=[
                Identifier(
                    identifier_scheme="ISIL",
                    identifier_value="NL-AmsTEST"
                )
            ],
            partnerships=[
                Partnership(
                    partner_name="Museum Register",
                    partnership_type="national_museum_certification"
                )
            ],
            provenance=Provenance(
                data_source=DataSource.CSV_REGISTRY,
                data_tier=DataTier.TIER_1_AUTHORITATIVE,
                extraction_date=datetime.now(timezone.utc),
                extraction_method="Test data"
            )
        )

        exporter = RDFExporter()
        turtle_output = exporter.export([custodian], format="turtle")

        # Parse output to verify it's valid Turtle
        test_graph = Graph()
        test_graph.parse(data=turtle_output, format="turtle")

        # Should have substantial content
        assert len(test_graph) > 10, "Graph should have multiple triples"

        # Verify key patterns in Turtle output
        assert "org:hasMembership" in turtle_output
        assert "org:Membership" in turtle_output
        assert "Museum Register" in turtle_output
        assert "national_museum_certification" in turtle_output


class TestRDFExporterCompleteness:
    """Test that RDF exporter handles all custodian fields."""

    def test_full_custodian_export(self):
        """Test exporting a custodian with all fields populated."""
        custodian = HeritageCustodian(
            id="https://w3id.org/heritage/custodian/nl/complete-test",
            name="Complete Test Institution",
            alternative_names=["CTI", "Compleet Test Instituut"],
            institution_type=InstitutionType.ARCHIVE,
            description="A fully populated test institution",
            homepage="https://example.org",
            locations=[
                Location(
                    city="Utrecht",
                    country="NL",
                    street_address="Teststraat 123",
                    postal_code="1234 AB"
                )
            ],
            identifiers=[
                Identifier(
                    identifier_scheme="ISIL",
                    identifier_value="NL-UtrechtCTI"
                ),
                Identifier(
                    identifier_scheme="Wikidata",
                    identifier_value="Q12345",
                    identifier_url="https://www.wikidata.org/wiki/Q12345"
                )
            ],
            partnerships=[
                Partnership(
                    partner_name="Archieven.nl",
                    partnership_type="aggregator_participation"
                ),
                Partnership(
                    partner_name="DC4EU",
                    partnership_type="digitization_program"
                )
            ],
            provenance=Provenance(
                data_source=DataSource.CSV_REGISTRY,
                data_tier=DataTier.TIER_1_AUTHORITATIVE,
                extraction_date=datetime.now(timezone.utc),
                extraction_method="Comprehensive test"
            )
        )

        exporter = RDFExporter()
        exporter.add_custodian(custodian)

        # Export to Turtle
        turtle_output = exporter.export([custodian], format="turtle")

        # Verify all components are present
        assert "Complete Test Institution" in turtle_output
        assert "CTI" in turtle_output
        assert "Utrecht" in turtle_output
        assert "Teststraat 123" in turtle_output
        assert "NL-UtrechtCTI" in turtle_output
        assert "Q12345" in turtle_output
        assert "Archieven.nl" in turtle_output
        assert "DC4EU" in turtle_output
        assert "org:hasMembership" in turtle_output

        # Parse and count triples
        test_graph = Graph()
        test_graph.parse(data=turtle_output, format="turtle")

        # Should have rich metadata (50+ triples for a complete institution)
        assert len(test_graph) > 50, f"Graph should have substantial content, got {len(test_graph)} triples"