glam/tests/parsers/test_conversation.py

"""
Tests for ConversationParser
"""

import json
from datetime import datetime
from pathlib import Path
import pytest

from glam_extractor.parsers.conversation import (
    ConversationParser,
    Conversation,
    ChatMessage,
    MessageContent,
)


@pytest.fixture
def sample_conversation_path():
    """Path to the sample conversation fixture"""
    return Path(__file__).parent.parent / "fixtures" / "sample_conversation.json"


@pytest.fixture
def sample_conversation_data():
    """Sample conversation data as dictionary"""
    return {
        "uuid": "test-uuid-001",
        "name": "Test Conversation",
        "summary": "A test conversation",
        "created_at": "2025-11-05T10:00:00.000000Z",
        "updated_at": "2025-11-05T10:30:00.000000Z",
        "chat_messages": [
            {
                "uuid": "msg-001",
                "text": "Hello",
                "sender": "human",
                "content": [{"type": "text", "text": "Hello"}],
                "created_at": "2025-11-05T10:00:00.000000Z",
                "updated_at": "2025-11-05T10:00:00.000000Z",
            },
            {
                "uuid": "msg-002",
                "text": "Hi there!",
                "sender": "assistant",
                "content": [{"type": "text", "text": "Hi there!"}],
                "created_at": "2025-11-05T10:01:00.000000Z",
                "updated_at": "2025-11-05T10:01:00.000000Z",
            },
        ],
    }


class TestMessageContent:
    """Test MessageContent model"""

    def test_basic_text_content(self):
        content = MessageContent(type="text", text="Hello world")
        assert content.type == "text"
        assert content.text == "Hello world"

    def test_tool_use_content(self):
        # MessageContent allows extra fields
        content = MessageContent(
            type="tool_use", name="web_search", input={"query": "museums"}
        )
        assert content.type == "tool_use"
        assert content.text is None


class TestChatMessage:
    """Test ChatMessage model"""

    def test_basic_message(self):
        msg = ChatMessage(
            uuid="msg-001",
            text="Hello",
            sender="human",
            content=[MessageContent(type="text", text="Hello")],
        )
        assert msg.uuid == "msg-001"
        assert msg.text == "Hello"
        assert msg.sender == "human"
        assert len(msg.content) == 1

    def test_extract_text_from_text_field(self):
        msg = ChatMessage(uuid="msg-001", text="Main text", sender="human", content=[])
        assert msg.extract_text() == "Main text"

    def test_extract_text_from_content(self):
        msg = ChatMessage(
            uuid="msg-001",
            text="",
            sender="assistant",
            content=[
                MessageContent(type="text", text="Content text"),
                MessageContent(type="tool_use", name="search"),
            ],
        )
        assert msg.extract_text() == "Content text"

    def test_extract_text_deduplication(self):
        # Sometimes text field and content[0].text are identical
        msg = ChatMessage(
            uuid="msg-001",
            text="Same text",
            sender="human",
            content=[MessageContent(type="text", text="Same text")],
        )
        # Should not duplicate
        assert msg.extract_text() == "Same text"

    def test_extract_text_combined(self):
        msg = ChatMessage(
            uuid="msg-001",
            text="First part",
            sender="assistant",
            content=[
                MessageContent(type="text", text="Second part"),
                MessageContent(type="text", text="Third part"),
            ],
        )
        extracted = msg.extract_text()
        assert "First part" in extracted
        assert "Second part" in extracted
        assert "Third part" in extracted


class TestConversation:
    """Test Conversation model"""

    def test_basic_conversation(self, sample_conversation_data):
        conv = Conversation(**sample_conversation_data)
        assert conv.uuid == "test-uuid-001"
        assert conv.name == "Test Conversation"
        assert conv.summary == "A test conversation"
        assert len(conv.chat_messages) == 2

    def test_datetime_parsing(self):
        conv = Conversation(
            uuid="test",
            name="Test",
            created_at="2025-11-05T10:00:00.000000Z",
            updated_at="2025-11-05T10:30:00.000000Z",
        )
        assert isinstance(conv.created_at, datetime)
        assert isinstance(conv.updated_at, datetime)

    def test_get_assistant_messages(self, sample_conversation_data):
        conv = Conversation(**sample_conversation_data)
        assistant_msgs = conv.get_assistant_messages()
        assert len(assistant_msgs) == 1
        assert assistant_msgs[0].sender == "assistant"

    def test_get_human_messages(self, sample_conversation_data):
        conv = Conversation(**sample_conversation_data)
        human_msgs = conv.get_human_messages()
        assert len(human_msgs) == 1
        assert human_msgs[0].sender == "human"

    def test_extract_all_text(self, sample_conversation_data):
        conv = Conversation(**sample_conversation_data)
        all_text = conv.extract_all_text()
        assert "Hello" in all_text
        assert "Hi there!" in all_text

    def test_extract_text_by_sender(self, sample_conversation_data):
        conv = Conversation(**sample_conversation_data)
        assistant_text = conv.extract_all_text(sender="assistant")
        assert "Hi there!" in assistant_text
        assert "Hello" not in assistant_text


class TestConversationParser:
    """Test ConversationParser"""

    def test_parse_dict(self, sample_conversation_data):
        parser = ConversationParser()
        conv = parser.parse_dict(sample_conversation_data)
        assert isinstance(conv, Conversation)
        assert conv.uuid == "test-uuid-001"

    def test_parse_dict_missing_uuid(self):
        parser = ConversationParser()
        with pytest.raises(ValueError, match="uuid"):
            parser.parse_dict({"name": "Test"})

    def test_parse_dict_missing_name(self):
        parser = ConversationParser()
        with pytest.raises(ValueError, match="name"):
            parser.parse_dict({"uuid": "test-123"})

    def test_parse_dict_not_dict(self):
        parser = ConversationParser()
        with pytest.raises(ValueError, match="dictionary"):
            parser.parse_dict("not a dict")

    def test_parse_file(self, sample_conversation_path):
        parser = ConversationParser()
        conv = parser.parse_file(sample_conversation_path)
        assert isinstance(conv, Conversation)
        assert conv.uuid == "test-uuid-001"
        assert conv.name == "Test Dutch GLAM Institutions"

    def test_parse_file_not_found(self):
        parser = ConversationParser()
        with pytest.raises(FileNotFoundError):
            parser.parse_file("/nonexistent/file.json")

    def test_extract_institutions_context(self, sample_conversation_path):
        parser = ConversationParser()
        conv = parser.parse_file(sample_conversation_path)
        context = parser.extract_institutions_context(conv)

        # Should contain assistant messages with institution details
        assert "Rijksmuseum" in context
        assert "NL-AsdRM" in context  # ISIL code
        assert "Nationaal Archief" in context
        assert "NL-HaNA" in context  # ISIL code

        # Should NOT contain human questions (since we filter to assistant only)
        # But human messages are short questions, so this is implicit

    def test_get_conversation_metadata(self, sample_conversation_path):
        parser = ConversationParser()
        conv = parser.parse_file(sample_conversation_path)
        metadata = parser.get_conversation_metadata(conv)

        assert metadata["conversation_id"] == "test-uuid-001"
        assert metadata["conversation_name"] == "Test Dutch GLAM Institutions"
        assert metadata["message_count"] == 4
        assert metadata["assistant_message_count"] == 2
        assert metadata["human_message_count"] == 2


class TestRealWorldConversation:
    """Test with actual conversation structure from sample"""

    @pytest.fixture
    def rijksmuseum_conversation(self, sample_conversation_path):
        parser = ConversationParser()
        return parser.parse_file(sample_conversation_path)

    def test_extract_rijksmuseum_info(self, rijksmuseum_conversation):
        """Test extracting information about Rijksmuseum"""
        text = rijksmuseum_conversation.extract_all_text(sender="assistant")

        # Institution name
        assert "Rijksmuseum" in text

        # ISIL code
        assert "NL-AsdRM" in text

        # Address
        assert "Museumstraat 1" in text
        assert "Amsterdam" in text

        # Collection info
        assert "1 million" in text

        # Metadata standards
        assert "SPECTRUM" in text
        assert "LIDO" in text

        # Website
        assert "rijksmuseum.nl" in text

    def test_extract_nationaal_archief_info(self, rijksmuseum_conversation):
        """Test extracting information about Nationaal Archief"""
        text = rijksmuseum_conversation.extract_all_text(sender="assistant")

        # Institution name
        assert "Nationaal Archief" in text

        # ISIL code
        assert "NL-HaNA" in text

        # Location
        assert "The Hague" in text or "Den Haag" in text

        # Metadata standards
        assert "EAD" in text
        assert "RiC-O" in text

    def test_message_count(self, rijksmuseum_conversation):
        """Test message counts"""
        assert len(rijksmuseum_conversation.chat_messages) == 4
        assert len(rijksmuseum_conversation.get_assistant_messages()) == 2
        assert len(rijksmuseum_conversation.get_human_messages()) == 2

    def test_message_order(self, rijksmuseum_conversation):
        """Test messages are in correct order"""
        messages = rijksmuseum_conversation.chat_messages
        assert messages[0].sender == "human"
        assert messages[1].sender == "assistant"
        assert messages[2].sender == "human"
        assert messages[3].sender == "assistant"


class TestPartnershipExtraction:
    """Test partnership extraction from conversation text"""

    @pytest.fixture
    def parser(self):
        return ConversationParser()

    @pytest.fixture
    def conversation_with_europeana(self):
        """Conversation mentioning Europeana partnership"""
        return Conversation(
            uuid="test-europeana",
            name="Test Europeana",
            chat_messages=[
                ChatMessage(
                    uuid="msg-1",
                    text="Tell me about partnerships",
                    sender="human",
                    content=[],
                ),
                ChatMessage(
                    uuid="msg-2",
                    text="The museum participates in Europeana since 2018 and collaborates with DPLA.",
                    sender="assistant",
                    content=[
                        MessageContent(
                            type="text",
                            text="The museum participates in Europeana since 2018 and collaborates with DPLA."
                        )
                    ],
                ),
            ],
        )

    @pytest.fixture
    def conversation_with_temporal_info(self):
        """Conversation with temporal partnership information"""
        return Conversation(
            uuid="test-temporal",
            name="Test Temporal",
            chat_messages=[
                ChatMessage(
                    uuid="msg-1",
                    text="",
                    sender="assistant",
                    content=[
                        MessageContent(
                            type="text",
                            text="The archive joined Archieven.nl from 2020 to 2025 and participated in DC4EU digitization program."
                        )
                    ],
                ),
            ],
        )

    @pytest.fixture
    def conversation_with_generic_partnerships(self):
        """Conversation with generic partnership phrases"""
        return Conversation(
            uuid="test-generic",
            name="Test Generic",
            chat_messages=[
                ChatMessage(
                    uuid="msg-1",
                    text="",
                    sender="assistant",
                    content=[
                        MessageContent(
                            type="text",
                            text="The institution is part of the Digital Heritage Network and member of the International Museum Consortium."
                        )
                    ],
                ),
            ],
        )

    def test_extract_europeana_partnership(self, parser, conversation_with_europeana):
        """Test extracting Europeana partnership"""
        partnerships = parser.extract_partnerships(conversation_with_europeana)

        # Should find Europeana
        europeana = [p for p in partnerships if "Europeana" in p["partner_name"]]
        assert len(europeana) == 1
        assert europeana[0]["partnership_type"] == "aggregator_participation"
        assert europeana[0]["start_date"] == "2018-01-01"  # "since 2018"

    def test_extract_dpla_partnership(self, parser, conversation_with_europeana):
        """Test extracting DPLA partnership"""
        partnerships = parser.extract_partnerships(conversation_with_europeana)

        # Should find DPLA
        dpla = [p for p in partnerships if "DPLA" in p["partner_name"]]
        assert len(dpla) == 1
        assert dpla[0]["partnership_type"] == "aggregator_participation"

    def test_extract_temporal_info_from_to(self, parser, conversation_with_temporal_info):
        """Test extracting 'from YYYY to YYYY' temporal pattern"""
        partnerships = parser.extract_partnerships(conversation_with_temporal_info)

        # Find Archieven.nl
        archieven = [p for p in partnerships if "Archieven.nl" in p["partner_name"]]
        assert len(archieven) == 1
        assert archieven[0]["start_date"] == "2020-01-01"
        assert archieven[0]["end_date"] == "2025-12-31"

    def test_extract_digitization_program(self, parser, conversation_with_temporal_info):
        """Test extracting digitization program partnership"""
        partnerships = parser.extract_partnerships(conversation_with_temporal_info)

        # Find DC4EU
        dc4eu = [p for p in partnerships if "DC4EU" in p["partner_name"]]
        assert len(dc4eu) == 1
        assert dc4eu[0]["partnership_type"] == "digitization_program"

    def test_extract_generic_network_partnership(self, parser, conversation_with_generic_partnerships):
        """Test extracting generic 'part of Network' phrase"""
        partnerships = parser.extract_partnerships(conversation_with_generic_partnerships)

        # Should find "Digital Heritage Network"
        network = [p for p in partnerships if "Digital Heritage Network" in p["partner_name"]]
        assert len(network) >= 1

    def test_extract_generic_consortium_partnership(self, parser, conversation_with_generic_partnerships):
        """Test extracting generic 'member of Consortium' phrase"""
        partnerships = parser.extract_partnerships(conversation_with_generic_partnerships)

        # Should find "International Museum Consortium"
        consortium = [p for p in partnerships if "International Museum Consortium" in p["partner_name"]]
        assert len(consortium) >= 1

    def test_no_duplicates(self, parser):
        """Test that duplicate partnerships are not returned"""
        conv = Conversation(
            uuid="test-dupe",
            name="Test Duplicates",
            chat_messages=[
                ChatMessage(
                    uuid="msg-1",
                    text="",
                    sender="assistant",
                    content=[
                        MessageContent(
                            type="text",
                            text="Europeana is great. We work with Europeana. Europeana has many partners."
                        )
                    ],
                ),
            ],
        )

        partnerships = parser.extract_partnerships(conv)

        # Should only return one Europeana partnership
        europeana_count = len([p for p in partnerships if "Europeana" in p["partner_name"]])
        assert europeana_count == 1

    def test_context_extraction(self, parser, conversation_with_europeana):
        """Test that description contains context sentence"""
        partnerships = parser.extract_partnerships(conversation_with_europeana)

        europeana = [p for p in partnerships if "Europeana" in p["partner_name"]][0]

        # Description should contain the context
        assert "description" in europeana
        assert len(europeana["description"]) > 0
        assert "Europeana" in europeana["description"]

    def test_classify_partnership_type_aggregator(self, parser):
        """Test classification of aggregator partnerships"""
        ptype = parser._classify_partnership_type("Archives Portal Europe", "joined the portal")
        assert ptype == "aggregator_participation"

    def test_classify_partnership_type_network(self, parser):
        """Test classification of network partnerships"""
        ptype = parser._classify_partnership_type("IIIF Consortium", "member of consortium")
        assert ptype == "thematic_network"

    def test_classify_partnership_type_program(self, parser):
        """Test classification of program partnerships"""
        ptype = parser._classify_partnership_type("Digitization Initiative", "participated in project")
        assert ptype == "digitization_program"

    def test_temporal_extraction_since(self, parser):
        """Test 'since YYYY' pattern"""
        temporal = parser._extract_temporal_info("participating since 2019")
        assert temporal["start_date"] == "2019-01-01"
        assert "end_date" not in temporal

    def test_temporal_extraction_until(self, parser):
        """Test 'until YYYY' pattern"""
        temporal = parser._extract_temporal_info("active until 2024")
        assert temporal["end_date"] == "2024-12-31"
        assert "start_date" not in temporal

    def test_temporal_extraction_in(self, parser):
        """Test 'in YYYY' pattern (single year)"""
        temporal = parser._extract_temporal_info("joined in 2022")
        assert temporal["start_date"] == "2022-01-01"
        assert temporal["end_date"] == "2022-12-31"

    def test_temporal_extraction_no_match(self, parser):
        """Test text with no temporal patterns"""
        temporal = parser._extract_temporal_info("ongoing partnership")
        assert len(temporal) == 0

    def test_sentence_context_extraction(self, parser):
        """Test sentence context extraction"""
        text = "First sentence. The museum participates in Europeana since 2018. Third sentence."
        position = text.index("Europeana")

        context = parser._extract_sentence_context(text, position)

        # Should extract the full sentence
        assert "museum participates" in context
        assert "Europeana" in context
        assert "since 2018" in context

    def test_real_world_mexican_partnerships(self, parser):
        """Test extracting partnerships from Mexican GLAM text"""
        conv = Conversation(
            uuid="test-mexico",
            name="Mexican GLAM",
            chat_messages=[
                ChatMessage(
                    uuid="msg-1",
                    text="",
                    sender="assistant",
                    content=[
                        MessageContent(
                            type="text",
                            text="""
                            The 2018 Open GLAM México Conference brought Europeana representatives
                            to share best practices. Mexican libraries participate in OCLC WorldCat
                            and the CONRICYT consortium serves 500+ institutions since 2010.
                            Google Arts & Culture features major Mexican institutions.
                            """
                        )
                    ],
                ),
            ],
        )

        partnerships = parser.extract_partnerships(conv)

        # Should find multiple partnerships
        assert len(partnerships) >= 3

        # Check for specific partners
        partner_names = [p["partner_name"] for p in partnerships]
        assert any("Europeana" in name for name in partner_names)
        assert any("WorldCat" in name or "OCLC" in name for name in partner_names)
        assert any("CONRICYT" in name for name in partner_names)
        assert any("Google Arts & Culture" in name for name in partner_names)