""" Tests for ConversationParser """ import json from datetime import datetime from pathlib import Path import pytest from glam_extractor.parsers.conversation import ( ConversationParser, Conversation, ChatMessage, MessageContent, ) @pytest.fixture def sample_conversation_path(): """Path to the sample conversation fixture""" return Path(__file__).parent.parent / "fixtures" / "sample_conversation.json" @pytest.fixture def sample_conversation_data(): """Sample conversation data as dictionary""" return { "uuid": "test-uuid-001", "name": "Test Conversation", "summary": "A test conversation", "created_at": "2025-11-05T10:00:00.000000Z", "updated_at": "2025-11-05T10:30:00.000000Z", "chat_messages": [ { "uuid": "msg-001", "text": "Hello", "sender": "human", "content": [{"type": "text", "text": "Hello"}], "created_at": "2025-11-05T10:00:00.000000Z", "updated_at": "2025-11-05T10:00:00.000000Z", }, { "uuid": "msg-002", "text": "Hi there!", "sender": "assistant", "content": [{"type": "text", "text": "Hi there!"}], "created_at": "2025-11-05T10:01:00.000000Z", "updated_at": "2025-11-05T10:01:00.000000Z", }, ], } class TestMessageContent: """Test MessageContent model""" def test_basic_text_content(self): content = MessageContent(type="text", text="Hello world") assert content.type == "text" assert content.text == "Hello world" def test_tool_use_content(self): # MessageContent allows extra fields content = MessageContent( type="tool_use", name="web_search", input={"query": "museums"} ) assert content.type == "tool_use" assert content.text is None class TestChatMessage: """Test ChatMessage model""" def test_basic_message(self): msg = ChatMessage( uuid="msg-001", text="Hello", sender="human", content=[MessageContent(type="text", text="Hello")], ) assert msg.uuid == "msg-001" assert msg.text == "Hello" assert msg.sender == "human" assert len(msg.content) == 1 def test_extract_text_from_text_field(self): msg = ChatMessage(uuid="msg-001", text="Main text", sender="human", content=[]) assert msg.extract_text() == "Main text" def test_extract_text_from_content(self): msg = ChatMessage( uuid="msg-001", text="", sender="assistant", content=[ MessageContent(type="text", text="Content text"), MessageContent(type="tool_use", name="search"), ], ) assert msg.extract_text() == "Content text" def test_extract_text_deduplication(self): # Sometimes text field and content[0].text are identical msg = ChatMessage( uuid="msg-001", text="Same text", sender="human", content=[MessageContent(type="text", text="Same text")], ) # Should not duplicate assert msg.extract_text() == "Same text" def test_extract_text_combined(self): msg = ChatMessage( uuid="msg-001", text="First part", sender="assistant", content=[ MessageContent(type="text", text="Second part"), MessageContent(type="text", text="Third part"), ], ) extracted = msg.extract_text() assert "First part" in extracted assert "Second part" in extracted assert "Third part" in extracted class TestConversation: """Test Conversation model""" def test_basic_conversation(self, sample_conversation_data): conv = Conversation(**sample_conversation_data) assert conv.uuid == "test-uuid-001" assert conv.name == "Test Conversation" assert conv.summary == "A test conversation" assert len(conv.chat_messages) == 2 def test_datetime_parsing(self): conv = Conversation( uuid="test", name="Test", created_at="2025-11-05T10:00:00.000000Z", updated_at="2025-11-05T10:30:00.000000Z", ) assert isinstance(conv.created_at, datetime) assert isinstance(conv.updated_at, datetime) def test_get_assistant_messages(self, sample_conversation_data): conv = Conversation(**sample_conversation_data) assistant_msgs = conv.get_assistant_messages() assert len(assistant_msgs) == 1 assert assistant_msgs[0].sender == "assistant" def test_get_human_messages(self, sample_conversation_data): conv = Conversation(**sample_conversation_data) human_msgs = conv.get_human_messages() assert len(human_msgs) == 1 assert human_msgs[0].sender == "human" def test_extract_all_text(self, sample_conversation_data): conv = Conversation(**sample_conversation_data) all_text = conv.extract_all_text() assert "Hello" in all_text assert "Hi there!" in all_text def test_extract_text_by_sender(self, sample_conversation_data): conv = Conversation(**sample_conversation_data) assistant_text = conv.extract_all_text(sender="assistant") assert "Hi there!" in assistant_text assert "Hello" not in assistant_text class TestConversationParser: """Test ConversationParser""" def test_parse_dict(self, sample_conversation_data): parser = ConversationParser() conv = parser.parse_dict(sample_conversation_data) assert isinstance(conv, Conversation) assert conv.uuid == "test-uuid-001" def test_parse_dict_missing_uuid(self): parser = ConversationParser() with pytest.raises(ValueError, match="uuid"): parser.parse_dict({"name": "Test"}) def test_parse_dict_missing_name(self): parser = ConversationParser() with pytest.raises(ValueError, match="name"): parser.parse_dict({"uuid": "test-123"}) def test_parse_dict_not_dict(self): parser = ConversationParser() with pytest.raises(ValueError, match="dictionary"): parser.parse_dict("not a dict") def test_parse_file(self, sample_conversation_path): parser = ConversationParser() conv = parser.parse_file(sample_conversation_path) assert isinstance(conv, Conversation) assert conv.uuid == "test-uuid-001" assert conv.name == "Test Dutch GLAM Institutions" def test_parse_file_not_found(self): parser = ConversationParser() with pytest.raises(FileNotFoundError): parser.parse_file("/nonexistent/file.json") def test_extract_institutions_context(self, sample_conversation_path): parser = ConversationParser() conv = parser.parse_file(sample_conversation_path) context = parser.extract_institutions_context(conv) # Should contain assistant messages with institution details assert "Rijksmuseum" in context assert "NL-AsdRM" in context # ISIL code assert "Nationaal Archief" in context assert "NL-HaNA" in context # ISIL code # Should NOT contain human questions (since we filter to assistant only) # But human messages are short questions, so this is implicit def test_get_conversation_metadata(self, sample_conversation_path): parser = ConversationParser() conv = parser.parse_file(sample_conversation_path) metadata = parser.get_conversation_metadata(conv) assert metadata["conversation_id"] == "test-uuid-001" assert metadata["conversation_name"] == "Test Dutch GLAM Institutions" assert metadata["message_count"] == 4 assert metadata["assistant_message_count"] == 2 assert metadata["human_message_count"] == 2 class TestRealWorldConversation: """Test with actual conversation structure from sample""" @pytest.fixture def rijksmuseum_conversation(self, sample_conversation_path): parser = ConversationParser() return parser.parse_file(sample_conversation_path) def test_extract_rijksmuseum_info(self, rijksmuseum_conversation): """Test extracting information about Rijksmuseum""" text = rijksmuseum_conversation.extract_all_text(sender="assistant") # Institution name assert "Rijksmuseum" in text # ISIL code assert "NL-AsdRM" in text # Address assert "Museumstraat 1" in text assert "Amsterdam" in text # Collection info assert "1 million" in text # Metadata standards assert "SPECTRUM" in text assert "LIDO" in text # Website assert "rijksmuseum.nl" in text def test_extract_nationaal_archief_info(self, rijksmuseum_conversation): """Test extracting information about Nationaal Archief""" text = rijksmuseum_conversation.extract_all_text(sender="assistant") # Institution name assert "Nationaal Archief" in text # ISIL code assert "NL-HaNA" in text # Location assert "The Hague" in text or "Den Haag" in text # Metadata standards assert "EAD" in text assert "RiC-O" in text def test_message_count(self, rijksmuseum_conversation): """Test message counts""" assert len(rijksmuseum_conversation.chat_messages) == 4 assert len(rijksmuseum_conversation.get_assistant_messages()) == 2 assert len(rijksmuseum_conversation.get_human_messages()) == 2 def test_message_order(self, rijksmuseum_conversation): """Test messages are in correct order""" messages = rijksmuseum_conversation.chat_messages assert messages[0].sender == "human" assert messages[1].sender == "assistant" assert messages[2].sender == "human" assert messages[3].sender == "assistant" class TestPartnershipExtraction: """Test partnership extraction from conversation text""" @pytest.fixture def parser(self): return ConversationParser() @pytest.fixture def conversation_with_europeana(self): """Conversation mentioning Europeana partnership""" return Conversation( uuid="test-europeana", name="Test Europeana", chat_messages=[ ChatMessage( uuid="msg-1", text="Tell me about partnerships", sender="human", content=[], ), ChatMessage( uuid="msg-2", text="The museum participates in Europeana since 2018 and collaborates with DPLA.", sender="assistant", content=[ MessageContent( type="text", text="The museum participates in Europeana since 2018 and collaborates with DPLA." ) ], ), ], ) @pytest.fixture def conversation_with_temporal_info(self): """Conversation with temporal partnership information""" return Conversation( uuid="test-temporal", name="Test Temporal", chat_messages=[ ChatMessage( uuid="msg-1", text="", sender="assistant", content=[ MessageContent( type="text", text="The archive joined Archieven.nl from 2020 to 2025 and participated in DC4EU digitization program." ) ], ), ], ) @pytest.fixture def conversation_with_generic_partnerships(self): """Conversation with generic partnership phrases""" return Conversation( uuid="test-generic", name="Test Generic", chat_messages=[ ChatMessage( uuid="msg-1", text="", sender="assistant", content=[ MessageContent( type="text", text="The institution is part of the Digital Heritage Network and member of the International Museum Consortium." ) ], ), ], ) def test_extract_europeana_partnership(self, parser, conversation_with_europeana): """Test extracting Europeana partnership""" partnerships = parser.extract_partnerships(conversation_with_europeana) # Should find Europeana europeana = [p for p in partnerships if "Europeana" in p["partner_name"]] assert len(europeana) == 1 assert europeana[0]["partnership_type"] == "aggregator_participation" assert europeana[0]["start_date"] == "2018-01-01" # "since 2018" def test_extract_dpla_partnership(self, parser, conversation_with_europeana): """Test extracting DPLA partnership""" partnerships = parser.extract_partnerships(conversation_with_europeana) # Should find DPLA dpla = [p for p in partnerships if "DPLA" in p["partner_name"]] assert len(dpla) == 1 assert dpla[0]["partnership_type"] == "aggregator_participation" def test_extract_temporal_info_from_to(self, parser, conversation_with_temporal_info): """Test extracting 'from YYYY to YYYY' temporal pattern""" partnerships = parser.extract_partnerships(conversation_with_temporal_info) # Find Archieven.nl archieven = [p for p in partnerships if "Archieven.nl" in p["partner_name"]] assert len(archieven) == 1 assert archieven[0]["start_date"] == "2020-01-01" assert archieven[0]["end_date"] == "2025-12-31" def test_extract_digitization_program(self, parser, conversation_with_temporal_info): """Test extracting digitization program partnership""" partnerships = parser.extract_partnerships(conversation_with_temporal_info) # Find DC4EU dc4eu = [p for p in partnerships if "DC4EU" in p["partner_name"]] assert len(dc4eu) == 1 assert dc4eu[0]["partnership_type"] == "digitization_program" def test_extract_generic_network_partnership(self, parser, conversation_with_generic_partnerships): """Test extracting generic 'part of Network' phrase""" partnerships = parser.extract_partnerships(conversation_with_generic_partnerships) # Should find "Digital Heritage Network" network = [p for p in partnerships if "Digital Heritage Network" in p["partner_name"]] assert len(network) >= 1 def test_extract_generic_consortium_partnership(self, parser, conversation_with_generic_partnerships): """Test extracting generic 'member of Consortium' phrase""" partnerships = parser.extract_partnerships(conversation_with_generic_partnerships) # Should find "International Museum Consortium" consortium = [p for p in partnerships if "International Museum Consortium" in p["partner_name"]] assert len(consortium) >= 1 def test_no_duplicates(self, parser): """Test that duplicate partnerships are not returned""" conv = Conversation( uuid="test-dupe", name="Test Duplicates", chat_messages=[ ChatMessage( uuid="msg-1", text="", sender="assistant", content=[ MessageContent( type="text", text="Europeana is great. We work with Europeana. Europeana has many partners." ) ], ), ], ) partnerships = parser.extract_partnerships(conv) # Should only return one Europeana partnership europeana_count = len([p for p in partnerships if "Europeana" in p["partner_name"]]) assert europeana_count == 1 def test_context_extraction(self, parser, conversation_with_europeana): """Test that description contains context sentence""" partnerships = parser.extract_partnerships(conversation_with_europeana) europeana = [p for p in partnerships if "Europeana" in p["partner_name"]][0] # Description should contain the context assert "description" in europeana assert len(europeana["description"]) > 0 assert "Europeana" in europeana["description"] def test_classify_partnership_type_aggregator(self, parser): """Test classification of aggregator partnerships""" ptype = parser._classify_partnership_type("Archives Portal Europe", "joined the portal") assert ptype == "aggregator_participation" def test_classify_partnership_type_network(self, parser): """Test classification of network partnerships""" ptype = parser._classify_partnership_type("IIIF Consortium", "member of consortium") assert ptype == "thematic_network" def test_classify_partnership_type_program(self, parser): """Test classification of program partnerships""" ptype = parser._classify_partnership_type("Digitization Initiative", "participated in project") assert ptype == "digitization_program" def test_temporal_extraction_since(self, parser): """Test 'since YYYY' pattern""" temporal = parser._extract_temporal_info("participating since 2019") assert temporal["start_date"] == "2019-01-01" assert "end_date" not in temporal def test_temporal_extraction_until(self, parser): """Test 'until YYYY' pattern""" temporal = parser._extract_temporal_info("active until 2024") assert temporal["end_date"] == "2024-12-31" assert "start_date" not in temporal def test_temporal_extraction_in(self, parser): """Test 'in YYYY' pattern (single year)""" temporal = parser._extract_temporal_info("joined in 2022") assert temporal["start_date"] == "2022-01-01" assert temporal["end_date"] == "2022-12-31" def test_temporal_extraction_no_match(self, parser): """Test text with no temporal patterns""" temporal = parser._extract_temporal_info("ongoing partnership") assert len(temporal) == 0 def test_sentence_context_extraction(self, parser): """Test sentence context extraction""" text = "First sentence. The museum participates in Europeana since 2018. Third sentence." position = text.index("Europeana") context = parser._extract_sentence_context(text, position) # Should extract the full sentence assert "museum participates" in context assert "Europeana" in context assert "since 2018" in context def test_real_world_mexican_partnerships(self, parser): """Test extracting partnerships from Mexican GLAM text""" conv = Conversation( uuid="test-mexico", name="Mexican GLAM", chat_messages=[ ChatMessage( uuid="msg-1", text="", sender="assistant", content=[ MessageContent( type="text", text=""" The 2018 Open GLAM México Conference brought Europeana representatives to share best practices. Mexican libraries participate in OCLC WorldCat and the CONRICYT consortium serves 500+ institutions since 2010. Google Arts & Culture features major Mexican institutions. """ ) ], ), ], ) partnerships = parser.extract_partnerships(conv) # Should find multiple partnerships assert len(partnerships) >= 3 # Check for specific partners partner_names = [p["partner_name"] for p in partnerships] assert any("Europeana" in name for name in partner_names) assert any("WorldCat" in name or "OCLC" in name for name in partner_names) assert any("CONRICYT" in name for name in partner_names) assert any("Google Arts & Culture" in name for name in partner_names)