glam/tests/parsers/test_conversation.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

553 lines
21 KiB
Python

"""
Tests for ConversationParser
"""
import json
from datetime import datetime
from pathlib import Path
import pytest
from glam_extractor.parsers.conversation import (
ConversationParser,
Conversation,
ChatMessage,
MessageContent,
)
@pytest.fixture
def sample_conversation_path():
"""Path to the sample conversation fixture"""
return Path(__file__).parent.parent / "fixtures" / "sample_conversation.json"
@pytest.fixture
def sample_conversation_data():
"""Sample conversation data as dictionary"""
return {
"uuid": "test-uuid-001",
"name": "Test Conversation",
"summary": "A test conversation",
"created_at": "2025-11-05T10:00:00.000000Z",
"updated_at": "2025-11-05T10:30:00.000000Z",
"chat_messages": [
{
"uuid": "msg-001",
"text": "Hello",
"sender": "human",
"content": [{"type": "text", "text": "Hello"}],
"created_at": "2025-11-05T10:00:00.000000Z",
"updated_at": "2025-11-05T10:00:00.000000Z",
},
{
"uuid": "msg-002",
"text": "Hi there!",
"sender": "assistant",
"content": [{"type": "text", "text": "Hi there!"}],
"created_at": "2025-11-05T10:01:00.000000Z",
"updated_at": "2025-11-05T10:01:00.000000Z",
},
],
}
class TestMessageContent:
"""Test MessageContent model"""
def test_basic_text_content(self):
content = MessageContent(type="text", text="Hello world")
assert content.type == "text"
assert content.text == "Hello world"
def test_tool_use_content(self):
# MessageContent allows extra fields
content = MessageContent(
type="tool_use", name="web_search", input={"query": "museums"}
)
assert content.type == "tool_use"
assert content.text is None
class TestChatMessage:
"""Test ChatMessage model"""
def test_basic_message(self):
msg = ChatMessage(
uuid="msg-001",
text="Hello",
sender="human",
content=[MessageContent(type="text", text="Hello")],
)
assert msg.uuid == "msg-001"
assert msg.text == "Hello"
assert msg.sender == "human"
assert len(msg.content) == 1
def test_extract_text_from_text_field(self):
msg = ChatMessage(uuid="msg-001", text="Main text", sender="human", content=[])
assert msg.extract_text() == "Main text"
def test_extract_text_from_content(self):
msg = ChatMessage(
uuid="msg-001",
text="",
sender="assistant",
content=[
MessageContent(type="text", text="Content text"),
MessageContent(type="tool_use", name="search"),
],
)
assert msg.extract_text() == "Content text"
def test_extract_text_deduplication(self):
# Sometimes text field and content[0].text are identical
msg = ChatMessage(
uuid="msg-001",
text="Same text",
sender="human",
content=[MessageContent(type="text", text="Same text")],
)
# Should not duplicate
assert msg.extract_text() == "Same text"
def test_extract_text_combined(self):
msg = ChatMessage(
uuid="msg-001",
text="First part",
sender="assistant",
content=[
MessageContent(type="text", text="Second part"),
MessageContent(type="text", text="Third part"),
],
)
extracted = msg.extract_text()
assert "First part" in extracted
assert "Second part" in extracted
assert "Third part" in extracted
class TestConversation:
"""Test Conversation model"""
def test_basic_conversation(self, sample_conversation_data):
conv = Conversation(**sample_conversation_data)
assert conv.uuid == "test-uuid-001"
assert conv.name == "Test Conversation"
assert conv.summary == "A test conversation"
assert len(conv.chat_messages) == 2
def test_datetime_parsing(self):
conv = Conversation(
uuid="test",
name="Test",
created_at="2025-11-05T10:00:00.000000Z",
updated_at="2025-11-05T10:30:00.000000Z",
)
assert isinstance(conv.created_at, datetime)
assert isinstance(conv.updated_at, datetime)
def test_get_assistant_messages(self, sample_conversation_data):
conv = Conversation(**sample_conversation_data)
assistant_msgs = conv.get_assistant_messages()
assert len(assistant_msgs) == 1
assert assistant_msgs[0].sender == "assistant"
def test_get_human_messages(self, sample_conversation_data):
conv = Conversation(**sample_conversation_data)
human_msgs = conv.get_human_messages()
assert len(human_msgs) == 1
assert human_msgs[0].sender == "human"
def test_extract_all_text(self, sample_conversation_data):
conv = Conversation(**sample_conversation_data)
all_text = conv.extract_all_text()
assert "Hello" in all_text
assert "Hi there!" in all_text
def test_extract_text_by_sender(self, sample_conversation_data):
conv = Conversation(**sample_conversation_data)
assistant_text = conv.extract_all_text(sender="assistant")
assert "Hi there!" in assistant_text
assert "Hello" not in assistant_text
class TestConversationParser:
"""Test ConversationParser"""
def test_parse_dict(self, sample_conversation_data):
parser = ConversationParser()
conv = parser.parse_dict(sample_conversation_data)
assert isinstance(conv, Conversation)
assert conv.uuid == "test-uuid-001"
def test_parse_dict_missing_uuid(self):
parser = ConversationParser()
with pytest.raises(ValueError, match="uuid"):
parser.parse_dict({"name": "Test"})
def test_parse_dict_missing_name(self):
parser = ConversationParser()
with pytest.raises(ValueError, match="name"):
parser.parse_dict({"uuid": "test-123"})
def test_parse_dict_not_dict(self):
parser = ConversationParser()
with pytest.raises(ValueError, match="dictionary"):
parser.parse_dict("not a dict")
def test_parse_file(self, sample_conversation_path):
parser = ConversationParser()
conv = parser.parse_file(sample_conversation_path)
assert isinstance(conv, Conversation)
assert conv.uuid == "test-uuid-001"
assert conv.name == "Test Dutch GLAM Institutions"
def test_parse_file_not_found(self):
parser = ConversationParser()
with pytest.raises(FileNotFoundError):
parser.parse_file("/nonexistent/file.json")
def test_extract_institutions_context(self, sample_conversation_path):
parser = ConversationParser()
conv = parser.parse_file(sample_conversation_path)
context = parser.extract_institutions_context(conv)
# Should contain assistant messages with institution details
assert "Rijksmuseum" in context
assert "NL-AsdRM" in context # ISIL code
assert "Nationaal Archief" in context
assert "NL-HaNA" in context # ISIL code
# Should NOT contain human questions (since we filter to assistant only)
# But human messages are short questions, so this is implicit
def test_get_conversation_metadata(self, sample_conversation_path):
parser = ConversationParser()
conv = parser.parse_file(sample_conversation_path)
metadata = parser.get_conversation_metadata(conv)
assert metadata["conversation_id"] == "test-uuid-001"
assert metadata["conversation_name"] == "Test Dutch GLAM Institutions"
assert metadata["message_count"] == 4
assert metadata["assistant_message_count"] == 2
assert metadata["human_message_count"] == 2
class TestRealWorldConversation:
"""Test with actual conversation structure from sample"""
@pytest.fixture
def rijksmuseum_conversation(self, sample_conversation_path):
parser = ConversationParser()
return parser.parse_file(sample_conversation_path)
def test_extract_rijksmuseum_info(self, rijksmuseum_conversation):
"""Test extracting information about Rijksmuseum"""
text = rijksmuseum_conversation.extract_all_text(sender="assistant")
# Institution name
assert "Rijksmuseum" in text
# ISIL code
assert "NL-AsdRM" in text
# Address
assert "Museumstraat 1" in text
assert "Amsterdam" in text
# Collection info
assert "1 million" in text
# Metadata standards
assert "SPECTRUM" in text
assert "LIDO" in text
# Website
assert "rijksmuseum.nl" in text
def test_extract_nationaal_archief_info(self, rijksmuseum_conversation):
"""Test extracting information about Nationaal Archief"""
text = rijksmuseum_conversation.extract_all_text(sender="assistant")
# Institution name
assert "Nationaal Archief" in text
# ISIL code
assert "NL-HaNA" in text
# Location
assert "The Hague" in text or "Den Haag" in text
# Metadata standards
assert "EAD" in text
assert "RiC-O" in text
def test_message_count(self, rijksmuseum_conversation):
"""Test message counts"""
assert len(rijksmuseum_conversation.chat_messages) == 4
assert len(rijksmuseum_conversation.get_assistant_messages()) == 2
assert len(rijksmuseum_conversation.get_human_messages()) == 2
def test_message_order(self, rijksmuseum_conversation):
"""Test messages are in correct order"""
messages = rijksmuseum_conversation.chat_messages
assert messages[0].sender == "human"
assert messages[1].sender == "assistant"
assert messages[2].sender == "human"
assert messages[3].sender == "assistant"
class TestPartnershipExtraction:
"""Test partnership extraction from conversation text"""
@pytest.fixture
def parser(self):
return ConversationParser()
@pytest.fixture
def conversation_with_europeana(self):
"""Conversation mentioning Europeana partnership"""
return Conversation(
uuid="test-europeana",
name="Test Europeana",
chat_messages=[
ChatMessage(
uuid="msg-1",
text="Tell me about partnerships",
sender="human",
content=[],
),
ChatMessage(
uuid="msg-2",
text="The museum participates in Europeana since 2018 and collaborates with DPLA.",
sender="assistant",
content=[
MessageContent(
type="text",
text="The museum participates in Europeana since 2018 and collaborates with DPLA."
)
],
),
],
)
@pytest.fixture
def conversation_with_temporal_info(self):
"""Conversation with temporal partnership information"""
return Conversation(
uuid="test-temporal",
name="Test Temporal",
chat_messages=[
ChatMessage(
uuid="msg-1",
text="",
sender="assistant",
content=[
MessageContent(
type="text",
text="The archive joined Archieven.nl from 2020 to 2025 and participated in DC4EU digitization program."
)
],
),
],
)
@pytest.fixture
def conversation_with_generic_partnerships(self):
"""Conversation with generic partnership phrases"""
return Conversation(
uuid="test-generic",
name="Test Generic",
chat_messages=[
ChatMessage(
uuid="msg-1",
text="",
sender="assistant",
content=[
MessageContent(
type="text",
text="The institution is part of the Digital Heritage Network and member of the International Museum Consortium."
)
],
),
],
)
def test_extract_europeana_partnership(self, parser, conversation_with_europeana):
"""Test extracting Europeana partnership"""
partnerships = parser.extract_partnerships(conversation_with_europeana)
# Should find Europeana
europeana = [p for p in partnerships if "Europeana" in p["partner_name"]]
assert len(europeana) == 1
assert europeana[0]["partnership_type"] == "aggregator_participation"
assert europeana[0]["start_date"] == "2018-01-01" # "since 2018"
def test_extract_dpla_partnership(self, parser, conversation_with_europeana):
"""Test extracting DPLA partnership"""
partnerships = parser.extract_partnerships(conversation_with_europeana)
# Should find DPLA
dpla = [p for p in partnerships if "DPLA" in p["partner_name"]]
assert len(dpla) == 1
assert dpla[0]["partnership_type"] == "aggregator_participation"
def test_extract_temporal_info_from_to(self, parser, conversation_with_temporal_info):
"""Test extracting 'from YYYY to YYYY' temporal pattern"""
partnerships = parser.extract_partnerships(conversation_with_temporal_info)
# Find Archieven.nl
archieven = [p for p in partnerships if "Archieven.nl" in p["partner_name"]]
assert len(archieven) == 1
assert archieven[0]["start_date"] == "2020-01-01"
assert archieven[0]["end_date"] == "2025-12-31"
def test_extract_digitization_program(self, parser, conversation_with_temporal_info):
"""Test extracting digitization program partnership"""
partnerships = parser.extract_partnerships(conversation_with_temporal_info)
# Find DC4EU
dc4eu = [p for p in partnerships if "DC4EU" in p["partner_name"]]
assert len(dc4eu) == 1
assert dc4eu[0]["partnership_type"] == "digitization_program"
def test_extract_generic_network_partnership(self, parser, conversation_with_generic_partnerships):
"""Test extracting generic 'part of Network' phrase"""
partnerships = parser.extract_partnerships(conversation_with_generic_partnerships)
# Should find "Digital Heritage Network"
network = [p for p in partnerships if "Digital Heritage Network" in p["partner_name"]]
assert len(network) >= 1
def test_extract_generic_consortium_partnership(self, parser, conversation_with_generic_partnerships):
"""Test extracting generic 'member of Consortium' phrase"""
partnerships = parser.extract_partnerships(conversation_with_generic_partnerships)
# Should find "International Museum Consortium"
consortium = [p for p in partnerships if "International Museum Consortium" in p["partner_name"]]
assert len(consortium) >= 1
def test_no_duplicates(self, parser):
"""Test that duplicate partnerships are not returned"""
conv = Conversation(
uuid="test-dupe",
name="Test Duplicates",
chat_messages=[
ChatMessage(
uuid="msg-1",
text="",
sender="assistant",
content=[
MessageContent(
type="text",
text="Europeana is great. We work with Europeana. Europeana has many partners."
)
],
),
],
)
partnerships = parser.extract_partnerships(conv)
# Should only return one Europeana partnership
europeana_count = len([p for p in partnerships if "Europeana" in p["partner_name"]])
assert europeana_count == 1
def test_context_extraction(self, parser, conversation_with_europeana):
"""Test that description contains context sentence"""
partnerships = parser.extract_partnerships(conversation_with_europeana)
europeana = [p for p in partnerships if "Europeana" in p["partner_name"]][0]
# Description should contain the context
assert "description" in europeana
assert len(europeana["description"]) > 0
assert "Europeana" in europeana["description"]
def test_classify_partnership_type_aggregator(self, parser):
"""Test classification of aggregator partnerships"""
ptype = parser._classify_partnership_type("Archives Portal Europe", "joined the portal")
assert ptype == "aggregator_participation"
def test_classify_partnership_type_network(self, parser):
"""Test classification of network partnerships"""
ptype = parser._classify_partnership_type("IIIF Consortium", "member of consortium")
assert ptype == "thematic_network"
def test_classify_partnership_type_program(self, parser):
"""Test classification of program partnerships"""
ptype = parser._classify_partnership_type("Digitization Initiative", "participated in project")
assert ptype == "digitization_program"
def test_temporal_extraction_since(self, parser):
"""Test 'since YYYY' pattern"""
temporal = parser._extract_temporal_info("participating since 2019")
assert temporal["start_date"] == "2019-01-01"
assert "end_date" not in temporal
def test_temporal_extraction_until(self, parser):
"""Test 'until YYYY' pattern"""
temporal = parser._extract_temporal_info("active until 2024")
assert temporal["end_date"] == "2024-12-31"
assert "start_date" not in temporal
def test_temporal_extraction_in(self, parser):
"""Test 'in YYYY' pattern (single year)"""
temporal = parser._extract_temporal_info("joined in 2022")
assert temporal["start_date"] == "2022-01-01"
assert temporal["end_date"] == "2022-12-31"
def test_temporal_extraction_no_match(self, parser):
"""Test text with no temporal patterns"""
temporal = parser._extract_temporal_info("ongoing partnership")
assert len(temporal) == 0
def test_sentence_context_extraction(self, parser):
"""Test sentence context extraction"""
text = "First sentence. The museum participates in Europeana since 2018. Third sentence."
position = text.index("Europeana")
context = parser._extract_sentence_context(text, position)
# Should extract the full sentence
assert "museum participates" in context
assert "Europeana" in context
assert "since 2018" in context
def test_real_world_mexican_partnerships(self, parser):
"""Test extracting partnerships from Mexican GLAM text"""
conv = Conversation(
uuid="test-mexico",
name="Mexican GLAM",
chat_messages=[
ChatMessage(
uuid="msg-1",
text="",
sender="assistant",
content=[
MessageContent(
type="text",
text="""
The 2018 Open GLAM México Conference brought Europeana representatives
to share best practices. Mexican libraries participate in OCLC WorldCat
and the CONRICYT consortium serves 500+ institutions since 2010.
Google Arts & Culture features major Mexican institutions.
"""
)
],
),
],
)
partnerships = parser.extract_partnerships(conv)
# Should find multiple partnerships
assert len(partnerships) >= 3
# Check for specific partners
partner_names = [p["partner_name"] for p in partnerships]
assert any("Europeana" in name for name in partner_names)
assert any("WorldCat" in name or "OCLC" in name for name in partner_names)
assert any("CONRICYT" in name for name in partner_names)
assert any("Google Arts & Culture" in name for name in partner_names)