- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
553 lines
21 KiB
Python
553 lines
21 KiB
Python
"""
|
|
Tests for ConversationParser
|
|
"""
|
|
|
|
import json
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
import pytest
|
|
|
|
from glam_extractor.parsers.conversation import (
|
|
ConversationParser,
|
|
Conversation,
|
|
ChatMessage,
|
|
MessageContent,
|
|
)
|
|
|
|
|
|
@pytest.fixture
|
|
def sample_conversation_path():
|
|
"""Path to the sample conversation fixture"""
|
|
return Path(__file__).parent.parent / "fixtures" / "sample_conversation.json"
|
|
|
|
|
|
@pytest.fixture
|
|
def sample_conversation_data():
|
|
"""Sample conversation data as dictionary"""
|
|
return {
|
|
"uuid": "test-uuid-001",
|
|
"name": "Test Conversation",
|
|
"summary": "A test conversation",
|
|
"created_at": "2025-11-05T10:00:00.000000Z",
|
|
"updated_at": "2025-11-05T10:30:00.000000Z",
|
|
"chat_messages": [
|
|
{
|
|
"uuid": "msg-001",
|
|
"text": "Hello",
|
|
"sender": "human",
|
|
"content": [{"type": "text", "text": "Hello"}],
|
|
"created_at": "2025-11-05T10:00:00.000000Z",
|
|
"updated_at": "2025-11-05T10:00:00.000000Z",
|
|
},
|
|
{
|
|
"uuid": "msg-002",
|
|
"text": "Hi there!",
|
|
"sender": "assistant",
|
|
"content": [{"type": "text", "text": "Hi there!"}],
|
|
"created_at": "2025-11-05T10:01:00.000000Z",
|
|
"updated_at": "2025-11-05T10:01:00.000000Z",
|
|
},
|
|
],
|
|
}
|
|
|
|
|
|
class TestMessageContent:
|
|
"""Test MessageContent model"""
|
|
|
|
def test_basic_text_content(self):
|
|
content = MessageContent(type="text", text="Hello world")
|
|
assert content.type == "text"
|
|
assert content.text == "Hello world"
|
|
|
|
def test_tool_use_content(self):
|
|
# MessageContent allows extra fields
|
|
content = MessageContent(
|
|
type="tool_use", name="web_search", input={"query": "museums"}
|
|
)
|
|
assert content.type == "tool_use"
|
|
assert content.text is None
|
|
|
|
|
|
class TestChatMessage:
|
|
"""Test ChatMessage model"""
|
|
|
|
def test_basic_message(self):
|
|
msg = ChatMessage(
|
|
uuid="msg-001",
|
|
text="Hello",
|
|
sender="human",
|
|
content=[MessageContent(type="text", text="Hello")],
|
|
)
|
|
assert msg.uuid == "msg-001"
|
|
assert msg.text == "Hello"
|
|
assert msg.sender == "human"
|
|
assert len(msg.content) == 1
|
|
|
|
def test_extract_text_from_text_field(self):
|
|
msg = ChatMessage(uuid="msg-001", text="Main text", sender="human", content=[])
|
|
assert msg.extract_text() == "Main text"
|
|
|
|
def test_extract_text_from_content(self):
|
|
msg = ChatMessage(
|
|
uuid="msg-001",
|
|
text="",
|
|
sender="assistant",
|
|
content=[
|
|
MessageContent(type="text", text="Content text"),
|
|
MessageContent(type="tool_use", name="search"),
|
|
],
|
|
)
|
|
assert msg.extract_text() == "Content text"
|
|
|
|
def test_extract_text_deduplication(self):
|
|
# Sometimes text field and content[0].text are identical
|
|
msg = ChatMessage(
|
|
uuid="msg-001",
|
|
text="Same text",
|
|
sender="human",
|
|
content=[MessageContent(type="text", text="Same text")],
|
|
)
|
|
# Should not duplicate
|
|
assert msg.extract_text() == "Same text"
|
|
|
|
def test_extract_text_combined(self):
|
|
msg = ChatMessage(
|
|
uuid="msg-001",
|
|
text="First part",
|
|
sender="assistant",
|
|
content=[
|
|
MessageContent(type="text", text="Second part"),
|
|
MessageContent(type="text", text="Third part"),
|
|
],
|
|
)
|
|
extracted = msg.extract_text()
|
|
assert "First part" in extracted
|
|
assert "Second part" in extracted
|
|
assert "Third part" in extracted
|
|
|
|
|
|
class TestConversation:
|
|
"""Test Conversation model"""
|
|
|
|
def test_basic_conversation(self, sample_conversation_data):
|
|
conv = Conversation(**sample_conversation_data)
|
|
assert conv.uuid == "test-uuid-001"
|
|
assert conv.name == "Test Conversation"
|
|
assert conv.summary == "A test conversation"
|
|
assert len(conv.chat_messages) == 2
|
|
|
|
def test_datetime_parsing(self):
|
|
conv = Conversation(
|
|
uuid="test",
|
|
name="Test",
|
|
created_at="2025-11-05T10:00:00.000000Z",
|
|
updated_at="2025-11-05T10:30:00.000000Z",
|
|
)
|
|
assert isinstance(conv.created_at, datetime)
|
|
assert isinstance(conv.updated_at, datetime)
|
|
|
|
def test_get_assistant_messages(self, sample_conversation_data):
|
|
conv = Conversation(**sample_conversation_data)
|
|
assistant_msgs = conv.get_assistant_messages()
|
|
assert len(assistant_msgs) == 1
|
|
assert assistant_msgs[0].sender == "assistant"
|
|
|
|
def test_get_human_messages(self, sample_conversation_data):
|
|
conv = Conversation(**sample_conversation_data)
|
|
human_msgs = conv.get_human_messages()
|
|
assert len(human_msgs) == 1
|
|
assert human_msgs[0].sender == "human"
|
|
|
|
def test_extract_all_text(self, sample_conversation_data):
|
|
conv = Conversation(**sample_conversation_data)
|
|
all_text = conv.extract_all_text()
|
|
assert "Hello" in all_text
|
|
assert "Hi there!" in all_text
|
|
|
|
def test_extract_text_by_sender(self, sample_conversation_data):
|
|
conv = Conversation(**sample_conversation_data)
|
|
assistant_text = conv.extract_all_text(sender="assistant")
|
|
assert "Hi there!" in assistant_text
|
|
assert "Hello" not in assistant_text
|
|
|
|
|
|
class TestConversationParser:
|
|
"""Test ConversationParser"""
|
|
|
|
def test_parse_dict(self, sample_conversation_data):
|
|
parser = ConversationParser()
|
|
conv = parser.parse_dict(sample_conversation_data)
|
|
assert isinstance(conv, Conversation)
|
|
assert conv.uuid == "test-uuid-001"
|
|
|
|
def test_parse_dict_missing_uuid(self):
|
|
parser = ConversationParser()
|
|
with pytest.raises(ValueError, match="uuid"):
|
|
parser.parse_dict({"name": "Test"})
|
|
|
|
def test_parse_dict_missing_name(self):
|
|
parser = ConversationParser()
|
|
with pytest.raises(ValueError, match="name"):
|
|
parser.parse_dict({"uuid": "test-123"})
|
|
|
|
def test_parse_dict_not_dict(self):
|
|
parser = ConversationParser()
|
|
with pytest.raises(ValueError, match="dictionary"):
|
|
parser.parse_dict("not a dict")
|
|
|
|
def test_parse_file(self, sample_conversation_path):
|
|
parser = ConversationParser()
|
|
conv = parser.parse_file(sample_conversation_path)
|
|
assert isinstance(conv, Conversation)
|
|
assert conv.uuid == "test-uuid-001"
|
|
assert conv.name == "Test Dutch GLAM Institutions"
|
|
|
|
def test_parse_file_not_found(self):
|
|
parser = ConversationParser()
|
|
with pytest.raises(FileNotFoundError):
|
|
parser.parse_file("/nonexistent/file.json")
|
|
|
|
def test_extract_institutions_context(self, sample_conversation_path):
|
|
parser = ConversationParser()
|
|
conv = parser.parse_file(sample_conversation_path)
|
|
context = parser.extract_institutions_context(conv)
|
|
|
|
# Should contain assistant messages with institution details
|
|
assert "Rijksmuseum" in context
|
|
assert "NL-AsdRM" in context # ISIL code
|
|
assert "Nationaal Archief" in context
|
|
assert "NL-HaNA" in context # ISIL code
|
|
|
|
# Should NOT contain human questions (since we filter to assistant only)
|
|
# But human messages are short questions, so this is implicit
|
|
|
|
def test_get_conversation_metadata(self, sample_conversation_path):
|
|
parser = ConversationParser()
|
|
conv = parser.parse_file(sample_conversation_path)
|
|
metadata = parser.get_conversation_metadata(conv)
|
|
|
|
assert metadata["conversation_id"] == "test-uuid-001"
|
|
assert metadata["conversation_name"] == "Test Dutch GLAM Institutions"
|
|
assert metadata["message_count"] == 4
|
|
assert metadata["assistant_message_count"] == 2
|
|
assert metadata["human_message_count"] == 2
|
|
|
|
|
|
class TestRealWorldConversation:
|
|
"""Test with actual conversation structure from sample"""
|
|
|
|
@pytest.fixture
|
|
def rijksmuseum_conversation(self, sample_conversation_path):
|
|
parser = ConversationParser()
|
|
return parser.parse_file(sample_conversation_path)
|
|
|
|
def test_extract_rijksmuseum_info(self, rijksmuseum_conversation):
|
|
"""Test extracting information about Rijksmuseum"""
|
|
text = rijksmuseum_conversation.extract_all_text(sender="assistant")
|
|
|
|
# Institution name
|
|
assert "Rijksmuseum" in text
|
|
|
|
# ISIL code
|
|
assert "NL-AsdRM" in text
|
|
|
|
# Address
|
|
assert "Museumstraat 1" in text
|
|
assert "Amsterdam" in text
|
|
|
|
# Collection info
|
|
assert "1 million" in text
|
|
|
|
# Metadata standards
|
|
assert "SPECTRUM" in text
|
|
assert "LIDO" in text
|
|
|
|
# Website
|
|
assert "rijksmuseum.nl" in text
|
|
|
|
def test_extract_nationaal_archief_info(self, rijksmuseum_conversation):
|
|
"""Test extracting information about Nationaal Archief"""
|
|
text = rijksmuseum_conversation.extract_all_text(sender="assistant")
|
|
|
|
# Institution name
|
|
assert "Nationaal Archief" in text
|
|
|
|
# ISIL code
|
|
assert "NL-HaNA" in text
|
|
|
|
# Location
|
|
assert "The Hague" in text or "Den Haag" in text
|
|
|
|
# Metadata standards
|
|
assert "EAD" in text
|
|
assert "RiC-O" in text
|
|
|
|
def test_message_count(self, rijksmuseum_conversation):
|
|
"""Test message counts"""
|
|
assert len(rijksmuseum_conversation.chat_messages) == 4
|
|
assert len(rijksmuseum_conversation.get_assistant_messages()) == 2
|
|
assert len(rijksmuseum_conversation.get_human_messages()) == 2
|
|
|
|
def test_message_order(self, rijksmuseum_conversation):
|
|
"""Test messages are in correct order"""
|
|
messages = rijksmuseum_conversation.chat_messages
|
|
assert messages[0].sender == "human"
|
|
assert messages[1].sender == "assistant"
|
|
assert messages[2].sender == "human"
|
|
assert messages[3].sender == "assistant"
|
|
|
|
|
|
class TestPartnershipExtraction:
|
|
"""Test partnership extraction from conversation text"""
|
|
|
|
@pytest.fixture
|
|
def parser(self):
|
|
return ConversationParser()
|
|
|
|
@pytest.fixture
|
|
def conversation_with_europeana(self):
|
|
"""Conversation mentioning Europeana partnership"""
|
|
return Conversation(
|
|
uuid="test-europeana",
|
|
name="Test Europeana",
|
|
chat_messages=[
|
|
ChatMessage(
|
|
uuid="msg-1",
|
|
text="Tell me about partnerships",
|
|
sender="human",
|
|
content=[],
|
|
),
|
|
ChatMessage(
|
|
uuid="msg-2",
|
|
text="The museum participates in Europeana since 2018 and collaborates with DPLA.",
|
|
sender="assistant",
|
|
content=[
|
|
MessageContent(
|
|
type="text",
|
|
text="The museum participates in Europeana since 2018 and collaborates with DPLA."
|
|
)
|
|
],
|
|
),
|
|
],
|
|
)
|
|
|
|
@pytest.fixture
|
|
def conversation_with_temporal_info(self):
|
|
"""Conversation with temporal partnership information"""
|
|
return Conversation(
|
|
uuid="test-temporal",
|
|
name="Test Temporal",
|
|
chat_messages=[
|
|
ChatMessage(
|
|
uuid="msg-1",
|
|
text="",
|
|
sender="assistant",
|
|
content=[
|
|
MessageContent(
|
|
type="text",
|
|
text="The archive joined Archieven.nl from 2020 to 2025 and participated in DC4EU digitization program."
|
|
)
|
|
],
|
|
),
|
|
],
|
|
)
|
|
|
|
@pytest.fixture
|
|
def conversation_with_generic_partnerships(self):
|
|
"""Conversation with generic partnership phrases"""
|
|
return Conversation(
|
|
uuid="test-generic",
|
|
name="Test Generic",
|
|
chat_messages=[
|
|
ChatMessage(
|
|
uuid="msg-1",
|
|
text="",
|
|
sender="assistant",
|
|
content=[
|
|
MessageContent(
|
|
type="text",
|
|
text="The institution is part of the Digital Heritage Network and member of the International Museum Consortium."
|
|
)
|
|
],
|
|
),
|
|
],
|
|
)
|
|
|
|
def test_extract_europeana_partnership(self, parser, conversation_with_europeana):
|
|
"""Test extracting Europeana partnership"""
|
|
partnerships = parser.extract_partnerships(conversation_with_europeana)
|
|
|
|
# Should find Europeana
|
|
europeana = [p for p in partnerships if "Europeana" in p["partner_name"]]
|
|
assert len(europeana) == 1
|
|
assert europeana[0]["partnership_type"] == "aggregator_participation"
|
|
assert europeana[0]["start_date"] == "2018-01-01" # "since 2018"
|
|
|
|
def test_extract_dpla_partnership(self, parser, conversation_with_europeana):
|
|
"""Test extracting DPLA partnership"""
|
|
partnerships = parser.extract_partnerships(conversation_with_europeana)
|
|
|
|
# Should find DPLA
|
|
dpla = [p for p in partnerships if "DPLA" in p["partner_name"]]
|
|
assert len(dpla) == 1
|
|
assert dpla[0]["partnership_type"] == "aggregator_participation"
|
|
|
|
def test_extract_temporal_info_from_to(self, parser, conversation_with_temporal_info):
|
|
"""Test extracting 'from YYYY to YYYY' temporal pattern"""
|
|
partnerships = parser.extract_partnerships(conversation_with_temporal_info)
|
|
|
|
# Find Archieven.nl
|
|
archieven = [p for p in partnerships if "Archieven.nl" in p["partner_name"]]
|
|
assert len(archieven) == 1
|
|
assert archieven[0]["start_date"] == "2020-01-01"
|
|
assert archieven[0]["end_date"] == "2025-12-31"
|
|
|
|
def test_extract_digitization_program(self, parser, conversation_with_temporal_info):
|
|
"""Test extracting digitization program partnership"""
|
|
partnerships = parser.extract_partnerships(conversation_with_temporal_info)
|
|
|
|
# Find DC4EU
|
|
dc4eu = [p for p in partnerships if "DC4EU" in p["partner_name"]]
|
|
assert len(dc4eu) == 1
|
|
assert dc4eu[0]["partnership_type"] == "digitization_program"
|
|
|
|
def test_extract_generic_network_partnership(self, parser, conversation_with_generic_partnerships):
|
|
"""Test extracting generic 'part of Network' phrase"""
|
|
partnerships = parser.extract_partnerships(conversation_with_generic_partnerships)
|
|
|
|
# Should find "Digital Heritage Network"
|
|
network = [p for p in partnerships if "Digital Heritage Network" in p["partner_name"]]
|
|
assert len(network) >= 1
|
|
|
|
def test_extract_generic_consortium_partnership(self, parser, conversation_with_generic_partnerships):
|
|
"""Test extracting generic 'member of Consortium' phrase"""
|
|
partnerships = parser.extract_partnerships(conversation_with_generic_partnerships)
|
|
|
|
# Should find "International Museum Consortium"
|
|
consortium = [p for p in partnerships if "International Museum Consortium" in p["partner_name"]]
|
|
assert len(consortium) >= 1
|
|
|
|
def test_no_duplicates(self, parser):
|
|
"""Test that duplicate partnerships are not returned"""
|
|
conv = Conversation(
|
|
uuid="test-dupe",
|
|
name="Test Duplicates",
|
|
chat_messages=[
|
|
ChatMessage(
|
|
uuid="msg-1",
|
|
text="",
|
|
sender="assistant",
|
|
content=[
|
|
MessageContent(
|
|
type="text",
|
|
text="Europeana is great. We work with Europeana. Europeana has many partners."
|
|
)
|
|
],
|
|
),
|
|
],
|
|
)
|
|
|
|
partnerships = parser.extract_partnerships(conv)
|
|
|
|
# Should only return one Europeana partnership
|
|
europeana_count = len([p for p in partnerships if "Europeana" in p["partner_name"]])
|
|
assert europeana_count == 1
|
|
|
|
def test_context_extraction(self, parser, conversation_with_europeana):
|
|
"""Test that description contains context sentence"""
|
|
partnerships = parser.extract_partnerships(conversation_with_europeana)
|
|
|
|
europeana = [p for p in partnerships if "Europeana" in p["partner_name"]][0]
|
|
|
|
# Description should contain the context
|
|
assert "description" in europeana
|
|
assert len(europeana["description"]) > 0
|
|
assert "Europeana" in europeana["description"]
|
|
|
|
def test_classify_partnership_type_aggregator(self, parser):
|
|
"""Test classification of aggregator partnerships"""
|
|
ptype = parser._classify_partnership_type("Archives Portal Europe", "joined the portal")
|
|
assert ptype == "aggregator_participation"
|
|
|
|
def test_classify_partnership_type_network(self, parser):
|
|
"""Test classification of network partnerships"""
|
|
ptype = parser._classify_partnership_type("IIIF Consortium", "member of consortium")
|
|
assert ptype == "thematic_network"
|
|
|
|
def test_classify_partnership_type_program(self, parser):
|
|
"""Test classification of program partnerships"""
|
|
ptype = parser._classify_partnership_type("Digitization Initiative", "participated in project")
|
|
assert ptype == "digitization_program"
|
|
|
|
def test_temporal_extraction_since(self, parser):
|
|
"""Test 'since YYYY' pattern"""
|
|
temporal = parser._extract_temporal_info("participating since 2019")
|
|
assert temporal["start_date"] == "2019-01-01"
|
|
assert "end_date" not in temporal
|
|
|
|
def test_temporal_extraction_until(self, parser):
|
|
"""Test 'until YYYY' pattern"""
|
|
temporal = parser._extract_temporal_info("active until 2024")
|
|
assert temporal["end_date"] == "2024-12-31"
|
|
assert "start_date" not in temporal
|
|
|
|
def test_temporal_extraction_in(self, parser):
|
|
"""Test 'in YYYY' pattern (single year)"""
|
|
temporal = parser._extract_temporal_info("joined in 2022")
|
|
assert temporal["start_date"] == "2022-01-01"
|
|
assert temporal["end_date"] == "2022-12-31"
|
|
|
|
def test_temporal_extraction_no_match(self, parser):
|
|
"""Test text with no temporal patterns"""
|
|
temporal = parser._extract_temporal_info("ongoing partnership")
|
|
assert len(temporal) == 0
|
|
|
|
def test_sentence_context_extraction(self, parser):
|
|
"""Test sentence context extraction"""
|
|
text = "First sentence. The museum participates in Europeana since 2018. Third sentence."
|
|
position = text.index("Europeana")
|
|
|
|
context = parser._extract_sentence_context(text, position)
|
|
|
|
# Should extract the full sentence
|
|
assert "museum participates" in context
|
|
assert "Europeana" in context
|
|
assert "since 2018" in context
|
|
|
|
def test_real_world_mexican_partnerships(self, parser):
|
|
"""Test extracting partnerships from Mexican GLAM text"""
|
|
conv = Conversation(
|
|
uuid="test-mexico",
|
|
name="Mexican GLAM",
|
|
chat_messages=[
|
|
ChatMessage(
|
|
uuid="msg-1",
|
|
text="",
|
|
sender="assistant",
|
|
content=[
|
|
MessageContent(
|
|
type="text",
|
|
text="""
|
|
The 2018 Open GLAM México Conference brought Europeana representatives
|
|
to share best practices. Mexican libraries participate in OCLC WorldCat
|
|
and the CONRICYT consortium serves 500+ institutions since 2010.
|
|
Google Arts & Culture features major Mexican institutions.
|
|
"""
|
|
)
|
|
],
|
|
),
|
|
],
|
|
)
|
|
|
|
partnerships = parser.extract_partnerships(conv)
|
|
|
|
# Should find multiple partnerships
|
|
assert len(partnerships) >= 3
|
|
|
|
# Check for specific partners
|
|
partner_names = [p["partner_name"] for p in partnerships]
|
|
assert any("Europeana" in name for name in partner_names)
|
|
assert any("WorldCat" in name or "OCLC" in name for name in partner_names)
|
|
assert any("CONRICYT" in name for name in partner_names)
|
|
assert any("Google Arts & Culture" in name for name in partner_names)
|
|
|