1160 lines
46 KiB
Python
1160 lines
46 KiB
Python
"""
|
|
Unit tests for Template-Based SPARQL Query Generation System
|
|
|
|
Tests the critical ordering of the pipeline:
|
|
1. ConversationContextResolver (FIRST - resolves follow-ups)
|
|
2. FykeFilter (on RESOLVED question, not raw input!)
|
|
3. TemplateClassifier
|
|
4. SlotExtractor
|
|
5. TemplateInstantiator
|
|
|
|
Run with: pytest tests/test_template_sparql.py -v
|
|
"""
|
|
|
|
import json
|
|
import pytest
|
|
from pathlib import Path
|
|
from unittest.mock import MagicMock, patch
|
|
|
|
# Add project root to path for imports
|
|
import sys
|
|
PROJECT_ROOT = Path(__file__).parent.parent
|
|
sys.path.insert(0, str(PROJECT_ROOT))
|
|
|
|
# Try to import the module under test
|
|
try:
|
|
from backend.rag.template_sparql import (
|
|
SynonymResolver,
|
|
get_synonym_resolver,
|
|
ConversationState,
|
|
ConversationTurn,
|
|
ResolvedQuestion,
|
|
FykeResult,
|
|
TemplateMatchResult,
|
|
FykeFilterConfig,
|
|
TemplateInstantiator,
|
|
TemplateClassifier,
|
|
SPARQL_PREFIXES,
|
|
)
|
|
TEMPLATE_SPARQL_AVAILABLE = True
|
|
except ImportError as e:
|
|
# Module may not be importable in all environments (missing dspy, etc.)
|
|
TEMPLATE_SPARQL_AVAILABLE = False
|
|
IMPORT_ERROR = str(e)
|
|
|
|
# Create placeholder classes for tests that don't need full module
|
|
from pydantic import BaseModel, Field
|
|
from typing import Optional, Literal
|
|
from dataclasses import dataclass
|
|
|
|
class ConversationTurn(BaseModel):
|
|
role: Literal["user", "assistant"]
|
|
content: str
|
|
resolved_question: Optional[str] = None
|
|
template_id: Optional[str] = None
|
|
slots: dict = Field(default_factory=dict)
|
|
results: list = Field(default_factory=list)
|
|
|
|
class ConversationState(BaseModel):
|
|
turns: list = Field(default_factory=list)
|
|
current_slots: dict = Field(default_factory=dict)
|
|
current_template_id: Optional[str] = None
|
|
language: str = "nl"
|
|
|
|
def add_turn(self, turn):
|
|
self.turns.append(turn)
|
|
if turn.role == "user" and turn.slots:
|
|
self.current_slots.update(turn.slots)
|
|
if turn.template_id:
|
|
self.current_template_id = turn.template_id
|
|
|
|
def get_previous_user_turn(self):
|
|
for turn in reversed(self.turns):
|
|
if turn.role == "user":
|
|
return turn
|
|
return None
|
|
|
|
def to_dspy_history(self):
|
|
return {"messages": [{"role": t.role, "content": t.content} for t in self.turns[-6:]]}
|
|
|
|
class ResolvedQuestion(BaseModel):
|
|
original: str
|
|
resolved: str
|
|
is_follow_up: bool = False
|
|
follow_up_type: Optional[str] = None
|
|
inherited_slots: dict = Field(default_factory=dict)
|
|
confidence: float = 1.0
|
|
|
|
class FykeResult(BaseModel):
|
|
is_relevant: bool
|
|
confidence: float
|
|
reasoning: str
|
|
standard_response: Optional[str] = None
|
|
|
|
class TemplateMatchResult(BaseModel):
|
|
matched: bool
|
|
template_id: Optional[str] = None
|
|
confidence: float = 0.0
|
|
slots: dict = Field(default_factory=dict)
|
|
sparql: Optional[str] = None
|
|
reasoning: str = ""
|
|
|
|
class FykeFilterConfig(BaseModel):
|
|
out_of_scope_keywords: list
|
|
out_of_scope_categories: list
|
|
heritage_keywords: list
|
|
standard_response: dict
|
|
|
|
class SynonymResolver:
|
|
def __init__(self):
|
|
self._loaded = False
|
|
|
|
def load(self):
|
|
pass
|
|
|
|
def resolve_institution_type(self, term):
|
|
mappings = {
|
|
"musea": "M", "museum": "M", "museums": "M",
|
|
"archieven": "A", "archief": "A", "archives": "A",
|
|
"bibliotheken": "L", "bibliotheek": "L", "libraries": "L",
|
|
"galerie": "G", "galleries": "G",
|
|
}
|
|
term_lower = term.lower().strip()
|
|
if term_lower in mappings:
|
|
return mappings[term_lower]
|
|
if term.upper() in "MLAGORCUBESFIXPHDNT":
|
|
return term.upper()
|
|
return None
|
|
|
|
def resolve_city(self, term):
|
|
corrections = {
|
|
"den haag": "Den Haag",
|
|
"the hague": "Den Haag",
|
|
"'s-gravenhage": "Den Haag",
|
|
}
|
|
term_lower = term.lower().strip()
|
|
if term_lower in corrections:
|
|
return corrections[term_lower]
|
|
return term.title()
|
|
|
|
def resolve_subregion(self, term):
|
|
return None
|
|
|
|
def resolve_country(self, term):
|
|
if term.startswith("Q"):
|
|
return term
|
|
return None
|
|
|
|
def resolve_budget_category(self, term):
|
|
"""Resolve budget category term to canonical slot name."""
|
|
mappings = {
|
|
# Dutch - Innovation
|
|
"innovatie": "innovation",
|
|
"innovaties": "innovation",
|
|
"vernieuwing": "innovation",
|
|
# English - Innovation
|
|
"innovation": "innovation",
|
|
"innovations": "innovation",
|
|
"r_and_d": "innovation",
|
|
"technology": "innovation",
|
|
# German - Innovation
|
|
"innovationen": "innovation",
|
|
"erneuerung": "innovation",
|
|
# Dutch - Digitization
|
|
"digitalisering": "digitization",
|
|
# English - Digitization
|
|
"digitization": "digitization",
|
|
"digitisation": "digitization",
|
|
# German - Digitization
|
|
"digitalisierung": "digitization",
|
|
# Dutch - Preservation
|
|
"conservering": "preservation",
|
|
"restauratie": "preservation",
|
|
# English - Preservation
|
|
"preservation": "preservation",
|
|
"conservation": "preservation",
|
|
# German - Preservation
|
|
"konservierung": "preservation",
|
|
# Dutch - Personnel
|
|
"personeel": "personnel",
|
|
"salarissen": "personnel",
|
|
# English - Personnel
|
|
"personnel": "personnel",
|
|
"staff": "personnel",
|
|
"salaries": "personnel",
|
|
# German - Personnel
|
|
"personal": "personnel",
|
|
# Dutch - Acquisition
|
|
"aanwinsten": "acquisition",
|
|
"aankopen": "acquisition",
|
|
# English - Acquisition
|
|
"acquisition": "acquisition",
|
|
"acquisitions": "acquisition",
|
|
# German - Acquisition
|
|
"erwerbungen": "acquisition",
|
|
# Dutch - Operating
|
|
"operationeel": "operating",
|
|
"exploitatie": "operating",
|
|
# English - Operating
|
|
"operating": "operating",
|
|
"operations": "operating",
|
|
# German - Operating
|
|
"betriebskosten": "operating",
|
|
# Dutch - Capital
|
|
"kapitaal": "capital",
|
|
"investeringen": "capital",
|
|
# English - Capital
|
|
"capital": "capital",
|
|
"capex": "capital",
|
|
# German - Capital
|
|
"investitionen": "capital",
|
|
}
|
|
term_lower = term.lower().strip()
|
|
if term_lower in mappings:
|
|
return mappings[term_lower]
|
|
return None
|
|
|
|
def get_synonym_resolver():
|
|
return SynonymResolver()
|
|
|
|
SPARQL_PREFIXES = """PREFIX hc: <https://nde.nl/ontology/hc/>
|
|
PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>
|
|
PREFIX schema: <http://schema.org/>
|
|
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>"""
|
|
|
|
class TemplateInstantiator:
|
|
pass
|
|
|
|
|
|
# =============================================================================
|
|
# SYNONYM RESOLVER TESTS
|
|
# =============================================================================
|
|
|
|
class TestSynonymResolver:
|
|
"""Tests for SynonymResolver."""
|
|
|
|
def test_resolve_institution_type_dutch(self):
|
|
"""Test Dutch institution type synonyms."""
|
|
resolver = SynonymResolver()
|
|
|
|
assert resolver.resolve_institution_type("musea") == "M"
|
|
assert resolver.resolve_institution_type("museum") == "M"
|
|
assert resolver.resolve_institution_type("archieven") == "A"
|
|
assert resolver.resolve_institution_type("archief") == "A"
|
|
assert resolver.resolve_institution_type("bibliotheken") == "L"
|
|
assert resolver.resolve_institution_type("bibliotheek") == "L"
|
|
assert resolver.resolve_institution_type("galerie") == "G"
|
|
|
|
def test_resolve_institution_type_english(self):
|
|
"""Test English institution type synonyms."""
|
|
resolver = SynonymResolver()
|
|
|
|
assert resolver.resolve_institution_type("museums") == "M"
|
|
assert resolver.resolve_institution_type("archives") == "A"
|
|
assert resolver.resolve_institution_type("libraries") == "L"
|
|
assert resolver.resolve_institution_type("galleries") == "G"
|
|
|
|
def test_resolve_institution_type_code_passthrough(self):
|
|
"""Test that single-letter codes pass through."""
|
|
resolver = SynonymResolver()
|
|
|
|
assert resolver.resolve_institution_type("M") == "M"
|
|
assert resolver.resolve_institution_type("A") == "A"
|
|
assert resolver.resolve_institution_type("L") == "L"
|
|
|
|
def test_resolve_institution_type_case_insensitive(self):
|
|
"""Test case insensitivity."""
|
|
resolver = SynonymResolver()
|
|
|
|
assert resolver.resolve_institution_type("MUSEA") == "M"
|
|
assert resolver.resolve_institution_type("Archieven") == "A"
|
|
assert resolver.resolve_institution_type("BIBLIOTHEKEN") == "L"
|
|
|
|
def test_resolve_city_corrections(self):
|
|
"""Test city name corrections."""
|
|
resolver = SynonymResolver()
|
|
|
|
assert resolver.resolve_city("den haag") == "Den Haag"
|
|
assert resolver.resolve_city("the hague") == "Den Haag"
|
|
assert resolver.resolve_city("'s-gravenhage") == "Den Haag"
|
|
assert resolver.resolve_city("amsterdam") == "Amsterdam"
|
|
assert resolver.resolve_city("ROTTERDAM") == "Rotterdam"
|
|
|
|
def test_resolve_subregion_dutch_provinces(self):
|
|
"""Test Dutch province resolution."""
|
|
resolver = SynonymResolver()
|
|
|
|
# These may need the validation rules loaded
|
|
result = resolver.resolve_subregion("noord-holland")
|
|
assert result is None or result == "NL-NH"
|
|
|
|
def test_resolve_country(self):
|
|
"""Test country resolution to Wikidata Q-numbers."""
|
|
resolver = SynonymResolver()
|
|
|
|
# Direct Q-number passthrough
|
|
assert resolver.resolve_country("Q55") == "Q55"
|
|
|
|
|
|
# =============================================================================
|
|
# CONVERSATION STATE TESTS
|
|
# =============================================================================
|
|
|
|
class TestConversationState:
|
|
"""Tests for ConversationState management."""
|
|
|
|
def test_empty_state(self):
|
|
"""Test empty conversation state."""
|
|
state = ConversationState()
|
|
|
|
assert len(state.turns) == 0
|
|
assert state.current_slots == {}
|
|
assert state.current_template_id is None
|
|
assert state.get_previous_user_turn() is None
|
|
|
|
def test_add_user_turn(self):
|
|
"""Test adding user turn updates slots."""
|
|
state = ConversationState()
|
|
|
|
turn = ConversationTurn(
|
|
role="user",
|
|
content="Welke archieven zijn er in Den Haag?",
|
|
resolved_question="Welke archieven zijn er in Den Haag?",
|
|
template_id="list_institutions_by_type_city",
|
|
slots={"institution_type": "A", "city": "Den Haag"}
|
|
)
|
|
state.add_turn(turn)
|
|
|
|
assert len(state.turns) == 1
|
|
assert state.current_slots["institution_type"] == "A"
|
|
assert state.current_slots["city"] == "Den Haag"
|
|
assert state.current_template_id == "list_institutions_by_type_city"
|
|
|
|
def test_slot_inheritance(self):
|
|
"""Test that slots are inherited across turns."""
|
|
state = ConversationState()
|
|
|
|
# First turn sets institution_type and city
|
|
turn1 = ConversationTurn(
|
|
role="user",
|
|
content="Welke archieven zijn er in Den Haag?",
|
|
slots={"institution_type": "A", "city": "Den Haag"}
|
|
)
|
|
state.add_turn(turn1)
|
|
|
|
# Second turn only changes city
|
|
turn2 = ConversationTurn(
|
|
role="user",
|
|
content="En in Enschede?",
|
|
slots={"city": "Enschede"} # institution_type inherited
|
|
)
|
|
state.add_turn(turn2)
|
|
|
|
# institution_type should still be A
|
|
assert state.current_slots["institution_type"] == "A"
|
|
assert state.current_slots["city"] == "Enschede"
|
|
|
|
def test_get_previous_user_turn(self):
|
|
"""Test getting previous user turn."""
|
|
state = ConversationState()
|
|
|
|
user_turn = ConversationTurn(role="user", content="Question?")
|
|
assistant_turn = ConversationTurn(role="assistant", content="Answer.")
|
|
|
|
state.add_turn(user_turn)
|
|
state.add_turn(assistant_turn)
|
|
|
|
prev = state.get_previous_user_turn()
|
|
assert prev is not None
|
|
assert prev.content == "Question?"
|
|
|
|
def test_to_dspy_history(self):
|
|
"""Test conversion to DSPy History."""
|
|
state = ConversationState()
|
|
|
|
state.add_turn(ConversationTurn(role="user", content="Q1"))
|
|
state.add_turn(ConversationTurn(role="assistant", content="A1"))
|
|
state.add_turn(ConversationTurn(role="user", content="Q2"))
|
|
|
|
history = state.to_dspy_history()
|
|
|
|
# Handle both real DSPy History and mock dict
|
|
if isinstance(history, dict):
|
|
messages = history.get("messages", [])
|
|
else:
|
|
messages = history.messages
|
|
|
|
assert len(messages) == 3
|
|
assert messages[0]["role"] == "user"
|
|
assert messages[0]["content"] == "Q1"
|
|
|
|
|
|
# =============================================================================
|
|
# FYKE FILTER TESTS (Critical: Must operate on RESOLVED question!)
|
|
# =============================================================================
|
|
|
|
class TestFykeFilter:
|
|
"""Tests for FykeFilter.
|
|
|
|
CRITICAL: These tests verify that the Fyke filter operates on RESOLVED
|
|
questions, not raw input. Short follow-ups like "En in Enschede?" should
|
|
NOT be filtered when they resolve to valid heritage questions.
|
|
"""
|
|
|
|
def test_fyke_config_loads(self):
|
|
"""Test that Fyke config loads properly."""
|
|
config = FykeFilterConfig(
|
|
out_of_scope_keywords=["tandpasta", "supermarkt"],
|
|
out_of_scope_categories=["shopping"],
|
|
heritage_keywords=["museum", "archief"],
|
|
standard_response={"nl": "Ik help met erfgoed.", "en": "I help with heritage."}
|
|
)
|
|
|
|
assert "tandpasta" in config.out_of_scope_keywords
|
|
assert "museum" in config.heritage_keywords
|
|
|
|
def test_heritage_keywords_pass(self):
|
|
"""Test that heritage keywords are detected as relevant."""
|
|
config = FykeFilterConfig(
|
|
out_of_scope_keywords=[],
|
|
out_of_scope_categories=[],
|
|
# Include plural forms for Dutch (musea, archieven, bibliotheken)
|
|
heritage_keywords=["museum", "musea", "archief", "archieven", "bibliotheek", "bibliotheken"],
|
|
standard_response={}
|
|
)
|
|
|
|
# Simulating Fyke logic (without DSPy call)
|
|
question = "Welke musea zijn er in Amsterdam?"
|
|
is_relevant = any(kw in question.lower() for kw in config.heritage_keywords)
|
|
|
|
assert is_relevant is True
|
|
|
|
def test_out_of_scope_blocked(self):
|
|
"""Test that out-of-scope keywords are blocked."""
|
|
config = FykeFilterConfig(
|
|
out_of_scope_keywords=["tandpasta", "supermarkt", "restaurant"],
|
|
out_of_scope_categories=[],
|
|
heritage_keywords=["museum"],
|
|
standard_response={"nl": "Ik help met erfgoed."}
|
|
)
|
|
|
|
question = "Waar kan ik tandpasta kopen?"
|
|
is_blocked = any(kw in question.lower() for kw in config.out_of_scope_keywords)
|
|
|
|
assert is_blocked is True
|
|
|
|
def test_resolved_follow_up_passes(self):
|
|
"""CRITICAL: Resolved follow-ups should pass the filter.
|
|
|
|
Raw: "En in Enschede?" (would be ambiguous)
|
|
Resolved: "Welke archieven zijn er in Enschede?" (clearly relevant)
|
|
|
|
The Fyke filter MUST see the resolved question.
|
|
"""
|
|
config = FykeFilterConfig(
|
|
out_of_scope_keywords=["tandpasta"],
|
|
out_of_scope_categories=[],
|
|
heritage_keywords=["archieven", "musea", "bibliotheken"],
|
|
standard_response={}
|
|
)
|
|
|
|
# This is what the Fyke filter should see (RESOLVED question)
|
|
resolved_question = "Welke archieven zijn er in Enschede?"
|
|
is_relevant = any(kw in resolved_question.lower() for kw in config.heritage_keywords)
|
|
|
|
assert is_relevant is True
|
|
|
|
def test_short_follow_up_without_resolution_would_fail(self):
|
|
"""Demonstrate why ConversationContextResolver must run FIRST.
|
|
|
|
If we passed raw "En in Enschede?" to Fyke without resolution,
|
|
it wouldn't match any heritage keywords.
|
|
"""
|
|
config = FykeFilterConfig(
|
|
out_of_scope_keywords=[],
|
|
out_of_scope_categories=[],
|
|
# Include plural forms for Dutch
|
|
heritage_keywords=["museum", "musea", "archief", "archieven", "bibliotheek", "bibliotheken", "galerie", "galerijen"],
|
|
standard_response={}
|
|
)
|
|
|
|
# Raw follow-up without resolution
|
|
raw_question = "En in Enschede?"
|
|
would_match_heritage = any(kw in raw_question.lower() for kw in config.heritage_keywords)
|
|
|
|
# This demonstrates the problem - raw follow-up doesn't match!
|
|
assert would_match_heritage is False
|
|
|
|
# But after resolution, it would:
|
|
resolved_question = "Welke archieven zijn er in Enschede?"
|
|
matches_after_resolution = any(kw in resolved_question.lower() for kw in config.heritage_keywords)
|
|
assert matches_after_resolution is True
|
|
|
|
|
|
# =============================================================================
|
|
# TEMPLATE INSTANTIATOR TESTS
|
|
# =============================================================================
|
|
|
|
class TestTemplateInstantiator:
|
|
"""Tests for TemplateInstantiator."""
|
|
|
|
def test_simple_template_render(self):
|
|
"""Test basic template rendering."""
|
|
instantiator = TemplateInstantiator()
|
|
|
|
# Mock a simple template
|
|
from jinja2 import Environment, BaseLoader
|
|
env = Environment(loader=BaseLoader())
|
|
template_str = """{{ prefixes }}
|
|
SELECT ?institution ?name WHERE {
|
|
?institution hc:institutionType "{{ institution_type }}" ;
|
|
schema:addressLocality "{{ city }}" .
|
|
}"""
|
|
|
|
template = env.from_string(template_str)
|
|
result = template.render(
|
|
prefixes=SPARQL_PREFIXES,
|
|
institution_type="A",
|
|
city="Den Haag"
|
|
)
|
|
|
|
assert 'hc:institutionType "A"' in result
|
|
assert 'schema:addressLocality "Den Haag"' in result
|
|
assert "PREFIX hc:" in result
|
|
|
|
def test_prefixes_included(self):
|
|
"""Test that SPARQL prefixes are included."""
|
|
assert "PREFIX hc: <https://nde.nl/ontology/hc/>" in SPARQL_PREFIXES
|
|
assert "PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>" in SPARQL_PREFIXES
|
|
assert "PREFIX schema: <http://schema.org/>" in SPARQL_PREFIXES
|
|
|
|
|
|
# =============================================================================
|
|
# PIPELINE ORDERING TESTS
|
|
# =============================================================================
|
|
|
|
class TestPipelineOrdering:
|
|
"""Tests to verify correct pipeline ordering.
|
|
|
|
CRITICAL: The pipeline MUST follow this order:
|
|
1. ConversationContextResolver (resolve follow-ups)
|
|
2. FykeFilter (on RESOLVED question)
|
|
3. TemplateClassifier
|
|
4. SlotExtractor
|
|
5. TemplateInstantiator
|
|
"""
|
|
|
|
def test_follow_up_flow(self):
|
|
"""Test complete flow for a follow-up question.
|
|
|
|
Scenario:
|
|
Turn 1: "Welke archieven zijn er in Den Haag?"
|
|
Turn 2: "En in Enschede?"
|
|
|
|
Expected flow:
|
|
1. ConversationContextResolver: "En in Enschede?" → "Welke archieven zijn er in Enschede?"
|
|
2. FykeFilter: "Welke archieven zijn er in Enschede?" → PASS (contains "archieven")
|
|
3. TemplateClassifier: → list_institutions_by_type_city
|
|
4. SlotExtractor: → {institution_type: "A", city: "Enschede"}
|
|
5. TemplateInstantiator: → SPARQL query
|
|
"""
|
|
# Step 1: Simulate context resolution
|
|
raw_question = "En in Enschede?"
|
|
previous_slots = {"institution_type": "A", "city": "Den Haag"}
|
|
|
|
# The resolved question should carry over the institution type
|
|
resolved = ResolvedQuestion(
|
|
original=raw_question,
|
|
resolved="Welke archieven zijn er in Enschede?",
|
|
is_follow_up=True,
|
|
follow_up_type="location_swap",
|
|
inherited_slots={"institution_type": "A"},
|
|
confidence=0.95
|
|
)
|
|
|
|
# Step 2: Fyke should pass the RESOLVED question
|
|
heritage_keywords = ["archieven", "musea", "bibliotheken"]
|
|
passes_fyke = any(kw in resolved.resolved.lower() for kw in heritage_keywords)
|
|
assert passes_fyke is True
|
|
|
|
# Step 3: Template should match
|
|
expected_template = "list_institutions_by_type_city"
|
|
|
|
# Step 4: Slots should include inherited + new
|
|
expected_slots = {
|
|
"institution_type": "A", # Inherited
|
|
"city": "Enschede" # New
|
|
}
|
|
|
|
# This test documents the expected flow
|
|
assert resolved.is_follow_up is True
|
|
assert resolved.inherited_slots["institution_type"] == "A"
|
|
|
|
def test_count_follow_up_flow(self):
|
|
"""Test flow for count follow-up.
|
|
|
|
Turn 1: "Welke musea zijn er in Amsterdam?" (returns list)
|
|
Turn 2: "Hoeveel?" (count follow-up)
|
|
|
|
Expected:
|
|
1. Resolve: "Hoeveel?" → "Hoeveel musea zijn er in Amsterdam?"
|
|
2. Fyke: PASS (resolved contains "musea")
|
|
3. Template: count_institutions_by_type_location
|
|
"""
|
|
raw = "Hoeveel?"
|
|
previous_slots = {"institution_type": "M", "city": "Amsterdam"}
|
|
|
|
# After resolution
|
|
resolved = ResolvedQuestion(
|
|
original=raw,
|
|
resolved="Hoeveel musea zijn er in Amsterdam?",
|
|
is_follow_up=True,
|
|
follow_up_type="count_from_list",
|
|
inherited_slots={"institution_type": "M", "city": "Amsterdam"},
|
|
confidence=0.9
|
|
)
|
|
|
|
# Fyke should pass
|
|
heritage_keywords = ["musea"]
|
|
passes = any(kw in resolved.resolved.lower() for kw in heritage_keywords)
|
|
assert passes is True
|
|
|
|
# Template should change to count variant
|
|
assert resolved.follow_up_type == "count_from_list"
|
|
|
|
|
|
# =============================================================================
|
|
# GOLDEN TEST CASES
|
|
# =============================================================================
|
|
|
|
class TestGoldenCases:
|
|
"""Golden test cases that must always pass."""
|
|
|
|
@pytest.mark.parametrize("question,expected_template,expected_slots", [
|
|
(
|
|
"Welke musea zijn er in Amsterdam?",
|
|
"list_institutions_by_type_city",
|
|
{"institution_type": "M", "city": "Amsterdam"}
|
|
),
|
|
(
|
|
"Welke archieven zijn er in Den Haag?",
|
|
"list_institutions_by_type_city",
|
|
{"institution_type": "A", "city": "Den Haag"}
|
|
),
|
|
(
|
|
"Hoeveel bibliotheken zijn er in Rotterdam?",
|
|
"count_institutions_by_type_location",
|
|
{"institution_type": "L", "city": "Rotterdam"} # Changed from location
|
|
),
|
|
(
|
|
"What museums are in Amsterdam?",
|
|
"list_institutions_by_type_city",
|
|
{"institution_type": "M", "city": "Amsterdam"}
|
|
),
|
|
])
|
|
def test_golden_question_parsing(self, question, expected_template, expected_slots):
|
|
"""Test that golden questions parse to expected templates and slots.
|
|
|
|
Note: This is a structural test. Full DSPy integration tests
|
|
require a running LLM backend.
|
|
"""
|
|
# This documents expected behavior
|
|
# Full test would use the actual pipeline
|
|
assert expected_template is not None
|
|
assert "institution_type" in expected_slots or expected_slots == {}
|
|
|
|
@pytest.mark.parametrize("raw_follow_up,previous_question,expected_resolved", [
|
|
(
|
|
"En in Enschede?",
|
|
"Welke archieven zijn er in Den Haag?",
|
|
"Welke archieven zijn er in Enschede?"
|
|
),
|
|
(
|
|
"En de musea?",
|
|
"Welke archieven zijn er in Amsterdam?",
|
|
"Welke musea zijn er in Amsterdam?"
|
|
),
|
|
(
|
|
"Hoeveel?",
|
|
"Welke bibliotheken zijn er in Utrecht?",
|
|
"Hoeveel bibliotheken zijn er in Utrecht?"
|
|
),
|
|
])
|
|
def test_golden_follow_up_resolution(self, raw_follow_up, previous_question, expected_resolved):
|
|
"""Test that follow-ups resolve correctly.
|
|
|
|
These document expected ConversationContextResolver behavior.
|
|
"""
|
|
# This documents expected behavior
|
|
assert raw_follow_up != expected_resolved
|
|
assert len(expected_resolved) > len(raw_follow_up)
|
|
|
|
|
|
# =============================================================================
|
|
# BUDGET CATEGORY TESTS
|
|
# =============================================================================
|
|
|
|
class TestBudgetCategoryResolution:
|
|
"""Tests for budget category synonym resolution.
|
|
|
|
These tests verify that multilingual budget/expense category terms
|
|
are correctly resolved to canonical slot names for financial queries.
|
|
|
|
Example competency question:
|
|
"Which Custodians spend more than 5000 euros on innovations in 2024?"
|
|
"""
|
|
|
|
def test_resolve_budget_category_dutch_innovation(self):
|
|
"""Test Dutch innovation budget terms."""
|
|
resolver = SynonymResolver()
|
|
|
|
# Dutch terms for innovation
|
|
assert resolver.resolve_budget_category("innovatie") == "innovation"
|
|
assert resolver.resolve_budget_category("innovaties") == "innovation"
|
|
assert resolver.resolve_budget_category("vernieuwing") == "innovation"
|
|
|
|
def test_resolve_budget_category_english_innovation(self):
|
|
"""Test English innovation budget terms."""
|
|
resolver = SynonymResolver()
|
|
|
|
assert resolver.resolve_budget_category("innovation") == "innovation"
|
|
assert resolver.resolve_budget_category("innovations") == "innovation"
|
|
assert resolver.resolve_budget_category("r_and_d") == "innovation"
|
|
assert resolver.resolve_budget_category("technology") == "innovation"
|
|
|
|
def test_resolve_budget_category_german_innovation(self):
|
|
"""Test German innovation budget terms."""
|
|
resolver = SynonymResolver()
|
|
|
|
assert resolver.resolve_budget_category("innovationen") == "innovation"
|
|
assert resolver.resolve_budget_category("erneuerung") == "innovation"
|
|
|
|
def test_resolve_budget_category_digitization(self):
|
|
"""Test digitization budget terms in multiple languages."""
|
|
resolver = SynonymResolver()
|
|
|
|
# Dutch
|
|
assert resolver.resolve_budget_category("digitalisering") == "digitization"
|
|
# English (US)
|
|
assert resolver.resolve_budget_category("digitization") == "digitization"
|
|
# English (UK)
|
|
assert resolver.resolve_budget_category("digitisation") == "digitization"
|
|
# German
|
|
assert resolver.resolve_budget_category("digitalisierung") == "digitization"
|
|
|
|
def test_resolve_budget_category_preservation(self):
|
|
"""Test preservation/conservation budget terms."""
|
|
resolver = SynonymResolver()
|
|
|
|
# Dutch
|
|
assert resolver.resolve_budget_category("conservering") == "preservation"
|
|
assert resolver.resolve_budget_category("restauratie") == "preservation"
|
|
# English
|
|
assert resolver.resolve_budget_category("preservation") == "preservation"
|
|
assert resolver.resolve_budget_category("conservation") == "preservation"
|
|
# German
|
|
assert resolver.resolve_budget_category("konservierung") == "preservation"
|
|
|
|
def test_resolve_budget_category_personnel(self):
|
|
"""Test personnel/staff budget terms."""
|
|
resolver = SynonymResolver()
|
|
|
|
# Dutch
|
|
assert resolver.resolve_budget_category("personeel") == "personnel"
|
|
assert resolver.resolve_budget_category("salarissen") == "personnel"
|
|
# English
|
|
assert resolver.resolve_budget_category("personnel") == "personnel"
|
|
assert resolver.resolve_budget_category("staff") == "personnel"
|
|
assert resolver.resolve_budget_category("salaries") == "personnel"
|
|
# German
|
|
assert resolver.resolve_budget_category("personal") == "personnel"
|
|
|
|
def test_resolve_budget_category_acquisition(self):
|
|
"""Test acquisition/collection development budget terms."""
|
|
resolver = SynonymResolver()
|
|
|
|
# Dutch
|
|
assert resolver.resolve_budget_category("aanwinsten") == "acquisition"
|
|
assert resolver.resolve_budget_category("aankopen") == "acquisition"
|
|
# English
|
|
assert resolver.resolve_budget_category("acquisition") == "acquisition"
|
|
assert resolver.resolve_budget_category("acquisitions") == "acquisition"
|
|
# German
|
|
assert resolver.resolve_budget_category("erwerbungen") == "acquisition"
|
|
|
|
def test_resolve_budget_category_operating(self):
|
|
"""Test operating/running costs budget terms."""
|
|
resolver = SynonymResolver()
|
|
|
|
# Dutch
|
|
assert resolver.resolve_budget_category("operationeel") == "operating"
|
|
assert resolver.resolve_budget_category("exploitatie") == "operating"
|
|
# English
|
|
assert resolver.resolve_budget_category("operating") == "operating"
|
|
assert resolver.resolve_budget_category("operations") == "operating"
|
|
# German
|
|
assert resolver.resolve_budget_category("betriebskosten") == "operating"
|
|
|
|
def test_resolve_budget_category_capital(self):
|
|
"""Test capital/investment budget terms."""
|
|
resolver = SynonymResolver()
|
|
|
|
# Dutch
|
|
assert resolver.resolve_budget_category("kapitaal") == "capital"
|
|
assert resolver.resolve_budget_category("investeringen") == "capital"
|
|
# English
|
|
assert resolver.resolve_budget_category("capital") == "capital"
|
|
assert resolver.resolve_budget_category("capex") == "capital"
|
|
# German
|
|
assert resolver.resolve_budget_category("investitionen") == "capital"
|
|
|
|
def test_resolve_budget_category_case_insensitive(self):
|
|
"""Test case insensitivity for budget categories."""
|
|
resolver = SynonymResolver()
|
|
|
|
assert resolver.resolve_budget_category("INNOVATIE") == "innovation"
|
|
assert resolver.resolve_budget_category("Digitalisering") == "digitization"
|
|
assert resolver.resolve_budget_category("PRESERVATION") == "preservation"
|
|
|
|
def test_resolve_budget_category_unknown_returns_none(self):
|
|
"""Test that unknown terms return None."""
|
|
resolver = SynonymResolver()
|
|
|
|
assert resolver.resolve_budget_category("tandpasta") is None
|
|
assert resolver.resolve_budget_category("xyz123") is None
|
|
assert resolver.resolve_budget_category("") is None
|
|
|
|
|
|
# =============================================================================
|
|
# BUDGET THRESHOLD TEMPLATE TESTS
|
|
# =============================================================================
|
|
|
|
class TestBudgetThresholdTemplate:
|
|
"""Tests for the find_custodians_by_budget_threshold template.
|
|
|
|
This template answers competency questions like:
|
|
"Which Custodians spend more than 5000 euros on innovations in 2024?"
|
|
"""
|
|
|
|
@pytest.mark.parametrize("question,expected_slots", [
|
|
(
|
|
"Welke instellingen geven meer dan 5000 euro uit aan innovatie?",
|
|
{"budget_category": "innovation", "amount": 5000, "comparison": ">"}
|
|
),
|
|
(
|
|
"Which museums spend more than 10000 on digitization in 2024?",
|
|
{"budget_category": "digitization", "amount": 10000, "institution_type": "M", "year": 2024}
|
|
),
|
|
(
|
|
"Welke archieven hebben een personeelsbudget van meer dan 100000 euro?",
|
|
{"budget_category": "personnel", "amount": 100000, "institution_type": "A"}
|
|
),
|
|
])
|
|
def test_budget_threshold_slot_extraction(self, question, expected_slots):
|
|
"""Test that budget threshold questions extract correct slots.
|
|
|
|
Note: This documents expected behavior. Full extraction requires
|
|
the DSPy SlotExtractor component.
|
|
"""
|
|
# This documents expected behavior
|
|
assert "budget_category" in expected_slots
|
|
assert "amount" in expected_slots
|
|
|
|
def test_budget_template_exists_in_config(self):
|
|
"""Verify the budget threshold template is defined."""
|
|
templates_path = PROJECT_ROOT / "data" / "sparql_templates.yaml"
|
|
|
|
if templates_path.exists():
|
|
import yaml
|
|
with open(templates_path) as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
# Templates are under the "templates" key
|
|
templates = data.get("templates", {})
|
|
assert "find_custodians_by_budget_threshold" in templates
|
|
template = templates["find_custodians_by_budget_threshold"]
|
|
assert template.get("id") == "find_custodians_by_budget_threshold"
|
|
|
|
def test_budget_category_slot_type_defined(self):
|
|
"""Verify budget_category slot type is defined in templates."""
|
|
templates_path = PROJECT_ROOT / "data" / "sparql_templates.yaml"
|
|
|
|
if templates_path.exists():
|
|
import yaml
|
|
with open(templates_path) as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
# Slot types are under "_slot_types" key (with underscore prefix)
|
|
slot_types = data.get("_slot_types", {})
|
|
assert "budget_category" in slot_types
|
|
|
|
budget_category = slot_types["budget_category"]
|
|
assert "synonyms" in budget_category
|
|
assert "innovatie" in budget_category["synonyms"]
|
|
assert budget_category["synonyms"]["innovatie"] == "innovation"
|
|
|
|
|
|
# =============================================================================
|
|
# PATTERN-BASED TEMPLATE MATCHING TESTS
|
|
# =============================================================================
|
|
|
|
class TestPatternBasedMatching:
|
|
"""Tests for pattern-based template matching (fast fallback before LLM).
|
|
|
|
The _match_by_patterns() method provides deterministic matching using
|
|
question_patterns defined in sparql_templates.yaml, avoiding LLM calls
|
|
for well-defined query structures.
|
|
"""
|
|
|
|
def test_exact_budget_pattern_match(self):
|
|
"""Test exact match for budget threshold query."""
|
|
classifier = TemplateClassifier()
|
|
templates = classifier._load_templates()
|
|
|
|
question = "Welke instellingen geven meer dan 5000 euro uit aan innovatie?"
|
|
result = classifier._match_by_patterns(question, templates)
|
|
|
|
assert result is not None, "Pattern match should succeed"
|
|
assert result.matched is True
|
|
assert result.template_id == "find_custodians_by_budget_threshold"
|
|
assert result.confidence >= 0.9
|
|
|
|
def test_english_budget_pattern_match(self):
|
|
"""Test English budget threshold query pattern."""
|
|
classifier = TemplateClassifier()
|
|
templates = classifier._load_templates()
|
|
|
|
question = "Which custodians spend more than 10000 on digitization?"
|
|
result = classifier._match_by_patterns(question, templates)
|
|
|
|
assert result is not None
|
|
assert result.template_id == "find_custodians_by_budget_threshold"
|
|
assert result.confidence >= 0.9
|
|
|
|
def test_list_institutions_pattern_match(self):
|
|
"""Test pattern match for list institutions query."""
|
|
classifier = TemplateClassifier()
|
|
templates = classifier._load_templates()
|
|
|
|
question = "Welke archieven zijn er in Amsterdam?"
|
|
result = classifier._match_by_patterns(question, templates)
|
|
|
|
assert result is not None
|
|
assert result.template_id == "list_institutions_by_type_city"
|
|
assert result.confidence >= 0.9
|
|
|
|
def test_pattern_match_case_insensitive(self):
|
|
"""Test that pattern matching is case-insensitive."""
|
|
classifier = TemplateClassifier()
|
|
templates = classifier._load_templates()
|
|
|
|
# Uppercase version of a pattern
|
|
question = "WELKE INSTELLINGEN GEVEN MEER DAN 5000 EURO UIT AAN INNOVATIE?"
|
|
result = classifier._match_by_patterns(question, templates)
|
|
|
|
assert result is not None
|
|
assert result.template_id == "find_custodians_by_budget_threshold"
|
|
|
|
def test_pattern_match_returns_none_for_unknown(self):
|
|
"""Test that unknown patterns return None."""
|
|
classifier = TemplateClassifier()
|
|
templates = classifier._load_templates()
|
|
|
|
# Use a truly unrelated question that won't match any heritage patterns
|
|
question = "Hoe laat vertrekt de trein naar Utrecht?" # "What time does the train to Utrecht leave?"
|
|
result = classifier._match_by_patterns(question, templates)
|
|
|
|
assert result is None, "Unrelated question should not match any pattern"
|
|
|
|
def test_forward_uses_pattern_match_before_llm(self):
|
|
"""Test that forward() uses pattern matching before falling back to LLM."""
|
|
classifier = TemplateClassifier()
|
|
|
|
# A question that exactly matches a pattern should return quickly
|
|
# without needing LLM (tested by checking the reasoning)
|
|
question = "Welke instellingen geven meer dan 5000 euro uit aan innovatie?"
|
|
result = classifier.forward(question)
|
|
|
|
assert result.matched is True
|
|
assert result.template_id == "find_custodians_by_budget_threshold"
|
|
assert "Pattern match" in result.reasoning # Indicates pattern was used, not LLM
|
|
|
|
|
|
# =============================================================================
|
|
# INTEGRATION SMOKE TEST
|
|
# =============================================================================
|
|
|
|
class TestIntegrationSmoke:
|
|
"""Smoke tests for integration (require templates file)."""
|
|
|
|
def test_templates_file_exists(self):
|
|
"""Verify templates YAML exists."""
|
|
templates_path = PROJECT_ROOT / "data" / "sparql_templates.yaml"
|
|
|
|
# May not exist in CI
|
|
if templates_path.exists():
|
|
import yaml
|
|
with open(templates_path) as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
assert "templates" in data
|
|
assert len(data["templates"]) >= 10 # We defined 10 templates
|
|
assert "fyke_filter" in data
|
|
assert "follow_up_patterns" in data
|
|
|
|
def test_validation_rules_file_exists(self):
|
|
"""Verify validation rules JSON exists."""
|
|
validation_path = PROJECT_ROOT / "data" / "validation" / "sparql_validation_rules.json"
|
|
|
|
if validation_path.exists():
|
|
with open(validation_path) as f:
|
|
data = json.load(f)
|
|
|
|
assert "institution_type_mappings" in data
|
|
assert "subregion_mappings" in data
|
|
|
|
|
|
# =============================================================================
|
|
# REGION VARIANT TESTS
|
|
# =============================================================================
|
|
|
|
class TestRegionVariantSelection:
|
|
"""Tests for region-based queries using the correct template variant.
|
|
|
|
Verifies that province names trigger the region template variant,
|
|
which uses GHCID prefix filtering instead of addressLocality.
|
|
"""
|
|
|
|
def test_is_region_recognizes_dutch_provinces(self):
|
|
"""Test that is_region correctly identifies Dutch provinces."""
|
|
resolver = get_synonym_resolver()
|
|
|
|
# These should be recognized as regions
|
|
assert resolver.is_region("Noord-Holland") is True
|
|
assert resolver.is_region("zuid-holland") is True
|
|
assert resolver.is_region("Limburg") is True
|
|
assert resolver.is_region("Gelderland") is True
|
|
assert resolver.is_region("NL-NH") is True # ISO code format
|
|
|
|
# These should NOT be recognized as regions (they are cities)
|
|
assert resolver.is_region("Amsterdam") is False
|
|
assert resolver.is_region("Rotterdam") is False
|
|
assert resolver.is_region("Den Haag") is False
|
|
|
|
def test_template_instantiator_region_variant(self):
|
|
"""Test that region variant uses GHCID prefix filtering."""
|
|
instantiator = TemplateInstantiator()
|
|
|
|
# Render with region variant
|
|
sparql = instantiator.render(
|
|
template_id="count_institutions_by_type_location",
|
|
slots={"institution_type": "M", "location": "NL-NH"},
|
|
variant="region"
|
|
)
|
|
|
|
assert sparql is not None
|
|
# Region variant should use GHCID filtering
|
|
assert "hc:ghcid" in sparql
|
|
assert 'FILTER(STRSTARTS(?ghcid, "NL-NH"))' in sparql
|
|
# Should NOT use addressLocality
|
|
assert "addressLocality" not in sparql
|
|
|
|
def test_template_instantiator_city_default(self):
|
|
"""Test that default (city) variant uses addressLocality."""
|
|
instantiator = TemplateInstantiator()
|
|
|
|
# Render without variant (defaults to city template)
|
|
sparql = instantiator.render(
|
|
template_id="count_institutions_by_type_location",
|
|
slots={"institution_type": "M", "location": "Amsterdam"},
|
|
variant=None
|
|
)
|
|
|
|
assert sparql is not None
|
|
# City variant should use addressLocality
|
|
assert "addressLocality" in sparql
|
|
assert '"Amsterdam"' in sparql
|
|
# Should NOT use GHCID filtering
|
|
assert "hc:ghcid" not in sparql
|
|
|
|
def test_find_institutions_by_founding_date_region_variant(self):
|
|
"""Test that founding date query uses GHCID for region filtering."""
|
|
instantiator = TemplateInstantiator()
|
|
|
|
# Render with region variant for "Oudste musea in Limburg"
|
|
sparql = instantiator.render(
|
|
template_id="find_institutions_by_founding_date",
|
|
slots={"institution_type": "M", "location": "NL-LI", "order": "ASC"},
|
|
variant="region"
|
|
)
|
|
|
|
assert sparql is not None
|
|
# Region variant should use GHCID filtering
|
|
assert "hc:ghcid" in sparql
|
|
assert 'FILTER(STRSTARTS(?ghcid, "NL-LI"))' in sparql
|
|
# Should include founding date and ordering
|
|
assert "schema:foundingDate" in sparql
|
|
assert "ORDER BY" in sparql
|
|
# Should NOT use addressLocality for location
|
|
assert 'addressLocality "NL-LI"' not in sparql
|
|
|
|
def test_find_institutions_by_founding_date_city_default(self):
|
|
"""Test that founding date query uses addressLocality for city."""
|
|
instantiator = TemplateInstantiator()
|
|
|
|
# Render without variant for "Oudste musea in Amsterdam"
|
|
sparql = instantiator.render(
|
|
template_id="find_institutions_by_founding_date",
|
|
slots={"institution_type": "M", "location": "Amsterdam", "order": "ASC"},
|
|
variant=None
|
|
)
|
|
|
|
assert sparql is not None
|
|
# City variant should use addressLocality
|
|
assert "addressLocality" in sparql
|
|
assert '"Amsterdam"' in sparql
|
|
# Should NOT use GHCID filtering for location
|
|
assert 'FILTER(STRSTARTS(?ghcid' not in sparql
|
|
|
|
def test_compare_locations_region_variant(self):
|
|
"""Test that location comparison uses GHCID for region comparison."""
|
|
instantiator = TemplateInstantiator()
|
|
|
|
# Render with region variant for "Compare Noord-Holland and Zuid-Holland"
|
|
sparql = instantiator.render(
|
|
template_id="compare_locations",
|
|
slots={"location1": "NL-NH", "location2": "NL-ZH", "institution_type": "M"},
|
|
variant="region"
|
|
)
|
|
|
|
assert sparql is not None
|
|
# Region variant should use GHCID filtering
|
|
assert "hc:ghcid" in sparql
|
|
assert "STRSTARTS" in sparql
|
|
# Both region codes should be in VALUES clause
|
|
assert '"NL-NH"' in sparql
|
|
assert '"NL-ZH"' in sparql
|
|
# Should NOT use addressLocality
|
|
assert "addressLocality" not in sparql
|
|
|
|
def test_compare_locations_city_default(self):
|
|
"""Test that location comparison uses addressLocality for cities."""
|
|
instantiator = TemplateInstantiator()
|
|
|
|
# Render without variant for "Compare Amsterdam and Rotterdam"
|
|
sparql = instantiator.render(
|
|
template_id="compare_locations",
|
|
slots={"location1": "Amsterdam", "location2": "Rotterdam"},
|
|
variant=None
|
|
)
|
|
|
|
assert sparql is not None
|
|
# City variant should use addressLocality
|
|
assert "addressLocality" in sparql
|
|
# Both cities should be in VALUES clause
|
|
assert '"Amsterdam"' in sparql
|
|
assert '"Rotterdam"' in sparql
|
|
|
|
|
|
if __name__ == "__main__":
|
|
pytest.main([__file__, "-v"])
|