glam/backend/rag/test_semantic_routing.py
kempersc 99dc608826 Refactor RAG to template-based SPARQL generation
Major architectural changes based on Formica et al. (2023) research:
- Add TemplateClassifier for deterministic SPARQL template matching
- Add SlotExtractor with synonym resolution for slot values
- Add TemplateInstantiator using Jinja2 for query rendering
- Refactor dspy_heritage_rag.py to use template system
- Update main.py with streamlined pipeline
- Fix semantic_router.py ordering issues
- Add comprehensive metrics tracking

Template-based approach achieves 65% precision vs 10% LLM-only
per Formica et al. research on SPARQL generation.
2026-01-07 22:04:43 +01:00

500 lines
21 KiB
Python

"""
Tests for Semantic Routing (Signal-Decision Pattern)
Tests the SemanticSignalExtractor and SemanticDecisionRouter classes
which enable fast LLM-free query routing for high-confidence queries.
"""
import pytest
from .semantic_router import (
QuerySignals,
RouteConfig,
SemanticSignalExtractor,
SemanticDecisionRouter,
get_signal_extractor,
get_decision_router,
)
class TestSemanticSignalExtractor:
"""Tests for SemanticSignalExtractor class."""
@pytest.fixture
def extractor(self):
return SemanticSignalExtractor()
# ===== Entity Type Detection =====
def test_detect_person_query(self, extractor):
"""Person indicators should detect person entity type."""
# Query with clear person indicator and no institution indicator
signals = extractor.extract_signals("Wie werkt daar als medewerker?")
assert signals.entity_type == "person"
def test_detect_person_query_with_institution_is_mixed(self, extractor):
"""Person query mentioning institution should be mixed."""
signals = extractor.extract_signals("Wie is de archivaris bij het Noord-Hollands Archief?")
# "archief" is an institution indicator, so this is mixed
assert signals.entity_type == "mixed"
def test_detect_person_query_with_organisatie_is_mixed(self, extractor):
"""Person query with 'organisatie' should be mixed."""
signals = extractor.extract_signals("Wie is de directeur van deze organisatie?")
# "organisatie" is an institution indicator
assert signals.entity_type == "mixed"
def test_detect_institution_query(self, extractor):
"""Institution indicators should detect institution entity type."""
signals = extractor.extract_signals("Welke musea zijn er in Amsterdam?")
assert signals.entity_type == "institution"
def test_detect_mixed_query(self, extractor):
"""Mixed indicators should detect mixed entity type."""
signals = extractor.extract_signals("Welke curatoren werken bij musea in Utrecht?")
assert signals.entity_type == "mixed"
def test_default_to_institution(self, extractor):
"""Ambiguous queries should default to institution."""
signals = extractor.extract_signals("Vertel me over cultureel erfgoed")
assert signals.entity_type == "institution"
# ===== Intent Classification =====
def test_statistical_intent(self, extractor):
"""Aggregation indicators should classify as statistical."""
signals = extractor.extract_signals("Hoeveel archieven zijn er in Nederland?")
assert signals.intent == "statistical"
assert signals.requires_aggregation is True
def test_temporal_intent(self, extractor):
"""Temporal indicators should classify as temporal."""
signals = extractor.extract_signals("Wanneer is het Rijksmuseum opgericht?")
assert signals.intent == "temporal"
assert signals.has_temporal_constraint is True
def test_temporal_intent_with_oldest(self, extractor):
"""Oldest/newest queries should be temporal."""
signals = extractor.extract_signals("Wat is het oudste museum in Nederland?")
assert signals.intent == "temporal"
assert signals.has_temporal_constraint is True
def test_geographic_intent(self, extractor):
"""Geographic indicators should classify as geographic."""
# "waar" (where) is a geographic indicator
signals = extractor.extract_signals("Waar staat dit museum?")
assert signals.intent == "geographic"
assert signals.has_geographic_constraint is True
def test_geographic_intent_with_location(self, extractor):
"""Location mentions should trigger geographic constraint."""
signals = extractor.extract_signals("Vertel me over musea in Amsterdam")
assert signals.has_geographic_constraint is True
def test_temporal_indicator_substring_fixed(self, extractor):
"""Verify fix: substring matching no longer causes false positives.
'nationaal' contains 'na' but should NOT trigger temporal (uses word boundaries).
This tests that the fix for substring matching is working.
"""
signals = extractor.extract_signals("In welke stad ligt het Nationaal Archief?")
# After fix: should NOT be temporal (no word-boundary match for "na")
# "In" at start is a word boundary match for geographic indicator
assert signals.intent == "geographic"
assert signals.has_temporal_constraint is False
def test_entity_lookup_intent(self, extractor):
"""Entity lookup indicators should classify correctly."""
signals = extractor.extract_signals("Wat is het Rijksmuseum?")
assert signals.intent == "entity_lookup"
def test_comparative_intent(self, extractor):
"""Comparative queries should be classified correctly."""
signals = extractor.extract_signals("Vergelijk het Rijksmuseum met het Van Gogh Museum")
assert signals.intent == "comparative"
def test_exploration_default_intent(self, extractor):
"""Default to exploration for open questions without clear indicators."""
# Query without geographic, temporal, or aggregation indicators
# Note: "in" is a geographic indicator, so avoid words containing it
signals = extractor.extract_signals("Welke schilderijen vallen op?")
assert signals.intent == "exploration"
def test_geographic_indicator_substring_fixed(self, extractor):
"""Verify fix: 'in' no longer matches inside words.
'interessant' contains 'in' but should NOT trigger geographic.
This tests that the word boundary fix is working.
"""
signals = extractor.extract_signals("Welke schilderijen zijn interessant?")
# After fix: should be exploration, not geographic
assert signals.intent == "exploration"
assert signals.has_geographic_constraint is False
def test_word_boundary_in_works_correctly(self, extractor):
"""Verify 'in' as standalone word DOES trigger geographic."""
signals = extractor.extract_signals("Welke musea zijn er in Amsterdam?")
# "in" as standalone word should trigger geographic
assert signals.intent == "geographic"
assert signals.has_geographic_constraint is True
def test_word_boundary_na_works_correctly(self, extractor):
"""Verify 'na' as standalone word DOES trigger temporal."""
# Dutch: "Na de fusie..." = "After the merger..."
signals = extractor.extract_signals("Wat gebeurde er na de fusie met het archief?")
# "na" as standalone word should trigger temporal
assert signals.intent == "temporal"
assert signals.has_temporal_constraint is True
# ===== Entity Extraction =====
def test_extract_institution_mention(self, extractor):
"""Should extract institution names from query."""
signals = extractor.extract_signals("Vertel me over het Noord-Hollands Archief")
assert len(signals.institution_mentions) >= 1
# Should find "Noord-Hollands Archief" or similar
def test_extract_location_mention(self, extractor):
"""Should extract known Dutch locations."""
signals = extractor.extract_signals("Welke musea zijn er in Amsterdam?")
assert "Amsterdam" in signals.location_mentions
assert signals.has_geographic_constraint is True
def test_extract_multiple_locations(self, extractor):
"""Should extract multiple locations."""
signals = extractor.extract_signals("Archieven in Utrecht en Haarlem")
assert "Utrecht" in signals.location_mentions
assert "Haarlem" in signals.location_mentions
# ===== Language Detection =====
def test_detect_dutch_language(self, extractor):
"""Dutch queries should be detected."""
signals = extractor.extract_signals("Hoeveel musea zijn er in Nederland?")
assert signals.language == "nl"
def test_detect_english_language(self, extractor):
"""English queries should be detected."""
signals = extractor.extract_signals("How many museums are there in Amsterdam?")
assert signals.language == "en"
# ===== Confidence Scoring =====
def test_high_confidence_clear_query(self, extractor):
"""Clear queries should have high confidence."""
signals = extractor.extract_signals("Hoeveel archieven zijn er in Noord-Holland?")
assert signals.confidence >= 0.8
def test_moderate_confidence_ambiguous_query(self, extractor):
"""Ambiguous queries should have moderate confidence."""
signals = extractor.extract_signals("erfgoed informatie")
assert signals.confidence < 0.9
def test_confidence_capped_at_095(self, extractor):
"""Confidence should not exceed 0.95."""
signals = extractor.extract_signals("Hoeveel musea zijn er in Amsterdam?")
assert signals.confidence <= 0.95
class TestSemanticDecisionRouter:
"""Tests for SemanticDecisionRouter class."""
@pytest.fixture
def router(self):
return SemanticDecisionRouter()
def test_person_query_routes_to_qdrant_persons(self, router):
"""Person queries should route to heritage_persons collection."""
signals = QuerySignals(
entity_type="person",
intent="entity_lookup",
institution_mentions=["Noord-Hollands Archief"],
)
config = router.route(signals)
assert config.primary_backend == "qdrant"
assert config.qdrant_collection == "heritage_persons"
def test_person_query_with_institution_filter(self, router):
"""Person queries with institution should add filter."""
signals = QuerySignals(
entity_type="person",
intent="entity_lookup",
institution_mentions=["Noord-Hollands Archief"],
)
config = router.route(signals)
assert "custodian_slug" in config.qdrant_filters
assert "noord-hollands-archief" in config.qdrant_filters["custodian_slug"]
def test_statistical_query_routes_to_sparql(self, router):
"""Statistical queries should route to SPARQL for aggregations.
NOTE: DuckLake removed from RAG - it's for offline analytics only.
Statistical queries now use SPARQL aggregations (COUNT, SUM, AVG, GROUP BY).
"""
signals = QuerySignals(
entity_type="institution",
intent="statistical",
requires_aggregation=True,
)
config = router.route(signals)
assert config.primary_backend == "sparql"
def test_temporal_query_uses_temporal_templates(self, router):
"""Temporal queries should enable temporal templates."""
signals = QuerySignals(
entity_type="institution",
intent="temporal",
has_temporal_constraint=True,
)
config = router.route(signals)
assert config.primary_backend == "sparql"
assert config.use_temporal_templates is True
def test_geographic_query_routes_to_sparql(self, router):
"""Geographic queries should route to SPARQL."""
signals = QuerySignals(
entity_type="institution",
intent="geographic",
has_geographic_constraint=True,
location_mentions=["Amsterdam"],
)
config = router.route(signals)
assert config.primary_backend == "sparql"
def test_default_hybrid_routing(self, router):
"""Default queries should use hybrid routing."""
signals = QuerySignals(
entity_type="institution",
intent="exploration",
)
config = router.route(signals)
assert config.primary_backend == "qdrant"
assert config.secondary_backend == "sparql"
class TestSlugGeneration:
"""Tests for institution slug generation."""
@pytest.fixture
def router(self):
return SemanticDecisionRouter()
def test_simple_slug(self, router):
"""Simple names should convert to lowercase hyphenated slug."""
slug = router._to_slug("Rijksmuseum")
assert slug == "rijksmuseum"
def test_slug_with_spaces(self, router):
"""Spaces should be converted to hyphens."""
slug = router._to_slug("Noord-Hollands Archief")
assert slug == "noord-hollands-archief"
def test_slug_with_article(self, router):
"""Dutch articles should be preserved in slug."""
slug = router._to_slug("Het Utrechts Archief")
assert slug == "het-utrechts-archief"
def test_slug_with_diacritics(self, router):
"""Diacritics should be removed."""
slug = router._to_slug("Musée d'Orsay")
assert slug == "musee-dorsay"
class TestSingletonInstances:
"""Tests for singleton pattern."""
def test_signal_extractor_singleton(self):
"""get_signal_extractor should return same instance."""
ext1 = get_signal_extractor()
ext2 = get_signal_extractor()
assert ext1 is ext2
def test_decision_router_singleton(self):
"""get_decision_router should return same instance."""
router1 = get_decision_router()
router2 = get_decision_router()
assert router1 is router2
class TestIntegration:
"""Integration tests for full signal-decision flow."""
def test_full_person_query_flow(self):
"""Test complete flow for person query."""
extractor = get_signal_extractor()
router = get_decision_router()
# Query with clear person indicator but also institution mention (mixed)
signals = extractor.extract_signals(
"Wie is de archivaris bij het Noord-Hollands Archief?"
)
config = router.route(signals)
# Mixed entity type because both person and institution indicators present
assert signals.entity_type == "mixed"
# Mixed queries route via default (qdrant hybrid)
assert config.primary_backend in ["qdrant", "sparql"]
def test_full_pure_person_query_flow(self):
"""Test complete flow for pure person query (no institution mention)."""
extractor = get_signal_extractor()
router = get_decision_router()
signals = extractor.extract_signals("Wie werkt daar als medewerker?")
config = router.route(signals)
assert signals.entity_type == "person"
assert config.primary_backend == "qdrant"
assert config.qdrant_collection == "heritage_persons"
def test_full_statistical_query_flow(self):
"""Test complete flow for statistical query.
NOTE: DuckLake removed from RAG - statistical queries now use SPARQL aggregations.
"""
extractor = get_signal_extractor()
router = get_decision_router()
signals = extractor.extract_signals(
"Hoeveel musea zijn er per provincie in Nederland?"
)
config = router.route(signals)
assert signals.intent == "statistical"
assert signals.requires_aggregation is True
assert config.primary_backend == "sparql"
def test_full_temporal_query_flow(self):
"""Test complete flow for temporal query."""
extractor = get_signal_extractor()
router = get_decision_router()
signals = extractor.extract_signals(
"Wat is het oudste archief in Noord-Holland?"
)
config = router.route(signals)
assert signals.intent == "temporal"
assert signals.has_temporal_constraint is True
assert config.use_temporal_templates is True
def test_high_confidence_skip_llm_threshold(self):
"""Verify high-confidence queries meet skip threshold."""
extractor = get_signal_extractor()
# These queries should have confidence >= 0.8
# Need clear indicators without ambiguity
high_confidence_queries = [
"Hoeveel archieven zijn er in Nederland?", # clear aggregation
"Wanneer is het Nationaal Archief opgericht?", # clear temporal
"Welke musea zijn er in Amsterdam?", # clear geographic + institution
]
for query in high_confidence_queries:
signals = extractor.extract_signals(query)
assert signals.confidence >= 0.8, (
f"Query '{query}' has confidence {signals.confidence}, expected >= 0.8"
)
def test_moderate_confidence_for_mixed_queries(self):
"""Mixed entity type queries should have lower confidence."""
extractor = get_signal_extractor()
# Mixed queries are more ambiguous
signals = extractor.extract_signals("Wie is de directeur van het Rijksmuseum?")
# Mixed entity type (person + institution) reduces confidence
assert signals.entity_type == "mixed"
assert signals.confidence < 0.9 # Not as high as clear queries
class TestYearPatternDetection:
"""Tests for year-based temporal detection.
Year mentions (1000-2029) should trigger temporal intent,
even when combined with geographic indicators like 'in'.
"""
@pytest.fixture
def extractor(self):
return SemanticSignalExtractor()
def test_year_triggers_temporal_intent(self, extractor):
"""A year mention should classify as temporal intent."""
signals = extractor.extract_signals("Wat was de status van het Rijksmuseum in 1990?")
# Year 1990 should trigger temporal, not "in" triggering geographic
assert signals.intent == "temporal"
assert signals.has_temporal_constraint is True
def test_year_1850_triggers_temporal(self, extractor):
"""Historical year should trigger temporal."""
signals = extractor.extract_signals("Welke musea bestonden in 1850?")
assert signals.intent == "temporal"
assert signals.has_temporal_constraint is True
def test_year_2020_with_aggregation_is_statistical(self, extractor):
"""Aggregation query with year should be statistical with temporal constraint.
'Hoeveel' (how many) triggers aggregation → statistical intent.
Year 2020 triggers temporal constraint.
Result: statistical intent WITH temporal filter applied.
"""
signals = extractor.extract_signals("Hoeveel archieven waren er in 2020?")
# "Hoeveel" overrides to statistical, but temporal constraint is detected
assert signals.intent == "statistical"
assert signals.requires_aggregation is True
assert signals.has_temporal_constraint is True # Year still detected!
def test_year_2020_pure_temporal(self, extractor):
"""Recent year without aggregation should be temporal."""
signals = extractor.extract_signals("Welke archieven bestonden in 2020?")
assert signals.intent == "temporal"
assert signals.has_temporal_constraint is True
def test_geographic_without_year_stays_geographic(self, extractor):
"""Geographic query without year should stay geographic."""
signals = extractor.extract_signals("Welke musea zijn er in Amsterdam?")
assert signals.intent == "geographic"
assert signals.has_temporal_constraint is False
def test_year_overrides_geographic_in(self, extractor):
"""Year should make query temporal even with 'in' for location."""
signals = extractor.extract_signals("Welke musea waren er in Amsterdam in 1900?")
# Year 1900 should override the geographic "in Amsterdam"
assert signals.intent == "temporal"
assert signals.has_temporal_constraint is True
# Geographic constraint should still be detected
assert signals.has_geographic_constraint is True
def test_year_in_english_query(self, extractor):
"""Year detection should work in English queries too."""
signals = extractor.extract_signals("What museums existed in 1920?")
assert signals.intent == "temporal"
assert signals.has_temporal_constraint is True
def test_year_range_boundary_1000(self, extractor):
"""Year 1000 should be detected."""
signals = extractor.extract_signals("Bestond dit klooster al in 1000?")
assert signals.has_temporal_constraint is True
def test_year_range_boundary_2029(self, extractor):
"""Year 2029 should be detected (future planning)."""
signals = extractor.extract_signals("Wat zijn de plannen voor 2029?")
assert signals.has_temporal_constraint is True
def test_non_year_number_ignored(self, extractor):
"""Numbers that aren't years should not trigger temporal."""
signals = extractor.extract_signals("Hoeveel van de 500 musea hebben een website?")
# 500 is not a valid year (outside 1000-2029)
# This is a statistical query
assert signals.intent == "statistical"
# has_temporal_constraint could be False (no year) but check intent
def test_year_combined_with_temporal_keyword(self, extractor):
"""Year + temporal keyword should be high confidence temporal."""
signals = extractor.extract_signals("Wanneer in 1945 werd het museum gesloten?")
assert signals.intent == "temporal"
assert signals.has_temporal_constraint is True
# Combined signals should give high confidence
assert signals.confidence >= 0.8
# Run with: pytest backend/rag/test_semantic_routing.py -v