- Fix scope_note → finding_aid_scope_note in FindingAid.yaml - Remove duplicate wikidata_entity slot from CustodianType.yaml (import instead) - Remove duplicate rico_record_set_type from class_metadata_slots.yaml - Fix range types for equals_string compatibility (uriorcurie → string) - Move class names from close_mappings to see_also in 10 RecordSetTypes files - Generate all RDF formats: OWL, N-Triples, RDF/XML, N3, JSON-LD context - Sync schemas to frontend/public/schemas/ Files: 1,151 changed (includes prior CustodianType migration)
493 lines
21 KiB
Python
493 lines
21 KiB
Python
"""
|
|
Tests for Semantic Routing (Signal-Decision Pattern)
|
|
|
|
Tests the SemanticSignalExtractor and SemanticDecisionRouter classes
|
|
which enable fast LLM-free query routing for high-confidence queries.
|
|
"""
|
|
|
|
import pytest
|
|
from .semantic_router import (
|
|
QuerySignals,
|
|
RouteConfig,
|
|
SemanticSignalExtractor,
|
|
SemanticDecisionRouter,
|
|
get_signal_extractor,
|
|
get_decision_router,
|
|
)
|
|
|
|
|
|
class TestSemanticSignalExtractor:
|
|
"""Tests for SemanticSignalExtractor class."""
|
|
|
|
@pytest.fixture
|
|
def extractor(self):
|
|
return SemanticSignalExtractor()
|
|
|
|
# ===== Entity Type Detection =====
|
|
|
|
def test_detect_person_query(self, extractor):
|
|
"""Person indicators should detect person entity type."""
|
|
# Query with clear person indicator and no institution indicator
|
|
signals = extractor.extract_signals("Wie werkt daar als medewerker?")
|
|
assert signals.entity_type == "person"
|
|
|
|
def test_detect_person_query_with_institution_is_mixed(self, extractor):
|
|
"""Person query mentioning institution should be mixed."""
|
|
signals = extractor.extract_signals("Wie is de archivaris bij het Noord-Hollands Archief?")
|
|
# "archief" is an institution indicator, so this is mixed
|
|
assert signals.entity_type == "mixed"
|
|
|
|
def test_detect_person_query_with_organisatie_is_mixed(self, extractor):
|
|
"""Person query with 'organisatie' should be mixed."""
|
|
signals = extractor.extract_signals("Wie is de directeur van deze organisatie?")
|
|
# "organisatie" is an institution indicator
|
|
assert signals.entity_type == "mixed"
|
|
|
|
def test_detect_institution_query(self, extractor):
|
|
"""Institution indicators should detect institution entity type."""
|
|
signals = extractor.extract_signals("Welke musea zijn er in Amsterdam?")
|
|
assert signals.entity_type == "institution"
|
|
|
|
def test_detect_mixed_query(self, extractor):
|
|
"""Mixed indicators should detect mixed entity type."""
|
|
signals = extractor.extract_signals("Welke curatoren werken bij musea in Utrecht?")
|
|
assert signals.entity_type == "mixed"
|
|
|
|
def test_default_to_institution(self, extractor):
|
|
"""Ambiguous queries should default to institution."""
|
|
signals = extractor.extract_signals("Vertel me over cultureel erfgoed")
|
|
assert signals.entity_type == "institution"
|
|
|
|
# ===== Intent Classification =====
|
|
|
|
def test_statistical_intent(self, extractor):
|
|
"""Aggregation indicators should classify as statistical."""
|
|
signals = extractor.extract_signals("Hoeveel archieven zijn er in Nederland?")
|
|
assert signals.intent == "statistical"
|
|
assert signals.requires_aggregation is True
|
|
|
|
def test_temporal_intent(self, extractor):
|
|
"""Temporal indicators should classify as temporal."""
|
|
signals = extractor.extract_signals("Wanneer is het Rijksmuseum opgericht?")
|
|
assert signals.intent == "temporal"
|
|
assert signals.has_temporal_constraint is True
|
|
|
|
def test_temporal_intent_with_oldest(self, extractor):
|
|
"""Oldest/newest queries should be temporal."""
|
|
signals = extractor.extract_signals("Wat is het oudste museum in Nederland?")
|
|
assert signals.intent == "temporal"
|
|
assert signals.has_temporal_constraint is True
|
|
|
|
def test_geographic_intent(self, extractor):
|
|
"""Geographic indicators should classify as geographic."""
|
|
# "waar" (where) is a geographic indicator
|
|
signals = extractor.extract_signals("Waar staat dit museum?")
|
|
assert signals.intent == "geographic"
|
|
assert signals.has_geographic_constraint is True
|
|
|
|
def test_geographic_intent_with_location(self, extractor):
|
|
"""Location mentions should trigger geographic constraint."""
|
|
signals = extractor.extract_signals("Vertel me over musea in Amsterdam")
|
|
assert signals.has_geographic_constraint is True
|
|
|
|
def test_temporal_indicator_substring_fixed(self, extractor):
|
|
"""Verify fix: substring matching no longer causes false positives.
|
|
|
|
'nationaal' contains 'na' but should NOT trigger temporal (uses word boundaries).
|
|
This tests that the fix for substring matching is working.
|
|
"""
|
|
signals = extractor.extract_signals("In welke stad ligt het Nationaal Archief?")
|
|
# After fix: should NOT be temporal (no word-boundary match for "na")
|
|
# "In" at start is a word boundary match for geographic indicator
|
|
assert signals.intent == "geographic"
|
|
assert signals.has_temporal_constraint is False
|
|
|
|
def test_entity_lookup_intent(self, extractor):
|
|
"""Entity lookup indicators should classify correctly."""
|
|
signals = extractor.extract_signals("Wat is het Rijksmuseum?")
|
|
assert signals.intent == "entity_lookup"
|
|
|
|
def test_comparative_intent(self, extractor):
|
|
"""Comparative queries should be classified correctly."""
|
|
signals = extractor.extract_signals("Vergelijk het Rijksmuseum met het Van Gogh Museum")
|
|
assert signals.intent == "comparative"
|
|
|
|
def test_exploration_default_intent(self, extractor):
|
|
"""Default to exploration for open questions without clear indicators."""
|
|
# Query without geographic, temporal, or aggregation indicators
|
|
# Note: "in" is a geographic indicator, so avoid words containing it
|
|
signals = extractor.extract_signals("Welke schilderijen vallen op?")
|
|
assert signals.intent == "exploration"
|
|
|
|
def test_geographic_indicator_substring_fixed(self, extractor):
|
|
"""Verify fix: 'in' no longer matches inside words.
|
|
|
|
'interessant' contains 'in' but should NOT trigger geographic.
|
|
This tests that the word boundary fix is working.
|
|
"""
|
|
signals = extractor.extract_signals("Welke schilderijen zijn interessant?")
|
|
# After fix: should be exploration, not geographic
|
|
assert signals.intent == "exploration"
|
|
assert signals.has_geographic_constraint is False
|
|
|
|
def test_word_boundary_in_works_correctly(self, extractor):
|
|
"""Verify 'in' as standalone word DOES trigger geographic."""
|
|
signals = extractor.extract_signals("Welke musea zijn er in Amsterdam?")
|
|
# "in" as standalone word should trigger geographic
|
|
assert signals.intent == "geographic"
|
|
assert signals.has_geographic_constraint is True
|
|
|
|
def test_word_boundary_na_works_correctly(self, extractor):
|
|
"""Verify 'na' as standalone word DOES trigger temporal."""
|
|
# Dutch: "Na de fusie..." = "After the merger..."
|
|
signals = extractor.extract_signals("Wat gebeurde er na de fusie met het archief?")
|
|
# "na" as standalone word should trigger temporal
|
|
assert signals.intent == "temporal"
|
|
assert signals.has_temporal_constraint is True
|
|
|
|
# ===== Entity Extraction =====
|
|
|
|
def test_extract_institution_mention(self, extractor):
|
|
"""Should extract institution names from query."""
|
|
signals = extractor.extract_signals("Vertel me over het Noord-Hollands Archief")
|
|
assert len(signals.institution_mentions) >= 1
|
|
# Should find "Noord-Hollands Archief" or similar
|
|
|
|
def test_extract_location_mention(self, extractor):
|
|
"""Should extract known Dutch locations."""
|
|
signals = extractor.extract_signals("Welke musea zijn er in Amsterdam?")
|
|
assert "Amsterdam" in signals.location_mentions
|
|
assert signals.has_geographic_constraint is True
|
|
|
|
def test_extract_multiple_locations(self, extractor):
|
|
"""Should extract multiple locations."""
|
|
signals = extractor.extract_signals("Archieven in Utrecht en Haarlem")
|
|
assert "Utrecht" in signals.location_mentions
|
|
assert "Haarlem" in signals.location_mentions
|
|
|
|
# ===== Language Detection =====
|
|
|
|
def test_detect_dutch_language(self, extractor):
|
|
"""Dutch queries should be detected."""
|
|
signals = extractor.extract_signals("Hoeveel musea zijn er in Nederland?")
|
|
assert signals.language == "nl"
|
|
|
|
def test_detect_english_language(self, extractor):
|
|
"""English queries should be detected."""
|
|
signals = extractor.extract_signals("How many museums are there in Amsterdam?")
|
|
assert signals.language == "en"
|
|
|
|
# ===== Confidence Scoring =====
|
|
|
|
def test_high_confidence_clear_query(self, extractor):
|
|
"""Clear queries should have high confidence."""
|
|
signals = extractor.extract_signals("Hoeveel archieven zijn er in Noord-Holland?")
|
|
assert signals.confidence >= 0.8
|
|
|
|
def test_moderate_confidence_ambiguous_query(self, extractor):
|
|
"""Ambiguous queries should have moderate confidence."""
|
|
signals = extractor.extract_signals("erfgoed informatie")
|
|
assert signals.confidence < 0.9
|
|
|
|
def test_confidence_capped_at_095(self, extractor):
|
|
"""Confidence should not exceed 0.95."""
|
|
signals = extractor.extract_signals("Hoeveel musea zijn er in Amsterdam?")
|
|
assert signals.confidence <= 0.95
|
|
|
|
|
|
class TestSemanticDecisionRouter:
|
|
"""Tests for SemanticDecisionRouter class."""
|
|
|
|
@pytest.fixture
|
|
def router(self):
|
|
return SemanticDecisionRouter()
|
|
|
|
def test_person_query_routes_to_qdrant_persons(self, router):
|
|
"""Person queries should route to heritage_persons collection."""
|
|
signals = QuerySignals(
|
|
entity_type="person",
|
|
intent="entity_lookup",
|
|
institution_mentions=["Noord-Hollands Archief"],
|
|
)
|
|
config = router.route(signals)
|
|
assert config.primary_backend == "qdrant"
|
|
assert config.qdrant_collection == "heritage_persons"
|
|
|
|
def test_person_query_with_institution_filter(self, router):
|
|
"""Person queries with institution should add filter."""
|
|
signals = QuerySignals(
|
|
entity_type="person",
|
|
intent="entity_lookup",
|
|
institution_mentions=["Noord-Hollands Archief"],
|
|
)
|
|
config = router.route(signals)
|
|
assert "custodian_slug" in config.qdrant_filters
|
|
assert "noord-hollands-archief" in config.qdrant_filters["custodian_slug"]
|
|
|
|
def test_statistical_query_routes_to_ducklake(self, router):
|
|
"""Statistical queries should route to DuckLake."""
|
|
signals = QuerySignals(
|
|
entity_type="institution",
|
|
intent="statistical",
|
|
requires_aggregation=True,
|
|
)
|
|
config = router.route(signals)
|
|
assert config.primary_backend == "ducklake"
|
|
|
|
def test_temporal_query_uses_temporal_templates(self, router):
|
|
"""Temporal queries should enable temporal templates."""
|
|
signals = QuerySignals(
|
|
entity_type="institution",
|
|
intent="temporal",
|
|
has_temporal_constraint=True,
|
|
)
|
|
config = router.route(signals)
|
|
assert config.primary_backend == "sparql"
|
|
assert config.use_temporal_templates is True
|
|
|
|
def test_geographic_query_routes_to_sparql(self, router):
|
|
"""Geographic queries should route to SPARQL."""
|
|
signals = QuerySignals(
|
|
entity_type="institution",
|
|
intent="geographic",
|
|
has_geographic_constraint=True,
|
|
location_mentions=["Amsterdam"],
|
|
)
|
|
config = router.route(signals)
|
|
assert config.primary_backend == "sparql"
|
|
|
|
def test_default_hybrid_routing(self, router):
|
|
"""Default queries should use hybrid routing."""
|
|
signals = QuerySignals(
|
|
entity_type="institution",
|
|
intent="exploration",
|
|
)
|
|
config = router.route(signals)
|
|
assert config.primary_backend == "qdrant"
|
|
assert config.secondary_backend == "sparql"
|
|
|
|
|
|
class TestSlugGeneration:
|
|
"""Tests for institution slug generation."""
|
|
|
|
@pytest.fixture
|
|
def router(self):
|
|
return SemanticDecisionRouter()
|
|
|
|
def test_simple_slug(self, router):
|
|
"""Simple names should convert to lowercase hyphenated slug."""
|
|
slug = router._to_slug("Rijksmuseum")
|
|
assert slug == "rijksmuseum"
|
|
|
|
def test_slug_with_spaces(self, router):
|
|
"""Spaces should be converted to hyphens."""
|
|
slug = router._to_slug("Noord-Hollands Archief")
|
|
assert slug == "noord-hollands-archief"
|
|
|
|
def test_slug_with_article(self, router):
|
|
"""Dutch articles should be preserved in slug."""
|
|
slug = router._to_slug("Het Utrechts Archief")
|
|
assert slug == "het-utrechts-archief"
|
|
|
|
def test_slug_with_diacritics(self, router):
|
|
"""Diacritics should be removed."""
|
|
slug = router._to_slug("Musée d'Orsay")
|
|
assert slug == "musee-dorsay"
|
|
|
|
|
|
class TestSingletonInstances:
|
|
"""Tests for singleton pattern."""
|
|
|
|
def test_signal_extractor_singleton(self):
|
|
"""get_signal_extractor should return same instance."""
|
|
ext1 = get_signal_extractor()
|
|
ext2 = get_signal_extractor()
|
|
assert ext1 is ext2
|
|
|
|
def test_decision_router_singleton(self):
|
|
"""get_decision_router should return same instance."""
|
|
router1 = get_decision_router()
|
|
router2 = get_decision_router()
|
|
assert router1 is router2
|
|
|
|
|
|
class TestIntegration:
|
|
"""Integration tests for full signal-decision flow."""
|
|
|
|
def test_full_person_query_flow(self):
|
|
"""Test complete flow for person query."""
|
|
extractor = get_signal_extractor()
|
|
router = get_decision_router()
|
|
|
|
# Query with clear person indicator but also institution mention (mixed)
|
|
signals = extractor.extract_signals(
|
|
"Wie is de archivaris bij het Noord-Hollands Archief?"
|
|
)
|
|
config = router.route(signals)
|
|
|
|
# Mixed entity type because both person and institution indicators present
|
|
assert signals.entity_type == "mixed"
|
|
# Mixed queries route via default (qdrant hybrid)
|
|
assert config.primary_backend in ["qdrant", "sparql"]
|
|
|
|
def test_full_pure_person_query_flow(self):
|
|
"""Test complete flow for pure person query (no institution mention)."""
|
|
extractor = get_signal_extractor()
|
|
router = get_decision_router()
|
|
|
|
signals = extractor.extract_signals("Wie werkt daar als medewerker?")
|
|
config = router.route(signals)
|
|
|
|
assert signals.entity_type == "person"
|
|
assert config.primary_backend == "qdrant"
|
|
assert config.qdrant_collection == "heritage_persons"
|
|
|
|
def test_full_statistical_query_flow(self):
|
|
"""Test complete flow for statistical query."""
|
|
extractor = get_signal_extractor()
|
|
router = get_decision_router()
|
|
|
|
signals = extractor.extract_signals(
|
|
"Hoeveel musea zijn er per provincie in Nederland?"
|
|
)
|
|
config = router.route(signals)
|
|
|
|
assert signals.intent == "statistical"
|
|
assert signals.requires_aggregation is True
|
|
assert config.primary_backend == "ducklake"
|
|
|
|
def test_full_temporal_query_flow(self):
|
|
"""Test complete flow for temporal query."""
|
|
extractor = get_signal_extractor()
|
|
router = get_decision_router()
|
|
|
|
signals = extractor.extract_signals(
|
|
"Wat is het oudste archief in Noord-Holland?"
|
|
)
|
|
config = router.route(signals)
|
|
|
|
assert signals.intent == "temporal"
|
|
assert signals.has_temporal_constraint is True
|
|
assert config.use_temporal_templates is True
|
|
|
|
def test_high_confidence_skip_llm_threshold(self):
|
|
"""Verify high-confidence queries meet skip threshold."""
|
|
extractor = get_signal_extractor()
|
|
|
|
# These queries should have confidence >= 0.8
|
|
# Need clear indicators without ambiguity
|
|
high_confidence_queries = [
|
|
"Hoeveel archieven zijn er in Nederland?", # clear aggregation
|
|
"Wanneer is het Nationaal Archief opgericht?", # clear temporal
|
|
"Welke musea zijn er in Amsterdam?", # clear geographic + institution
|
|
]
|
|
|
|
for query in high_confidence_queries:
|
|
signals = extractor.extract_signals(query)
|
|
assert signals.confidence >= 0.8, (
|
|
f"Query '{query}' has confidence {signals.confidence}, expected >= 0.8"
|
|
)
|
|
|
|
def test_moderate_confidence_for_mixed_queries(self):
|
|
"""Mixed entity type queries should have lower confidence."""
|
|
extractor = get_signal_extractor()
|
|
|
|
# Mixed queries are more ambiguous
|
|
signals = extractor.extract_signals("Wie is de directeur van het Rijksmuseum?")
|
|
# Mixed entity type (person + institution) reduces confidence
|
|
assert signals.entity_type == "mixed"
|
|
assert signals.confidence < 0.9 # Not as high as clear queries
|
|
|
|
|
|
class TestYearPatternDetection:
|
|
"""Tests for year-based temporal detection.
|
|
|
|
Year mentions (1000-2029) should trigger temporal intent,
|
|
even when combined with geographic indicators like 'in'.
|
|
"""
|
|
|
|
@pytest.fixture
|
|
def extractor(self):
|
|
return SemanticSignalExtractor()
|
|
|
|
def test_year_triggers_temporal_intent(self, extractor):
|
|
"""A year mention should classify as temporal intent."""
|
|
signals = extractor.extract_signals("Wat was de status van het Rijksmuseum in 1990?")
|
|
# Year 1990 should trigger temporal, not "in" triggering geographic
|
|
assert signals.intent == "temporal"
|
|
assert signals.has_temporal_constraint is True
|
|
|
|
def test_year_1850_triggers_temporal(self, extractor):
|
|
"""Historical year should trigger temporal."""
|
|
signals = extractor.extract_signals("Welke musea bestonden in 1850?")
|
|
assert signals.intent == "temporal"
|
|
assert signals.has_temporal_constraint is True
|
|
|
|
def test_year_2020_with_aggregation_is_statistical(self, extractor):
|
|
"""Aggregation query with year should be statistical with temporal constraint.
|
|
|
|
'Hoeveel' (how many) triggers aggregation → statistical intent.
|
|
Year 2020 triggers temporal constraint.
|
|
Result: statistical intent WITH temporal filter applied.
|
|
"""
|
|
signals = extractor.extract_signals("Hoeveel archieven waren er in 2020?")
|
|
# "Hoeveel" overrides to statistical, but temporal constraint is detected
|
|
assert signals.intent == "statistical"
|
|
assert signals.requires_aggregation is True
|
|
assert signals.has_temporal_constraint is True # Year still detected!
|
|
|
|
def test_year_2020_pure_temporal(self, extractor):
|
|
"""Recent year without aggregation should be temporal."""
|
|
signals = extractor.extract_signals("Welke archieven bestonden in 2020?")
|
|
assert signals.intent == "temporal"
|
|
assert signals.has_temporal_constraint is True
|
|
|
|
def test_geographic_without_year_stays_geographic(self, extractor):
|
|
"""Geographic query without year should stay geographic."""
|
|
signals = extractor.extract_signals("Welke musea zijn er in Amsterdam?")
|
|
assert signals.intent == "geographic"
|
|
assert signals.has_temporal_constraint is False
|
|
|
|
def test_year_overrides_geographic_in(self, extractor):
|
|
"""Year should make query temporal even with 'in' for location."""
|
|
signals = extractor.extract_signals("Welke musea waren er in Amsterdam in 1900?")
|
|
# Year 1900 should override the geographic "in Amsterdam"
|
|
assert signals.intent == "temporal"
|
|
assert signals.has_temporal_constraint is True
|
|
# Geographic constraint should still be detected
|
|
assert signals.has_geographic_constraint is True
|
|
|
|
def test_year_in_english_query(self, extractor):
|
|
"""Year detection should work in English queries too."""
|
|
signals = extractor.extract_signals("What museums existed in 1920?")
|
|
assert signals.intent == "temporal"
|
|
assert signals.has_temporal_constraint is True
|
|
|
|
def test_year_range_boundary_1000(self, extractor):
|
|
"""Year 1000 should be detected."""
|
|
signals = extractor.extract_signals("Bestond dit klooster al in 1000?")
|
|
assert signals.has_temporal_constraint is True
|
|
|
|
def test_year_range_boundary_2029(self, extractor):
|
|
"""Year 2029 should be detected (future planning)."""
|
|
signals = extractor.extract_signals("Wat zijn de plannen voor 2029?")
|
|
assert signals.has_temporal_constraint is True
|
|
|
|
def test_non_year_number_ignored(self, extractor):
|
|
"""Numbers that aren't years should not trigger temporal."""
|
|
signals = extractor.extract_signals("Hoeveel van de 500 musea hebben een website?")
|
|
# 500 is not a valid year (outside 1000-2029)
|
|
# This is a statistical query
|
|
assert signals.intent == "statistical"
|
|
# has_temporal_constraint could be False (no year) but check intent
|
|
|
|
def test_year_combined_with_temporal_keyword(self, extractor):
|
|
"""Year + temporal keyword should be high confidence temporal."""
|
|
signals = extractor.extract_signals("Wanneer in 1945 werd het museum gesloten?")
|
|
assert signals.intent == "temporal"
|
|
assert signals.has_temporal_constraint is True
|
|
# Combined signals should give high confidence
|
|
assert signals.confidence >= 0.8
|
|
|
|
|
|
# Run with: pytest backend/rag/test_semantic_routing.py -v
|