""" Tests for Semantic Routing (Signal-Decision Pattern) Tests the SemanticSignalExtractor and SemanticDecisionRouter classes which enable fast LLM-free query routing for high-confidence queries. """ import pytest from .semantic_router import ( QuerySignals, RouteConfig, SemanticSignalExtractor, SemanticDecisionRouter, get_signal_extractor, get_decision_router, ) class TestSemanticSignalExtractor: """Tests for SemanticSignalExtractor class.""" @pytest.fixture def extractor(self): return SemanticSignalExtractor() # ===== Entity Type Detection ===== def test_detect_person_query(self, extractor): """Person indicators should detect person entity type.""" # Query with clear person indicator and no institution indicator signals = extractor.extract_signals("Wie werkt daar als medewerker?") assert signals.entity_type == "person" def test_detect_person_query_with_institution_is_mixed(self, extractor): """Person query mentioning institution should be mixed.""" signals = extractor.extract_signals("Wie is de archivaris bij het Noord-Hollands Archief?") # "archief" is an institution indicator, so this is mixed assert signals.entity_type == "mixed" def test_detect_person_query_with_organisatie_is_mixed(self, extractor): """Person query with 'organisatie' should be mixed.""" signals = extractor.extract_signals("Wie is de directeur van deze organisatie?") # "organisatie" is an institution indicator assert signals.entity_type == "mixed" def test_detect_institution_query(self, extractor): """Institution indicators should detect institution entity type.""" signals = extractor.extract_signals("Welke musea zijn er in Amsterdam?") assert signals.entity_type == "institution" def test_detect_mixed_query(self, extractor): """Mixed indicators should detect mixed entity type.""" signals = extractor.extract_signals("Welke curatoren werken bij musea in Utrecht?") assert signals.entity_type == "mixed" def test_default_to_institution(self, extractor): """Ambiguous queries should default to institution.""" signals = extractor.extract_signals("Vertel me over cultureel erfgoed") assert signals.entity_type == "institution" # ===== Intent Classification ===== def test_statistical_intent(self, extractor): """Aggregation indicators should classify as statistical.""" signals = extractor.extract_signals("Hoeveel archieven zijn er in Nederland?") assert signals.intent == "statistical" assert signals.requires_aggregation is True def test_temporal_intent(self, extractor): """Temporal indicators should classify as temporal.""" signals = extractor.extract_signals("Wanneer is het Rijksmuseum opgericht?") assert signals.intent == "temporal" assert signals.has_temporal_constraint is True def test_temporal_intent_with_oldest(self, extractor): """Oldest/newest queries should be temporal.""" signals = extractor.extract_signals("Wat is het oudste museum in Nederland?") assert signals.intent == "temporal" assert signals.has_temporal_constraint is True def test_geographic_intent(self, extractor): """Geographic indicators should classify as geographic.""" # "waar" (where) is a geographic indicator signals = extractor.extract_signals("Waar staat dit museum?") assert signals.intent == "geographic" assert signals.has_geographic_constraint is True def test_geographic_intent_with_location(self, extractor): """Location mentions should trigger geographic constraint.""" signals = extractor.extract_signals("Vertel me over musea in Amsterdam") assert signals.has_geographic_constraint is True def test_temporal_indicator_substring_fixed(self, extractor): """Verify fix: substring matching no longer causes false positives. 'nationaal' contains 'na' but should NOT trigger temporal (uses word boundaries). This tests that the fix for substring matching is working. """ signals = extractor.extract_signals("In welke stad ligt het Nationaal Archief?") # After fix: should NOT be temporal (no word-boundary match for "na") # "In" at start is a word boundary match for geographic indicator assert signals.intent == "geographic" assert signals.has_temporal_constraint is False def test_entity_lookup_intent(self, extractor): """Entity lookup indicators should classify correctly.""" signals = extractor.extract_signals("Wat is het Rijksmuseum?") assert signals.intent == "entity_lookup" def test_comparative_intent(self, extractor): """Comparative queries should be classified correctly.""" signals = extractor.extract_signals("Vergelijk het Rijksmuseum met het Van Gogh Museum") assert signals.intent == "comparative" def test_exploration_default_intent(self, extractor): """Default to exploration for open questions without clear indicators.""" # Query without geographic, temporal, or aggregation indicators # Note: "in" is a geographic indicator, so avoid words containing it signals = extractor.extract_signals("Welke schilderijen vallen op?") assert signals.intent == "exploration" def test_geographic_indicator_substring_fixed(self, extractor): """Verify fix: 'in' no longer matches inside words. 'interessant' contains 'in' but should NOT trigger geographic. This tests that the word boundary fix is working. """ signals = extractor.extract_signals("Welke schilderijen zijn interessant?") # After fix: should be exploration, not geographic assert signals.intent == "exploration" assert signals.has_geographic_constraint is False def test_word_boundary_in_works_correctly(self, extractor): """Verify 'in' as standalone word DOES trigger geographic.""" signals = extractor.extract_signals("Welke musea zijn er in Amsterdam?") # "in" as standalone word should trigger geographic assert signals.intent == "geographic" assert signals.has_geographic_constraint is True def test_word_boundary_na_works_correctly(self, extractor): """Verify 'na' as standalone word DOES trigger temporal.""" # Dutch: "Na de fusie..." = "After the merger..." signals = extractor.extract_signals("Wat gebeurde er na de fusie met het archief?") # "na" as standalone word should trigger temporal assert signals.intent == "temporal" assert signals.has_temporal_constraint is True # ===== Entity Extraction ===== def test_extract_institution_mention(self, extractor): """Should extract institution names from query.""" signals = extractor.extract_signals("Vertel me over het Noord-Hollands Archief") assert len(signals.institution_mentions) >= 1 # Should find "Noord-Hollands Archief" or similar def test_extract_location_mention(self, extractor): """Should extract known Dutch locations.""" signals = extractor.extract_signals("Welke musea zijn er in Amsterdam?") assert "Amsterdam" in signals.location_mentions assert signals.has_geographic_constraint is True def test_extract_multiple_locations(self, extractor): """Should extract multiple locations.""" signals = extractor.extract_signals("Archieven in Utrecht en Haarlem") assert "Utrecht" in signals.location_mentions assert "Haarlem" in signals.location_mentions # ===== Language Detection ===== def test_detect_dutch_language(self, extractor): """Dutch queries should be detected.""" signals = extractor.extract_signals("Hoeveel musea zijn er in Nederland?") assert signals.language == "nl" def test_detect_english_language(self, extractor): """English queries should be detected.""" signals = extractor.extract_signals("How many museums are there in Amsterdam?") assert signals.language == "en" # ===== Confidence Scoring ===== def test_high_confidence_clear_query(self, extractor): """Clear queries should have high confidence.""" signals = extractor.extract_signals("Hoeveel archieven zijn er in Noord-Holland?") assert signals.confidence >= 0.8 def test_moderate_confidence_ambiguous_query(self, extractor): """Ambiguous queries should have moderate confidence.""" signals = extractor.extract_signals("erfgoed informatie") assert signals.confidence < 0.9 def test_confidence_capped_at_095(self, extractor): """Confidence should not exceed 0.95.""" signals = extractor.extract_signals("Hoeveel musea zijn er in Amsterdam?") assert signals.confidence <= 0.95 class TestSemanticDecisionRouter: """Tests for SemanticDecisionRouter class.""" @pytest.fixture def router(self): return SemanticDecisionRouter() def test_person_query_routes_to_qdrant_persons(self, router): """Person queries should route to heritage_persons collection.""" signals = QuerySignals( entity_type="person", intent="entity_lookup", institution_mentions=["Noord-Hollands Archief"], ) config = router.route(signals) assert config.primary_backend == "qdrant" assert config.qdrant_collection == "heritage_persons" def test_person_query_with_institution_filter(self, router): """Person queries with institution should add filter.""" signals = QuerySignals( entity_type="person", intent="entity_lookup", institution_mentions=["Noord-Hollands Archief"], ) config = router.route(signals) assert "custodian_slug" in config.qdrant_filters assert "noord-hollands-archief" in config.qdrant_filters["custodian_slug"] def test_statistical_query_routes_to_sparql(self, router): """Statistical queries should route to SPARQL for aggregations. NOTE: DuckLake removed from RAG - it's for offline analytics only. Statistical queries now use SPARQL aggregations (COUNT, SUM, AVG, GROUP BY). """ signals = QuerySignals( entity_type="institution", intent="statistical", requires_aggregation=True, ) config = router.route(signals) assert config.primary_backend == "sparql" def test_temporal_query_uses_temporal_templates(self, router): """Temporal queries should enable temporal templates.""" signals = QuerySignals( entity_type="institution", intent="temporal", has_temporal_constraint=True, ) config = router.route(signals) assert config.primary_backend == "sparql" assert config.use_temporal_templates is True def test_geographic_query_routes_to_sparql(self, router): """Geographic queries should route to SPARQL.""" signals = QuerySignals( entity_type="institution", intent="geographic", has_geographic_constraint=True, location_mentions=["Amsterdam"], ) config = router.route(signals) assert config.primary_backend == "sparql" def test_default_hybrid_routing(self, router): """Default queries should use hybrid routing.""" signals = QuerySignals( entity_type="institution", intent="exploration", ) config = router.route(signals) assert config.primary_backend == "qdrant" assert config.secondary_backend == "sparql" class TestSlugGeneration: """Tests for institution slug generation.""" @pytest.fixture def router(self): return SemanticDecisionRouter() def test_simple_slug(self, router): """Simple names should convert to lowercase hyphenated slug.""" slug = router._to_slug("Rijksmuseum") assert slug == "rijksmuseum" def test_slug_with_spaces(self, router): """Spaces should be converted to hyphens.""" slug = router._to_slug("Noord-Hollands Archief") assert slug == "noord-hollands-archief" def test_slug_with_article(self, router): """Dutch articles should be preserved in slug.""" slug = router._to_slug("Het Utrechts Archief") assert slug == "het-utrechts-archief" def test_slug_with_diacritics(self, router): """Diacritics should be removed.""" slug = router._to_slug("Musée d'Orsay") assert slug == "musee-dorsay" class TestSingletonInstances: """Tests for singleton pattern.""" def test_signal_extractor_singleton(self): """get_signal_extractor should return same instance.""" ext1 = get_signal_extractor() ext2 = get_signal_extractor() assert ext1 is ext2 def test_decision_router_singleton(self): """get_decision_router should return same instance.""" router1 = get_decision_router() router2 = get_decision_router() assert router1 is router2 class TestIntegration: """Integration tests for full signal-decision flow.""" def test_full_person_query_flow(self): """Test complete flow for person query.""" extractor = get_signal_extractor() router = get_decision_router() # Query with clear person indicator but also institution mention (mixed) signals = extractor.extract_signals( "Wie is de archivaris bij het Noord-Hollands Archief?" ) config = router.route(signals) # Mixed entity type because both person and institution indicators present assert signals.entity_type == "mixed" # Mixed queries route via default (qdrant hybrid) assert config.primary_backend in ["qdrant", "sparql"] def test_full_pure_person_query_flow(self): """Test complete flow for pure person query (no institution mention).""" extractor = get_signal_extractor() router = get_decision_router() signals = extractor.extract_signals("Wie werkt daar als medewerker?") config = router.route(signals) assert signals.entity_type == "person" assert config.primary_backend == "qdrant" assert config.qdrant_collection == "heritage_persons" def test_full_statistical_query_flow(self): """Test complete flow for statistical query. NOTE: DuckLake removed from RAG - statistical queries now use SPARQL aggregations. """ extractor = get_signal_extractor() router = get_decision_router() signals = extractor.extract_signals( "Hoeveel musea zijn er per provincie in Nederland?" ) config = router.route(signals) assert signals.intent == "statistical" assert signals.requires_aggregation is True assert config.primary_backend == "sparql" def test_full_temporal_query_flow(self): """Test complete flow for temporal query.""" extractor = get_signal_extractor() router = get_decision_router() signals = extractor.extract_signals( "Wat is het oudste archief in Noord-Holland?" ) config = router.route(signals) assert signals.intent == "temporal" assert signals.has_temporal_constraint is True assert config.use_temporal_templates is True def test_high_confidence_skip_llm_threshold(self): """Verify high-confidence queries meet skip threshold.""" extractor = get_signal_extractor() # These queries should have confidence >= 0.8 # Need clear indicators without ambiguity high_confidence_queries = [ "Hoeveel archieven zijn er in Nederland?", # clear aggregation "Wanneer is het Nationaal Archief opgericht?", # clear temporal "Welke musea zijn er in Amsterdam?", # clear geographic + institution ] for query in high_confidence_queries: signals = extractor.extract_signals(query) assert signals.confidence >= 0.8, ( f"Query '{query}' has confidence {signals.confidence}, expected >= 0.8" ) def test_moderate_confidence_for_mixed_queries(self): """Mixed entity type queries should have lower confidence.""" extractor = get_signal_extractor() # Mixed queries are more ambiguous signals = extractor.extract_signals("Wie is de directeur van het Rijksmuseum?") # Mixed entity type (person + institution) reduces confidence assert signals.entity_type == "mixed" assert signals.confidence < 0.9 # Not as high as clear queries class TestYearPatternDetection: """Tests for year-based temporal detection. Year mentions (1000-2029) should trigger temporal intent, even when combined with geographic indicators like 'in'. """ @pytest.fixture def extractor(self): return SemanticSignalExtractor() def test_year_triggers_temporal_intent(self, extractor): """A year mention should classify as temporal intent.""" signals = extractor.extract_signals("Wat was de status van het Rijksmuseum in 1990?") # Year 1990 should trigger temporal, not "in" triggering geographic assert signals.intent == "temporal" assert signals.has_temporal_constraint is True def test_year_1850_triggers_temporal(self, extractor): """Historical year should trigger temporal.""" signals = extractor.extract_signals("Welke musea bestonden in 1850?") assert signals.intent == "temporal" assert signals.has_temporal_constraint is True def test_year_2020_with_aggregation_is_statistical(self, extractor): """Aggregation query with year should be statistical with temporal constraint. 'Hoeveel' (how many) triggers aggregation → statistical intent. Year 2020 triggers temporal constraint. Result: statistical intent WITH temporal filter applied. """ signals = extractor.extract_signals("Hoeveel archieven waren er in 2020?") # "Hoeveel" overrides to statistical, but temporal constraint is detected assert signals.intent == "statistical" assert signals.requires_aggregation is True assert signals.has_temporal_constraint is True # Year still detected! def test_year_2020_pure_temporal(self, extractor): """Recent year without aggregation should be temporal.""" signals = extractor.extract_signals("Welke archieven bestonden in 2020?") assert signals.intent == "temporal" assert signals.has_temporal_constraint is True def test_geographic_without_year_stays_geographic(self, extractor): """Geographic query without year should stay geographic.""" signals = extractor.extract_signals("Welke musea zijn er in Amsterdam?") assert signals.intent == "geographic" assert signals.has_temporal_constraint is False def test_year_overrides_geographic_in(self, extractor): """Year should make query temporal even with 'in' for location.""" signals = extractor.extract_signals("Welke musea waren er in Amsterdam in 1900?") # Year 1900 should override the geographic "in Amsterdam" assert signals.intent == "temporal" assert signals.has_temporal_constraint is True # Geographic constraint should still be detected assert signals.has_geographic_constraint is True def test_year_in_english_query(self, extractor): """Year detection should work in English queries too.""" signals = extractor.extract_signals("What museums existed in 1920?") assert signals.intent == "temporal" assert signals.has_temporal_constraint is True def test_year_range_boundary_1000(self, extractor): """Year 1000 should be detected.""" signals = extractor.extract_signals("Bestond dit klooster al in 1000?") assert signals.has_temporal_constraint is True def test_year_range_boundary_2029(self, extractor): """Year 2029 should be detected (future planning).""" signals = extractor.extract_signals("Wat zijn de plannen voor 2029?") assert signals.has_temporal_constraint is True def test_non_year_number_ignored(self, extractor): """Numbers that aren't years should not trigger temporal.""" signals = extractor.extract_signals("Hoeveel van de 500 musea hebben een website?") # 500 is not a valid year (outside 1000-2029) # This is a statistical query assert signals.intent == "statistical" # has_temporal_constraint could be False (no year) but check intent def test_year_combined_with_temporal_keyword(self, extractor): """Year + temporal keyword should be high confidence temporal.""" signals = extractor.extract_signals("Wanneer in 1945 werd het museum gesloten?") assert signals.intent == "temporal" assert signals.has_temporal_constraint is True # Combined signals should give high confidence assert signals.confidence >= 0.8 # Run with: pytest backend/rag/test_semantic_routing.py -v