glam/backend/rag/test_semantic_routing.py

"""
Tests for Semantic Routing (Signal-Decision Pattern)

Tests the SemanticSignalExtractor and SemanticDecisionRouter classes
which enable fast LLM-free query routing for high-confidence queries.
"""

import pytest
from .semantic_router import (
    QuerySignals,
    RouteConfig,
    SemanticSignalExtractor,
    SemanticDecisionRouter,
    get_signal_extractor,
    get_decision_router,
)


class TestSemanticSignalExtractor:
    """Tests for SemanticSignalExtractor class."""

    @pytest.fixture
    def extractor(self):
        return SemanticSignalExtractor()

    # ===== Entity Type Detection =====

    def test_detect_person_query(self, extractor):
        """Person indicators should detect person entity type."""
        # Query with clear person indicator and no institution indicator
        signals = extractor.extract_signals("Wie werkt daar als medewerker?")
        assert signals.entity_type == "person"

    def test_detect_person_query_with_institution_is_mixed(self, extractor):
        """Person query mentioning institution should be mixed."""
        signals = extractor.extract_signals("Wie is de archivaris bij het Noord-Hollands Archief?")
        # "archief" is an institution indicator, so this is mixed
        assert signals.entity_type == "mixed"

    def test_detect_person_query_with_organisatie_is_mixed(self, extractor):
        """Person query with 'organisatie' should be mixed."""
        signals = extractor.extract_signals("Wie is de directeur van deze organisatie?")
        # "organisatie" is an institution indicator
        assert signals.entity_type == "mixed"

    def test_detect_institution_query(self, extractor):
        """Institution indicators should detect institution entity type."""
        signals = extractor.extract_signals("Welke musea zijn er in Amsterdam?")
        assert signals.entity_type == "institution"

    def test_detect_mixed_query(self, extractor):
        """Mixed indicators should detect mixed entity type."""
        signals = extractor.extract_signals("Welke curatoren werken bij musea in Utrecht?")
        assert signals.entity_type == "mixed"

    def test_default_to_institution(self, extractor):
        """Ambiguous queries should default to institution."""
        signals = extractor.extract_signals("Vertel me over cultureel erfgoed")
        assert signals.entity_type == "institution"

    # ===== Intent Classification =====

    def test_statistical_intent(self, extractor):
        """Aggregation indicators should classify as statistical."""
        signals = extractor.extract_signals("Hoeveel archieven zijn er in Nederland?")
        assert signals.intent == "statistical"
        assert signals.requires_aggregation is True

    def test_temporal_intent(self, extractor):
        """Temporal indicators should classify as temporal."""
        signals = extractor.extract_signals("Wanneer is het Rijksmuseum opgericht?")
        assert signals.intent == "temporal"
        assert signals.has_temporal_constraint is True

    def test_temporal_intent_with_oldest(self, extractor):
        """Oldest/newest queries should be temporal."""
        signals = extractor.extract_signals("Wat is het oudste museum in Nederland?")
        assert signals.intent == "temporal"
        assert signals.has_temporal_constraint is True

    def test_geographic_intent(self, extractor):
        """Geographic indicators should classify as geographic."""
        # "waar" (where) is a geographic indicator
        signals = extractor.extract_signals("Waar staat dit museum?")
        assert signals.intent == "geographic"
        assert signals.has_geographic_constraint is True

    def test_geographic_intent_with_location(self, extractor):
        """Location mentions should trigger geographic constraint."""
        signals = extractor.extract_signals("Vertel me over musea in Amsterdam")
        assert signals.has_geographic_constraint is True

    def test_temporal_indicator_substring_fixed(self, extractor):
        """Verify fix: substring matching no longer causes false positives.

        'nationaal' contains 'na' but should NOT trigger temporal (uses word boundaries).
        This tests that the fix for substring matching is working.
        """
        signals = extractor.extract_signals("In welke stad ligt het Nationaal Archief?")
        # After fix: should NOT be temporal (no word-boundary match for "na")
        # "In" at start is a word boundary match for geographic indicator
        assert signals.intent == "geographic"
        assert signals.has_temporal_constraint is False

    def test_entity_lookup_intent(self, extractor):
        """Entity lookup indicators should classify correctly."""
        signals = extractor.extract_signals("Wat is het Rijksmuseum?")
        assert signals.intent == "entity_lookup"

    def test_comparative_intent(self, extractor):
        """Comparative queries should be classified correctly."""
        signals = extractor.extract_signals("Vergelijk het Rijksmuseum met het Van Gogh Museum")
        assert signals.intent == "comparative"

    def test_exploration_default_intent(self, extractor):
        """Default to exploration for open questions without clear indicators."""
        # Query without geographic, temporal, or aggregation indicators
        # Note: "in" is a geographic indicator, so avoid words containing it
        signals = extractor.extract_signals("Welke schilderijen vallen op?")
        assert signals.intent == "exploration"

    def test_geographic_indicator_substring_fixed(self, extractor):
        """Verify fix: 'in' no longer matches inside words.

        'interessant' contains 'in' but should NOT trigger geographic.
        This tests that the word boundary fix is working.
        """
        signals = extractor.extract_signals("Welke schilderijen zijn interessant?")
        # After fix: should be exploration, not geographic
        assert signals.intent == "exploration"
        assert signals.has_geographic_constraint is False

    def test_word_boundary_in_works_correctly(self, extractor):
        """Verify 'in' as standalone word DOES trigger geographic."""
        signals = extractor.extract_signals("Welke musea zijn er in Amsterdam?")
        # "in" as standalone word should trigger geographic
        assert signals.intent == "geographic"
        assert signals.has_geographic_constraint is True

    def test_word_boundary_na_works_correctly(self, extractor):
        """Verify 'na' as standalone word DOES trigger temporal."""
        # Dutch: "Na de fusie..." = "After the merger..."
        signals = extractor.extract_signals("Wat gebeurde er na de fusie met het archief?")
        # "na" as standalone word should trigger temporal
        assert signals.intent == "temporal"
        assert signals.has_temporal_constraint is True

    # ===== Entity Extraction =====

    def test_extract_institution_mention(self, extractor):
        """Should extract institution names from query."""
        signals = extractor.extract_signals("Vertel me over het Noord-Hollands Archief")
        assert len(signals.institution_mentions) >= 1
        # Should find "Noord-Hollands Archief" or similar

    def test_extract_location_mention(self, extractor):
        """Should extract known Dutch locations."""
        signals = extractor.extract_signals("Welke musea zijn er in Amsterdam?")
        assert "Amsterdam" in signals.location_mentions
        assert signals.has_geographic_constraint is True

    def test_extract_multiple_locations(self, extractor):
        """Should extract multiple locations."""
        signals = extractor.extract_signals("Archieven in Utrecht en Haarlem")
        assert "Utrecht" in signals.location_mentions
        assert "Haarlem" in signals.location_mentions

    # ===== Language Detection =====

    def test_detect_dutch_language(self, extractor):
        """Dutch queries should be detected."""
        signals = extractor.extract_signals("Hoeveel musea zijn er in Nederland?")
        assert signals.language == "nl"

    def test_detect_english_language(self, extractor):
        """English queries should be detected."""
        signals = extractor.extract_signals("How many museums are there in Amsterdam?")
        assert signals.language == "en"

    # ===== Confidence Scoring =====

    def test_high_confidence_clear_query(self, extractor):
        """Clear queries should have high confidence."""
        signals = extractor.extract_signals("Hoeveel archieven zijn er in Noord-Holland?")
        assert signals.confidence >= 0.8

    def test_moderate_confidence_ambiguous_query(self, extractor):
        """Ambiguous queries should have moderate confidence."""
        signals = extractor.extract_signals("erfgoed informatie")
        assert signals.confidence < 0.9

    def test_confidence_capped_at_095(self, extractor):
        """Confidence should not exceed 0.95."""
        signals = extractor.extract_signals("Hoeveel musea zijn er in Amsterdam?")
        assert signals.confidence <= 0.95


class TestSemanticDecisionRouter:
    """Tests for SemanticDecisionRouter class."""

    @pytest.fixture
    def router(self):
        return SemanticDecisionRouter()

    def test_person_query_routes_to_qdrant_persons(self, router):
        """Person queries should route to heritage_persons collection."""
        signals = QuerySignals(
            entity_type="person",
            intent="entity_lookup",
            institution_mentions=["Noord-Hollands Archief"],
        )
        config = router.route(signals)
        assert config.primary_backend == "qdrant"
        assert config.qdrant_collection == "heritage_persons"

    def test_person_query_with_institution_filter(self, router):
        """Person queries with institution should add filter."""
        signals = QuerySignals(
            entity_type="person",
            intent="entity_lookup",
            institution_mentions=["Noord-Hollands Archief"],
        )
        config = router.route(signals)
        assert "custodian_slug" in config.qdrant_filters
        assert "noord-hollands-archief" in config.qdrant_filters["custodian_slug"]

    def test_statistical_query_routes_to_sparql(self, router):
        """Statistical queries should route to SPARQL for aggregations.

        NOTE: DuckLake removed from RAG - it's for offline analytics only.
        Statistical queries now use SPARQL aggregations (COUNT, SUM, AVG, GROUP BY).
        """
        signals = QuerySignals(
            entity_type="institution",
            intent="statistical",
            requires_aggregation=True,
        )
        config = router.route(signals)
        assert config.primary_backend == "sparql"

    def test_temporal_query_uses_temporal_templates(self, router):
        """Temporal queries should enable temporal templates."""
        signals = QuerySignals(
            entity_type="institution",
            intent="temporal",
            has_temporal_constraint=True,
        )
        config = router.route(signals)
        assert config.primary_backend == "sparql"
        assert config.use_temporal_templates is True

    def test_geographic_query_routes_to_sparql(self, router):
        """Geographic queries should route to SPARQL."""
        signals = QuerySignals(
            entity_type="institution",
            intent="geographic",
            has_geographic_constraint=True,
            location_mentions=["Amsterdam"],
        )
        config = router.route(signals)
        assert config.primary_backend == "sparql"

    def test_default_hybrid_routing(self, router):
        """Default queries should use hybrid routing."""
        signals = QuerySignals(
            entity_type="institution",
            intent="exploration",
        )
        config = router.route(signals)
        assert config.primary_backend == "qdrant"
        assert config.secondary_backend == "sparql"


class TestSlugGeneration:
    """Tests for institution slug generation."""

    @pytest.fixture
    def router(self):
        return SemanticDecisionRouter()

    def test_simple_slug(self, router):
        """Simple names should convert to lowercase hyphenated slug."""
        slug = router._to_slug("Rijksmuseum")
        assert slug == "rijksmuseum"

    def test_slug_with_spaces(self, router):
        """Spaces should be converted to hyphens."""
        slug = router._to_slug("Noord-Hollands Archief")
        assert slug == "noord-hollands-archief"

    def test_slug_with_article(self, router):
        """Dutch articles should be preserved in slug."""
        slug = router._to_slug("Het Utrechts Archief")
        assert slug == "het-utrechts-archief"

    def test_slug_with_diacritics(self, router):
        """Diacritics should be removed."""
        slug = router._to_slug("Musée d'Orsay")
        assert slug == "musee-dorsay"


class TestSingletonInstances:
    """Tests for singleton pattern."""

    def test_signal_extractor_singleton(self):
        """get_signal_extractor should return same instance."""
        ext1 = get_signal_extractor()
        ext2 = get_signal_extractor()
        assert ext1 is ext2

    def test_decision_router_singleton(self):
        """get_decision_router should return same instance."""
        router1 = get_decision_router()
        router2 = get_decision_router()
        assert router1 is router2


class TestIntegration:
    """Integration tests for full signal-decision flow."""

    def test_full_person_query_flow(self):
        """Test complete flow for person query."""
        extractor = get_signal_extractor()
        router = get_decision_router()

        # Query with clear person indicator but also institution mention (mixed)
        signals = extractor.extract_signals(
            "Wie is de archivaris bij het Noord-Hollands Archief?"
        )
        config = router.route(signals)

        # Mixed entity type because both person and institution indicators present
        assert signals.entity_type == "mixed"
        # Mixed queries route via default (qdrant hybrid)
        assert config.primary_backend in ["qdrant", "sparql"]

    def test_full_pure_person_query_flow(self):
        """Test complete flow for pure person query (no institution mention)."""
        extractor = get_signal_extractor()
        router = get_decision_router()

        signals = extractor.extract_signals("Wie werkt daar als medewerker?")
        config = router.route(signals)

        assert signals.entity_type == "person"
        assert config.primary_backend == "qdrant"
        assert config.qdrant_collection == "heritage_persons"

    def test_full_statistical_query_flow(self):
        """Test complete flow for statistical query.

        NOTE: DuckLake removed from RAG - statistical queries now use SPARQL aggregations.
        """
        extractor = get_signal_extractor()
        router = get_decision_router()

        signals = extractor.extract_signals(
            "Hoeveel musea zijn er per provincie in Nederland?"
        )
        config = router.route(signals)

        assert signals.intent == "statistical"
        assert signals.requires_aggregation is True
        assert config.primary_backend == "sparql"

    def test_full_temporal_query_flow(self):
        """Test complete flow for temporal query."""
        extractor = get_signal_extractor()
        router = get_decision_router()

        signals = extractor.extract_signals(
            "Wat is het oudste archief in Noord-Holland?"
        )
        config = router.route(signals)

        assert signals.intent == "temporal"
        assert signals.has_temporal_constraint is True
        assert config.use_temporal_templates is True

    def test_high_confidence_skip_llm_threshold(self):
        """Verify high-confidence queries meet skip threshold."""
        extractor = get_signal_extractor()

        # These queries should have confidence >= 0.8
        # Need clear indicators without ambiguity
        high_confidence_queries = [
            "Hoeveel archieven zijn er in Nederland?",  # clear aggregation
            "Wanneer is het Nationaal Archief opgericht?",  # clear temporal
            "Welke musea zijn er in Amsterdam?",  # clear geographic + institution
        ]

        for query in high_confidence_queries:
            signals = extractor.extract_signals(query)
            assert signals.confidence >= 0.8, (
                f"Query '{query}' has confidence {signals.confidence}, expected >= 0.8"
            )

    def test_moderate_confidence_for_mixed_queries(self):
        """Mixed entity type queries should have lower confidence."""
        extractor = get_signal_extractor()

        # Mixed queries are more ambiguous
        signals = extractor.extract_signals("Wie is de directeur van het Rijksmuseum?")
        # Mixed entity type (person + institution) reduces confidence
        assert signals.entity_type == "mixed"
        assert signals.confidence < 0.9  # Not as high as clear queries


class TestYearPatternDetection:
    """Tests for year-based temporal detection.

    Year mentions (1000-2029) should trigger temporal intent,
    even when combined with geographic indicators like 'in'.
    """

    @pytest.fixture
    def extractor(self):
        return SemanticSignalExtractor()

    def test_year_triggers_temporal_intent(self, extractor):
        """A year mention should classify as temporal intent."""
        signals = extractor.extract_signals("Wat was de status van het Rijksmuseum in 1990?")
        # Year 1990 should trigger temporal, not "in" triggering geographic
        assert signals.intent == "temporal"
        assert signals.has_temporal_constraint is True

    def test_year_1850_triggers_temporal(self, extractor):
        """Historical year should trigger temporal."""
        signals = extractor.extract_signals("Welke musea bestonden in 1850?")
        assert signals.intent == "temporal"
        assert signals.has_temporal_constraint is True

    def test_year_2020_with_aggregation_is_statistical(self, extractor):
        """Aggregation query with year should be statistical with temporal constraint.

        'Hoeveel' (how many) triggers aggregation → statistical intent.
        Year 2020 triggers temporal constraint.
        Result: statistical intent WITH temporal filter applied.
        """
        signals = extractor.extract_signals("Hoeveel archieven waren er in 2020?")
        # "Hoeveel" overrides to statistical, but temporal constraint is detected
        assert signals.intent == "statistical"
        assert signals.requires_aggregation is True
        assert signals.has_temporal_constraint is True  # Year still detected!

    def test_year_2020_pure_temporal(self, extractor):
        """Recent year without aggregation should be temporal."""
        signals = extractor.extract_signals("Welke archieven bestonden in 2020?")
        assert signals.intent == "temporal"
        assert signals.has_temporal_constraint is True

    def test_geographic_without_year_stays_geographic(self, extractor):
        """Geographic query without year should stay geographic."""
        signals = extractor.extract_signals("Welke musea zijn er in Amsterdam?")
        assert signals.intent == "geographic"
        assert signals.has_temporal_constraint is False

    def test_year_overrides_geographic_in(self, extractor):
        """Year should make query temporal even with 'in' for location."""
        signals = extractor.extract_signals("Welke musea waren er in Amsterdam in 1900?")
        # Year 1900 should override the geographic "in Amsterdam"
        assert signals.intent == "temporal"
        assert signals.has_temporal_constraint is True
        # Geographic constraint should still be detected
        assert signals.has_geographic_constraint is True

    def test_year_in_english_query(self, extractor):
        """Year detection should work in English queries too."""
        signals = extractor.extract_signals("What museums existed in 1920?")
        assert signals.intent == "temporal"
        assert signals.has_temporal_constraint is True

    def test_year_range_boundary_1000(self, extractor):
        """Year 1000 should be detected."""
        signals = extractor.extract_signals("Bestond dit klooster al in 1000?")
        assert signals.has_temporal_constraint is True

    def test_year_range_boundary_2029(self, extractor):
        """Year 2029 should be detected (future planning)."""
        signals = extractor.extract_signals("Wat zijn de plannen voor 2029?")
        assert signals.has_temporal_constraint is True

    def test_non_year_number_ignored(self, extractor):
        """Numbers that aren't years should not trigger temporal."""
        signals = extractor.extract_signals("Hoeveel van de 500 musea hebben een website?")
        # 500 is not a valid year (outside 1000-2029)
        # This is a statistical query
        assert signals.intent == "statistical"
        # has_temporal_constraint could be False (no year) but check intent

    def test_year_combined_with_temporal_keyword(self, extractor):
        """Year + temporal keyword should be high confidence temporal."""
        signals = extractor.extract_signals("Wanneer in 1945 werd het museum gesloten?")
        assert signals.intent == "temporal"
        assert signals.has_temporal_constraint is True
        # Combined signals should give high confidence
        assert signals.confidence >= 0.8


# Run with: pytest backend/rag/test_semantic_routing.py -v