glam/tests/rag/test_specificity_context_selector.py
kempersc 11983014bb Enhance specificity scoring system integration with existing infrastructure
- Updated documentation to clarify integration points with existing components in the RAG pipeline and DSPy framework.
- Added detailed mapping of SPARQL templates to context templates for improved specificity filtering.
- Implemented wrapper patterns around existing classifiers to extend functionality without duplication.
- Introduced new tests for the SpecificityAwareClassifier and SPARQLToContextMapper to ensure proper integration and functionality.
- Enhanced the CustodianRDFConverter to include ISO country and subregion codes from GHCID for better geospatial data handling.
2026-01-05 17:37:49 +01:00

675 lines
25 KiB
Python

"""
Tests for backend.rag.specificity.context_selector module.
This module tests the dynamic context template selection system, which:
- Maps query intent (geographic, statistical, etc.) to context templates
- Prioritizes entity_type for person queries
- Refines context based on custodian_type (A/M/L/etc.)
- Provides per-template threshold defaults
- Integrates with HeritageQueryRouter predictions
Coverage:
- DynamicContextSelector.select() with various combinations
- Selection priority ordering (person > custodian > intent > fallback)
- select_from_prediction() with mock router output
- Threshold override behavior
- Custom map injection via constructor
- Singleton and convenience function behavior
"""
from __future__ import annotations
from dataclasses import dataclass
from typing import Optional
import pytest
from backend.rag.specificity.context_selector import (
INTENT_TO_CONTEXT_MAP,
ENTITY_TYPE_TO_CONTEXT_MAP,
DEFAULT_THRESHOLDS,
ContextSelectionResult,
DynamicContextSelector,
get_dynamic_context_selector,
select_context_for_query,
)
from backend.rag.specificity.models import (
ContextTemplate,
INSTITUTION_TYPE_TO_CONTEXT,
)
# =============================================================================
# Fixtures
# =============================================================================
@pytest.fixture
def selector() -> DynamicContextSelector:
"""Create a fresh DynamicContextSelector instance."""
return DynamicContextSelector()
@pytest.fixture
def custom_intent_map() -> dict[str, ContextTemplate]:
"""Custom intent map for testing constructor injection."""
return {
"custom_intent": ContextTemplate.COLLECTION_DISCOVERY,
"geographic": ContextTemplate.MUSEUM_SEARCH, # Override default
}
@pytest.fixture
def custom_thresholds() -> dict[ContextTemplate, float]:
"""Custom thresholds for testing constructor injection."""
return {
ContextTemplate.PERSON_RESEARCH: 0.30, # Lower than default
ContextTemplate.GENERAL_HERITAGE: 0.80, # Higher than default
}
@dataclass
class MockPrediction:
"""Mock HeritageQueryRouter prediction for testing."""
intent: Optional[str] = None
entity_type: Optional[str] = None
target_custodian_type: Optional[str] = None
# Additional fields that might be present
sources: list[str] | None = None
entities: list[str] | None = None
language: str = "nl"
# =============================================================================
# Test: Intent to Context Mapping
# =============================================================================
class TestIntentMapping:
"""Test basic intent to context template mapping."""
def test_geographic_intent_maps_to_location_browse(
self, selector: DynamicContextSelector
):
"""Geographic intent should map to LOCATION_BROWSE."""
result = selector.select(intent="geographic")
assert result.template == ContextTemplate.LOCATION_BROWSE
assert "geographic" in result.selection_reason
def test_statistical_intent_maps_to_general(
self, selector: DynamicContextSelector
):
"""Statistical intent should map to GENERAL_HERITAGE."""
result = selector.select(intent="statistical")
assert result.template == ContextTemplate.GENERAL_HERITAGE
def test_relational_intent_maps_to_organizational_change(
self, selector: DynamicContextSelector
):
"""Relational intent should map to ORGANIZATIONAL_CHANGE."""
result = selector.select(intent="relational")
assert result.template == ContextTemplate.ORGANIZATIONAL_CHANGE
def test_temporal_intent_maps_to_organizational_change(
self, selector: DynamicContextSelector
):
"""Temporal intent should map to ORGANIZATIONAL_CHANGE."""
result = selector.select(intent="temporal")
assert result.template == ContextTemplate.ORGANIZATIONAL_CHANGE
def test_entity_lookup_intent_maps_to_identifier_lookup(
self, selector: DynamicContextSelector
):
"""Entity lookup intent should map to IDENTIFIER_LOOKUP."""
result = selector.select(intent="entity_lookup")
assert result.template == ContextTemplate.IDENTIFIER_LOOKUP
def test_comparative_intent_maps_to_general(
self, selector: DynamicContextSelector
):
"""Comparative intent should map to GENERAL_HERITAGE."""
result = selector.select(intent="comparative")
assert result.template == ContextTemplate.GENERAL_HERITAGE
def test_exploration_intent_maps_to_general(
self, selector: DynamicContextSelector
):
"""Exploration intent should map to GENERAL_HERITAGE."""
result = selector.select(intent="exploration")
assert result.template == ContextTemplate.GENERAL_HERITAGE
def test_unknown_intent_falls_back_to_general(
self, selector: DynamicContextSelector
):
"""Unknown intent should fall back to GENERAL_HERITAGE."""
result = selector.select(intent="nonexistent_intent")
assert result.template == ContextTemplate.GENERAL_HERITAGE
# =============================================================================
# Test: Entity Type Priority
# =============================================================================
class TestEntityTypePriority:
"""Test that entity_type takes priority over other factors."""
def test_person_entity_type_takes_priority_over_intent(
self, selector: DynamicContextSelector
):
"""Person entity_type should override intent mapping."""
result = selector.select(
intent="geographic", # Would normally be LOCATION_BROWSE
entity_type="person", # But this takes priority
)
assert result.template == ContextTemplate.PERSON_RESEARCH
assert "person" in result.selection_reason
def test_person_entity_type_takes_priority_over_custodian_type(
self, selector: DynamicContextSelector
):
"""Person entity_type should override custodian_type."""
result = selector.select(
entity_type="person",
custodian_type="A", # Would be ARCHIVE_SEARCH for institutions
)
assert result.template == ContextTemplate.PERSON_RESEARCH
def test_both_entity_type_uses_general_heritage(
self, selector: DynamicContextSelector
):
"""entity_type='both' should use GENERAL_HERITAGE."""
result = selector.select(entity_type="both")
assert result.template == ContextTemplate.GENERAL_HERITAGE
assert "both" in result.selection_reason
def test_institution_entity_type_allows_intent_mapping(
self, selector: DynamicContextSelector
):
"""entity_type='institution' should fall through to intent mapping."""
result = selector.select(
intent="geographic",
entity_type="institution",
)
assert result.template == ContextTemplate.LOCATION_BROWSE
# =============================================================================
# Test: Custodian Type Refinement
# =============================================================================
class TestCustodianTypeRefinement:
"""Test custodian type (GLAMORCUBESFIXPHDNT) refinement for institutions."""
def test_archive_custodian_type_maps_to_archive_search(
self, selector: DynamicContextSelector
):
"""Custodian type 'A' should map to ARCHIVE_SEARCH."""
result = selector.select(
entity_type="institution",
custodian_type="A",
)
assert result.template == ContextTemplate.ARCHIVE_SEARCH
assert "A" in result.selection_reason
def test_museum_custodian_type_maps_to_museum_search(
self, selector: DynamicContextSelector
):
"""Custodian type 'M' should map to MUSEUM_SEARCH."""
result = selector.select(
entity_type="institution",
custodian_type="M",
)
assert result.template == ContextTemplate.MUSEUM_SEARCH
def test_library_custodian_type_maps_to_library_search(
self, selector: DynamicContextSelector
):
"""Custodian type 'L' should map to LIBRARY_SEARCH."""
result = selector.select(
entity_type="institution",
custodian_type="L",
)
assert result.template == ContextTemplate.LIBRARY_SEARCH
def test_lowercase_custodian_type_is_normalized(
self, selector: DynamicContextSelector
):
"""Lowercase custodian types should be normalized to uppercase."""
result = selector.select(
entity_type="institution",
custodian_type="m", # lowercase
)
assert result.template == ContextTemplate.MUSEUM_SEARCH
def test_custodian_type_without_entity_type_uses_intent(
self, selector: DynamicContextSelector
):
"""Custodian type alone (without entity_type) should use intent mapping."""
result = selector.select(
intent="geographic",
custodian_type="A",
# No entity_type specified
)
# Should fall through to intent mapping since entity_type != "institution"
assert result.template == ContextTemplate.LOCATION_BROWSE
def test_unknown_custodian_type_falls_back_to_intent(
self, selector: DynamicContextSelector
):
"""Unknown custodian type should fall back to intent mapping."""
result = selector.select(
intent="geographic",
entity_type="institution",
custodian_type="Z", # Unknown
)
assert result.template == ContextTemplate.LOCATION_BROWSE
# =============================================================================
# Test: Threshold Selection
# =============================================================================
class TestThresholdSelection:
"""Test threshold selection per context template."""
def test_person_research_has_lower_threshold(
self, selector: DynamicContextSelector
):
"""PERSON_RESEARCH should have threshold 0.45."""
result = selector.select(entity_type="person")
assert result.threshold == 0.45
def test_identifier_lookup_has_lowest_threshold(
self, selector: DynamicContextSelector
):
"""IDENTIFIER_LOOKUP should have threshold 0.40."""
result = selector.select(intent="entity_lookup")
assert result.threshold == 0.40
def test_archive_search_has_medium_threshold(
self, selector: DynamicContextSelector
):
"""ARCHIVE_SEARCH should have threshold 0.50."""
result = selector.select(
entity_type="institution",
custodian_type="A",
)
assert result.threshold == 0.50
def test_general_heritage_has_highest_default_threshold(
self, selector: DynamicContextSelector
):
"""GENERAL_HERITAGE should have threshold 0.60."""
result = selector.select() # Fallback
assert result.threshold == 0.60
def test_threshold_override_takes_precedence(
self, selector: DynamicContextSelector
):
"""Explicit threshold_override should override default."""
result = selector.select(
entity_type="person",
threshold_override=0.75,
)
assert result.threshold == 0.75
def test_global_threshold_override_applies_to_all(self):
"""Global threshold override should apply to all selections."""
selector = DynamicContextSelector(global_threshold_override=0.99)
result1 = selector.select(entity_type="person")
result2 = selector.select(intent="geographic")
assert result1.threshold == 0.99
assert result2.threshold == 0.99
def test_get_threshold_for_template(
self, selector: DynamicContextSelector
):
"""get_threshold_for_template() should return correct values."""
assert selector.get_threshold_for_template(ContextTemplate.PERSON_RESEARCH) == 0.45
assert selector.get_threshold_for_template(ContextTemplate.GENERAL_HERITAGE) == 0.60
def test_get_all_thresholds_returns_dict(
self, selector: DynamicContextSelector
):
"""get_all_thresholds() should return dict with all templates."""
thresholds = selector.get_all_thresholds()
assert isinstance(thresholds, dict)
assert "person_research" in thresholds
assert "general_heritage" in thresholds
assert thresholds["person_research"] == 0.45
# =============================================================================
# Test: select_from_prediction()
# =============================================================================
class TestSelectFromPrediction:
"""Test selection from HeritageQueryRouter prediction objects."""
def test_select_from_prediction_extracts_intent(
self, selector: DynamicContextSelector
):
"""Should extract intent from prediction."""
prediction = MockPrediction(intent="geographic")
result = selector.select_from_prediction(prediction)
assert result.template == ContextTemplate.LOCATION_BROWSE
assert result.intent == "geographic"
def test_select_from_prediction_extracts_entity_type(
self, selector: DynamicContextSelector
):
"""Should extract entity_type from prediction."""
prediction = MockPrediction(entity_type="person")
result = selector.select_from_prediction(prediction)
assert result.template == ContextTemplate.PERSON_RESEARCH
assert result.entity_type == "person"
def test_select_from_prediction_extracts_custodian_type(
self, selector: DynamicContextSelector
):
"""Should extract target_custodian_type from prediction."""
prediction = MockPrediction(
entity_type="institution",
target_custodian_type="A",
)
result = selector.select_from_prediction(prediction)
assert result.template == ContextTemplate.ARCHIVE_SEARCH
assert result.custodian_type == "A"
def test_select_from_prediction_normalizes_unknown_custodian_type(
self, selector: DynamicContextSelector
):
"""Should treat 'UNKNOWN' custodian_type as None."""
prediction = MockPrediction(
intent="geographic",
entity_type="institution",
target_custodian_type="UNKNOWN",
)
result = selector.select_from_prediction(prediction)
# Should fall through to intent mapping
assert result.template == ContextTemplate.LOCATION_BROWSE
assert result.custodian_type is None
def test_select_from_prediction_with_threshold_override(
self, selector: DynamicContextSelector
):
"""Should apply threshold_override."""
prediction = MockPrediction(entity_type="person")
result = selector.select_from_prediction(prediction, threshold_override=0.33)
assert result.threshold == 0.33
def test_select_from_prediction_handles_missing_attributes(
self, selector: DynamicContextSelector
):
"""Should handle predictions with missing attributes gracefully."""
# Create object with no relevant attributes
@dataclass
class MinimalPrediction:
sparql: str = "SELECT * WHERE { ?s ?p ?o }"
prediction = MinimalPrediction()
result = selector.select_from_prediction(prediction)
# Should fall back to GENERAL_HERITAGE
assert result.template == ContextTemplate.GENERAL_HERITAGE
# =============================================================================
# Test: Custom Map Injection
# =============================================================================
class TestCustomMapInjection:
"""Test constructor injection of custom maps."""
def test_custom_intent_map(
self, custom_intent_map: dict[str, ContextTemplate]
):
"""Should use custom intent map."""
selector = DynamicContextSelector(intent_map=custom_intent_map)
# Custom intent
result = selector.select(intent="custom_intent")
assert result.template == ContextTemplate.COLLECTION_DISCOVERY
# Overridden default
result = selector.select(intent="geographic")
assert result.template == ContextTemplate.MUSEUM_SEARCH
def test_custom_thresholds(
self, custom_thresholds: dict[ContextTemplate, float]
):
"""Should use custom thresholds."""
selector = DynamicContextSelector(default_thresholds=custom_thresholds)
result = selector.select(entity_type="person")
assert result.threshold == 0.30 # Custom threshold
def test_custom_entity_type_map(self):
"""Should use custom entity type map for non-person types.
Note: entity_type="person" is hardcoded to PERSON_RESEARCH as highest priority
and cannot be overridden via custom map. This is by design to ensure
person queries always get the focused person context.
"""
# Person is hardcoded and cannot be overridden
selector = DynamicContextSelector(entity_type_map={"person": ContextTemplate.ARCHIVE_SEARCH})
result = selector.select(entity_type="person")
# Still PERSON_RESEARCH because it's hardcoded as Priority 1
assert result.template == ContextTemplate.PERSON_RESEARCH
def test_custom_custodian_type_map(self):
"""Should use custom custodian type map."""
custom_map = {
"A": ContextTemplate.DIGITAL_PLATFORM, # Override
"X": ContextTemplate.COLLECTION_DISCOVERY, # New
}
selector = DynamicContextSelector(custodian_type_map=custom_map)
result = selector.select(entity_type="institution", custodian_type="A")
assert result.template == ContextTemplate.DIGITAL_PLATFORM
# =============================================================================
# Test: Singleton and Convenience Function
# =============================================================================
class TestSingletonAndConvenience:
"""Test singleton pattern and convenience function."""
def test_get_dynamic_context_selector_returns_same_instance(self):
"""Should return the same singleton instance."""
selector1 = get_dynamic_context_selector()
selector2 = get_dynamic_context_selector()
assert selector1 is selector2
def test_select_context_for_query_works(self):
"""Convenience function should work correctly."""
result = select_context_for_query(
intent="geographic",
entity_type="institution",
custodian_type="A",
)
assert isinstance(result, ContextSelectionResult)
assert result.template == ContextTemplate.ARCHIVE_SEARCH
def test_select_context_for_query_with_threshold_override(self):
"""Convenience function should support threshold override."""
result = select_context_for_query(
entity_type="person",
threshold_override=0.25,
)
assert result.threshold == 0.25
# =============================================================================
# Test: ContextSelectionResult
# =============================================================================
class TestContextSelectionResult:
"""Test ContextSelectionResult dataclass."""
def test_str_representation(self, selector: DynamicContextSelector):
"""__str__ should provide readable output."""
result = selector.select(entity_type="person")
str_repr = str(result)
assert "person_research" in str_repr
assert "0.45" in str_repr
assert "reason=" in str_repr
def test_result_contains_all_input_values(
self, selector: DynamicContextSelector
):
"""Result should contain all input values."""
result = selector.select(
intent="geographic",
entity_type="institution",
custodian_type="A",
)
assert result.intent == "geographic"
assert result.entity_type == "institution"
assert result.custodian_type == "A"
# =============================================================================
# Test: Selection Priority Order
# =============================================================================
class TestSelectionPriorityOrder:
"""Test that selection priority is: person > custodian > intent > fallback."""
def test_priority_order_person_beats_all(
self, selector: DynamicContextSelector
):
"""Person should beat custodian_type and intent."""
result = selector.select(
intent="geographic",
entity_type="person",
custodian_type="M",
)
assert result.template == ContextTemplate.PERSON_RESEARCH
def test_priority_order_custodian_beats_intent(
self, selector: DynamicContextSelector
):
"""Custodian type should beat intent for institutions."""
result = selector.select(
intent="geographic", # Would be LOCATION_BROWSE
entity_type="institution",
custodian_type="M", # But this wins for institutions
)
assert result.template == ContextTemplate.MUSEUM_SEARCH
def test_priority_order_intent_used_when_no_custodian(
self, selector: DynamicContextSelector
):
"""Intent should be used when no valid custodian type."""
result = selector.select(
intent="geographic",
entity_type="institution",
# No custodian_type
)
assert result.template == ContextTemplate.LOCATION_BROWSE
def test_priority_order_fallback_when_nothing_specified(
self, selector: DynamicContextSelector
):
"""Should fall back to GENERAL_HERITAGE when nothing specified."""
result = selector.select()
assert result.template == ContextTemplate.GENERAL_HERITAGE
# =============================================================================
# Test: Module Constants
# =============================================================================
class TestModuleConstants:
"""Test that module-level constants are properly defined."""
def test_intent_to_context_map_has_expected_intents(self):
"""INTENT_TO_CONTEXT_MAP should have all expected intents."""
expected_intents = {
"geographic",
"statistical",
"relational",
"temporal",
"entity_lookup",
"comparative",
"exploration",
}
assert expected_intents == set(INTENT_TO_CONTEXT_MAP.keys())
def test_entity_type_to_context_map_has_person(self):
"""ENTITY_TYPE_TO_CONTEXT_MAP should have 'person' mapping."""
assert "person" in ENTITY_TYPE_TO_CONTEXT_MAP
assert ENTITY_TYPE_TO_CONTEXT_MAP["person"] == ContextTemplate.PERSON_RESEARCH
def test_default_thresholds_covers_all_templates(self):
"""DEFAULT_THRESHOLDS should cover all context templates."""
for template in ContextTemplate:
assert template in DEFAULT_THRESHOLDS, f"Missing threshold for {template}"
def test_default_thresholds_in_valid_range(self):
"""All default thresholds should be between 0 and 1."""
for template, threshold in DEFAULT_THRESHOLDS.items():
assert 0.0 <= threshold <= 1.0, f"Invalid threshold {threshold} for {template}"
# =============================================================================
# Test: Edge Cases
# =============================================================================
class TestEdgeCases:
"""Test edge cases and error handling."""
def test_none_values_handled_gracefully(
self, selector: DynamicContextSelector
):
"""Should handle None values without error."""
result = selector.select(
intent=None,
entity_type=None,
custodian_type=None,
)
assert result.template == ContextTemplate.GENERAL_HERITAGE
def test_empty_string_intent_treated_as_unknown(
self, selector: DynamicContextSelector
):
"""Empty string intent should fall back to general."""
result = selector.select(intent="")
assert result.template == ContextTemplate.GENERAL_HERITAGE
def test_whitespace_custodian_type_handled(
self, selector: DynamicContextSelector
):
"""Whitespace custodian type should fall back to intent."""
result = selector.select(
intent="geographic",
entity_type="institution",
custodian_type=" ",
)
# " ".upper() = " " which is not in the map
assert result.template == ContextTemplate.LOCATION_BROWSE