glam/backend/rag/specificity/context_selector.py
kempersc 11983014bb Enhance specificity scoring system integration with existing infrastructure
- Updated documentation to clarify integration points with existing components in the RAG pipeline and DSPy framework.
- Added detailed mapping of SPARQL templates to context templates for improved specificity filtering.
- Implemented wrapper patterns around existing classifiers to extend functionality without duplication.
- Introduced new tests for the SpecificityAwareClassifier and SPARQLToContextMapper to ensure proper integration and functionality.
- Enhanced the CustodianRDFConverter to include ISO country and subregion codes from GHCID for better geospatial data handling.
2026-01-05 17:37:49 +01:00

353 lines
13 KiB
Python

"""
Dynamic context template selection based on query classification.
This module bridges HeritageQueryRouter output to specificity filtering by:
1. Mapping query intent (geographic, statistical, etc.) to context templates
2. Considering entity_type (person, institution, both) for refinement
3. Optionally using target_custodian_type for institution-specific contexts
This enables automatic selection of the most relevant schema context for each query.
"""
from __future__ import annotations
import logging
from dataclasses import dataclass, field
from typing import Optional, TYPE_CHECKING
from .models import ContextTemplate, INSTITUTION_TYPE_TO_CONTEXT
if TYPE_CHECKING:
from dspy import Prediction
logger = logging.getLogger(__name__)
# Query intent to context template mapping
# Maps HeritageQueryIntent.intent values to ContextTemplates
INTENT_TO_CONTEXT_MAP: dict[str, ContextTemplate] = {
# Geographic intent → location browsing
"geographic": ContextTemplate.LOCATION_BROWSE,
# Statistical intent → general (counts across types)
"statistical": ContextTemplate.GENERAL_HERITAGE,
# Relational intent → organizational relationships
"relational": ContextTemplate.ORGANIZATIONAL_CHANGE,
# Temporal intent → organizational history
"temporal": ContextTemplate.ORGANIZATIONAL_CHANGE,
# Entity lookup → identifier-based search
"entity_lookup": ContextTemplate.IDENTIFIER_LOOKUP,
# Comparative intent → general (comparing across types)
"comparative": ContextTemplate.GENERAL_HERITAGE,
# Exploration intent → general discovery
"exploration": ContextTemplate.GENERAL_HERITAGE,
}
# Entity type to context template mapping
# When entity_type is known, this can override or refine the intent-based context
ENTITY_TYPE_TO_CONTEXT_MAP: dict[str, ContextTemplate] = {
"person": ContextTemplate.PERSON_RESEARCH,
# "institution" uses intent-based mapping
# "both" uses general heritage
}
# Default thresholds per context template
# Lower threshold = more selective (fewer classes)
# Higher threshold = more inclusive (more classes)
DEFAULT_THRESHOLDS: dict[ContextTemplate, float] = {
# Person queries benefit from focused context
ContextTemplate.PERSON_RESEARCH: 0.45,
# Identifier lookups need core classes
ContextTemplate.IDENTIFIER_LOOKUP: 0.40,
# Institution-type searches are moderately selective
ContextTemplate.ARCHIVE_SEARCH: 0.50,
ContextTemplate.MUSEUM_SEARCH: 0.50,
ContextTemplate.LIBRARY_SEARCH: 0.50,
# Collection and location need broader context
ContextTemplate.COLLECTION_DISCOVERY: 0.55,
ContextTemplate.LOCATION_BROWSE: 0.55,
# Organizational change needs temporal classes
ContextTemplate.ORGANIZATIONAL_CHANGE: 0.55,
# Digital platform is specialized
ContextTemplate.DIGITAL_PLATFORM: 0.50,
# General heritage is most inclusive
ContextTemplate.GENERAL_HERITAGE: 0.60,
}
@dataclass
class ContextSelectionResult:
"""Result of dynamic context selection.
Attributes:
template: Selected context template
threshold: Recommended threshold for this context
selection_reason: Human-readable explanation
intent: Original query intent (if available)
entity_type: Original entity type (if available)
custodian_type: Target custodian type (if available)
"""
template: ContextTemplate
threshold: float
selection_reason: str
intent: Optional[str] = None
entity_type: Optional[str] = None
custodian_type: Optional[str] = None
def __str__(self) -> str:
return f"ContextSelection({self.template.value}, threshold={self.threshold}, reason='{self.selection_reason}')"
class DynamicContextSelector:
"""Selects appropriate context template based on query classification.
This class integrates with HeritageQueryRouter output to automatically
determine the best context template and threshold for specificity filtering.
Usage:
selector = DynamicContextSelector()
# From HeritageQueryRouter prediction
router_result = router.forward(question)
selection = selector.select_from_prediction(router_result)
# Or manually specify parameters
selection = selector.select(
intent="geographic",
entity_type="institution",
custodian_type="A" # Archive
)
# Use with specificity-aware signatures
sig = get_schema_aware_sparql_signature(
context_template=selection.template.value,
threshold=selection.threshold
)
"""
def __init__(
self,
intent_map: Optional[dict[str, ContextTemplate]] = None,
entity_type_map: Optional[dict[str, ContextTemplate]] = None,
custodian_type_map: Optional[dict[str, ContextTemplate]] = None,
default_thresholds: Optional[dict[ContextTemplate, float]] = None,
global_threshold_override: Optional[float] = None,
):
"""Initialize the selector.
Args:
intent_map: Override intent → context mapping
entity_type_map: Override entity_type → context mapping
custodian_type_map: Override custodian_type → context mapping
default_thresholds: Override per-template thresholds
global_threshold_override: If set, use this threshold for all templates
"""
self._intent_map = intent_map or INTENT_TO_CONTEXT_MAP
self._entity_type_map = entity_type_map or ENTITY_TYPE_TO_CONTEXT_MAP
self._custodian_type_map = custodian_type_map or INSTITUTION_TYPE_TO_CONTEXT
self._default_thresholds = default_thresholds or DEFAULT_THRESHOLDS
self._global_threshold = global_threshold_override
def select(
self,
intent: Optional[str] = None,
entity_type: Optional[str] = None,
custodian_type: Optional[str] = None,
threshold_override: Optional[float] = None,
) -> ContextSelectionResult:
"""Select context template based on query classification.
Selection priority:
1. entity_type="person" → PERSON_RESEARCH (highest priority)
2. custodian_type (A/M/L/etc.) → type-specific context
3. intent → intent-based context
4. Fallback → GENERAL_HERITAGE
Args:
intent: Query intent from HeritageQueryRouter
entity_type: "person", "institution", or "both"
custodian_type: GLAMORCUBESFIXPHDNT single-letter code
threshold_override: Override the default threshold
Returns:
ContextSelectionResult with template, threshold, and reasoning
"""
template: ContextTemplate
reason: str
# Priority 1: Person queries always use PERSON_RESEARCH
if entity_type == "person":
template = ContextTemplate.PERSON_RESEARCH
reason = "entity_type='person' → PERSON_RESEARCH"
logger.debug(f"Context selection: {reason}")
# Priority 2: Custodian type refinement for institutions
elif entity_type == "institution" and custodian_type:
custodian_upper = custodian_type.upper()
if custodian_upper in self._custodian_type_map:
template = self._custodian_type_map[custodian_upper]
reason = f"custodian_type='{custodian_upper}'{template.value}"
logger.debug(f"Context selection: {reason}")
elif intent and intent in self._intent_map:
template = self._intent_map[intent]
reason = f"intent='{intent}' (custodian_type unknown) → {template.value}"
logger.debug(f"Context selection: {reason}")
else:
template = ContextTemplate.GENERAL_HERITAGE
reason = "Unknown custodian_type, no intent → GENERAL_HERITAGE"
logger.debug(f"Context selection: {reason}")
# Priority 3: Intent-based mapping
elif intent and intent in self._intent_map:
template = self._intent_map[intent]
reason = f"intent='{intent}'{template.value}"
logger.debug(f"Context selection: {reason}")
# Priority 4: entity_type="both" uses general
elif entity_type == "both":
template = ContextTemplate.GENERAL_HERITAGE
reason = "entity_type='both' → GENERAL_HERITAGE"
logger.debug(f"Context selection: {reason}")
# Fallback
else:
template = ContextTemplate.GENERAL_HERITAGE
reason = "No classification available → GENERAL_HERITAGE"
logger.debug(f"Context selection: {reason}")
# Determine threshold
if threshold_override is not None:
threshold = threshold_override
elif self._global_threshold is not None:
threshold = self._global_threshold
else:
threshold = self._default_thresholds.get(template, 0.5)
return ContextSelectionResult(
template=template,
threshold=threshold,
selection_reason=reason,
intent=intent,
entity_type=entity_type,
custodian_type=custodian_type,
)
def select_from_prediction(
self,
prediction: "Prediction",
threshold_override: Optional[float] = None,
) -> ContextSelectionResult:
"""Select context from HeritageQueryRouter prediction.
Extracts intent, entity_type, and target_custodian_type from
the prediction object and calls select().
Args:
prediction: Output from HeritageQueryRouter.forward()
threshold_override: Override the default threshold
Returns:
ContextSelectionResult with template, threshold, and reasoning
"""
intent = getattr(prediction, "intent", None)
entity_type = getattr(prediction, "entity_type", None)
custodian_type = getattr(prediction, "target_custodian_type", None)
# Normalize "UNKNOWN" to None
if custodian_type == "UNKNOWN":
custodian_type = None
return self.select(
intent=intent,
entity_type=entity_type,
custodian_type=custodian_type,
threshold_override=threshold_override,
)
def get_threshold_for_template(self, template: ContextTemplate) -> float:
"""Get the default threshold for a context template.
Args:
template: Context template
Returns:
Default threshold (0.0-1.0)
"""
if self._global_threshold is not None:
return self._global_threshold
return self._default_thresholds.get(template, 0.5)
def get_all_thresholds(self) -> dict[str, float]:
"""Get all default thresholds by template name.
Returns:
Dict mapping template names to thresholds
"""
return {
template.value: self._default_thresholds.get(template, 0.5)
for template in ContextTemplate
}
# Singleton instance
_selector_instance: Optional[DynamicContextSelector] = None
def get_dynamic_context_selector() -> DynamicContextSelector:
"""Get singleton selector instance."""
global _selector_instance
if _selector_instance is None:
_selector_instance = DynamicContextSelector()
return _selector_instance
def select_context_for_query(
intent: Optional[str] = None,
entity_type: Optional[str] = None,
custodian_type: Optional[str] = None,
threshold_override: Optional[float] = None,
) -> ContextSelectionResult:
"""Convenience function for context selection.
This is the primary entry point for dynamic context selection.
Args:
intent: Query intent from HeritageQueryRouter
entity_type: "person", "institution", or "both"
custodian_type: GLAMORCUBESFIXPHDNT single-letter code
threshold_override: Override the default threshold
Returns:
ContextSelectionResult with template, threshold, and reasoning
Example:
from backend.rag.specificity import select_context_for_query
# After query routing
result = select_context_for_query(
intent="geographic",
entity_type="institution",
custodian_type="A"
)
# result.template = ContextTemplate.ARCHIVE_SEARCH
# result.threshold = 0.5
"""
return get_dynamic_context_selector().select(
intent=intent,
entity_type=entity_type,
custodian_type=custodian_type,
threshold_override=threshold_override,
)