- Updated documentation to clarify integration points with existing components in the RAG pipeline and DSPy framework. - Added detailed mapping of SPARQL templates to context templates for improved specificity filtering. - Implemented wrapper patterns around existing classifiers to extend functionality without duplication. - Introduced new tests for the SpecificityAwareClassifier and SPARQLToContextMapper to ensure proper integration and functionality. - Enhanced the CustodianRDFConverter to include ISO country and subregion codes from GHCID for better geospatial data handling.
353 lines
13 KiB
Python
353 lines
13 KiB
Python
"""
|
|
Dynamic context template selection based on query classification.
|
|
|
|
This module bridges HeritageQueryRouter output to specificity filtering by:
|
|
1. Mapping query intent (geographic, statistical, etc.) to context templates
|
|
2. Considering entity_type (person, institution, both) for refinement
|
|
3. Optionally using target_custodian_type for institution-specific contexts
|
|
|
|
This enables automatic selection of the most relevant schema context for each query.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
from dataclasses import dataclass, field
|
|
from typing import Optional, TYPE_CHECKING
|
|
|
|
from .models import ContextTemplate, INSTITUTION_TYPE_TO_CONTEXT
|
|
|
|
if TYPE_CHECKING:
|
|
from dspy import Prediction
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
# Query intent to context template mapping
|
|
# Maps HeritageQueryIntent.intent values to ContextTemplates
|
|
INTENT_TO_CONTEXT_MAP: dict[str, ContextTemplate] = {
|
|
# Geographic intent → location browsing
|
|
"geographic": ContextTemplate.LOCATION_BROWSE,
|
|
|
|
# Statistical intent → general (counts across types)
|
|
"statistical": ContextTemplate.GENERAL_HERITAGE,
|
|
|
|
# Relational intent → organizational relationships
|
|
"relational": ContextTemplate.ORGANIZATIONAL_CHANGE,
|
|
|
|
# Temporal intent → organizational history
|
|
"temporal": ContextTemplate.ORGANIZATIONAL_CHANGE,
|
|
|
|
# Entity lookup → identifier-based search
|
|
"entity_lookup": ContextTemplate.IDENTIFIER_LOOKUP,
|
|
|
|
# Comparative intent → general (comparing across types)
|
|
"comparative": ContextTemplate.GENERAL_HERITAGE,
|
|
|
|
# Exploration intent → general discovery
|
|
"exploration": ContextTemplate.GENERAL_HERITAGE,
|
|
}
|
|
|
|
|
|
# Entity type to context template mapping
|
|
# When entity_type is known, this can override or refine the intent-based context
|
|
ENTITY_TYPE_TO_CONTEXT_MAP: dict[str, ContextTemplate] = {
|
|
"person": ContextTemplate.PERSON_RESEARCH,
|
|
# "institution" uses intent-based mapping
|
|
# "both" uses general heritage
|
|
}
|
|
|
|
|
|
# Default thresholds per context template
|
|
# Lower threshold = more selective (fewer classes)
|
|
# Higher threshold = more inclusive (more classes)
|
|
DEFAULT_THRESHOLDS: dict[ContextTemplate, float] = {
|
|
# Person queries benefit from focused context
|
|
ContextTemplate.PERSON_RESEARCH: 0.45,
|
|
|
|
# Identifier lookups need core classes
|
|
ContextTemplate.IDENTIFIER_LOOKUP: 0.40,
|
|
|
|
# Institution-type searches are moderately selective
|
|
ContextTemplate.ARCHIVE_SEARCH: 0.50,
|
|
ContextTemplate.MUSEUM_SEARCH: 0.50,
|
|
ContextTemplate.LIBRARY_SEARCH: 0.50,
|
|
|
|
# Collection and location need broader context
|
|
ContextTemplate.COLLECTION_DISCOVERY: 0.55,
|
|
ContextTemplate.LOCATION_BROWSE: 0.55,
|
|
|
|
# Organizational change needs temporal classes
|
|
ContextTemplate.ORGANIZATIONAL_CHANGE: 0.55,
|
|
|
|
# Digital platform is specialized
|
|
ContextTemplate.DIGITAL_PLATFORM: 0.50,
|
|
|
|
# General heritage is most inclusive
|
|
ContextTemplate.GENERAL_HERITAGE: 0.60,
|
|
}
|
|
|
|
|
|
@dataclass
|
|
class ContextSelectionResult:
|
|
"""Result of dynamic context selection.
|
|
|
|
Attributes:
|
|
template: Selected context template
|
|
threshold: Recommended threshold for this context
|
|
selection_reason: Human-readable explanation
|
|
intent: Original query intent (if available)
|
|
entity_type: Original entity type (if available)
|
|
custodian_type: Target custodian type (if available)
|
|
"""
|
|
template: ContextTemplate
|
|
threshold: float
|
|
selection_reason: str
|
|
intent: Optional[str] = None
|
|
entity_type: Optional[str] = None
|
|
custodian_type: Optional[str] = None
|
|
|
|
def __str__(self) -> str:
|
|
return f"ContextSelection({self.template.value}, threshold={self.threshold}, reason='{self.selection_reason}')"
|
|
|
|
|
|
class DynamicContextSelector:
|
|
"""Selects appropriate context template based on query classification.
|
|
|
|
This class integrates with HeritageQueryRouter output to automatically
|
|
determine the best context template and threshold for specificity filtering.
|
|
|
|
Usage:
|
|
selector = DynamicContextSelector()
|
|
|
|
# From HeritageQueryRouter prediction
|
|
router_result = router.forward(question)
|
|
selection = selector.select_from_prediction(router_result)
|
|
|
|
# Or manually specify parameters
|
|
selection = selector.select(
|
|
intent="geographic",
|
|
entity_type="institution",
|
|
custodian_type="A" # Archive
|
|
)
|
|
|
|
# Use with specificity-aware signatures
|
|
sig = get_schema_aware_sparql_signature(
|
|
context_template=selection.template.value,
|
|
threshold=selection.threshold
|
|
)
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
intent_map: Optional[dict[str, ContextTemplate]] = None,
|
|
entity_type_map: Optional[dict[str, ContextTemplate]] = None,
|
|
custodian_type_map: Optional[dict[str, ContextTemplate]] = None,
|
|
default_thresholds: Optional[dict[ContextTemplate, float]] = None,
|
|
global_threshold_override: Optional[float] = None,
|
|
):
|
|
"""Initialize the selector.
|
|
|
|
Args:
|
|
intent_map: Override intent → context mapping
|
|
entity_type_map: Override entity_type → context mapping
|
|
custodian_type_map: Override custodian_type → context mapping
|
|
default_thresholds: Override per-template thresholds
|
|
global_threshold_override: If set, use this threshold for all templates
|
|
"""
|
|
self._intent_map = intent_map or INTENT_TO_CONTEXT_MAP
|
|
self._entity_type_map = entity_type_map or ENTITY_TYPE_TO_CONTEXT_MAP
|
|
self._custodian_type_map = custodian_type_map or INSTITUTION_TYPE_TO_CONTEXT
|
|
self._default_thresholds = default_thresholds or DEFAULT_THRESHOLDS
|
|
self._global_threshold = global_threshold_override
|
|
|
|
def select(
|
|
self,
|
|
intent: Optional[str] = None,
|
|
entity_type: Optional[str] = None,
|
|
custodian_type: Optional[str] = None,
|
|
threshold_override: Optional[float] = None,
|
|
) -> ContextSelectionResult:
|
|
"""Select context template based on query classification.
|
|
|
|
Selection priority:
|
|
1. entity_type="person" → PERSON_RESEARCH (highest priority)
|
|
2. custodian_type (A/M/L/etc.) → type-specific context
|
|
3. intent → intent-based context
|
|
4. Fallback → GENERAL_HERITAGE
|
|
|
|
Args:
|
|
intent: Query intent from HeritageQueryRouter
|
|
entity_type: "person", "institution", or "both"
|
|
custodian_type: GLAMORCUBESFIXPHDNT single-letter code
|
|
threshold_override: Override the default threshold
|
|
|
|
Returns:
|
|
ContextSelectionResult with template, threshold, and reasoning
|
|
"""
|
|
template: ContextTemplate
|
|
reason: str
|
|
|
|
# Priority 1: Person queries always use PERSON_RESEARCH
|
|
if entity_type == "person":
|
|
template = ContextTemplate.PERSON_RESEARCH
|
|
reason = "entity_type='person' → PERSON_RESEARCH"
|
|
logger.debug(f"Context selection: {reason}")
|
|
|
|
# Priority 2: Custodian type refinement for institutions
|
|
elif entity_type == "institution" and custodian_type:
|
|
custodian_upper = custodian_type.upper()
|
|
if custodian_upper in self._custodian_type_map:
|
|
template = self._custodian_type_map[custodian_upper]
|
|
reason = f"custodian_type='{custodian_upper}' → {template.value}"
|
|
logger.debug(f"Context selection: {reason}")
|
|
elif intent and intent in self._intent_map:
|
|
template = self._intent_map[intent]
|
|
reason = f"intent='{intent}' (custodian_type unknown) → {template.value}"
|
|
logger.debug(f"Context selection: {reason}")
|
|
else:
|
|
template = ContextTemplate.GENERAL_HERITAGE
|
|
reason = "Unknown custodian_type, no intent → GENERAL_HERITAGE"
|
|
logger.debug(f"Context selection: {reason}")
|
|
|
|
# Priority 3: Intent-based mapping
|
|
elif intent and intent in self._intent_map:
|
|
template = self._intent_map[intent]
|
|
reason = f"intent='{intent}' → {template.value}"
|
|
logger.debug(f"Context selection: {reason}")
|
|
|
|
# Priority 4: entity_type="both" uses general
|
|
elif entity_type == "both":
|
|
template = ContextTemplate.GENERAL_HERITAGE
|
|
reason = "entity_type='both' → GENERAL_HERITAGE"
|
|
logger.debug(f"Context selection: {reason}")
|
|
|
|
# Fallback
|
|
else:
|
|
template = ContextTemplate.GENERAL_HERITAGE
|
|
reason = "No classification available → GENERAL_HERITAGE"
|
|
logger.debug(f"Context selection: {reason}")
|
|
|
|
# Determine threshold
|
|
if threshold_override is not None:
|
|
threshold = threshold_override
|
|
elif self._global_threshold is not None:
|
|
threshold = self._global_threshold
|
|
else:
|
|
threshold = self._default_thresholds.get(template, 0.5)
|
|
|
|
return ContextSelectionResult(
|
|
template=template,
|
|
threshold=threshold,
|
|
selection_reason=reason,
|
|
intent=intent,
|
|
entity_type=entity_type,
|
|
custodian_type=custodian_type,
|
|
)
|
|
|
|
def select_from_prediction(
|
|
self,
|
|
prediction: "Prediction",
|
|
threshold_override: Optional[float] = None,
|
|
) -> ContextSelectionResult:
|
|
"""Select context from HeritageQueryRouter prediction.
|
|
|
|
Extracts intent, entity_type, and target_custodian_type from
|
|
the prediction object and calls select().
|
|
|
|
Args:
|
|
prediction: Output from HeritageQueryRouter.forward()
|
|
threshold_override: Override the default threshold
|
|
|
|
Returns:
|
|
ContextSelectionResult with template, threshold, and reasoning
|
|
"""
|
|
intent = getattr(prediction, "intent", None)
|
|
entity_type = getattr(prediction, "entity_type", None)
|
|
custodian_type = getattr(prediction, "target_custodian_type", None)
|
|
|
|
# Normalize "UNKNOWN" to None
|
|
if custodian_type == "UNKNOWN":
|
|
custodian_type = None
|
|
|
|
return self.select(
|
|
intent=intent,
|
|
entity_type=entity_type,
|
|
custodian_type=custodian_type,
|
|
threshold_override=threshold_override,
|
|
)
|
|
|
|
def get_threshold_for_template(self, template: ContextTemplate) -> float:
|
|
"""Get the default threshold for a context template.
|
|
|
|
Args:
|
|
template: Context template
|
|
|
|
Returns:
|
|
Default threshold (0.0-1.0)
|
|
"""
|
|
if self._global_threshold is not None:
|
|
return self._global_threshold
|
|
return self._default_thresholds.get(template, 0.5)
|
|
|
|
def get_all_thresholds(self) -> dict[str, float]:
|
|
"""Get all default thresholds by template name.
|
|
|
|
Returns:
|
|
Dict mapping template names to thresholds
|
|
"""
|
|
return {
|
|
template.value: self._default_thresholds.get(template, 0.5)
|
|
for template in ContextTemplate
|
|
}
|
|
|
|
|
|
# Singleton instance
|
|
_selector_instance: Optional[DynamicContextSelector] = None
|
|
|
|
|
|
def get_dynamic_context_selector() -> DynamicContextSelector:
|
|
"""Get singleton selector instance."""
|
|
global _selector_instance
|
|
if _selector_instance is None:
|
|
_selector_instance = DynamicContextSelector()
|
|
return _selector_instance
|
|
|
|
|
|
def select_context_for_query(
|
|
intent: Optional[str] = None,
|
|
entity_type: Optional[str] = None,
|
|
custodian_type: Optional[str] = None,
|
|
threshold_override: Optional[float] = None,
|
|
) -> ContextSelectionResult:
|
|
"""Convenience function for context selection.
|
|
|
|
This is the primary entry point for dynamic context selection.
|
|
|
|
Args:
|
|
intent: Query intent from HeritageQueryRouter
|
|
entity_type: "person", "institution", or "both"
|
|
custodian_type: GLAMORCUBESFIXPHDNT single-letter code
|
|
threshold_override: Override the default threshold
|
|
|
|
Returns:
|
|
ContextSelectionResult with template, threshold, and reasoning
|
|
|
|
Example:
|
|
from backend.rag.specificity import select_context_for_query
|
|
|
|
# After query routing
|
|
result = select_context_for_query(
|
|
intent="geographic",
|
|
entity_type="institution",
|
|
custodian_type="A"
|
|
)
|
|
# result.template = ContextTemplate.ARCHIVE_SEARCH
|
|
# result.threshold = 0.5
|
|
"""
|
|
return get_dynamic_context_selector().select(
|
|
intent=intent,
|
|
entity_type=entity_type,
|
|
custodian_type=custodian_type,
|
|
threshold_override=threshold_override,
|
|
)
|