glam/backend/rag/specificity/classifier.py
kempersc 11983014bb Enhance specificity scoring system integration with existing infrastructure
- Updated documentation to clarify integration points with existing components in the RAG pipeline and DSPy framework.
- Added detailed mapping of SPARQL templates to context templates for improved specificity filtering.
- Implemented wrapper patterns around existing classifiers to extend functionality without duplication.
- Introduced new tests for the SpecificityAwareClassifier and SPARQLToContextMapper to ensure proper integration and functionality.
- Enhanced the CustodianRDFConverter to include ISO country and subregion codes from GHCID for better geospatial data handling.
2026-01-05 17:37:49 +01:00

233 lines
8.9 KiB
Python

"""
Specificity-aware classifier wrapper.
This module provides SpecificityAwareClassifier, which wraps the existing
TemplateClassifier and adds specificity scoring to the output.
The wrapper:
1. Delegates classification to existing TemplateClassifier
2. Maps SPARQL template → context template via SPARQLToContextMapper
3. Looks up specificity scores via SpecificityLookup
4. Returns ClassificationWithScores with filtered classes
This follows Option C (Hybrid) architecture: the existing classifier
is preserved unchanged, and we add a thin wrapper layer.
"""
from __future__ import annotations
import logging
from typing import Any, Optional
from .models import (
ClassificationResult,
ClassificationWithScores,
ContextTemplate,
)
from .mapper import get_sparql_to_context_mapper, SPARQLToContextMapper
from .lookup import get_specificity_lookup, SpecificityLookup
logger = logging.getLogger(__name__)
class SpecificityAwareClassifier:
"""Wrapper that adds specificity scoring to TemplateClassifier.
This class wraps the existing TemplateClassifier (from template_sparql.py)
and enriches its output with specificity scores, allowing the RAG pipeline
to filter schema classes based on query relevance.
Architecture (Option C - Hybrid):
```
User Question
┌─────────────────────────────────────┐
│ EXISTING: TemplateClassifier │ ← Unchanged
│ (Classifies to SPARQL template ID) │
└─────────────────────────────────────┘
↓ sparql_template_id + slots
┌─────────────────────────────────────┐
│ SPARQLToContextMapper │ ← New (this module)
│ (Maps SPARQL → Context templates) │
└─────────────────────────────────────┘
↓ context_template_id
┌─────────────────────────────────────┐
│ SpecificityLookup │ ← New (lookup.py)
│ (Filters classes by score threshold)│
└─────────────────────────────────────┘
↓ filtered_classes + scores
┌─────────────────────────────────────┐
│ RAG Pipeline / UML Visualization │
└─────────────────────────────────────┘
```
Usage:
from backend.rag.template_sparql import TemplateClassifier
from backend.rag.specificity import SpecificityAwareClassifier
# Wrap existing classifier
base_classifier = TemplateClassifier()
classifier = SpecificityAwareClassifier(base_classifier)
# Classify with specificity scoring
result = classifier.classify_with_scores(
question="Welke archieven zijn er in Amsterdam?",
threshold=0.6
)
print(result.context_template) # ContextTemplate.ARCHIVE_SEARCH
print(result.filtered_classes) # ['Archive', 'CustodianObservation', ...]
"""
def __init__(
self,
template_classifier: Optional["TemplateClassifier"] = None,
mapper: Optional[SPARQLToContextMapper] = None,
lookup: Optional[SpecificityLookup] = None,
default_threshold: float = 0.6,
):
"""Initialize the specificity-aware classifier.
Args:
template_classifier: Existing TemplateClassifier (lazy-loaded if None)
mapper: SPARQL to context mapper (uses singleton if None)
lookup: Specificity score lookup (uses singleton if None)
default_threshold: Default specificity threshold
"""
self._template_classifier = template_classifier
self._mapper = mapper or get_sparql_to_context_mapper()
self._lookup = lookup or get_specificity_lookup()
self.default_threshold = default_threshold
@property
def template_classifier(self) -> "TemplateClassifier":
"""Lazy-load TemplateClassifier to avoid circular imports."""
if self._template_classifier is None:
from backend.rag.template_sparql import TemplateClassifier
self._template_classifier = TemplateClassifier()
return self._template_classifier
def classify_with_scores(
self,
question: str,
language: str = "nl",
threshold: Optional[float] = None,
conversation_state: Optional[object] = None,
) -> ClassificationWithScores:
"""Classify question and return result with specificity scores.
This is the main entry point for the RAG pipeline.
Args:
question: User's natural language question
language: Language code (nl, en, de, fr)
threshold: Specificity threshold (uses default if None)
conversation_state: Optional conversation state for context
Returns:
ClassificationWithScores with filtered classes
"""
threshold = threshold if threshold is not None else self.default_threshold
# Step 1: Classify using existing TemplateClassifier
try:
match_result = self.template_classifier.forward(
question=question,
language=language,
conversation_state=conversation_state,
)
classification = ClassificationResult(
template_id=match_result.template_id,
confidence=match_result.confidence,
reasoning=match_result.reasoning,
slots=match_result.slots or {},
)
except Exception as e:
logger.warning(f"Template classification failed: {e}")
# Fallback to "none" template
classification = ClassificationResult(
template_id="none",
confidence=0.0,
reasoning=f"Classification failed: {e}",
slots={},
)
# Step 2: Map to context template
context_template = self._mapper.map(
classification.template_id,
classification.slots,
)
logger.debug(
f"Mapped {classification.template_id}{context_template.value} "
f"(slots: {classification.slots})"
)
# Step 3: Get filtered classes
filtered_scores = self._lookup.get_filtered_scores(context_template, threshold)
filtered_classes = list(filtered_scores.keys())
logger.info(
f"Specificity filter: {len(filtered_classes)} classes pass "
f"threshold {threshold} for {context_template.value}"
)
return ClassificationWithScores(
classification=classification,
context_template=context_template,
filtered_classes=filtered_classes,
all_scores=filtered_scores,
threshold_used=threshold,
)
def get_context_template(
self,
sparql_template_id: str,
slots: Optional[dict[str, str]] = None,
) -> ContextTemplate:
"""Get context template for a SPARQL template ID.
Convenience method for direct mapping without full classification.
Args:
sparql_template_id: SPARQL template ID
slots: Optional slots for refinement
Returns:
Context template
"""
return self._mapper.map(sparql_template_id, slots)
def get_filtered_classes(
self,
context_template: ContextTemplate,
threshold: Optional[float] = None,
) -> list[str]:
"""Get filtered classes for a context template.
Convenience method for direct lookup without classification.
Args:
context_template: Context template to filter by
threshold: Specificity threshold
Returns:
List of class names that pass threshold
"""
threshold = threshold if threshold is not None else self.default_threshold
return self._lookup.get_classes_for_template(context_template, threshold)
# Singleton instance
_classifier_instance: Optional[SpecificityAwareClassifier] = None
def get_specificity_aware_classifier() -> SpecificityAwareClassifier:
"""Get singleton classifier instance."""
global _classifier_instance
if _classifier_instance is None:
_classifier_instance = SpecificityAwareClassifier()
return _classifier_instance