- Updated documentation to clarify integration points with existing components in the RAG pipeline and DSPy framework. - Added detailed mapping of SPARQL templates to context templates for improved specificity filtering. - Implemented wrapper patterns around existing classifiers to extend functionality without duplication. - Introduced new tests for the SpecificityAwareClassifier and SPARQLToContextMapper to ensure proper integration and functionality. - Enhanced the CustodianRDFConverter to include ISO country and subregion codes from GHCID for better geospatial data handling.
233 lines
8.9 KiB
Python
233 lines
8.9 KiB
Python
"""
|
|
Specificity-aware classifier wrapper.
|
|
|
|
This module provides SpecificityAwareClassifier, which wraps the existing
|
|
TemplateClassifier and adds specificity scoring to the output.
|
|
|
|
The wrapper:
|
|
1. Delegates classification to existing TemplateClassifier
|
|
2. Maps SPARQL template → context template via SPARQLToContextMapper
|
|
3. Looks up specificity scores via SpecificityLookup
|
|
4. Returns ClassificationWithScores with filtered classes
|
|
|
|
This follows Option C (Hybrid) architecture: the existing classifier
|
|
is preserved unchanged, and we add a thin wrapper layer.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
from typing import Any, Optional
|
|
|
|
from .models import (
|
|
ClassificationResult,
|
|
ClassificationWithScores,
|
|
ContextTemplate,
|
|
)
|
|
from .mapper import get_sparql_to_context_mapper, SPARQLToContextMapper
|
|
from .lookup import get_specificity_lookup, SpecificityLookup
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class SpecificityAwareClassifier:
|
|
"""Wrapper that adds specificity scoring to TemplateClassifier.
|
|
|
|
This class wraps the existing TemplateClassifier (from template_sparql.py)
|
|
and enriches its output with specificity scores, allowing the RAG pipeline
|
|
to filter schema classes based on query relevance.
|
|
|
|
Architecture (Option C - Hybrid):
|
|
```
|
|
User Question
|
|
↓
|
|
┌─────────────────────────────────────┐
|
|
│ EXISTING: TemplateClassifier │ ← Unchanged
|
|
│ (Classifies to SPARQL template ID) │
|
|
└─────────────────────────────────────┘
|
|
↓ sparql_template_id + slots
|
|
┌─────────────────────────────────────┐
|
|
│ SPARQLToContextMapper │ ← New (this module)
|
|
│ (Maps SPARQL → Context templates) │
|
|
└─────────────────────────────────────┘
|
|
↓ context_template_id
|
|
┌─────────────────────────────────────┐
|
|
│ SpecificityLookup │ ← New (lookup.py)
|
|
│ (Filters classes by score threshold)│
|
|
└─────────────────────────────────────┘
|
|
↓ filtered_classes + scores
|
|
┌─────────────────────────────────────┐
|
|
│ RAG Pipeline / UML Visualization │
|
|
└─────────────────────────────────────┘
|
|
```
|
|
|
|
Usage:
|
|
from backend.rag.template_sparql import TemplateClassifier
|
|
from backend.rag.specificity import SpecificityAwareClassifier
|
|
|
|
# Wrap existing classifier
|
|
base_classifier = TemplateClassifier()
|
|
classifier = SpecificityAwareClassifier(base_classifier)
|
|
|
|
# Classify with specificity scoring
|
|
result = classifier.classify_with_scores(
|
|
question="Welke archieven zijn er in Amsterdam?",
|
|
threshold=0.6
|
|
)
|
|
|
|
print(result.context_template) # ContextTemplate.ARCHIVE_SEARCH
|
|
print(result.filtered_classes) # ['Archive', 'CustodianObservation', ...]
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
template_classifier: Optional["TemplateClassifier"] = None,
|
|
mapper: Optional[SPARQLToContextMapper] = None,
|
|
lookup: Optional[SpecificityLookup] = None,
|
|
default_threshold: float = 0.6,
|
|
):
|
|
"""Initialize the specificity-aware classifier.
|
|
|
|
Args:
|
|
template_classifier: Existing TemplateClassifier (lazy-loaded if None)
|
|
mapper: SPARQL to context mapper (uses singleton if None)
|
|
lookup: Specificity score lookup (uses singleton if None)
|
|
default_threshold: Default specificity threshold
|
|
"""
|
|
self._template_classifier = template_classifier
|
|
self._mapper = mapper or get_sparql_to_context_mapper()
|
|
self._lookup = lookup or get_specificity_lookup()
|
|
self.default_threshold = default_threshold
|
|
|
|
@property
|
|
def template_classifier(self) -> "TemplateClassifier":
|
|
"""Lazy-load TemplateClassifier to avoid circular imports."""
|
|
if self._template_classifier is None:
|
|
from backend.rag.template_sparql import TemplateClassifier
|
|
self._template_classifier = TemplateClassifier()
|
|
return self._template_classifier
|
|
|
|
def classify_with_scores(
|
|
self,
|
|
question: str,
|
|
language: str = "nl",
|
|
threshold: Optional[float] = None,
|
|
conversation_state: Optional[object] = None,
|
|
) -> ClassificationWithScores:
|
|
"""Classify question and return result with specificity scores.
|
|
|
|
This is the main entry point for the RAG pipeline.
|
|
|
|
Args:
|
|
question: User's natural language question
|
|
language: Language code (nl, en, de, fr)
|
|
threshold: Specificity threshold (uses default if None)
|
|
conversation_state: Optional conversation state for context
|
|
|
|
Returns:
|
|
ClassificationWithScores with filtered classes
|
|
"""
|
|
threshold = threshold if threshold is not None else self.default_threshold
|
|
|
|
# Step 1: Classify using existing TemplateClassifier
|
|
try:
|
|
match_result = self.template_classifier.forward(
|
|
question=question,
|
|
language=language,
|
|
conversation_state=conversation_state,
|
|
)
|
|
|
|
classification = ClassificationResult(
|
|
template_id=match_result.template_id,
|
|
confidence=match_result.confidence,
|
|
reasoning=match_result.reasoning,
|
|
slots=match_result.slots or {},
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Template classification failed: {e}")
|
|
# Fallback to "none" template
|
|
classification = ClassificationResult(
|
|
template_id="none",
|
|
confidence=0.0,
|
|
reasoning=f"Classification failed: {e}",
|
|
slots={},
|
|
)
|
|
|
|
# Step 2: Map to context template
|
|
context_template = self._mapper.map(
|
|
classification.template_id,
|
|
classification.slots,
|
|
)
|
|
|
|
logger.debug(
|
|
f"Mapped {classification.template_id} → {context_template.value} "
|
|
f"(slots: {classification.slots})"
|
|
)
|
|
|
|
# Step 3: Get filtered classes
|
|
filtered_scores = self._lookup.get_filtered_scores(context_template, threshold)
|
|
filtered_classes = list(filtered_scores.keys())
|
|
|
|
logger.info(
|
|
f"Specificity filter: {len(filtered_classes)} classes pass "
|
|
f"threshold {threshold} for {context_template.value}"
|
|
)
|
|
|
|
return ClassificationWithScores(
|
|
classification=classification,
|
|
context_template=context_template,
|
|
filtered_classes=filtered_classes,
|
|
all_scores=filtered_scores,
|
|
threshold_used=threshold,
|
|
)
|
|
|
|
def get_context_template(
|
|
self,
|
|
sparql_template_id: str,
|
|
slots: Optional[dict[str, str]] = None,
|
|
) -> ContextTemplate:
|
|
"""Get context template for a SPARQL template ID.
|
|
|
|
Convenience method for direct mapping without full classification.
|
|
|
|
Args:
|
|
sparql_template_id: SPARQL template ID
|
|
slots: Optional slots for refinement
|
|
|
|
Returns:
|
|
Context template
|
|
"""
|
|
return self._mapper.map(sparql_template_id, slots)
|
|
|
|
def get_filtered_classes(
|
|
self,
|
|
context_template: ContextTemplate,
|
|
threshold: Optional[float] = None,
|
|
) -> list[str]:
|
|
"""Get filtered classes for a context template.
|
|
|
|
Convenience method for direct lookup without classification.
|
|
|
|
Args:
|
|
context_template: Context template to filter by
|
|
threshold: Specificity threshold
|
|
|
|
Returns:
|
|
List of class names that pass threshold
|
|
"""
|
|
threshold = threshold if threshold is not None else self.default_threshold
|
|
return self._lookup.get_classes_for_template(context_template, threshold)
|
|
|
|
|
|
# Singleton instance
|
|
_classifier_instance: Optional[SpecificityAwareClassifier] = None
|
|
|
|
|
|
def get_specificity_aware_classifier() -> SpecificityAwareClassifier:
|
|
"""Get singleton classifier instance."""
|
|
global _classifier_instance
|
|
if _classifier_instance is None:
|
|
_classifier_instance = SpecificityAwareClassifier()
|
|
return _classifier_instance
|