glam/backend/rag/specificity/models.py
kempersc 11983014bb Enhance specificity scoring system integration with existing infrastructure
- Updated documentation to clarify integration points with existing components in the RAG pipeline and DSPy framework.
- Added detailed mapping of SPARQL templates to context templates for improved specificity filtering.
- Implemented wrapper patterns around existing classifiers to extend functionality without duplication.
- Introduced new tests for the SpecificityAwareClassifier and SPARQLToContextMapper to ensure proper integration and functionality.
- Enhanced the CustodianRDFConverter to include ISO country and subregion codes from GHCID for better geospatial data handling.
2026-01-05 17:37:49 +01:00

173 lines
5.7 KiB
Python

"""
Pydantic models for specificity score system.
This module defines the data models for:
- Context templates (10 conversation template types)
- Specificity scores for LinkML classes
- Classification results with scores
These models support the RAG pipeline's ability to filter schema classes
based on relevance to user queries.
"""
from __future__ import annotations
from dataclasses import dataclass, field
from enum import Enum
from typing import Optional
class ContextTemplate(str, Enum):
"""Context templates for specificity scoring.
These 10 templates represent different types of heritage-related queries.
Each LinkML class has a specificity score per template, allowing the RAG
pipeline to filter classes based on query relevance.
The templates are derived from common SPARQL query patterns in the
Heritage Custodian RAG system.
"""
# Institution-type specific searches
ARCHIVE_SEARCH = "archive_search"
MUSEUM_SEARCH = "museum_search"
LIBRARY_SEARCH = "library_search"
# Cross-cutting query types
COLLECTION_DISCOVERY = "collection_discovery"
PERSON_RESEARCH = "person_research"
LOCATION_BROWSE = "location_browse"
IDENTIFIER_LOOKUP = "identifier_lookup"
ORGANIZATIONAL_CHANGE = "organizational_change"
DIGITAL_PLATFORM = "digital_platform"
# Fallback
GENERAL_HERITAGE = "general_heritage"
@dataclass
class SpecificityScore:
"""Specificity score for a single LinkML class.
Lower scores indicate broader relevance (more likely to be included).
Higher scores indicate narrower relevance (filtered out more often).
Attributes:
class_name: Name of the LinkML class (e.g., "HeritageCustodian")
base_score: Default specificity score (0.0-1.0)
template_scores: Optional per-template score overrides
rationale: Explanation for the assigned scores
"""
class_name: str
base_score: float
template_scores: dict[ContextTemplate, float] = field(default_factory=dict)
rationale: Optional[str] = None
def get_score(self, template: ContextTemplate) -> float:
"""Get specificity score for a given template.
Args:
template: The context template to get score for
Returns:
Template-specific score if defined, otherwise base_score
"""
return self.template_scores.get(template, self.base_score)
def passes_threshold(self, template: ContextTemplate, threshold: float) -> bool:
"""Check if class passes specificity threshold for template.
Lower scores = more broadly relevant = more likely to pass.
Args:
template: Context template to check
threshold: Maximum specificity score to include (e.g., 0.6)
Returns:
True if class should be included (score <= threshold)
"""
return self.get_score(template) <= threshold
@dataclass
class ClassificationResult:
"""Result from template classification (from existing TemplateClassifier).
This mirrors the output from TemplateClassifier in template_sparql.py.
"""
template_id: str # SPARQL template ID (e.g., "list_institutions_by_type_city")
confidence: float
reasoning: str
slots: dict[str, str] = field(default_factory=dict)
@dataclass
class ClassificationWithScores:
"""Extended classification result with specificity scores.
This is the output of SpecificityAwareClassifier, which wraps
the existing TemplateClassifier and adds specificity scoring.
Attributes:
classification: Original classification from TemplateClassifier
context_template: Mapped context template for specificity lookup
filtered_classes: Classes that pass the specificity threshold
all_scores: Complete score data for all classes
threshold_used: The specificity threshold that was applied
"""
classification: ClassificationResult
context_template: ContextTemplate
filtered_classes: list[str]
all_scores: dict[str, SpecificityScore] = field(default_factory=dict)
threshold_used: float = 0.6
@property
def sparql_template_id(self) -> str:
"""Get the original SPARQL template ID."""
return self.classification.template_id
@property
def slots(self) -> dict[str, str]:
"""Get extracted slots from classification."""
return self.classification.slots
def get_filtered_class_names(self) -> list[str]:
"""Get list of class names that passed filtering."""
return self.filtered_classes
# Institution type to context template refinement mapping
INSTITUTION_TYPE_TO_CONTEXT: dict[str, ContextTemplate] = {
# Archives
"A": ContextTemplate.ARCHIVE_SEARCH,
"ARCHIVE": ContextTemplate.ARCHIVE_SEARCH,
# Museums
"M": ContextTemplate.MUSEUM_SEARCH,
"MUSEUM": ContextTemplate.MUSEUM_SEARCH,
"G": ContextTemplate.MUSEUM_SEARCH, # Gallery → museum context
"GALLERY": ContextTemplate.MUSEUM_SEARCH,
# Libraries
"L": ContextTemplate.LIBRARY_SEARCH,
"LIBRARY": ContextTemplate.LIBRARY_SEARCH,
# Research centers (often have library-like collections)
"R": ContextTemplate.LIBRARY_SEARCH,
"RESEARCH_CENTER": ContextTemplate.LIBRARY_SEARCH,
# Digital platforms
"D": ContextTemplate.DIGITAL_PLATFORM,
"DIGITAL_PLATFORM": ContextTemplate.DIGITAL_PLATFORM,
# Education providers (mixed, use general)
"E": ContextTemplate.GENERAL_HERITAGE,
"EDUCATION_PROVIDER": ContextTemplate.GENERAL_HERITAGE,
# Holy sites (often have archival collections)
"H": ContextTemplate.ARCHIVE_SEARCH,
"HOLY_SACRED_SITE": ContextTemplate.ARCHIVE_SEARCH,
}