- Updated documentation to clarify integration points with existing components in the RAG pipeline and DSPy framework. - Added detailed mapping of SPARQL templates to context templates for improved specificity filtering. - Implemented wrapper patterns around existing classifiers to extend functionality without duplication. - Introduced new tests for the SpecificityAwareClassifier and SPARQLToContextMapper to ensure proper integration and functionality. - Enhanced the CustodianRDFConverter to include ISO country and subregion codes from GHCID for better geospatial data handling.
173 lines
5.7 KiB
Python
173 lines
5.7 KiB
Python
"""
|
|
Pydantic models for specificity score system.
|
|
|
|
This module defines the data models for:
|
|
- Context templates (10 conversation template types)
|
|
- Specificity scores for LinkML classes
|
|
- Classification results with scores
|
|
|
|
These models support the RAG pipeline's ability to filter schema classes
|
|
based on relevance to user queries.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from dataclasses import dataclass, field
|
|
from enum import Enum
|
|
from typing import Optional
|
|
|
|
|
|
class ContextTemplate(str, Enum):
|
|
"""Context templates for specificity scoring.
|
|
|
|
These 10 templates represent different types of heritage-related queries.
|
|
Each LinkML class has a specificity score per template, allowing the RAG
|
|
pipeline to filter classes based on query relevance.
|
|
|
|
The templates are derived from common SPARQL query patterns in the
|
|
Heritage Custodian RAG system.
|
|
"""
|
|
|
|
# Institution-type specific searches
|
|
ARCHIVE_SEARCH = "archive_search"
|
|
MUSEUM_SEARCH = "museum_search"
|
|
LIBRARY_SEARCH = "library_search"
|
|
|
|
# Cross-cutting query types
|
|
COLLECTION_DISCOVERY = "collection_discovery"
|
|
PERSON_RESEARCH = "person_research"
|
|
LOCATION_BROWSE = "location_browse"
|
|
IDENTIFIER_LOOKUP = "identifier_lookup"
|
|
ORGANIZATIONAL_CHANGE = "organizational_change"
|
|
DIGITAL_PLATFORM = "digital_platform"
|
|
|
|
# Fallback
|
|
GENERAL_HERITAGE = "general_heritage"
|
|
|
|
|
|
@dataclass
|
|
class SpecificityScore:
|
|
"""Specificity score for a single LinkML class.
|
|
|
|
Lower scores indicate broader relevance (more likely to be included).
|
|
Higher scores indicate narrower relevance (filtered out more often).
|
|
|
|
Attributes:
|
|
class_name: Name of the LinkML class (e.g., "HeritageCustodian")
|
|
base_score: Default specificity score (0.0-1.0)
|
|
template_scores: Optional per-template score overrides
|
|
rationale: Explanation for the assigned scores
|
|
"""
|
|
|
|
class_name: str
|
|
base_score: float
|
|
template_scores: dict[ContextTemplate, float] = field(default_factory=dict)
|
|
rationale: Optional[str] = None
|
|
|
|
def get_score(self, template: ContextTemplate) -> float:
|
|
"""Get specificity score for a given template.
|
|
|
|
Args:
|
|
template: The context template to get score for
|
|
|
|
Returns:
|
|
Template-specific score if defined, otherwise base_score
|
|
"""
|
|
return self.template_scores.get(template, self.base_score)
|
|
|
|
def passes_threshold(self, template: ContextTemplate, threshold: float) -> bool:
|
|
"""Check if class passes specificity threshold for template.
|
|
|
|
Lower scores = more broadly relevant = more likely to pass.
|
|
|
|
Args:
|
|
template: Context template to check
|
|
threshold: Maximum specificity score to include (e.g., 0.6)
|
|
|
|
Returns:
|
|
True if class should be included (score <= threshold)
|
|
"""
|
|
return self.get_score(template) <= threshold
|
|
|
|
|
|
@dataclass
|
|
class ClassificationResult:
|
|
"""Result from template classification (from existing TemplateClassifier).
|
|
|
|
This mirrors the output from TemplateClassifier in template_sparql.py.
|
|
"""
|
|
|
|
template_id: str # SPARQL template ID (e.g., "list_institutions_by_type_city")
|
|
confidence: float
|
|
reasoning: str
|
|
slots: dict[str, str] = field(default_factory=dict)
|
|
|
|
|
|
@dataclass
|
|
class ClassificationWithScores:
|
|
"""Extended classification result with specificity scores.
|
|
|
|
This is the output of SpecificityAwareClassifier, which wraps
|
|
the existing TemplateClassifier and adds specificity scoring.
|
|
|
|
Attributes:
|
|
classification: Original classification from TemplateClassifier
|
|
context_template: Mapped context template for specificity lookup
|
|
filtered_classes: Classes that pass the specificity threshold
|
|
all_scores: Complete score data for all classes
|
|
threshold_used: The specificity threshold that was applied
|
|
"""
|
|
|
|
classification: ClassificationResult
|
|
context_template: ContextTemplate
|
|
filtered_classes: list[str]
|
|
all_scores: dict[str, SpecificityScore] = field(default_factory=dict)
|
|
threshold_used: float = 0.6
|
|
|
|
@property
|
|
def sparql_template_id(self) -> str:
|
|
"""Get the original SPARQL template ID."""
|
|
return self.classification.template_id
|
|
|
|
@property
|
|
def slots(self) -> dict[str, str]:
|
|
"""Get extracted slots from classification."""
|
|
return self.classification.slots
|
|
|
|
def get_filtered_class_names(self) -> list[str]:
|
|
"""Get list of class names that passed filtering."""
|
|
return self.filtered_classes
|
|
|
|
|
|
# Institution type to context template refinement mapping
|
|
INSTITUTION_TYPE_TO_CONTEXT: dict[str, ContextTemplate] = {
|
|
# Archives
|
|
"A": ContextTemplate.ARCHIVE_SEARCH,
|
|
"ARCHIVE": ContextTemplate.ARCHIVE_SEARCH,
|
|
|
|
# Museums
|
|
"M": ContextTemplate.MUSEUM_SEARCH,
|
|
"MUSEUM": ContextTemplate.MUSEUM_SEARCH,
|
|
"G": ContextTemplate.MUSEUM_SEARCH, # Gallery → museum context
|
|
"GALLERY": ContextTemplate.MUSEUM_SEARCH,
|
|
|
|
# Libraries
|
|
"L": ContextTemplate.LIBRARY_SEARCH,
|
|
"LIBRARY": ContextTemplate.LIBRARY_SEARCH,
|
|
|
|
# Research centers (often have library-like collections)
|
|
"R": ContextTemplate.LIBRARY_SEARCH,
|
|
"RESEARCH_CENTER": ContextTemplate.LIBRARY_SEARCH,
|
|
|
|
# Digital platforms
|
|
"D": ContextTemplate.DIGITAL_PLATFORM,
|
|
"DIGITAL_PLATFORM": ContextTemplate.DIGITAL_PLATFORM,
|
|
|
|
# Education providers (mixed, use general)
|
|
"E": ContextTemplate.GENERAL_HERITAGE,
|
|
"EDUCATION_PROVIDER": ContextTemplate.GENERAL_HERITAGE,
|
|
|
|
# Holy sites (often have archival collections)
|
|
"H": ContextTemplate.ARCHIVE_SEARCH,
|
|
"HOLY_SACRED_SITE": ContextTemplate.ARCHIVE_SEARCH,
|
|
}
|