""" Pydantic models for specificity score system. This module defines the data models for: - Context templates (10 conversation template types) - Specificity scores for LinkML classes - Classification results with scores These models support the RAG pipeline's ability to filter schema classes based on relevance to user queries. """ from __future__ import annotations from dataclasses import dataclass, field from enum import Enum from typing import Optional class ContextTemplate(str, Enum): """Context templates for specificity scoring. These 10 templates represent different types of heritage-related queries. Each LinkML class has a specificity score per template, allowing the RAG pipeline to filter classes based on query relevance. The templates are derived from common SPARQL query patterns in the Heritage Custodian RAG system. """ # Institution-type specific searches ARCHIVE_SEARCH = "archive_search" MUSEUM_SEARCH = "museum_search" LIBRARY_SEARCH = "library_search" # Cross-cutting query types COLLECTION_DISCOVERY = "collection_discovery" PERSON_RESEARCH = "person_research" LOCATION_BROWSE = "location_browse" IDENTIFIER_LOOKUP = "identifier_lookup" ORGANIZATIONAL_CHANGE = "organizational_change" DIGITAL_PLATFORM = "digital_platform" # Fallback GENERAL_HERITAGE = "general_heritage" @dataclass class SpecificityScore: """Specificity score for a single LinkML class. Lower scores indicate broader relevance (more likely to be included). Higher scores indicate narrower relevance (filtered out more often). Attributes: class_name: Name of the LinkML class (e.g., "HeritageCustodian") base_score: Default specificity score (0.0-1.0) template_scores: Optional per-template score overrides rationale: Explanation for the assigned scores """ class_name: str base_score: float template_scores: dict[ContextTemplate, float] = field(default_factory=dict) rationale: Optional[str] = None def get_score(self, template: ContextTemplate) -> float: """Get specificity score for a given template. Args: template: The context template to get score for Returns: Template-specific score if defined, otherwise base_score """ return self.template_scores.get(template, self.base_score) def passes_threshold(self, template: ContextTemplate, threshold: float) -> bool: """Check if class passes specificity threshold for template. Lower scores = more broadly relevant = more likely to pass. Args: template: Context template to check threshold: Maximum specificity score to include (e.g., 0.6) Returns: True if class should be included (score <= threshold) """ return self.get_score(template) <= threshold @dataclass class ClassificationResult: """Result from template classification (from existing TemplateClassifier). This mirrors the output from TemplateClassifier in template_sparql.py. """ template_id: str # SPARQL template ID (e.g., "list_institutions_by_type_city") confidence: float reasoning: str slots: dict[str, str] = field(default_factory=dict) @dataclass class ClassificationWithScores: """Extended classification result with specificity scores. This is the output of SpecificityAwareClassifier, which wraps the existing TemplateClassifier and adds specificity scoring. Attributes: classification: Original classification from TemplateClassifier context_template: Mapped context template for specificity lookup filtered_classes: Classes that pass the specificity threshold all_scores: Complete score data for all classes threshold_used: The specificity threshold that was applied """ classification: ClassificationResult context_template: ContextTemplate filtered_classes: list[str] all_scores: dict[str, SpecificityScore] = field(default_factory=dict) threshold_used: float = 0.6 @property def sparql_template_id(self) -> str: """Get the original SPARQL template ID.""" return self.classification.template_id @property def slots(self) -> dict[str, str]: """Get extracted slots from classification.""" return self.classification.slots def get_filtered_class_names(self) -> list[str]: """Get list of class names that passed filtering.""" return self.filtered_classes # Institution type to context template refinement mapping INSTITUTION_TYPE_TO_CONTEXT: dict[str, ContextTemplate] = { # Archives "A": ContextTemplate.ARCHIVE_SEARCH, "ARCHIVE": ContextTemplate.ARCHIVE_SEARCH, # Museums "M": ContextTemplate.MUSEUM_SEARCH, "MUSEUM": ContextTemplate.MUSEUM_SEARCH, "G": ContextTemplate.MUSEUM_SEARCH, # Gallery → museum context "GALLERY": ContextTemplate.MUSEUM_SEARCH, # Libraries "L": ContextTemplate.LIBRARY_SEARCH, "LIBRARY": ContextTemplate.LIBRARY_SEARCH, # Research centers (often have library-like collections) "R": ContextTemplate.LIBRARY_SEARCH, "RESEARCH_CENTER": ContextTemplate.LIBRARY_SEARCH, # Digital platforms "D": ContextTemplate.DIGITAL_PLATFORM, "DIGITAL_PLATFORM": ContextTemplate.DIGITAL_PLATFORM, # Education providers (mixed, use general) "E": ContextTemplate.GENERAL_HERITAGE, "EDUCATION_PROVIDER": ContextTemplate.GENERAL_HERITAGE, # Holy sites (often have archival collections) "H": ContextTemplate.ARCHIVE_SEARCH, "HOLY_SACRED_SITE": ContextTemplate.ARCHIVE_SEARCH, }