glam/backend/rag/specificity/models.py

"""
Pydantic models for specificity score system.

This module defines the data models for:
- Context templates (10 conversation template types)
- Specificity scores for LinkML classes
- Classification results with scores

These models support the RAG pipeline's ability to filter schema classes
based on relevance to user queries.
"""

from __future__ import annotations

from dataclasses import dataclass, field
from enum import Enum
from typing import Optional


class ContextTemplate(str, Enum):
    """Context templates for specificity scoring.

    These 10 templates represent different types of heritage-related queries.
    Each LinkML class has a specificity score per template, allowing the RAG
    pipeline to filter classes based on query relevance.

    The templates are derived from common SPARQL query patterns in the
    Heritage Custodian RAG system.
    """

    # Institution-type specific searches
    ARCHIVE_SEARCH = "archive_search"
    MUSEUM_SEARCH = "museum_search"
    LIBRARY_SEARCH = "library_search"

    # Cross-cutting query types
    COLLECTION_DISCOVERY = "collection_discovery"
    PERSON_RESEARCH = "person_research"
    LOCATION_BROWSE = "location_browse"
    IDENTIFIER_LOOKUP = "identifier_lookup"
    ORGANIZATIONAL_CHANGE = "organizational_change"
    DIGITAL_PLATFORM = "digital_platform"

    # Fallback
    GENERAL_HERITAGE = "general_heritage"


@dataclass
class SpecificityScore:
    """Specificity score for a single LinkML class.

    Lower scores indicate broader relevance (more likely to be included).
    Higher scores indicate narrower relevance (filtered out more often).

    Attributes:
        class_name: Name of the LinkML class (e.g., "HeritageCustodian")
        base_score: Default specificity score (0.0-1.0)
        template_scores: Optional per-template score overrides
        rationale: Explanation for the assigned scores
    """

    class_name: str
    base_score: float
    template_scores: dict[ContextTemplate, float] = field(default_factory=dict)
    rationale: Optional[str] = None

    def get_score(self, template: ContextTemplate) -> float:
        """Get specificity score for a given template.

        Args:
            template: The context template to get score for

        Returns:
            Template-specific score if defined, otherwise base_score
        """
        return self.template_scores.get(template, self.base_score)

    def passes_threshold(self, template: ContextTemplate, threshold: float) -> bool:
        """Check if class passes specificity threshold for template.

        Lower scores = more broadly relevant = more likely to pass.

        Args:
            template: Context template to check
            threshold: Maximum specificity score to include (e.g., 0.6)

        Returns:
            True if class should be included (score <= threshold)
        """
        return self.get_score(template) <= threshold


@dataclass
class ClassificationResult:
    """Result from template classification (from existing TemplateClassifier).

    This mirrors the output from TemplateClassifier in template_sparql.py.
    """

    template_id: str  # SPARQL template ID (e.g., "list_institutions_by_type_city")
    confidence: float
    reasoning: str
    slots: dict[str, str] = field(default_factory=dict)


@dataclass
class ClassificationWithScores:
    """Extended classification result with specificity scores.

    This is the output of SpecificityAwareClassifier, which wraps
    the existing TemplateClassifier and adds specificity scoring.

    Attributes:
        classification: Original classification from TemplateClassifier
        context_template: Mapped context template for specificity lookup
        filtered_classes: Classes that pass the specificity threshold
        all_scores: Complete score data for all classes
        threshold_used: The specificity threshold that was applied
    """

    classification: ClassificationResult
    context_template: ContextTemplate
    filtered_classes: list[str]
    all_scores: dict[str, SpecificityScore] = field(default_factory=dict)
    threshold_used: float = 0.6

    @property
    def sparql_template_id(self) -> str:
        """Get the original SPARQL template ID."""
        return self.classification.template_id

    @property
    def slots(self) -> dict[str, str]:
        """Get extracted slots from classification."""
        return self.classification.slots

    def get_filtered_class_names(self) -> list[str]:
        """Get list of class names that passed filtering."""
        return self.filtered_classes


# Institution type to context template refinement mapping
INSTITUTION_TYPE_TO_CONTEXT: dict[str, ContextTemplate] = {
    # Archives
    "A": ContextTemplate.ARCHIVE_SEARCH,
    "ARCHIVE": ContextTemplate.ARCHIVE_SEARCH,

    # Museums
    "M": ContextTemplate.MUSEUM_SEARCH,
    "MUSEUM": ContextTemplate.MUSEUM_SEARCH,
    "G": ContextTemplate.MUSEUM_SEARCH,  # Gallery → museum context
    "GALLERY": ContextTemplate.MUSEUM_SEARCH,

    # Libraries
    "L": ContextTemplate.LIBRARY_SEARCH,
    "LIBRARY": ContextTemplate.LIBRARY_SEARCH,

    # Research centers (often have library-like collections)
    "R": ContextTemplate.LIBRARY_SEARCH,
    "RESEARCH_CENTER": ContextTemplate.LIBRARY_SEARCH,

    # Digital platforms
    "D": ContextTemplate.DIGITAL_PLATFORM,
    "DIGITAL_PLATFORM": ContextTemplate.DIGITAL_PLATFORM,

    # Education providers (mixed, use general)
    "E": ContextTemplate.GENERAL_HERITAGE,
    "EDUCATION_PROVIDER": ContextTemplate.GENERAL_HERITAGE,

    # Holy sites (often have archival collections)
    "H": ContextTemplate.ARCHIVE_SEARCH,
    "HOLY_SACRED_SITE": ContextTemplate.ARCHIVE_SEARCH,
}