""" Specificity score lookup from LinkML schema annotations. This module reads specificity scores from LinkML class YAML files and provides filtered class lists based on context templates. Scores are read from the `annotations` section of each class file: ```yaml classes: ClassName: annotations: specificity_score: 0.75 specificity_rationale: "Fairly specific to archival contexts" template_specificity: archive_search: 0.95 museum_search: 0.20 ``` """ from __future__ import annotations import logging from functools import lru_cache from pathlib import Path from typing import Any, Optional import yaml from .models import ContextTemplate, SpecificityScore logger = logging.getLogger(__name__) # Default schema directory SCHEMA_BASE_DIR = Path(__file__).parent.parent.parent.parent / "schemas" / "20251121" / "linkml" class SpecificityLookup: """Looks up specificity scores from LinkML schema annotations. This class: 1. Scans LinkML class YAML files for specificity annotations 2. Caches scores in memory for fast lookup 3. Provides filtering by context template and threshold Usage: lookup = SpecificityLookup() # Get all classes that pass threshold for a template classes = lookup.get_classes_for_template( ContextTemplate.ARCHIVE_SEARCH, threshold=0.6 ) # Get score for a specific class score = lookup.get_score("HeritageCustodian") """ def __init__( self, schema_dir: Optional[Path] = None, default_score: float = 0.5, ): """Initialize the lookup. Args: schema_dir: Path to LinkML schema directory default_score: Default score for classes without annotations """ self.schema_dir = schema_dir or SCHEMA_BASE_DIR self.default_score = default_score self._scores: Optional[dict[str, SpecificityScore]] = None def _load_scores(self) -> dict[str, SpecificityScore]: """Load specificity scores from all class YAML files. Returns: Dictionary mapping class name to SpecificityScore """ if self._scores is not None: return self._scores self._scores = {} classes_dir = self.schema_dir / "modules" / "classes" if not classes_dir.exists(): logger.warning(f"Classes directory not found: {classes_dir}") return self._scores # Scan all YAML files in classes directory yaml_files = list(classes_dir.glob("*.yaml")) logger.info(f"Scanning {len(yaml_files)} class files for specificity annotations") for yaml_file in yaml_files: try: with open(yaml_file, "r", encoding="utf-8") as f: data = yaml.safe_load(f) if not data: continue classes = data.get("classes", {}) for class_name, class_def in classes.items(): if not class_def: continue score = self._parse_score_from_class(class_name, class_def) if score: self._scores[class_name] = score except Exception as e: logger.warning(f"Error reading {yaml_file}: {e}") logger.info(f"Loaded specificity scores for {len(self._scores)} classes") return self._scores def _parse_score_from_class( self, class_name: str, class_def: dict[str, Any] ) -> Optional[SpecificityScore]: """Parse specificity score from class definition. Args: class_name: Name of the class class_def: Class definition from YAML Returns: SpecificityScore if annotations found, None otherwise """ annotations = class_def.get("annotations", {}) # Check for specificity_score annotation base_score = annotations.get("specificity_score") if base_score is None: # No annotation - return default score return SpecificityScore( class_name=class_name, base_score=self.default_score, rationale="No specificity annotation (using default)", ) # Parse base score try: base_score_float = float(base_score) except (ValueError, TypeError): logger.warning(f"Invalid specificity_score for {class_name}: {base_score}") return None # Parse template-specific scores template_scores: dict[ContextTemplate, float] = {} template_specificity = annotations.get("template_specificity", {}) if isinstance(template_specificity, dict): for template_name, score in template_specificity.items(): try: template = ContextTemplate(template_name) template_scores[template] = float(score) except (ValueError, KeyError): logger.warning( f"Invalid template specificity for {class_name}: " f"{template_name}={score}" ) # Get rationale rationale = annotations.get("specificity_rationale") return SpecificityScore( class_name=class_name, base_score=base_score_float, template_scores=template_scores, rationale=rationale, ) def get_score(self, class_name: str) -> SpecificityScore: """Get specificity score for a class. Args: class_name: Name of the LinkML class Returns: SpecificityScore (default if not found) """ scores = self._load_scores() return scores.get( class_name, SpecificityScore( class_name=class_name, base_score=self.default_score, rationale="Class not found in schema", ) ) def get_all_scores(self) -> dict[str, SpecificityScore]: """Get all loaded specificity scores.""" return self._load_scores().copy() def get_classes_for_template( self, template: ContextTemplate, threshold: float = 0.6, ) -> list[str]: """Get classes that pass specificity threshold for a template. Lower scores = more broadly relevant = more likely to be included. Args: template: Context template to filter by threshold: Maximum specificity score to include Returns: List of class names that pass the threshold """ scores = self._load_scores() passing_classes = [ class_name for class_name, score in scores.items() if score.passes_threshold(template, threshold) ] logger.debug( f"Template {template.value}: {len(passing_classes)}/{len(scores)} " f"classes pass threshold {threshold}" ) return sorted(passing_classes) def get_filtered_scores( self, template: ContextTemplate, threshold: float = 0.6, ) -> dict[str, SpecificityScore]: """Get scores for classes that pass threshold. Args: template: Context template to filter by threshold: Maximum specificity score to include Returns: Dictionary of class name → SpecificityScore for passing classes """ scores = self._load_scores() return { class_name: score for class_name, score in scores.items() if score.passes_threshold(template, threshold) } def reload(self) -> None: """Force reload of scores from disk.""" self._scores = None self._load_scores() # Singleton instance _lookup_instance: Optional[SpecificityLookup] = None def get_specificity_lookup() -> SpecificityLookup: """Get singleton lookup instance.""" global _lookup_instance if _lookup_instance is None: _lookup_instance = SpecificityLookup() return _lookup_instance @lru_cache(maxsize=128) def get_classes_for_template_cached( template: str, threshold: float = 0.6, ) -> tuple[str, ...]: """Cached version of get_classes_for_template. Args: template: Template name (string for hashability) threshold: Specificity threshold Returns: Tuple of class names (tuple for hashability) """ lookup = get_specificity_lookup() try: context_template = ContextTemplate(template) except ValueError: context_template = ContextTemplate.GENERAL_HERITAGE return tuple(lookup.get_classes_for_template(context_template, threshold))