glam/backend/rag/specificity/lookup.py
kempersc 11983014bb Enhance specificity scoring system integration with existing infrastructure
- Updated documentation to clarify integration points with existing components in the RAG pipeline and DSPy framework.
- Added detailed mapping of SPARQL templates to context templates for improved specificity filtering.
- Implemented wrapper patterns around existing classifiers to extend functionality without duplication.
- Introduced new tests for the SpecificityAwareClassifier and SPARQLToContextMapper to ensure proper integration and functionality.
- Enhanced the CustodianRDFConverter to include ISO country and subregion codes from GHCID for better geospatial data handling.
2026-01-05 17:37:49 +01:00

289 lines
9.1 KiB
Python

"""
Specificity score lookup from LinkML schema annotations.
This module reads specificity scores from LinkML class YAML files
and provides filtered class lists based on context templates.
Scores are read from the `annotations` section of each class file:
```yaml
classes:
ClassName:
annotations:
specificity_score: 0.75
specificity_rationale: "Fairly specific to archival contexts"
template_specificity:
archive_search: 0.95
museum_search: 0.20
```
"""
from __future__ import annotations
import logging
from functools import lru_cache
from pathlib import Path
from typing import Any, Optional
import yaml
from .models import ContextTemplate, SpecificityScore
logger = logging.getLogger(__name__)
# Default schema directory
SCHEMA_BASE_DIR = Path(__file__).parent.parent.parent.parent / "schemas" / "20251121" / "linkml"
class SpecificityLookup:
"""Looks up specificity scores from LinkML schema annotations.
This class:
1. Scans LinkML class YAML files for specificity annotations
2. Caches scores in memory for fast lookup
3. Provides filtering by context template and threshold
Usage:
lookup = SpecificityLookup()
# Get all classes that pass threshold for a template
classes = lookup.get_classes_for_template(
ContextTemplate.ARCHIVE_SEARCH,
threshold=0.6
)
# Get score for a specific class
score = lookup.get_score("HeritageCustodian")
"""
def __init__(
self,
schema_dir: Optional[Path] = None,
default_score: float = 0.5,
):
"""Initialize the lookup.
Args:
schema_dir: Path to LinkML schema directory
default_score: Default score for classes without annotations
"""
self.schema_dir = schema_dir or SCHEMA_BASE_DIR
self.default_score = default_score
self._scores: Optional[dict[str, SpecificityScore]] = None
def _load_scores(self) -> dict[str, SpecificityScore]:
"""Load specificity scores from all class YAML files.
Returns:
Dictionary mapping class name to SpecificityScore
"""
if self._scores is not None:
return self._scores
self._scores = {}
classes_dir = self.schema_dir / "modules" / "classes"
if not classes_dir.exists():
logger.warning(f"Classes directory not found: {classes_dir}")
return self._scores
# Scan all YAML files in classes directory
yaml_files = list(classes_dir.glob("*.yaml"))
logger.info(f"Scanning {len(yaml_files)} class files for specificity annotations")
for yaml_file in yaml_files:
try:
with open(yaml_file, "r", encoding="utf-8") as f:
data = yaml.safe_load(f)
if not data:
continue
classes = data.get("classes", {})
for class_name, class_def in classes.items():
if not class_def:
continue
score = self._parse_score_from_class(class_name, class_def)
if score:
self._scores[class_name] = score
except Exception as e:
logger.warning(f"Error reading {yaml_file}: {e}")
logger.info(f"Loaded specificity scores for {len(self._scores)} classes")
return self._scores
def _parse_score_from_class(
self,
class_name: str,
class_def: dict[str, Any]
) -> Optional[SpecificityScore]:
"""Parse specificity score from class definition.
Args:
class_name: Name of the class
class_def: Class definition from YAML
Returns:
SpecificityScore if annotations found, None otherwise
"""
annotations = class_def.get("annotations", {})
# Check for specificity_score annotation
base_score = annotations.get("specificity_score")
if base_score is None:
# No annotation - return default score
return SpecificityScore(
class_name=class_name,
base_score=self.default_score,
rationale="No specificity annotation (using default)",
)
# Parse base score
try:
base_score_float = float(base_score)
except (ValueError, TypeError):
logger.warning(f"Invalid specificity_score for {class_name}: {base_score}")
return None
# Parse template-specific scores
template_scores: dict[ContextTemplate, float] = {}
template_specificity = annotations.get("template_specificity", {})
if isinstance(template_specificity, dict):
for template_name, score in template_specificity.items():
try:
template = ContextTemplate(template_name)
template_scores[template] = float(score)
except (ValueError, KeyError):
logger.warning(
f"Invalid template specificity for {class_name}: "
f"{template_name}={score}"
)
# Get rationale
rationale = annotations.get("specificity_rationale")
return SpecificityScore(
class_name=class_name,
base_score=base_score_float,
template_scores=template_scores,
rationale=rationale,
)
def get_score(self, class_name: str) -> SpecificityScore:
"""Get specificity score for a class.
Args:
class_name: Name of the LinkML class
Returns:
SpecificityScore (default if not found)
"""
scores = self._load_scores()
return scores.get(
class_name,
SpecificityScore(
class_name=class_name,
base_score=self.default_score,
rationale="Class not found in schema",
)
)
def get_all_scores(self) -> dict[str, SpecificityScore]:
"""Get all loaded specificity scores."""
return self._load_scores().copy()
def get_classes_for_template(
self,
template: ContextTemplate,
threshold: float = 0.6,
) -> list[str]:
"""Get classes that pass specificity threshold for a template.
Lower scores = more broadly relevant = more likely to be included.
Args:
template: Context template to filter by
threshold: Maximum specificity score to include
Returns:
List of class names that pass the threshold
"""
scores = self._load_scores()
passing_classes = [
class_name
for class_name, score in scores.items()
if score.passes_threshold(template, threshold)
]
logger.debug(
f"Template {template.value}: {len(passing_classes)}/{len(scores)} "
f"classes pass threshold {threshold}"
)
return sorted(passing_classes)
def get_filtered_scores(
self,
template: ContextTemplate,
threshold: float = 0.6,
) -> dict[str, SpecificityScore]:
"""Get scores for classes that pass threshold.
Args:
template: Context template to filter by
threshold: Maximum specificity score to include
Returns:
Dictionary of class name → SpecificityScore for passing classes
"""
scores = self._load_scores()
return {
class_name: score
for class_name, score in scores.items()
if score.passes_threshold(template, threshold)
}
def reload(self) -> None:
"""Force reload of scores from disk."""
self._scores = None
self._load_scores()
# Singleton instance
_lookup_instance: Optional[SpecificityLookup] = None
def get_specificity_lookup() -> SpecificityLookup:
"""Get singleton lookup instance."""
global _lookup_instance
if _lookup_instance is None:
_lookup_instance = SpecificityLookup()
return _lookup_instance
@lru_cache(maxsize=128)
def get_classes_for_template_cached(
template: str,
threshold: float = 0.6,
) -> tuple[str, ...]:
"""Cached version of get_classes_for_template.
Args:
template: Template name (string for hashability)
threshold: Specificity threshold
Returns:
Tuple of class names (tuple for hashability)
"""
lookup = get_specificity_lookup()
try:
context_template = ContextTemplate(template)
except ValueError:
context_template = ContextTemplate.GENERAL_HERITAGE
return tuple(lookup.get_classes_for_template(context_template, threshold))