- Updated documentation to clarify integration points with existing components in the RAG pipeline and DSPy framework. - Added detailed mapping of SPARQL templates to context templates for improved specificity filtering. - Implemented wrapper patterns around existing classifiers to extend functionality without duplication. - Introduced new tests for the SpecificityAwareClassifier and SPARQLToContextMapper to ensure proper integration and functionality. - Enhanced the CustodianRDFConverter to include ISO country and subregion codes from GHCID for better geospatial data handling.
289 lines
9.1 KiB
Python
289 lines
9.1 KiB
Python
"""
|
|
Specificity score lookup from LinkML schema annotations.
|
|
|
|
This module reads specificity scores from LinkML class YAML files
|
|
and provides filtered class lists based on context templates.
|
|
|
|
Scores are read from the `annotations` section of each class file:
|
|
```yaml
|
|
classes:
|
|
ClassName:
|
|
annotations:
|
|
specificity_score: 0.75
|
|
specificity_rationale: "Fairly specific to archival contexts"
|
|
template_specificity:
|
|
archive_search: 0.95
|
|
museum_search: 0.20
|
|
```
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
from functools import lru_cache
|
|
from pathlib import Path
|
|
from typing import Any, Optional
|
|
|
|
import yaml
|
|
|
|
from .models import ContextTemplate, SpecificityScore
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Default schema directory
|
|
SCHEMA_BASE_DIR = Path(__file__).parent.parent.parent.parent / "schemas" / "20251121" / "linkml"
|
|
|
|
|
|
class SpecificityLookup:
|
|
"""Looks up specificity scores from LinkML schema annotations.
|
|
|
|
This class:
|
|
1. Scans LinkML class YAML files for specificity annotations
|
|
2. Caches scores in memory for fast lookup
|
|
3. Provides filtering by context template and threshold
|
|
|
|
Usage:
|
|
lookup = SpecificityLookup()
|
|
|
|
# Get all classes that pass threshold for a template
|
|
classes = lookup.get_classes_for_template(
|
|
ContextTemplate.ARCHIVE_SEARCH,
|
|
threshold=0.6
|
|
)
|
|
|
|
# Get score for a specific class
|
|
score = lookup.get_score("HeritageCustodian")
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
schema_dir: Optional[Path] = None,
|
|
default_score: float = 0.5,
|
|
):
|
|
"""Initialize the lookup.
|
|
|
|
Args:
|
|
schema_dir: Path to LinkML schema directory
|
|
default_score: Default score for classes without annotations
|
|
"""
|
|
self.schema_dir = schema_dir or SCHEMA_BASE_DIR
|
|
self.default_score = default_score
|
|
self._scores: Optional[dict[str, SpecificityScore]] = None
|
|
|
|
def _load_scores(self) -> dict[str, SpecificityScore]:
|
|
"""Load specificity scores from all class YAML files.
|
|
|
|
Returns:
|
|
Dictionary mapping class name to SpecificityScore
|
|
"""
|
|
if self._scores is not None:
|
|
return self._scores
|
|
|
|
self._scores = {}
|
|
classes_dir = self.schema_dir / "modules" / "classes"
|
|
|
|
if not classes_dir.exists():
|
|
logger.warning(f"Classes directory not found: {classes_dir}")
|
|
return self._scores
|
|
|
|
# Scan all YAML files in classes directory
|
|
yaml_files = list(classes_dir.glob("*.yaml"))
|
|
logger.info(f"Scanning {len(yaml_files)} class files for specificity annotations")
|
|
|
|
for yaml_file in yaml_files:
|
|
try:
|
|
with open(yaml_file, "r", encoding="utf-8") as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
if not data:
|
|
continue
|
|
|
|
classes = data.get("classes", {})
|
|
for class_name, class_def in classes.items():
|
|
if not class_def:
|
|
continue
|
|
|
|
score = self._parse_score_from_class(class_name, class_def)
|
|
if score:
|
|
self._scores[class_name] = score
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Error reading {yaml_file}: {e}")
|
|
|
|
logger.info(f"Loaded specificity scores for {len(self._scores)} classes")
|
|
return self._scores
|
|
|
|
def _parse_score_from_class(
|
|
self,
|
|
class_name: str,
|
|
class_def: dict[str, Any]
|
|
) -> Optional[SpecificityScore]:
|
|
"""Parse specificity score from class definition.
|
|
|
|
Args:
|
|
class_name: Name of the class
|
|
class_def: Class definition from YAML
|
|
|
|
Returns:
|
|
SpecificityScore if annotations found, None otherwise
|
|
"""
|
|
annotations = class_def.get("annotations", {})
|
|
|
|
# Check for specificity_score annotation
|
|
base_score = annotations.get("specificity_score")
|
|
if base_score is None:
|
|
# No annotation - return default score
|
|
return SpecificityScore(
|
|
class_name=class_name,
|
|
base_score=self.default_score,
|
|
rationale="No specificity annotation (using default)",
|
|
)
|
|
|
|
# Parse base score
|
|
try:
|
|
base_score_float = float(base_score)
|
|
except (ValueError, TypeError):
|
|
logger.warning(f"Invalid specificity_score for {class_name}: {base_score}")
|
|
return None
|
|
|
|
# Parse template-specific scores
|
|
template_scores: dict[ContextTemplate, float] = {}
|
|
template_specificity = annotations.get("template_specificity", {})
|
|
|
|
if isinstance(template_specificity, dict):
|
|
for template_name, score in template_specificity.items():
|
|
try:
|
|
template = ContextTemplate(template_name)
|
|
template_scores[template] = float(score)
|
|
except (ValueError, KeyError):
|
|
logger.warning(
|
|
f"Invalid template specificity for {class_name}: "
|
|
f"{template_name}={score}"
|
|
)
|
|
|
|
# Get rationale
|
|
rationale = annotations.get("specificity_rationale")
|
|
|
|
return SpecificityScore(
|
|
class_name=class_name,
|
|
base_score=base_score_float,
|
|
template_scores=template_scores,
|
|
rationale=rationale,
|
|
)
|
|
|
|
def get_score(self, class_name: str) -> SpecificityScore:
|
|
"""Get specificity score for a class.
|
|
|
|
Args:
|
|
class_name: Name of the LinkML class
|
|
|
|
Returns:
|
|
SpecificityScore (default if not found)
|
|
"""
|
|
scores = self._load_scores()
|
|
return scores.get(
|
|
class_name,
|
|
SpecificityScore(
|
|
class_name=class_name,
|
|
base_score=self.default_score,
|
|
rationale="Class not found in schema",
|
|
)
|
|
)
|
|
|
|
def get_all_scores(self) -> dict[str, SpecificityScore]:
|
|
"""Get all loaded specificity scores."""
|
|
return self._load_scores().copy()
|
|
|
|
def get_classes_for_template(
|
|
self,
|
|
template: ContextTemplate,
|
|
threshold: float = 0.6,
|
|
) -> list[str]:
|
|
"""Get classes that pass specificity threshold for a template.
|
|
|
|
Lower scores = more broadly relevant = more likely to be included.
|
|
|
|
Args:
|
|
template: Context template to filter by
|
|
threshold: Maximum specificity score to include
|
|
|
|
Returns:
|
|
List of class names that pass the threshold
|
|
"""
|
|
scores = self._load_scores()
|
|
|
|
passing_classes = [
|
|
class_name
|
|
for class_name, score in scores.items()
|
|
if score.passes_threshold(template, threshold)
|
|
]
|
|
|
|
logger.debug(
|
|
f"Template {template.value}: {len(passing_classes)}/{len(scores)} "
|
|
f"classes pass threshold {threshold}"
|
|
)
|
|
|
|
return sorted(passing_classes)
|
|
|
|
def get_filtered_scores(
|
|
self,
|
|
template: ContextTemplate,
|
|
threshold: float = 0.6,
|
|
) -> dict[str, SpecificityScore]:
|
|
"""Get scores for classes that pass threshold.
|
|
|
|
Args:
|
|
template: Context template to filter by
|
|
threshold: Maximum specificity score to include
|
|
|
|
Returns:
|
|
Dictionary of class name → SpecificityScore for passing classes
|
|
"""
|
|
scores = self._load_scores()
|
|
|
|
return {
|
|
class_name: score
|
|
for class_name, score in scores.items()
|
|
if score.passes_threshold(template, threshold)
|
|
}
|
|
|
|
def reload(self) -> None:
|
|
"""Force reload of scores from disk."""
|
|
self._scores = None
|
|
self._load_scores()
|
|
|
|
|
|
# Singleton instance
|
|
_lookup_instance: Optional[SpecificityLookup] = None
|
|
|
|
|
|
def get_specificity_lookup() -> SpecificityLookup:
|
|
"""Get singleton lookup instance."""
|
|
global _lookup_instance
|
|
if _lookup_instance is None:
|
|
_lookup_instance = SpecificityLookup()
|
|
return _lookup_instance
|
|
|
|
|
|
@lru_cache(maxsize=128)
|
|
def get_classes_for_template_cached(
|
|
template: str,
|
|
threshold: float = 0.6,
|
|
) -> tuple[str, ...]:
|
|
"""Cached version of get_classes_for_template.
|
|
|
|
Args:
|
|
template: Template name (string for hashability)
|
|
threshold: Specificity threshold
|
|
|
|
Returns:
|
|
Tuple of class names (tuple for hashability)
|
|
"""
|
|
lookup = get_specificity_lookup()
|
|
try:
|
|
context_template = ContextTemplate(template)
|
|
except ValueError:
|
|
context_template = ContextTemplate.GENERAL_HERITAGE
|
|
|
|
return tuple(lookup.get_classes_for_template(context_template, threshold))
|