glam/backend/rag/specificity/mapper.py
kempersc 11983014bb Enhance specificity scoring system integration with existing infrastructure
- Updated documentation to clarify integration points with existing components in the RAG pipeline and DSPy framework.
- Added detailed mapping of SPARQL templates to context templates for improved specificity filtering.
- Implemented wrapper patterns around existing classifiers to extend functionality without duplication.
- Introduced new tests for the SpecificityAwareClassifier and SPARQLToContextMapper to ensure proper integration and functionality.
- Enhanced the CustodianRDFConverter to include ISO country and subregion codes from GHCID for better geospatial data handling.
2026-01-05 17:37:49 +01:00

184 lines
6.4 KiB
Python

"""
SPARQL template to context template mapper.
This module provides the mapping layer between:
- SPARQL template IDs (output from existing TemplateClassifier)
- Context template IDs (input to specificity scoring)
The mapper also handles institution-type-based refinement, e.g.,
mapping "list_institutions_by_type_city" + slot "A" (archive) to "archive_search".
"""
from __future__ import annotations
import logging
from typing import Optional
from .models import ContextTemplate, INSTITUTION_TYPE_TO_CONTEXT
logger = logging.getLogger(__name__)
# SPARQL template ID → Context template mapping
# This maps the output of TemplateClassifier to specificity context templates
SPARQL_TO_CONTEXT_MAP: dict[str, ContextTemplate] = {
# Location-based queries
"list_institutions_by_type_city": ContextTemplate.LOCATION_BROWSE,
"list_institutions_by_type_region": ContextTemplate.LOCATION_BROWSE,
"list_institutions_by_type_country": ContextTemplate.LOCATION_BROWSE,
"list_all_institutions_in_city": ContextTemplate.LOCATION_BROWSE,
"compare_locations": ContextTemplate.LOCATION_BROWSE,
# Count queries (general)
"count_institutions_by_type_location": ContextTemplate.LOCATION_BROWSE,
"count_institutions_by_type": ContextTemplate.GENERAL_HERITAGE,
# Identifier lookups
"find_institution_by_name": ContextTemplate.IDENTIFIER_LOOKUP,
"find_institution_by_identifier": ContextTemplate.IDENTIFIER_LOOKUP,
# Temporal/organizational queries
"find_institutions_by_founding_date": ContextTemplate.ORGANIZATIONAL_CHANGE,
# Budget queries (general heritage)
"find_custodians_by_budget_threshold": ContextTemplate.GENERAL_HERITAGE,
# Person queries
"find_person_by_role": ContextTemplate.PERSON_RESEARCH,
"find_people_at_institution": ContextTemplate.PERSON_RESEARCH,
"list_staff_by_role_category": ContextTemplate.PERSON_RESEARCH,
# Collection queries
"list_collections_by_type": ContextTemplate.COLLECTION_DISCOVERY,
"find_collections_by_subject": ContextTemplate.COLLECTION_DISCOVERY,
# Digital platform queries
"find_digital_platforms": ContextTemplate.DIGITAL_PLATFORM,
"list_platform_integrations": ContextTemplate.DIGITAL_PLATFORM,
# Fallback
"none": ContextTemplate.GENERAL_HERITAGE,
}
class SPARQLToContextMapper:
"""Maps SPARQL template IDs to context templates for specificity scoring.
This class:
1. Takes a SPARQL template ID (from TemplateClassifier)
2. Optionally refines based on institution_type slot
3. Returns the appropriate context template for specificity lookup
Usage:
mapper = SPARQLToContextMapper()
# Basic mapping
context = mapper.map("list_institutions_by_type_city")
# Returns: ContextTemplate.LOCATION_BROWSE
# Refined mapping with institution type
context = mapper.map(
"list_institutions_by_type_city",
slots={"institution_type": "A"}
)
# Returns: ContextTemplate.ARCHIVE_SEARCH
"""
def __init__(
self,
sparql_map: Optional[dict[str, ContextTemplate]] = None,
type_refinement_map: Optional[dict[str, ContextTemplate]] = None,
):
"""Initialize the mapper.
Args:
sparql_map: Override the default SPARQL → context mapping
type_refinement_map: Override the institution type refinement mapping
"""
self._sparql_map = sparql_map or SPARQL_TO_CONTEXT_MAP
self._type_refinement_map = type_refinement_map or INSTITUTION_TYPE_TO_CONTEXT
def map(
self,
sparql_template_id: str,
slots: Optional[dict[str, str]] = None,
) -> ContextTemplate:
"""Map SPARQL template ID to context template.
Args:
sparql_template_id: Template ID from TemplateClassifier
slots: Optional extracted slots (for refinement)
Returns:
Appropriate ContextTemplate for specificity scoring
"""
# Get base mapping
base_context = self._sparql_map.get(
sparql_template_id,
ContextTemplate.GENERAL_HERITAGE
)
# Check if we should refine based on institution_type slot
if slots and self._should_refine_by_institution_type(sparql_template_id):
institution_type = slots.get("institution_type", "").upper()
if institution_type:
refined = self._type_refinement_map.get(institution_type)
if refined:
logger.debug(
f"Refined context: {base_context}{refined} "
f"(institution_type={institution_type})"
)
return refined
return base_context
def _should_refine_by_institution_type(self, template_id: str) -> bool:
"""Check if template should be refined by institution_type slot.
Only certain templates benefit from institution-type refinement.
"""
refinable_templates = {
"list_institutions_by_type_city",
"list_institutions_by_type_region",
"list_institutions_by_type_country",
"count_institutions_by_type_location",
"find_institutions_by_founding_date",
}
return template_id in refinable_templates
def get_all_context_templates(self) -> list[ContextTemplate]:
"""Get list of all context templates."""
return list(ContextTemplate)
def get_sparql_templates_for_context(
self,
context: ContextTemplate
) -> list[str]:
"""Get all SPARQL template IDs that map to a context template.
Useful for debugging and documentation.
Args:
context: Context template to look up
Returns:
List of SPARQL template IDs that map to this context
"""
return [
sparql_id
for sparql_id, ctx in self._sparql_map.items()
if ctx == context
]
# Singleton instance
_mapper_instance: Optional[SPARQLToContextMapper] = None
def get_sparql_to_context_mapper() -> SPARQLToContextMapper:
"""Get singleton mapper instance."""
global _mapper_instance
if _mapper_instance is None:
_mapper_instance = SPARQLToContextMapper()
return _mapper_instance