- Updated documentation to clarify integration points with existing components in the RAG pipeline and DSPy framework. - Added detailed mapping of SPARQL templates to context templates for improved specificity filtering. - Implemented wrapper patterns around existing classifiers to extend functionality without duplication. - Introduced new tests for the SpecificityAwareClassifier and SPARQLToContextMapper to ensure proper integration and functionality. - Enhanced the CustodianRDFConverter to include ISO country and subregion codes from GHCID for better geospatial data handling.
184 lines
6.4 KiB
Python
184 lines
6.4 KiB
Python
"""
|
|
SPARQL template to context template mapper.
|
|
|
|
This module provides the mapping layer between:
|
|
- SPARQL template IDs (output from existing TemplateClassifier)
|
|
- Context template IDs (input to specificity scoring)
|
|
|
|
The mapper also handles institution-type-based refinement, e.g.,
|
|
mapping "list_institutions_by_type_city" + slot "A" (archive) to "archive_search".
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
from typing import Optional
|
|
|
|
from .models import ContextTemplate, INSTITUTION_TYPE_TO_CONTEXT
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
# SPARQL template ID → Context template mapping
|
|
# This maps the output of TemplateClassifier to specificity context templates
|
|
SPARQL_TO_CONTEXT_MAP: dict[str, ContextTemplate] = {
|
|
# Location-based queries
|
|
"list_institutions_by_type_city": ContextTemplate.LOCATION_BROWSE,
|
|
"list_institutions_by_type_region": ContextTemplate.LOCATION_BROWSE,
|
|
"list_institutions_by_type_country": ContextTemplate.LOCATION_BROWSE,
|
|
"list_all_institutions_in_city": ContextTemplate.LOCATION_BROWSE,
|
|
"compare_locations": ContextTemplate.LOCATION_BROWSE,
|
|
|
|
# Count queries (general)
|
|
"count_institutions_by_type_location": ContextTemplate.LOCATION_BROWSE,
|
|
"count_institutions_by_type": ContextTemplate.GENERAL_HERITAGE,
|
|
|
|
# Identifier lookups
|
|
"find_institution_by_name": ContextTemplate.IDENTIFIER_LOOKUP,
|
|
"find_institution_by_identifier": ContextTemplate.IDENTIFIER_LOOKUP,
|
|
|
|
# Temporal/organizational queries
|
|
"find_institutions_by_founding_date": ContextTemplate.ORGANIZATIONAL_CHANGE,
|
|
|
|
# Budget queries (general heritage)
|
|
"find_custodians_by_budget_threshold": ContextTemplate.GENERAL_HERITAGE,
|
|
|
|
# Person queries
|
|
"find_person_by_role": ContextTemplate.PERSON_RESEARCH,
|
|
"find_people_at_institution": ContextTemplate.PERSON_RESEARCH,
|
|
"list_staff_by_role_category": ContextTemplate.PERSON_RESEARCH,
|
|
|
|
# Collection queries
|
|
"list_collections_by_type": ContextTemplate.COLLECTION_DISCOVERY,
|
|
"find_collections_by_subject": ContextTemplate.COLLECTION_DISCOVERY,
|
|
|
|
# Digital platform queries
|
|
"find_digital_platforms": ContextTemplate.DIGITAL_PLATFORM,
|
|
"list_platform_integrations": ContextTemplate.DIGITAL_PLATFORM,
|
|
|
|
# Fallback
|
|
"none": ContextTemplate.GENERAL_HERITAGE,
|
|
}
|
|
|
|
|
|
class SPARQLToContextMapper:
|
|
"""Maps SPARQL template IDs to context templates for specificity scoring.
|
|
|
|
This class:
|
|
1. Takes a SPARQL template ID (from TemplateClassifier)
|
|
2. Optionally refines based on institution_type slot
|
|
3. Returns the appropriate context template for specificity lookup
|
|
|
|
Usage:
|
|
mapper = SPARQLToContextMapper()
|
|
|
|
# Basic mapping
|
|
context = mapper.map("list_institutions_by_type_city")
|
|
# Returns: ContextTemplate.LOCATION_BROWSE
|
|
|
|
# Refined mapping with institution type
|
|
context = mapper.map(
|
|
"list_institutions_by_type_city",
|
|
slots={"institution_type": "A"}
|
|
)
|
|
# Returns: ContextTemplate.ARCHIVE_SEARCH
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
sparql_map: Optional[dict[str, ContextTemplate]] = None,
|
|
type_refinement_map: Optional[dict[str, ContextTemplate]] = None,
|
|
):
|
|
"""Initialize the mapper.
|
|
|
|
Args:
|
|
sparql_map: Override the default SPARQL → context mapping
|
|
type_refinement_map: Override the institution type refinement mapping
|
|
"""
|
|
self._sparql_map = sparql_map or SPARQL_TO_CONTEXT_MAP
|
|
self._type_refinement_map = type_refinement_map or INSTITUTION_TYPE_TO_CONTEXT
|
|
|
|
def map(
|
|
self,
|
|
sparql_template_id: str,
|
|
slots: Optional[dict[str, str]] = None,
|
|
) -> ContextTemplate:
|
|
"""Map SPARQL template ID to context template.
|
|
|
|
Args:
|
|
sparql_template_id: Template ID from TemplateClassifier
|
|
slots: Optional extracted slots (for refinement)
|
|
|
|
Returns:
|
|
Appropriate ContextTemplate for specificity scoring
|
|
"""
|
|
# Get base mapping
|
|
base_context = self._sparql_map.get(
|
|
sparql_template_id,
|
|
ContextTemplate.GENERAL_HERITAGE
|
|
)
|
|
|
|
# Check if we should refine based on institution_type slot
|
|
if slots and self._should_refine_by_institution_type(sparql_template_id):
|
|
institution_type = slots.get("institution_type", "").upper()
|
|
if institution_type:
|
|
refined = self._type_refinement_map.get(institution_type)
|
|
if refined:
|
|
logger.debug(
|
|
f"Refined context: {base_context} → {refined} "
|
|
f"(institution_type={institution_type})"
|
|
)
|
|
return refined
|
|
|
|
return base_context
|
|
|
|
def _should_refine_by_institution_type(self, template_id: str) -> bool:
|
|
"""Check if template should be refined by institution_type slot.
|
|
|
|
Only certain templates benefit from institution-type refinement.
|
|
"""
|
|
refinable_templates = {
|
|
"list_institutions_by_type_city",
|
|
"list_institutions_by_type_region",
|
|
"list_institutions_by_type_country",
|
|
"count_institutions_by_type_location",
|
|
"find_institutions_by_founding_date",
|
|
}
|
|
return template_id in refinable_templates
|
|
|
|
def get_all_context_templates(self) -> list[ContextTemplate]:
|
|
"""Get list of all context templates."""
|
|
return list(ContextTemplate)
|
|
|
|
def get_sparql_templates_for_context(
|
|
self,
|
|
context: ContextTemplate
|
|
) -> list[str]:
|
|
"""Get all SPARQL template IDs that map to a context template.
|
|
|
|
Useful for debugging and documentation.
|
|
|
|
Args:
|
|
context: Context template to look up
|
|
|
|
Returns:
|
|
List of SPARQL template IDs that map to this context
|
|
"""
|
|
return [
|
|
sparql_id
|
|
for sparql_id, ctx in self._sparql_map.items()
|
|
if ctx == context
|
|
]
|
|
|
|
|
|
# Singleton instance
|
|
_mapper_instance: Optional[SPARQLToContextMapper] = None
|
|
|
|
|
|
def get_sparql_to_context_mapper() -> SPARQLToContextMapper:
|
|
"""Get singleton mapper instance."""
|
|
global _mapper_instance
|
|
if _mapper_instance is None:
|
|
_mapper_instance = SPARQLToContextMapper()
|
|
return _mapper_instance
|