glam/backend/rag/atomic_decomposer.py
2025-12-11 22:32:09 +01:00

555 lines
20 KiB
Python

"""
Atomic Query Decomposer for Heritage RAG Semantic Caching
Decomposes complex heritage queries into atomic sub-tasks for higher cache hit rates.
Based on research showing 40-70% cache hit rates with atomic decomposition vs 5-15% for full queries.
Architecture:
1. Parse query into intent + entities + constraints
2. Generate cacheable sub-task keys
3. Check cache for each sub-task independently
4. Merge cached sub-results with fresh retrievals
5. Cache new sub-results for future queries
Example decomposition:
"Hoeveel musea in Amsterdam hebben een ISIL code?"
├── IntentTask(intent="statistical", entity_type="MUSEUM")
├── LocationTask(location="Amsterdam", level="city")
├── FilterTask(filter="has_isil", value=True)
└── AggregateTask(aggregation="count")
Each sub-task can be cached independently:
- Statistical queries for "MUSEUM" entities → cacheable
- Location filtering for "Amsterdam" → cacheable
- ISIL filter application → cacheable
"""
from __future__ import annotations
import hashlib
import logging
import re
from dataclasses import dataclass, field
from enum import Enum
from typing import Any
logger = logging.getLogger(__name__)
class SubTaskType(str, Enum):
"""Types of atomic sub-tasks for heritage queries."""
INTENT_CLASSIFICATION = "intent"
ENTITY_EXTRACTION = "entity"
LOCATION_FILTER = "location"
TYPE_FILTER = "type"
TEMPORAL_FILTER = "temporal"
IDENTIFIER_FILTER = "identifier"
AGGREGATION = "aggregate"
RELATIONSHIP = "relation"
COMPARISON = "compare"
@dataclass
class AtomicSubTask:
"""A single atomic sub-task that can be cached independently."""
task_type: SubTaskType
task_key: str # Unique key for cache lookup
parameters: dict[str, Any]
# Dependencies on other sub-tasks
depends_on: list[str] = field(default_factory=list)
# Cached result (if available)
cached_result: Any = None
cache_hit: bool = False
def __hash__(self) -> int:
return hash(self.task_key)
def to_cache_key(self) -> str:
"""Generate deterministic cache key for this sub-task."""
params_str = "&".join(f"{k}={v}" for k, v in sorted(self.parameters.items()))
key_str = f"{self.task_type.value}:{params_str}"
return hashlib.sha256(key_str.encode()).hexdigest()[:16]
@dataclass
class DecomposedQuery:
"""A query decomposed into atomic sub-tasks."""
original_query: str
language: str
sub_tasks: list[AtomicSubTask]
# Extracted components
intent: str | None = None
entities: list[str] = field(default_factory=list)
location: str | None = None
institution_type: str | None = None
temporal_range: tuple[str, str] | None = None
# Execution metadata
fully_cached: bool = False
partial_cache_hits: int = 0
def get_cache_keys(self) -> list[str]:
"""Get all cache keys for sub-tasks."""
return [task.to_cache_key() for task in self.sub_tasks]
class HeritageQueryDecomposer:
"""Decomposes heritage queries into cacheable atomic sub-tasks.
Uses pattern matching and linguistic analysis to break down
complex queries into independent, cacheable components.
Strategies:
1. Intent decomposition - Separate intent from parameters
2. Entity decomposition - Each entity is a separate lookup
3. Filter decomposition - Each filter is independent
4. Aggregation decomposition - Aggregation separate from data
"""
def __init__(self):
# Intent detection patterns
self.intent_patterns = {
"statistical": [
r"hoeveel\b", r"how many\b", r"aantal\b", r"count\b",
r"totaal\b", r"total\b", r"gemiddeld\b", r"average\b",
],
"geographic": [
r"waar\b", r"where\b", r"locatie\b", r"location\b",
r"in de buurt\b", r"nearby\b", r"kaart\b", r"map\b",
],
"temporal": [
r"wanneer\b", r"when\b", r"opgericht\b", r"founded\b",
r"geschiedenis\b", r"history\b", r"tijdlijn\b", r"timeline\b",
],
"entity_lookup": [
r"wat is\b", r"what is\b", r"informatie over\b", r"about\b",
r"details\b", r"tell me\b", r"vertel\b",
],
"relational": [
r"relatie\b", r"relationship\b", r"connected\b", r"verbonden\b",
r"netwerk\b", r"network\b", r"fusie\b", r"merger\b",
],
"comparative": [
r"vergelijk\b", r"compare\b", r"versus\b", r"verschil\b",
r"difference\b", r"between\b", r"tussen\b",
],
}
# Institution type patterns (Dutch + English)
self.type_patterns = {
"MUSEUM": [r"museum", r"musea", r"museums"],
"LIBRARY": [r"bibliothe[e]?k", r"librar(y|ies)", r"bieb"],
"ARCHIVE": [r"archief", r"archiv(e|es)", r"archieven"],
"GALLERY": [r"galerie", r"galler(y|ies)", r"kunsthal"],
"RESEARCH_CENTER": [r"onderzoek", r"research", r"kenniscentrum"],
"BOTANICAL_ZOO": [r"dierentuin", r"zoo", r"botanisch", r"artis"],
"HOLY_SITES": [r"kerk", r"church", r"kathedraal", r"cathedral"],
}
# Location patterns
self.location_patterns = [
# Dutch cities
r"\b(amsterdam|rotterdam|den haag|utrecht|eindhoven|tilburg|groningen|almere|breda|nijmegen|apeldoorn|haarlem|arnhem|zaanstad|amersfoort|haarlemmermeer|enschede|the hague|maastricht|leiden|dordrecht|zoetermeer|zwolle|deventer|delft|alkmaar|leeuwarden|venlo|hilversum|middelburg|sittard|assen|helmond|lelystad|emmen)\b",
# Dutch provinces
r"\b(noord-holland|zuid-holland|utrecht|gelderland|brabant|limburg|overijssel|friesland|groningen|drenthe|flevoland|zeeland)\b",
# Dutch regions
r"\b(randstad|veluwe|achterhoek|twente|brabant)\b",
]
# Identifier patterns
self.identifier_patterns = {
"isil": r"isil|NL-[A-Z][a-z]+[A-Z]*",
"wikidata": r"Q\d+|wikidata",
"ghcid": r"NL-[A-Z]{2}-[A-Z]{3}-[A-Z]-",
}
# Compile patterns
self._compile_patterns()
def _compile_patterns(self) -> None:
"""Compile regex patterns for efficiency."""
self._intent_regexes = {
intent: [re.compile(p, re.IGNORECASE) for p in patterns]
for intent, patterns in self.intent_patterns.items()
}
self._type_regexes = {
inst_type: [re.compile(p, re.IGNORECASE) for p in patterns]
for inst_type, patterns in self.type_patterns.items()
}
self._location_regexes = [
re.compile(p, re.IGNORECASE) for p in self.location_patterns
]
self._identifier_regexes = {
id_type: re.compile(p, re.IGNORECASE)
for id_type, p in self.identifier_patterns.items()
}
def decompose(self, query: str, language: str = "nl") -> DecomposedQuery:
"""Decompose a query into atomic sub-tasks.
Args:
query: Natural language query
language: Query language (nl, en)
Returns:
DecomposedQuery with atomic sub-tasks
"""
sub_tasks: list[AtomicSubTask] = []
# 1. Detect intent
intent = self._detect_intent(query)
sub_tasks.append(AtomicSubTask(
task_type=SubTaskType.INTENT_CLASSIFICATION,
task_key=f"intent:{intent}",
parameters={"intent": intent, "language": language},
))
# 2. Extract institution type
inst_type = self._extract_institution_type(query)
if inst_type:
sub_tasks.append(AtomicSubTask(
task_type=SubTaskType.TYPE_FILTER,
task_key=f"type:{inst_type}",
parameters={"institution_type": inst_type},
depends_on=["intent"],
))
# 3. Extract location
location = self._extract_location(query)
if location:
sub_tasks.append(AtomicSubTask(
task_type=SubTaskType.LOCATION_FILTER,
task_key=f"location:{location.lower()}",
parameters={"location": location, "level": self._infer_location_level(location)},
depends_on=["intent"],
))
# 4. Extract identifier filters
identifiers = self._extract_identifiers(query)
for id_type, id_value in identifiers.items():
sub_tasks.append(AtomicSubTask(
task_type=SubTaskType.IDENTIFIER_FILTER,
task_key=f"identifier:{id_type}:{id_value}",
parameters={"identifier_type": id_type, "value": id_value},
depends_on=["intent"],
))
# 5. Add aggregation task if statistical
if intent == "statistical":
agg_type = self._infer_aggregation_type(query)
sub_tasks.append(AtomicSubTask(
task_type=SubTaskType.AGGREGATION,
task_key=f"aggregate:{agg_type}",
parameters={"aggregation": agg_type},
depends_on=[t.task_key for t in sub_tasks if t.task_type != SubTaskType.INTENT_CLASSIFICATION],
))
# 6. Add relationship task if relational
if intent == "relational":
sub_tasks.append(AtomicSubTask(
task_type=SubTaskType.RELATIONSHIP,
task_key="relation:network",
parameters={"relation_type": "network"},
depends_on=[t.task_key for t in sub_tasks if t.task_type in [SubTaskType.TYPE_FILTER, SubTaskType.LOCATION_FILTER]],
))
return DecomposedQuery(
original_query=query,
language=language,
sub_tasks=sub_tasks,
intent=intent,
entities=self._extract_entities(query),
location=location,
institution_type=inst_type,
)
def _detect_intent(self, query: str) -> str:
"""Detect query intent using pattern matching."""
intent_scores: dict[str, int] = {intent: 0 for intent in self.intent_patterns}
for intent, regexes in self._intent_regexes.items():
for regex in regexes:
if regex.search(query):
intent_scores[intent] += 1
best_intent = max(intent_scores, key=intent_scores.get) # type: ignore
return best_intent if intent_scores[best_intent] > 0 else "exploration"
def _extract_institution_type(self, query: str) -> str | None:
"""Extract institution type from query."""
for inst_type, regexes in self._type_regexes.items():
for regex in regexes:
if regex.search(query):
return inst_type
return None
def _extract_location(self, query: str) -> str | None:
"""Extract location from query."""
for regex in self._location_regexes:
match = regex.search(query)
if match:
return match.group(0).title()
return None
def _infer_location_level(self, location: str) -> str:
"""Infer whether location is city, province, or region."""
location_lower = location.lower()
provinces = [
"noord-holland", "zuid-holland", "utrecht", "gelderland",
"brabant", "limburg", "overijssel", "friesland",
"groningen", "drenthe", "flevoland", "zeeland",
]
regions = ["randstad", "veluwe", "achterhoek", "twente"]
if location_lower in provinces:
return "province"
elif location_lower in regions:
return "region"
else:
return "city"
def _extract_identifiers(self, query: str) -> dict[str, str | None]:
"""Extract identifier references from query."""
identifiers: dict[str, str | None] = {}
for id_type, regex in self._identifier_regexes.items():
match = regex.search(query)
if match:
identifiers[id_type] = match.group(0) if "has" not in query.lower() else "exists"
return identifiers
def _extract_entities(self, query: str) -> list[str]:
"""Extract named entities from query (simplified NER)."""
entities: list[str] = []
# Extract quoted strings
quoted = re.findall(r'"([^"]+)"', query)
entities.extend(quoted)
# Extract institution type mentions
inst_type = self._extract_institution_type(query)
if inst_type:
entities.append(inst_type)
# Extract location mentions
location = self._extract_location(query)
if location:
entities.append(location)
return entities
def _infer_aggregation_type(self, query: str) -> str:
"""Infer aggregation type from query."""
query_lower = query.lower()
if any(w in query_lower for w in ["gemiddeld", "average"]):
return "average"
elif any(w in query_lower for w in ["totaal", "total", "sum"]):
return "sum"
elif any(w in query_lower for w in ["maximum", "max", "meest", "most"]):
return "max"
elif any(w in query_lower for w in ["minimum", "min", "minst", "least"]):
return "min"
else:
return "count"
def generate_sub_task_cache_key(
self,
task: AtomicSubTask,
language: str = "nl",
) -> str:
"""Generate a cache key for a sub-task.
Format: heritage:subtask:{type}:{hash}
"""
params_with_lang = {**task.parameters, "lang": language}
params_str = "&".join(f"{k}={v}" for k, v in sorted(params_with_lang.items()))
hash_input = f"{task.task_type.value}:{params_str}"
hash_value = hashlib.sha256(hash_input.encode()).hexdigest()[:12]
return f"heritage:subtask:{task.task_type.value}:{hash_value}"
class AtomicCacheManager:
"""Manages atomic sub-task caching for the heritage RAG pipeline.
Integrates with HeritageSemanticCache for storing and retrieving
atomic sub-task results.
"""
def __init__(self, semantic_cache: Any = None):
"""Initialize with optional semantic cache backend.
Args:
semantic_cache: HeritageSemanticCache instance
"""
self.decomposer = HeritageQueryDecomposer()
self.semantic_cache = semantic_cache
# In-memory fallback for sub-task cache
self._subtask_cache: dict[str, Any] = {}
# Statistics
self.stats = {
"queries_decomposed": 0,
"subtask_hits": 0,
"subtask_misses": 0,
"full_query_reassemblies": 0,
}
async def process_query(
self,
query: str,
language: str = "nl",
) -> tuple[DecomposedQuery, dict[str, Any]]:
"""Process a query with atomic caching.
Returns decomposed query and cached sub-results.
"""
self.stats["queries_decomposed"] += 1
# Decompose query
decomposed = self.decomposer.decompose(query, language)
# Check cache for each sub-task
cached_results: dict[str, Any] = {}
for task in decomposed.sub_tasks:
cache_key = self.decomposer.generate_sub_task_cache_key(task, language)
# Try semantic cache first
cached_value = await self._get_cached_subtask(cache_key)
if cached_value is not None:
task.cached_result = cached_value
task.cache_hit = True
cached_results[task.task_key] = cached_value
decomposed.partial_cache_hits += 1
self.stats["subtask_hits"] += 1
else:
self.stats["subtask_misses"] += 1
# Check if fully cached
decomposed.fully_cached = all(t.cache_hit for t in decomposed.sub_tasks)
if decomposed.fully_cached:
self.stats["full_query_reassemblies"] += 1
return decomposed, cached_results
async def cache_subtask_result(
self,
task: AtomicSubTask,
result: Any,
language: str = "nl",
ttl: int = 3600,
) -> bool:
"""Cache a sub-task result.
Args:
task: The atomic sub-task
result: Result to cache
language: Query language
ttl: Time-to-live in seconds
Returns:
True if cached successfully
"""
cache_key = self.decomposer.generate_sub_task_cache_key(task, language)
# Store in semantic cache if available
if self.semantic_cache:
try:
await self.semantic_cache.set(
query=task.task_key,
response={"subtask_result": result},
intent=task.task_type.value,
language=language,
)
return True
except Exception as e:
logger.warning(f"Failed to cache subtask in semantic cache: {e}")
# Fallback to memory cache
self._subtask_cache[cache_key] = result
return True
async def _get_cached_subtask(self, cache_key: str) -> Any | None:
"""Get cached sub-task result."""
# Try semantic cache
if self.semantic_cache:
try:
result = await self.semantic_cache.get(query=cache_key)
if result and "subtask_result" in result:
return result["subtask_result"]
except Exception as e:
logger.debug(f"Semantic cache lookup failed: {e}")
# Fallback to memory
return self._subtask_cache.get(cache_key)
def get_stats(self) -> dict[str, Any]:
"""Get atomic caching statistics."""
total_subtasks = self.stats["subtask_hits"] + self.stats["subtask_misses"]
hit_rate = self.stats["subtask_hits"] / total_subtasks if total_subtasks > 0 else 0
return {
**self.stats,
"subtask_hit_rate": round(hit_rate * 100, 2),
"memory_cache_size": len(self._subtask_cache),
}
# Convenience functions
_decomposer_instance: HeritageQueryDecomposer | None = None
def get_decomposer() -> HeritageQueryDecomposer:
"""Get or create global decomposer instance."""
global _decomposer_instance
if _decomposer_instance is None:
_decomposer_instance = HeritageQueryDecomposer()
return _decomposer_instance
def decompose_query(query: str, language: str = "nl") -> DecomposedQuery:
"""Convenience function to decompose a query."""
return get_decomposer().decompose(query, language)
# Example usage and testing
if __name__ == "__main__":
# Test decomposition
decomposer = HeritageQueryDecomposer()
test_queries = [
"Hoeveel musea zijn er in Amsterdam?",
"Where is the Rijksmuseum located?",
"Show me archives in Noord-Holland with ISIL codes",
"Welke bibliotheken zijn gefuseerd sinds 2000?",
"Find heritage institutions near Rotterdam Centraal",
"Compare the collections of Rijksmuseum and Van Gogh Museum",
]
for query in test_queries:
print(f"\nQuery: {query}")
decomposed = decomposer.decompose(query)
print(f" Intent: {decomposed.intent}")
print(f" Institution type: {decomposed.institution_type}")
print(f" Location: {decomposed.location}")
print(f" Sub-tasks: {len(decomposed.sub_tasks)}")
for task in decomposed.sub_tasks:
print(f" - {task.task_type.value}: {task.parameters}")