glam/backend/rag/atomic_decomposer.py

"""
Atomic Query Decomposer for Heritage RAG Semantic Caching

Decomposes complex heritage queries into atomic sub-tasks for higher cache hit rates.
Based on research showing 40-70% cache hit rates with atomic decomposition vs 5-15% for full queries.

Architecture:
1. Parse query into intent + entities + constraints
2. Generate cacheable sub-task keys
3. Check cache for each sub-task independently
4. Merge cached sub-results with fresh retrievals
5. Cache new sub-results for future queries

Example decomposition:
    "Hoeveel musea in Amsterdam hebben een ISIL code?"
    ↓
    ├── IntentTask(intent="statistical", entity_type="MUSEUM")
    ├── LocationTask(location="Amsterdam", level="city")
    ├── FilterTask(filter="has_isil", value=True)
    └── AggregateTask(aggregation="count")

Each sub-task can be cached independently:
- Statistical queries for "MUSEUM" entities → cacheable
- Location filtering for "Amsterdam" → cacheable
- ISIL filter application → cacheable
"""

from __future__ import annotations

import hashlib
import logging
import re
from dataclasses import dataclass, field
from enum import Enum
from typing import Any

logger = logging.getLogger(__name__)


class SubTaskType(str, Enum):
    """Types of atomic sub-tasks for heritage queries."""

    INTENT_CLASSIFICATION = "intent"
    ENTITY_EXTRACTION = "entity"
    LOCATION_FILTER = "location"
    TYPE_FILTER = "type"
    TEMPORAL_FILTER = "temporal"
    IDENTIFIER_FILTER = "identifier"
    AGGREGATION = "aggregate"
    RELATIONSHIP = "relation"
    COMPARISON = "compare"


@dataclass
class AtomicSubTask:
    """A single atomic sub-task that can be cached independently."""

    task_type: SubTaskType
    task_key: str  # Unique key for cache lookup
    parameters: dict[str, Any]

    # Dependencies on other sub-tasks
    depends_on: list[str] = field(default_factory=list)

    # Cached result (if available)
    cached_result: Any = None
    cache_hit: bool = False

    def __hash__(self) -> int:
        return hash(self.task_key)

    def to_cache_key(self) -> str:
        """Generate deterministic cache key for this sub-task."""
        params_str = "&".join(f"{k}={v}" for k, v in sorted(self.parameters.items()))
        key_str = f"{self.task_type.value}:{params_str}"
        return hashlib.sha256(key_str.encode()).hexdigest()[:16]


@dataclass
class DecomposedQuery:
    """A query decomposed into atomic sub-tasks."""

    original_query: str
    language: str
    sub_tasks: list[AtomicSubTask]

    # Extracted components
    intent: str | None = None
    entities: list[str] = field(default_factory=list)
    location: str | None = None
    institution_type: str | None = None
    temporal_range: tuple[str, str] | None = None

    # Execution metadata
    fully_cached: bool = False
    partial_cache_hits: int = 0

    def get_cache_keys(self) -> list[str]:
        """Get all cache keys for sub-tasks."""
        return [task.to_cache_key() for task in self.sub_tasks]


class HeritageQueryDecomposer:
    """Decomposes heritage queries into cacheable atomic sub-tasks.

    Uses pattern matching and linguistic analysis to break down
    complex queries into independent, cacheable components.

    Strategies:
    1. Intent decomposition - Separate intent from parameters
    2. Entity decomposition - Each entity is a separate lookup
    3. Filter decomposition - Each filter is independent
    4. Aggregation decomposition - Aggregation separate from data
    """

    def __init__(self):
        # Intent detection patterns
        self.intent_patterns = {
            "statistical": [
                r"hoeveel\b", r"how many\b", r"aantal\b", r"count\b",
                r"totaal\b", r"total\b", r"gemiddeld\b", r"average\b",
            ],
            "geographic": [
                r"waar\b", r"where\b", r"locatie\b", r"location\b",
                r"in de buurt\b", r"nearby\b", r"kaart\b", r"map\b",
            ],
            "temporal": [
                r"wanneer\b", r"when\b", r"opgericht\b", r"founded\b",
                r"geschiedenis\b", r"history\b", r"tijdlijn\b", r"timeline\b",
            ],
            "entity_lookup": [
                r"wat is\b", r"what is\b", r"informatie over\b", r"about\b",
                r"details\b", r"tell me\b", r"vertel\b",
            ],
            "relational": [
                r"relatie\b", r"relationship\b", r"connected\b", r"verbonden\b",
                r"netwerk\b", r"network\b", r"fusie\b", r"merger\b",
            ],
            "comparative": [
                r"vergelijk\b", r"compare\b", r"versus\b", r"verschil\b",
                r"difference\b", r"between\b", r"tussen\b",
            ],
        }

        # Institution type patterns (Dutch + English)
        self.type_patterns = {
            "MUSEUM": [r"museum", r"musea", r"museums"],
            "LIBRARY": [r"bibliothe[e]?k", r"librar(y|ies)", r"bieb"],
            "ARCHIVE": [r"archief", r"archiv(e|es)", r"archieven"],
            "GALLERY": [r"galerie", r"galler(y|ies)", r"kunsthal"],
            "RESEARCH_CENTER": [r"onderzoek", r"research", r"kenniscentrum"],
            "BOTANICAL_ZOO": [r"dierentuin", r"zoo", r"botanisch", r"artis"],
            "HOLY_SITES": [r"kerk", r"church", r"kathedraal", r"cathedral"],
        }

        # Location patterns
        self.location_patterns = [
            # Dutch cities
            r"\b(amsterdam|rotterdam|den haag|utrecht|eindhoven|tilburg|groningen|almere|breda|nijmegen|apeldoorn|haarlem|arnhem|zaanstad|amersfoort|haarlemmermeer|enschede|the hague|maastricht|leiden|dordrecht|zoetermeer|zwolle|deventer|delft|alkmaar|leeuwarden|venlo|hilversum|middelburg|sittard|assen|helmond|lelystad|emmen)\b",
            # Dutch provinces
            r"\b(noord-holland|zuid-holland|utrecht|gelderland|brabant|limburg|overijssel|friesland|groningen|drenthe|flevoland|zeeland)\b",
            # Dutch regions
            r"\b(randstad|veluwe|achterhoek|twente|brabant)\b",
        ]

        # Identifier patterns
        self.identifier_patterns = {
            "isil": r"isil|NL-[A-Z][a-z]+[A-Z]*",
            "wikidata": r"Q\d+|wikidata",
            "ghcid": r"NL-[A-Z]{2}-[A-Z]{3}-[A-Z]-",
        }

        # Compile patterns
        self._compile_patterns()

    def _compile_patterns(self) -> None:
        """Compile regex patterns for efficiency."""
        self._intent_regexes = {
            intent: [re.compile(p, re.IGNORECASE) for p in patterns]
            for intent, patterns in self.intent_patterns.items()
        }

        self._type_regexes = {
            inst_type: [re.compile(p, re.IGNORECASE) for p in patterns]
            for inst_type, patterns in self.type_patterns.items()
        }

        self._location_regexes = [
            re.compile(p, re.IGNORECASE) for p in self.location_patterns
        ]

        self._identifier_regexes = {
            id_type: re.compile(p, re.IGNORECASE)
            for id_type, p in self.identifier_patterns.items()
        }

    def decompose(self, query: str, language: str = "nl") -> DecomposedQuery:
        """Decompose a query into atomic sub-tasks.

        Args:
            query: Natural language query
            language: Query language (nl, en)

        Returns:
            DecomposedQuery with atomic sub-tasks
        """
        sub_tasks: list[AtomicSubTask] = []

        # 1. Detect intent
        intent = self._detect_intent(query)
        sub_tasks.append(AtomicSubTask(
            task_type=SubTaskType.INTENT_CLASSIFICATION,
            task_key=f"intent:{intent}",
            parameters={"intent": intent, "language": language},
        ))

        # 2. Extract institution type
        inst_type = self._extract_institution_type(query)
        if inst_type:
            sub_tasks.append(AtomicSubTask(
                task_type=SubTaskType.TYPE_FILTER,
                task_key=f"type:{inst_type}",
                parameters={"institution_type": inst_type},
                depends_on=["intent"],
            ))

        # 3. Extract location
        location = self._extract_location(query)
        if location:
            sub_tasks.append(AtomicSubTask(
                task_type=SubTaskType.LOCATION_FILTER,
                task_key=f"location:{location.lower()}",
                parameters={"location": location, "level": self._infer_location_level(location)},
                depends_on=["intent"],
            ))

        # 4. Extract identifier filters
        identifiers = self._extract_identifiers(query)
        for id_type, id_value in identifiers.items():
            sub_tasks.append(AtomicSubTask(
                task_type=SubTaskType.IDENTIFIER_FILTER,
                task_key=f"identifier:{id_type}:{id_value}",
                parameters={"identifier_type": id_type, "value": id_value},
                depends_on=["intent"],
            ))

        # 5. Add aggregation task if statistical
        if intent == "statistical":
            agg_type = self._infer_aggregation_type(query)
            sub_tasks.append(AtomicSubTask(
                task_type=SubTaskType.AGGREGATION,
                task_key=f"aggregate:{agg_type}",
                parameters={"aggregation": agg_type},
                depends_on=[t.task_key for t in sub_tasks if t.task_type != SubTaskType.INTENT_CLASSIFICATION],
            ))

        # 6. Add relationship task if relational
        if intent == "relational":
            sub_tasks.append(AtomicSubTask(
                task_type=SubTaskType.RELATIONSHIP,
                task_key="relation:network",
                parameters={"relation_type": "network"},
                depends_on=[t.task_key for t in sub_tasks if t.task_type in [SubTaskType.TYPE_FILTER, SubTaskType.LOCATION_FILTER]],
            ))

        return DecomposedQuery(
            original_query=query,
            language=language,
            sub_tasks=sub_tasks,
            intent=intent,
            entities=self._extract_entities(query),
            location=location,
            institution_type=inst_type,
        )

    def _detect_intent(self, query: str) -> str:
        """Detect query intent using pattern matching."""
        intent_scores: dict[str, int] = {intent: 0 for intent in self.intent_patterns}

        for intent, regexes in self._intent_regexes.items():
            for regex in regexes:
                if regex.search(query):
                    intent_scores[intent] += 1

        best_intent = max(intent_scores, key=intent_scores.get)  # type: ignore
        return best_intent if intent_scores[best_intent] > 0 else "exploration"

    def _extract_institution_type(self, query: str) -> str | None:
        """Extract institution type from query."""
        for inst_type, regexes in self._type_regexes.items():
            for regex in regexes:
                if regex.search(query):
                    return inst_type
        return None

    def _extract_location(self, query: str) -> str | None:
        """Extract location from query."""
        for regex in self._location_regexes:
            match = regex.search(query)
            if match:
                return match.group(0).title()
        return None

    def _infer_location_level(self, location: str) -> str:
        """Infer whether location is city, province, or region."""
        location_lower = location.lower()

        provinces = [
            "noord-holland", "zuid-holland", "utrecht", "gelderland",
            "brabant", "limburg", "overijssel", "friesland",
            "groningen", "drenthe", "flevoland", "zeeland",
        ]

        regions = ["randstad", "veluwe", "achterhoek", "twente"]

        if location_lower in provinces:
            return "province"
        elif location_lower in regions:
            return "region"
        else:
            return "city"

    def _extract_identifiers(self, query: str) -> dict[str, str | None]:
        """Extract identifier references from query."""
        identifiers: dict[str, str | None] = {}

        for id_type, regex in self._identifier_regexes.items():
            match = regex.search(query)
            if match:
                identifiers[id_type] = match.group(0) if "has" not in query.lower() else "exists"

        return identifiers

    def _extract_entities(self, query: str) -> list[str]:
        """Extract named entities from query (simplified NER)."""
        entities: list[str] = []

        # Extract quoted strings
        quoted = re.findall(r'"([^"]+)"', query)
        entities.extend(quoted)

        # Extract institution type mentions
        inst_type = self._extract_institution_type(query)
        if inst_type:
            entities.append(inst_type)

        # Extract location mentions
        location = self._extract_location(query)
        if location:
            entities.append(location)

        return entities

    def _infer_aggregation_type(self, query: str) -> str:
        """Infer aggregation type from query."""
        query_lower = query.lower()

        if any(w in query_lower for w in ["gemiddeld", "average"]):
            return "average"
        elif any(w in query_lower for w in ["totaal", "total", "sum"]):
            return "sum"
        elif any(w in query_lower for w in ["maximum", "max", "meest", "most"]):
            return "max"
        elif any(w in query_lower for w in ["minimum", "min", "minst", "least"]):
            return "min"
        else:
            return "count"

    def generate_sub_task_cache_key(
        self,
        task: AtomicSubTask,
        language: str = "nl",
    ) -> str:
        """Generate a cache key for a sub-task.

        Format: heritage:subtask:{type}:{hash}
        """
        params_with_lang = {**task.parameters, "lang": language}
        params_str = "&".join(f"{k}={v}" for k, v in sorted(params_with_lang.items()))
        hash_input = f"{task.task_type.value}:{params_str}"
        hash_value = hashlib.sha256(hash_input.encode()).hexdigest()[:12]

        return f"heritage:subtask:{task.task_type.value}:{hash_value}"


class AtomicCacheManager:
    """Manages atomic sub-task caching for the heritage RAG pipeline.

    Integrates with HeritageSemanticCache for storing and retrieving
    atomic sub-task results.
    """

    def __init__(self, semantic_cache: Any = None):
        """Initialize with optional semantic cache backend.

        Args:
            semantic_cache: HeritageSemanticCache instance
        """
        self.decomposer = HeritageQueryDecomposer()
        self.semantic_cache = semantic_cache

        # In-memory fallback for sub-task cache
        self._subtask_cache: dict[str, Any] = {}

        # Statistics
        self.stats = {
            "queries_decomposed": 0,
            "subtask_hits": 0,
            "subtask_misses": 0,
            "full_query_reassemblies": 0,
        }

    async def process_query(
        self,
        query: str,
        language: str = "nl",
    ) -> tuple[DecomposedQuery, dict[str, Any]]:
        """Process a query with atomic caching.

        Returns decomposed query and cached sub-results.
        """
        self.stats["queries_decomposed"] += 1

        # Decompose query
        decomposed = self.decomposer.decompose(query, language)

        # Check cache for each sub-task
        cached_results: dict[str, Any] = {}

        for task in decomposed.sub_tasks:
            cache_key = self.decomposer.generate_sub_task_cache_key(task, language)

            # Try semantic cache first
            cached_value = await self._get_cached_subtask(cache_key)

            if cached_value is not None:
                task.cached_result = cached_value
                task.cache_hit = True
                cached_results[task.task_key] = cached_value
                decomposed.partial_cache_hits += 1
                self.stats["subtask_hits"] += 1
            else:
                self.stats["subtask_misses"] += 1

        # Check if fully cached
        decomposed.fully_cached = all(t.cache_hit for t in decomposed.sub_tasks)

        if decomposed.fully_cached:
            self.stats["full_query_reassemblies"] += 1

        return decomposed, cached_results

    async def cache_subtask_result(
        self,
        task: AtomicSubTask,
        result: Any,
        language: str = "nl",
        ttl: int = 3600,
    ) -> bool:
        """Cache a sub-task result.

        Args:
            task: The atomic sub-task
            result: Result to cache
            language: Query language
            ttl: Time-to-live in seconds

        Returns:
            True if cached successfully
        """
        cache_key = self.decomposer.generate_sub_task_cache_key(task, language)

        # Store in semantic cache if available
        if self.semantic_cache:
            try:
                await self.semantic_cache.set(
                    query=task.task_key,
                    response={"subtask_result": result},
                    intent=task.task_type.value,
                    language=language,
                )
                return True
            except Exception as e:
                logger.warning(f"Failed to cache subtask in semantic cache: {e}")

        # Fallback to memory cache
        self._subtask_cache[cache_key] = result
        return True

    async def _get_cached_subtask(self, cache_key: str) -> Any | None:
        """Get cached sub-task result."""
        # Try semantic cache
        if self.semantic_cache:
            try:
                result = await self.semantic_cache.get(query=cache_key)
                if result and "subtask_result" in result:
                    return result["subtask_result"]
            except Exception as e:
                logger.debug(f"Semantic cache lookup failed: {e}")

        # Fallback to memory
        return self._subtask_cache.get(cache_key)

    def get_stats(self) -> dict[str, Any]:
        """Get atomic caching statistics."""
        total_subtasks = self.stats["subtask_hits"] + self.stats["subtask_misses"]
        hit_rate = self.stats["subtask_hits"] / total_subtasks if total_subtasks > 0 else 0

        return {
            **self.stats,
            "subtask_hit_rate": round(hit_rate * 100, 2),
            "memory_cache_size": len(self._subtask_cache),
        }


# Convenience functions
_decomposer_instance: HeritageQueryDecomposer | None = None


def get_decomposer() -> HeritageQueryDecomposer:
    """Get or create global decomposer instance."""
    global _decomposer_instance
    if _decomposer_instance is None:
        _decomposer_instance = HeritageQueryDecomposer()
    return _decomposer_instance


def decompose_query(query: str, language: str = "nl") -> DecomposedQuery:
    """Convenience function to decompose a query."""
    return get_decomposer().decompose(query, language)


# Example usage and testing
if __name__ == "__main__":
    # Test decomposition
    decomposer = HeritageQueryDecomposer()

    test_queries = [
        "Hoeveel musea zijn er in Amsterdam?",
        "Where is the Rijksmuseum located?",
        "Show me archives in Noord-Holland with ISIL codes",
        "Welke bibliotheken zijn gefuseerd sinds 2000?",
        "Find heritage institutions near Rotterdam Centraal",
        "Compare the collections of Rijksmuseum and Van Gogh Museum",
    ]

    for query in test_queries:
        print(f"\nQuery: {query}")
        decomposed = decomposer.decompose(query)
        print(f"  Intent: {decomposed.intent}")
        print(f"  Institution type: {decomposed.institution_type}")
        print(f"  Location: {decomposed.location}")
        print(f"  Sub-tasks: {len(decomposed.sub_tasks)}")
        for task in decomposed.sub_tasks:
            print(f"    - {task.task_type.value}: {task.parameters}")