glam/backend/rag/cache_config.py

"""
Cache Configuration for Heritage RAG Semantic Caching

Configuration settings for the hybrid semantic cache system including:
- Redis/Valkey connection settings
- Distance thresholds for semantic matching
- TTL policies for cache invalidation
- Quality control filters

Based on research from:
- DeepLearning.AI Semantic Caching course
- Banking RAG case study (99% -> 3.8% false positive reduction)
- GPTCache and vCache patterns
"""

from __future__ import annotations

from dataclasses import dataclass, field
from enum import Enum
from typing import Any

from pydantic import Field
from pydantic_settings import BaseSettings


class CacheBackend(str, Enum):
    """Available cache backends."""
    REDIS = "redis"
    VALKEY = "valkey"
    MEMORY = "memory"  # For testing


class DistanceMetric(str, Enum):
    """Distance metrics for semantic similarity."""
    COSINE = "cosine"
    L2 = "l2"
    IP = "ip"  # Inner product


class CacheSettings(BaseSettings):
    """Semantic cache configuration from environment variables."""

    # Redis/Valkey Connection
    cache_backend: CacheBackend = CacheBackend.VALKEY
    redis_url: str = "redis://localhost:6379"
    redis_password: str | None = None
    redis_db: int = 0

    # Cache Index Configuration
    cache_index_name: str = "heritage_rag_cache"
    cache_prefix: str = "heritage:cache:"

    # Embedding Model (matches redis/langcache-embed-v1)
    cache_embedding_model: str = "redis/langcache-embed-v1"
    cache_embedding_dim: int = 768

    # Distance Thresholds (critical for false positive prevention)
    # Lower = stricter matching, fewer false positives
    # 0.1 is very strict, 0.3 allows more semantic similarity
    # For heritage domain, 0.25 balances accuracy with hit rate
    distance_threshold: float = 0.25  # Cosine distance threshold
    distance_metric: DistanceMetric = DistanceMetric.COSINE

    # Quality Control Thresholds
    min_query_length: int = 10  # Skip very short queries
    max_query_length: int = 500  # Skip extremely long queries

    # TTL Policies (seconds)
    ttl_default: int = 86400  # 24 hours
    ttl_statistical: int = 3600  # 1 hour (counts may change)
    ttl_temporal: int = 86400  # 24 hours (historical data stable)
    ttl_geographic: int = 604800  # 7 days (locations very stable)
    ttl_entity: int = 604800  # 7 days (entity details stable)

    # Cache Behavior
    cache_enabled: bool = True
    validation_enabled: bool = True  # Enable cross-encoder validation
    atomic_decomposition_enabled: bool = True  # Enable sub-query caching

    # Warmup Configuration
    warmup_on_startup: bool = True
    warmup_batch_size: int = 50

    # Metrics & Observability
    metrics_enabled: bool = True
    log_cache_hits: bool = True
    log_cache_misses: bool = True

    class Config:
        env_prefix = "HERITAGE_CACHE_"
        env_file = ".env"
        extra = "ignore"


@dataclass
class CacheEntry:
    """A single cache entry with metadata."""

    query: str
    query_hash: str
    response: dict[str, Any]
    intent: str
    language: str
    sources: list[str]

    # Filterable metadata for cache queries
    institution_type: str | None = None
    country_code: str | None = None
    region_code: str | None = None

    # Provenance
    created_at: str = ""
    ttl_seconds: int = 86400
    hit_count: int = 0

    # Quality metrics
    confidence: float = 0.0
    validation_score: float | None = None


@dataclass
class CacheStats:
    """Cache performance statistics."""

    total_queries: int = 0
    cache_hits: int = 0
    cache_misses: int = 0
    validation_passes: int = 0
    validation_failures: int = 0

    # Timing
    avg_hit_latency_ms: float = 0.0
    avg_miss_latency_ms: float = 0.0

    # Quality
    false_positive_rate: float = 0.0
    hit_rate: float = 0.0

    def update_hit_rate(self) -> None:
        """Recalculate hit rate."""
        if self.total_queries > 0:
            self.hit_rate = self.cache_hits / self.total_queries

    def update_false_positive_rate(self) -> None:
        """Recalculate false positive rate."""
        total_hits = self.validation_passes + self.validation_failures
        if total_hits > 0:
            self.false_positive_rate = self.validation_failures / total_hits


# Heritage-specific skip patterns for cache bypass
CACHE_BYPASS_PATTERNS = [
    # Temporal/dynamic queries (results change frequently)
    r"vandaag|today|gisteren|yesterday|nu|now",
    r"recent|latest|newest|nieuwste|current|actueel|huidige",
    r"dit jaar|this year|vorige week|last week",

    # User-specific queries
    r"mijn|my|ik heb|i have",

    # Code/technical queries (complex, not cacheable)
    r"sparql|query|api|endpoint|code",

    # Highly specific numeric queries
    r"exact|precies|specifically",
]

# Heritage FAQ categories for cache warmup
FAQ_CATEGORIES = {
    "statistical": [
        "Hoeveel musea zijn er in Nederland?",
        "Hoeveel archieven heeft Noord-Holland?",
        "What is the total number of heritage institutions?",
        "How many libraries are there in Amsterdam?",
        "Hoeveel erfgoedinstellingen heeft Limburg?",
    ],
    "geographic": [
        "Where is the Rijksmuseum located?",
        "Welke musea zijn er in Rotterdam?",
        "Find archives in Utrecht province",
        "Show heritage institutions near Amsterdam Centraal",
        "Waar ligt het Nationaal Archief?",
    ],
    "entity_lookup": [
        "What is the ISIL code of the Rijksmuseum?",
        "Tell me about the Nationaal Archief",
        "Information about Eye Filmmuseum",
        "Details van het Zuiderzeemuseum",
        "Wat is het adres van de Koninklijke Bibliotheek?",
    ],
    "relational": [
        "Which institutions merged to form Noord-Hollands Archief?",
        "What museums are part of the Rijkscollectie?",
        "Show relationships between archives in Amsterdam",
        "Welke instellingen behoren tot Collectie Nederland?",
    ],
    "temporal": [
        "When was the Rijksmuseum founded?",
        "Which archives closed in the past decade?",
        "Timeline of museum mergers in the Netherlands",
        "Wanneer is het Stedelijk Museum opgericht?",
    ],
}

# Strategic distractors for cache boundary testing
# These are semantically similar but should NOT match
DISTRACTOR_PAIRS = [
    # Same intent, different entity
    ("Hoeveel musea zijn er in Amsterdam?", "Hoeveel musea zijn er in Rotterdam?"),
    ("Where is the Rijksmuseum?", "Where is the Van Gogh Museum?"),

    # Same entity, different intent
    ("Where is the Nationaal Archief?", "When was the Nationaal Archief founded?"),
    ("How many items in the Rijksmuseum?", "What type is the Rijksmuseum?"),

    # Similar phrasing, different meaning
    ("Archives in Amsterdam", "Archives about Amsterdam"),
    ("Museums with ISIL codes", "Museum ISIL code lookup"),
]


def get_cache_settings() -> CacheSettings:
    """Get cache settings singleton."""
    return CacheSettings()


def get_ttl_for_intent(intent: str, settings: CacheSettings | None = None) -> int:
    """Get appropriate TTL based on query intent.

    Args:
        intent: Query intent (statistical, geographic, etc.)
        settings: Optional cache settings override

    Returns:
        TTL in seconds
    """
    if settings is None:
        settings = get_cache_settings()

    ttl_mapping = {
        "statistical": settings.ttl_statistical,
        "temporal": settings.ttl_temporal,
        "geographic": settings.ttl_geographic,
        "entity_lookup": settings.ttl_entity,
        "relational": settings.ttl_default,
        "comparative": settings.ttl_default,
        "exploration": settings.ttl_default,
    }

    return ttl_mapping.get(intent, settings.ttl_default)


def should_bypass_cache(query: str, settings: CacheSettings | None = None) -> bool:
    """Check if query should bypass cache.

    Returns True for queries that should not use cached responses:
    - Temporal/dynamic queries (results change frequently)
    - User-specific queries
    - Very short or very long queries
    - Technical/code queries

    Args:
        query: The query string to check
        settings: Optional cache settings override

    Returns:
        True if cache should be bypassed
    """
    import re

    if settings is None:
        settings = get_cache_settings()

    # Length checks
    if len(query) < settings.min_query_length:
        return True
    if len(query) > settings.max_query_length:
        return True

    # Pattern checks
    for pattern in CACHE_BYPASS_PATTERNS:
        if re.search(pattern, query, re.IGNORECASE):
            return True

    return False