glam/backend/rag/metrics.py

"""
Prometheus Metrics for Heritage RAG API

Exposes metrics for monitoring template-based SPARQL generation,
session management, caching, and overall API performance.

Metrics exposed:
- rag_queries_total: Total queries by type (template/llm), status, endpoint
- rag_template_hits_total: Template SPARQL hits by template_id
- rag_template_tier_total: Template matching by tier (pattern/embedding/rag/llm)
- rag_query_duration_seconds: Query latency histogram
- rag_session_active: Active sessions gauge
- rag_cache_hits_total: Cache hit/miss counter
- rag_atomic_cache_total: Atomic sub-task cache hits/misses
- rag_atomic_subtasks_total: Sub-task cache operations
- rag_connection_pool_size: Connection pool utilization gauge
- rag_embedding_warmup_seconds: Embedding model warmup time

Usage:
    from backend.rag.metrics import (
        record_query, record_atomic_cache, record_template_tier,
        create_metrics_endpoint, PROMETHEUS_AVAILABLE
    )

    # Record a query
    record_query(
        endpoint="dspy_query",
        template_used=True,
        template_id="count_by_province",
        cache_hit=False,
        status="success",
        duration_seconds=1.5
    )

    # Record template tier
    record_template_tier(tier="pattern", template_id="list_by_city")

    # Record atomic cache stats
    record_atomic_cache(
        query_hit=False,
        subtask_hits=3,
        subtask_misses=1,
        fully_assembled=False
    )
"""

from __future__ import annotations

import logging
from functools import lru_cache
from typing import Any

logger = logging.getLogger(__name__)

# ============================================================================
# Prometheus Client Import (Lazy/Optional)
# ============================================================================

PROMETHEUS_AVAILABLE = False
_prometheus_client = None

try:
    import prometheus_client as _prometheus_client
    PROMETHEUS_AVAILABLE = True
    logger.info("Prometheus metrics enabled")
except ImportError:
    logger.warning("prometheus_client not installed - metrics disabled")


# ============================================================================
# Metric Initialization
# ============================================================================

def _init_metrics():
    """Initialize Prometheus metrics. Called once at module load."""
    if not PROMETHEUS_AVAILABLE or _prometheus_client is None:
        return {}

    pc = _prometheus_client

    return {
        # =================================================================
        # Query-level Metrics
        # =================================================================
        "query_counter": pc.Counter(
            "rag_queries_total",
            "Total RAG queries processed",
            labelnames=["endpoint", "method", "status"],
        ),
        "template_hit_counter": pc.Counter(
            "rag_template_hits_total",
            "Template SPARQL hits by template ID",
            labelnames=["template_id", "intent"],
        ),
        "cache_counter": pc.Counter(
            "rag_cache_total",
            "Cache hits and misses",
            labelnames=["result"],
        ),
        "query_duration": pc.Histogram(
            "rag_query_duration_seconds",
            "Query processing time in seconds",
            labelnames=["endpoint", "method"],
            buckets=(0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0),
        ),

        # =================================================================
        # Template Matching Tier Metrics (NEW)
        # =================================================================
        "template_tier_counter": pc.Counter(
            "rag_template_tier_total",
            "Template matching attempts by tier",
            labelnames=["tier", "matched"],  # tier: pattern, embedding, rag, llm
        ),
        "template_matching_duration": pc.Histogram(
            "rag_template_matching_seconds",
            "Time to match query to template",
            labelnames=["tier", "matched"],
            buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0),
        ),

        # =================================================================
        # Atomic Sub-task Cache Metrics (NEW)
        # =================================================================
        "atomic_query_counter": pc.Counter(
            "rag_atomic_queries_total",
            "Atomic decomposition query attempts",
            labelnames=["result"],  # full_hit, partial_hit, miss
        ),
        "atomic_subtask_counter": pc.Counter(
            "rag_atomic_subtasks_total",
            "Atomic sub-task cache operations",
            labelnames=["operation"],  # hit, miss, cached
        ),
        "atomic_reassembly_counter": pc.Counter(
            "rag_atomic_reassemblies_total",
            "Full query reassemblies from cached sub-tasks",
        ),
        "atomic_subtask_hit_rate": pc.Gauge(
            "rag_atomic_subtask_hit_rate",
            "Current atomic sub-task cache hit rate (0-1)",
        ),

        # =================================================================
        # Connection Pool Metrics (NEW)
        # =================================================================
        "connection_pool_size": pc.Gauge(
            "rag_connection_pool_size",
            "Current connection pool size by client type",
            labelnames=["client"],  # sparql, postgis (ducklake removed from RAG)
        ),
        "connection_pool_available": pc.Gauge(
            "rag_connection_pool_available",
            "Available connections in pool by client type",
            labelnames=["client"],
        ),

        # =================================================================
        # Warmup/Initialization Metrics (NEW)
        # =================================================================
        "embedding_warmup_duration": pc.Gauge(
            "rag_embedding_warmup_seconds",
            "Time taken to warm up embedding model",
            labelnames=["model"],
        ),
        "template_embedding_warmup_duration": pc.Gauge(
            "rag_template_embedding_warmup_seconds",
            "Time taken to pre-compute template embeddings",
        ),
        "warmup_status": pc.Gauge(
            "rag_warmup_complete",
            "Whether warmup is complete (1) or not (0)",
            labelnames=["component"],  # embedding_model, template_embeddings
        ),

        # =================================================================
        # Session Metrics
        # =================================================================
        "active_sessions_gauge": pc.Gauge(
            "rag_sessions_active",
            "Number of active conversation sessions",
        ),
    }


# Initialize metrics at module load
_metrics = _init_metrics()


# ============================================================================
# Helper Functions
# ============================================================================

def record_query(
    endpoint: str,
    template_used: bool,
    template_id: str | None,
    cache_hit: bool,
    status: str,
    duration_seconds: float,
    intent: str | None = None,
) -> None:
    """Record metrics for a completed query.

    Args:
        endpoint: API endpoint name (e.g., "dspy_query", "dspy_query_stream")
        template_used: Whether template SPARQL was used vs LLM generation
        template_id: Template ID if template was used
        cache_hit: Whether response was served from cache
        status: Query status ("success", "error", "timeout")
        duration_seconds: Total query duration in seconds
        intent: Query intent classification if available
    """
    if not PROMETHEUS_AVAILABLE or not _metrics:
        return

    method = "template" if template_used else "llm"

    # Increment query counter
    _metrics["query_counter"].labels(
        endpoint=endpoint,
        method=method,
        status=status,
    ).inc()

    # Record template hit if applicable
    if template_used and template_id:
        _metrics["template_hit_counter"].labels(
            template_id=template_id,
            intent=intent or "unknown",
        ).inc()

    # Record cache status
    _metrics["cache_counter"].labels(result="hit" if cache_hit else "miss").inc()

    # Record duration
    _metrics["query_duration"].labels(
        endpoint=endpoint,
        method=method,
    ).observe(duration_seconds)


def record_template_matching(matched: bool, duration_seconds: float, tier: str = "unknown") -> None:
    """Record template matching attempt metrics.

    Args:
        matched: Whether a template was successfully matched
        duration_seconds: Time taken to attempt template matching
        tier: Which matching tier was used (pattern, embedding, llm)
    """
    if not PROMETHEUS_AVAILABLE or not _metrics:
        return

    matched_str = "true" if matched else "false"

    _metrics["template_matching_duration"].labels(
        tier=tier,
        matched=matched_str,
    ).observe(duration_seconds)

    _metrics["template_tier_counter"].labels(
        tier=tier,
        matched=matched_str,
    ).inc()


def set_active_sessions(count: int) -> None:
    """Update the active sessions gauge.

    Args:
        count: Current number of active sessions
    """
    if not PROMETHEUS_AVAILABLE or not _metrics:
        return

    _metrics["active_sessions_gauge"].set(count)


def increment_active_sessions() -> None:
    """Increment active sessions by 1."""
    if not PROMETHEUS_AVAILABLE or not _metrics:
        return
    _metrics["active_sessions_gauge"].inc()


def decrement_active_sessions() -> None:
    """Decrement active sessions by 1."""
    if not PROMETHEUS_AVAILABLE or not _metrics:
        return
    _metrics["active_sessions_gauge"].dec()


# ============================================================================
# Template Tier Metrics (NEW)
# ============================================================================

def record_template_tier(
    tier: str,
    matched: bool,
    template_id: str | None = None,
    duration_seconds: float | None = None,
) -> None:
    """Record which template matching tier was used.

    Args:
        tier: Matching tier - "pattern", "embedding", "rag", or "llm"
        matched: Whether the tier successfully matched
        template_id: Template ID if matched
        duration_seconds: Optional time taken for this tier
    """
    if not PROMETHEUS_AVAILABLE or not _metrics:
        return

    matched_str = "true" if matched else "false"

    _metrics["template_tier_counter"].labels(
        tier=tier,
        matched=matched_str,
    ).inc()

    if duration_seconds is not None:
        _metrics["template_matching_duration"].labels(
            tier=tier,
            matched=matched_str,
        ).observe(duration_seconds)


# ============================================================================
# Atomic Sub-task Cache Metrics (NEW)
# ============================================================================

def record_atomic_cache(
    query_hit: bool,
    subtask_hits: int = 0,
    subtask_misses: int = 0,
    fully_assembled: bool = False,
) -> None:
    """Record atomic sub-task cache metrics.

    Args:
        query_hit: Whether full query was reassembled from cache
        subtask_hits: Number of sub-task cache hits
        subtask_misses: Number of sub-task cache misses
        fully_assembled: Whether all sub-tasks were cached (full reassembly)
    """
    if not PROMETHEUS_AVAILABLE or not _metrics:
        return

    # Record query-level result
    if fully_assembled:
        result = "full_hit"
    elif subtask_hits > 0:
        result = "partial_hit"
    else:
        result = "miss"

    _metrics["atomic_query_counter"].labels(result=result).inc()

    # Record sub-task level stats
    if subtask_hits > 0:
        _metrics["atomic_subtask_counter"].labels(operation="hit").inc(subtask_hits)
    if subtask_misses > 0:
        _metrics["atomic_subtask_counter"].labels(operation="miss").inc(subtask_misses)

    # Record full reassembly
    if fully_assembled:
        _metrics["atomic_reassembly_counter"].inc()

    # Update hit rate gauge
    total = subtask_hits + subtask_misses
    if total > 0:
        hit_rate = subtask_hits / total
        _metrics["atomic_subtask_hit_rate"].set(hit_rate)


def record_atomic_subtask_cached(count: int = 1) -> None:
    """Record that sub-tasks were cached for future use.

    Args:
        count: Number of sub-tasks cached
    """
    if not PROMETHEUS_AVAILABLE or not _metrics:
        return

    _metrics["atomic_subtask_counter"].labels(operation="cached").inc(count)


# ============================================================================
# Connection Pool Metrics (NEW)
# ============================================================================

def record_connection_pool(
    client: str,
    pool_size: int,
    available: int | None = None,
) -> None:
    """Record connection pool utilization.

    Args:
        client: Client type - "sparql", "postgis" (ducklake removed from RAG)
        pool_size: Current total pool size
        available: Number of available connections (if known)
    """
    if not PROMETHEUS_AVAILABLE or not _metrics:
        return

    _metrics["connection_pool_size"].labels(client=client).set(pool_size)

    if available is not None:
        _metrics["connection_pool_available"].labels(client=client).set(available)


# ============================================================================
# Warmup Metrics (NEW)
# ============================================================================

def record_embedding_warmup(
    model: str,
    duration_seconds: float,
    success: bool = True,
) -> None:
    """Record embedding model warmup time.

    Args:
        model: Model name/identifier
        duration_seconds: Time taken to warm up
        success: Whether warmup completed successfully
    """
    if not PROMETHEUS_AVAILABLE or not _metrics:
        return

    _metrics["embedding_warmup_duration"].labels(model=model).set(duration_seconds)
    _metrics["warmup_status"].labels(component="embedding_model").set(1 if success else 0)


def record_template_embedding_warmup(
    duration_seconds: float,
    template_count: int = 0,
    success: bool = True,
) -> None:
    """Record template embedding pre-computation time.

    Args:
        duration_seconds: Time taken to compute template embeddings
        template_count: Number of templates processed
        success: Whether warmup completed successfully
    """
    if not PROMETHEUS_AVAILABLE or not _metrics:
        return

    _metrics["template_embedding_warmup_duration"].set(duration_seconds)
    _metrics["warmup_status"].labels(component="template_embeddings").set(1 if success else 0)


def set_warmup_status(component: str, complete: bool) -> None:
    """Set warmup status for a component.

    Args:
        component: Component name - "embedding_model", "template_embeddings"
        complete: Whether warmup is complete
    """
    if not PROMETHEUS_AVAILABLE or not _metrics:
        return

    _metrics["warmup_status"].labels(component=component).set(1 if complete else 0)


# ============================================================================
# Metrics Endpoint
# ============================================================================

@lru_cache(maxsize=1)
def _get_metrics_bytes() -> tuple[bytes, str]:
    """Generate Prometheus metrics response (cached).

    Returns:
        Tuple of (metrics_bytes, content_type)
    """
    if not PROMETHEUS_AVAILABLE or _prometheus_client is None:
        return b"# Prometheus metrics not available\n", "text/plain"

    return (
        _prometheus_client.generate_latest(_prometheus_client.REGISTRY),
        _prometheus_client.CONTENT_TYPE_LATEST,
    )


def get_metrics_response() -> tuple[bytes, str]:
    """Generate Prometheus metrics response.

    Clears cache to ensure fresh metrics on each call.

    Returns:
        Tuple of (metrics_bytes, content_type)
    """
    _get_metrics_bytes.cache_clear()
    return _get_metrics_bytes()


def create_metrics_endpoint():
    """Create a FastAPI router for the /metrics endpoint.

    Usage:
        from backend.rag.metrics import create_metrics_endpoint
        app.include_router(create_metrics_endpoint())

    Returns:
        FastAPI APIRouter with /metrics endpoint
    """
    from fastapi import APIRouter
    from fastapi.responses import Response

    router = APIRouter(tags=["monitoring"])

    @router.get("/metrics")
    async def metrics():
        """Prometheus metrics endpoint for scraping."""
        body, content_type = get_metrics_response()
        return Response(content=body, media_type=content_type)

    return router


# ============================================================================
# Metric Summary Helpers (for logging/debugging)
# ============================================================================

def get_template_hit_rate() -> dict[str, Any]:
    """Calculate template hit rate from current metrics.

    Returns:
        Dict with hit rate statistics
    """
    if not PROMETHEUS_AVAILABLE or not _metrics:
        return {"available": False}

    query_counter = _metrics["query_counter"]

    # Get current counter values
    total_template = 0
    total_llm = 0

    # Iterate through query_counter samples
    for metric in query_counter.collect():
        for sample in metric.samples:
            if sample.name == "rag_queries_total":
                labels = sample.labels
                if labels.get("method") == "template":
                    total_template += sample.value
                elif labels.get("method") == "llm":
                    total_llm += sample.value

    total = total_template + total_llm
    hit_rate = total_template / total if total > 0 else 0.0

    return {
        "available": True,
        "total_queries": int(total),
        "template_queries": int(total_template),
        "llm_queries": int(total_llm),
        "template_hit_rate": round(hit_rate, 4),
        "template_hit_rate_percent": round(hit_rate * 100, 2),
    }


def get_template_breakdown() -> dict[str, int]:
    """Get breakdown of template usage by template_id.

    Returns:
        Dict mapping template_id to hit count
    """
    if not PROMETHEUS_AVAILABLE or not _metrics:
        return {}

    template_counter = _metrics["template_hit_counter"]

    breakdown: dict[str, int] = {}
    for metric in template_counter.collect():
        for sample in metric.samples:
            if sample.name == "rag_template_hits_total":
                template_id = sample.labels.get("template_id", "unknown")
                breakdown[template_id] = int(sample.value)

    return breakdown


def get_template_tier_stats() -> dict[str, Any]:
    """Get template matching tier statistics.

    Returns:
        Dict with tier breakdown and hit rates
    """
    if not PROMETHEUS_AVAILABLE or not _metrics:
        return {"available": False}

    tier_counter = _metrics["template_tier_counter"]

    stats: dict[str, dict[str, int]] = {
        "pattern": {"matched": 0, "unmatched": 0},
        "embedding": {"matched": 0, "unmatched": 0},
        "rag": {"matched": 0, "unmatched": 0},  # Tier 2.5: RAG-enhanced matching
        "llm": {"matched": 0, "unmatched": 0},
    }

    for metric in tier_counter.collect():
        for sample in metric.samples:
            if sample.name == "rag_template_tier_total":
                tier = sample.labels.get("tier", "unknown")
                matched = sample.labels.get("matched") == "true"

                if tier in stats:
                    key = "matched" if matched else "unmatched"
                    stats[tier][key] = int(sample.value)

    # Calculate totals and rates
    result = {"available": True, "tiers": {}}

    for tier, counts in stats.items():
        total = counts["matched"] + counts["unmatched"]
        hit_rate = counts["matched"] / total if total > 0 else 0.0
        result["tiers"][tier] = {
            "matched": counts["matched"],
            "unmatched": counts["unmatched"],
            "total": total,
            "hit_rate": round(hit_rate, 4),
        }

    # Overall stats
    total_matched = sum(s["matched"] for s in stats.values())
    total_attempts = sum(s["matched"] + s["unmatched"] for s in stats.values())

    result["total_matched"] = total_matched
    result["total_attempts"] = total_attempts
    result["overall_hit_rate"] = round(total_matched / total_attempts, 4) if total_attempts > 0 else 0.0

    return result


def get_atomic_cache_stats() -> dict[str, Any]:
    """Get atomic sub-task cache statistics.

    Returns:
        Dict with cache hit rates and operation counts
    """
    if not PROMETHEUS_AVAILABLE or not _metrics:
        return {"available": False}

    # Get query-level stats
    query_counter = _metrics["atomic_query_counter"]
    query_stats = {"full_hit": 0, "partial_hit": 0, "miss": 0}

    for metric in query_counter.collect():
        for sample in metric.samples:
            if sample.name == "rag_atomic_queries_total":
                result = sample.labels.get("result", "unknown")
                if result in query_stats:
                    query_stats[result] = int(sample.value)

    # Get sub-task level stats
    subtask_counter = _metrics["atomic_subtask_counter"]
    subtask_stats = {"hit": 0, "miss": 0, "cached": 0}

    for metric in subtask_counter.collect():
        for sample in metric.samples:
            if sample.name == "rag_atomic_subtasks_total":
                operation = sample.labels.get("operation", "unknown")
                if operation in subtask_stats:
                    subtask_stats[operation] = int(sample.value)

    # Get reassembly count
    reassembly_counter = _metrics["atomic_reassembly_counter"]
    reassemblies = 0
    for metric in reassembly_counter.collect():
        for sample in metric.samples:
            if sample.name == "rag_atomic_reassemblies_total":
                reassemblies = int(sample.value)

    # Calculate rates
    total_queries = sum(query_stats.values())
    total_subtasks = subtask_stats["hit"] + subtask_stats["miss"]

    subtask_hit_rate = subtask_stats["hit"] / total_subtasks if total_subtasks > 0 else 0.0
    query_full_hit_rate = query_stats["full_hit"] / total_queries if total_queries > 0 else 0.0
    query_any_hit_rate = (query_stats["full_hit"] + query_stats["partial_hit"]) / total_queries if total_queries > 0 else 0.0

    return {
        "available": True,
        "queries": {
            "full_hits": query_stats["full_hit"],
            "partial_hits": query_stats["partial_hit"],
            "misses": query_stats["miss"],
            "total": total_queries,
            "full_hit_rate": round(query_full_hit_rate, 4),
            "any_hit_rate": round(query_any_hit_rate, 4),
        },
        "subtasks": {
            "hits": subtask_stats["hit"],
            "misses": subtask_stats["miss"],
            "cached": subtask_stats["cached"],
            "total": total_subtasks,
            "hit_rate": round(subtask_hit_rate, 4),
            "hit_rate_percent": round(subtask_hit_rate * 100, 2),
        },
        "reassemblies": reassemblies,
    }


def get_all_performance_stats() -> dict[str, Any]:
    """Get comprehensive performance statistics.

    Returns:
        Dict with all performance metrics for monitoring dashboards
    """
    return {
        "template_hit_rate": get_template_hit_rate(),
        "template_breakdown": get_template_breakdown(),
        "template_tiers": get_template_tier_stats(),
        "atomic_cache": get_atomic_cache_stats(),
    }