glam/backend/rag/benchmark_performance.py

#!/usr/bin/env python3
"""
Performance Benchmark Suite for Heritage RAG Pipeline

Measures latency improvements from:
1. Embedding model warmup (eliminates 3-15s cold start)
2. Template embedding warmup (eliminates 2-5s first-query delay)
3. SPARQL connection pooling (reduces per-request overhead)
4. Atomic sub-task caching (40-70% vs 5-15% hit rate)
5. Full-query semantic caching

Usage:
    # Quick benchmark (no server required - tests components directly)
    python benchmark_performance.py --quick

    # Full API benchmark (requires server running on localhost:8000)
    python benchmark_performance.py --api

    # Atomic cache benchmark only
    python benchmark_performance.py --atomic-cache

Requirements:
    - For --api: Start server with `python main.py` first
    - For --quick: No server needed (tests components in-process)
"""

import argparse
import asyncio
import json
import os
import statistics
import sys
import time
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from typing import Any

# Add parent paths for imports
sys.path.insert(0, str(Path(__file__).parent))
sys.path.insert(0, str(Path(__file__).parent.parent.parent / "src"))

# Import local modules with fallback handling
TemplateClassifier: Any = None
get_template_embedding_matcher: Any = None
AtomicCacheManager: Any = None
HeritageQueryDecomposer: Any = None

try:
    from template_sparql import TemplateClassifier, get_template_embedding_matcher  # type: ignore
except ImportError:
    # Fallback for when running from different directory
    import importlib.util
    _spec = importlib.util.spec_from_file_location(
        "template_sparql",
        Path(__file__).parent / "template_sparql.py"
    )
    if _spec and _spec.loader:
        _template_sparql = importlib.util.module_from_spec(_spec)
        _spec.loader.exec_module(_template_sparql)
        TemplateClassifier = _template_sparql.TemplateClassifier
        get_template_embedding_matcher = _template_sparql.get_template_embedding_matcher

try:
    from atomic_decomposer import AtomicCacheManager, HeritageQueryDecomposer  # type: ignore
except ImportError:
    import importlib.util
    _spec2 = importlib.util.spec_from_file_location(
        "atomic_decomposer",
        Path(__file__).parent / "atomic_decomposer.py"
    )
    if _spec2 and _spec2.loader:
        _atomic_decomposer = importlib.util.module_from_spec(_spec2)
        _spec2.loader.exec_module(_atomic_decomposer)
        AtomicCacheManager = _atomic_decomposer.AtomicCacheManager
        HeritageQueryDecomposer = _atomic_decomposer.HeritageQueryDecomposer


@dataclass
class BenchmarkResult:
    """Result from a single benchmark run."""
    name: str
    iterations: int
    latencies_ms: list[float] = field(default_factory=list)
    cache_hits: int = 0
    cache_misses: int = 0
    errors: int = 0

    @property
    def mean_ms(self) -> float:
        return statistics.mean(self.latencies_ms) if self.latencies_ms else 0

    @property
    def median_ms(self) -> float:
        return statistics.median(self.latencies_ms) if self.latencies_ms else 0

    @property
    def p95_ms(self) -> float:
        if len(self.latencies_ms) < 2:
            return self.mean_ms
        sorted_latencies = sorted(self.latencies_ms)
        idx = int(len(sorted_latencies) * 0.95)
        return sorted_latencies[min(idx, len(sorted_latencies) - 1)]

    @property
    def min_ms(self) -> float:
        return min(self.latencies_ms) if self.latencies_ms else 0

    @property
    def max_ms(self) -> float:
        return max(self.latencies_ms) if self.latencies_ms else 0

    @property
    def cache_hit_rate(self) -> float:
        total = self.cache_hits + self.cache_misses
        return (self.cache_hits / total * 100) if total > 0 else 0

    def to_dict(self) -> dict:
        return {
            "name": self.name,
            "iterations": self.iterations,
            "mean_ms": round(self.mean_ms, 2),
            "median_ms": round(self.median_ms, 2),
            "p95_ms": round(self.p95_ms, 2),
            "min_ms": round(self.min_ms, 2),
            "max_ms": round(self.max_ms, 2),
            "cache_hit_rate": round(self.cache_hit_rate, 2),
            "errors": self.errors,
        }


# =============================================================================
# TEST QUERIES - Diverse set covering all query types
# =============================================================================

TEST_QUERIES = [
    # Template-matchable queries (should use fast template SPARQL)
    {"query": "Welke musea zijn er in Amsterdam?", "type": "template", "lang": "nl"},
    {"query": "Hoeveel bibliotheken zijn er in Rotterdam?", "type": "template", "lang": "nl"},
    {"query": "Welke archieven zijn er in Utrecht?", "type": "template", "lang": "nl"},
    {"query": "What museums are in The Hague?", "type": "template", "lang": "en"},

    # Atomic-decomposable queries (benefit from sub-task caching)
    {"query": "Hoeveel musea in Amsterdam hebben een ISIL code?", "type": "atomic", "lang": "nl"},
    {"query": "Welke archieven in Noord-Holland hebben een website?", "type": "atomic", "lang": "nl"},
    {"query": "How many libraries in Groningen have a Wikidata ID?", "type": "atomic", "lang": "en"},

    # Similar queries (test sub-task cache reuse)
    {"query": "Hoeveel musea in Amsterdam hebben een website?", "type": "atomic_reuse", "lang": "nl"},
    {"query": "Welke musea in Amsterdam zijn open?", "type": "atomic_reuse", "lang": "nl"},

    # LLM-required queries (complex, no template match)
    {"query": "Vergelijk de collecties van het Rijksmuseum en het Van Gogh Museum", "type": "llm", "lang": "nl"},
    {"query": "What is the history of archives in Friesland?", "type": "llm", "lang": "en"},
]


# =============================================================================
# BENCHMARK 1: Embedding Model Warmup
# =============================================================================

def benchmark_embedding_warmup() -> BenchmarkResult:
    """Benchmark embedding model cold start vs warm."""
    print("\n📊 Benchmark: Embedding Model Warmup")
    print("-" * 50)

    result = BenchmarkResult(name="embedding_warmup", iterations=5)

    try:
        # Test cold start (new model instance each time)
        from sentence_transformers import SentenceTransformer

        # Cold start
        print("  Testing cold start...")
        start = time.time()
        model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
        _ = model.encode("test query")
        cold_ms = (time.time() - start) * 1000
        result.latencies_ms.append(cold_ms)
        print(f"  Cold start: {cold_ms:.0f}ms")

        # Warm queries (model already loaded)
        print("  Testing warm queries...")
        for i in range(4):
            start = time.time()
            _ = model.encode(f"test query {i}")
            warm_ms = (time.time() - start) * 1000
            result.latencies_ms.append(warm_ms)

        warm_avg = statistics.mean(result.latencies_ms[1:])
        print(f"  Warm average: {warm_avg:.1f}ms")
        print(f"  Speedup: {cold_ms / warm_avg:.1f}x faster when warm")

    except Exception as e:
        print(f"  ERROR: {e}")
        result.errors += 1

    return result


# =============================================================================
# BENCHMARK 2: Template Embedding Warmup
# =============================================================================

def benchmark_template_embedding_warmup() -> BenchmarkResult:
    """Benchmark template embedding computation cold vs warm."""
    print("\n📊 Benchmark: Template Embedding Warmup")
    print("-" * 50)

    result = BenchmarkResult(name="template_embedding_warmup", iterations=5)

    try:
        # Use module-level imports (already loaded at top of file)
        # Load templates
        classifier = TemplateClassifier()
        templates = classifier._load_templates()

        if not templates:
            print("  No templates found!")
            result.errors += 1
            return result

        # Cold start (force recompute)
        print(f"  Testing cold start ({len(templates)} templates)...")
        matcher = get_template_embedding_matcher()
        matcher._template_embeddings = {}  # Clear cache

        start = time.time()
        matcher._ensure_embeddings_computed(templates)
        cold_ms = (time.time() - start) * 1000
        result.latencies_ms.append(cold_ms)
        print(f"  Cold computation: {cold_ms:.0f}ms")

        # Warm lookups (embeddings cached)
        print("  Testing warm lookups...")
        for i in range(4):
            start = time.time()
            _ = matcher.match("Welke musea zijn er in Amsterdam?", templates)
            warm_ms = (time.time() - start) * 1000
            result.latencies_ms.append(warm_ms)

        warm_avg = statistics.mean(result.latencies_ms[1:])
        print(f"  Warm lookup average: {warm_avg:.1f}ms")
        print(f"  Speedup: {cold_ms / warm_avg:.1f}x faster when warm")

    except Exception as e:
        print(f"  ERROR: {e}")
        result.errors += 1

    return result


# =============================================================================
# BENCHMARK 3: Atomic Sub-task Caching
# =============================================================================

async def benchmark_atomic_cache() -> BenchmarkResult:
    """Benchmark atomic sub-task cache hit rates."""
    print("\n📊 Benchmark: Atomic Sub-task Caching")
    print("-" * 50)

    result = BenchmarkResult(name="atomic_cache", iterations=len(TEST_QUERIES))

    try:
        # Use module-level imports (already loaded at top of file)
        # Create cache manager (in-memory only for benchmark)
        cache_mgr = AtomicCacheManager(semantic_cache=None)
        decomposer = HeritageQueryDecomposer()

        print(f"  Processing {len(TEST_QUERIES)} queries...")

        for i, q in enumerate(TEST_QUERIES):
            query = q["query"]
            lang = q["lang"]

            start = time.time()
            decomposed, cached = await cache_mgr.process_query(query, lang)
            elapsed_ms = (time.time() - start) * 1000
            result.latencies_ms.append(elapsed_ms)

            # Track cache stats
            for task in decomposed.sub_tasks:
                if task.cache_hit:
                    result.cache_hits += 1
                else:
                    result.cache_misses += 1

            # Simulate caching the results (for next similar queries)
            for task in decomposed.sub_tasks:
                if not task.cache_hit:
                    # Cache a dummy result
                    await cache_mgr.cache_subtask_result(
                        task=task,
                        result={"dummy": True, "query_idx": i},
                        language=lang,
                    )

            status = "✓ partial hit" if decomposed.partial_cache_hits > 0 else "○ miss"
            if decomposed.fully_cached:
                status = "★ full hit"

            print(f"  [{i+1:2d}] {status} - {decomposed.partial_cache_hits}/{len(decomposed.sub_tasks)} sub-tasks - {q['type']}")

        # Get final stats
        stats = cache_mgr.get_stats()
        print(f"\n  Final stats:")
        print(f"    Sub-task hit rate: {stats['subtask_hit_rate']:.1f}%")
        print(f"    Total sub-tasks: {stats['subtask_hits'] + stats['subtask_misses']}")
        print(f"    Full reassemblies: {stats['full_query_reassemblies']}")

    except Exception as e:
        print(f"  ERROR: {e}")
        result.errors += 1

    return result


# =============================================================================
# BENCHMARK 4: Template Pattern Matching Speed
# =============================================================================

def benchmark_template_matching() -> BenchmarkResult:
    """Benchmark 3-tier template matching: exact → pattern → embedding.

    Note: This benchmark tests only pattern and embedding matching (Tiers 1-2).
    The full TemplateClassifier.forward() also includes LLM classification (Tier 3),
    but we skip that here to isolate the fast-path performance.
    """
    print("\n📊 Benchmark: Template Pattern Matching (3-tier)")
    print("-" * 50)

    result = BenchmarkResult(name="template_matching", iterations=len(TEST_QUERIES))

    try:
        # Use module-level imports (already loaded at top of file)
        classifier = TemplateClassifier()
        templates = classifier._load_templates()

        if not templates:
            print("  No templates found!")
            result.errors += 1
            return result

        # Track which tier matched
        tier_counts = {"pattern": 0, "embedding": 0, "none": 0}

        # Get embedding matcher
        embedding_matcher = get_template_embedding_matcher()

        for q in TEST_QUERIES:
            query = q["query"]

            start = time.time()

            # Tier 1: Pattern matching
            pattern_match = classifier._match_by_patterns(query, templates)
            if pattern_match and pattern_match.confidence >= 0.75:
                elapsed_ms = (time.time() - start) * 1000
                result.latencies_ms.append(elapsed_ms)
                tier_counts["pattern"] += 1
                result.cache_hits += 1
                continue

            # Tier 2: Embedding matching
            embedding_match = embedding_matcher.match(query, templates, min_similarity=0.70)
            elapsed_ms = (time.time() - start) * 1000
            result.latencies_ms.append(elapsed_ms)

            if embedding_match and embedding_match.confidence >= 0.70:
                tier_counts["embedding"] += 1
                result.cache_hits += 1
            else:
                tier_counts["none"] += 1
                result.cache_misses += 1

        print(f"  Match distribution:")
        for tier, count in tier_counts.items():
            pct = count / len(TEST_QUERIES) * 100
            print(f"    {tier}: {count} ({pct:.0f}%)")

        print(f"  Average latency: {result.mean_ms:.1f}ms")

    except Exception as e:
        print(f"  ERROR: {e}")
        result.errors += 1

    return result


# =============================================================================
# BENCHMARK 5: Full API Endpoint (requires server)
# =============================================================================

async def benchmark_api_endpoint(base_url: str = "http://localhost:8000") -> BenchmarkResult:
    """Benchmark full API endpoint latency."""
    print("\n📊 Benchmark: Full API Endpoint (/api/rag/dspy/query)")
    print("-" * 50)

    result = BenchmarkResult(name="api_endpoint", iterations=len(TEST_QUERIES))

    try:
        import httpx

        async with httpx.AsyncClient(timeout=60.0) as client:
            # Warm up
            print("  Warming up API...")
            try:
                await client.post(
                    f"{base_url}/api/rag/dspy/query",
                    json={"question": "test", "language": "nl"},
                )
            except Exception:
                print("  WARNING: API not responding, skipping benchmark")
                result.errors += 1
                return result

            print(f"  Processing {len(TEST_QUERIES)} queries...")

            for i, q in enumerate(TEST_QUERIES):
                start = time.time()
                try:
                    response = await client.post(
                        f"{base_url}/api/rag/dspy/query",
                        json={
                            "question": q["query"],
                            "language": q["lang"],
                            "skip_cache": False,  # Use cache
                        },
                    )
                    elapsed_ms = (time.time() - start) * 1000
                    result.latencies_ms.append(elapsed_ms)

                    if response.status_code == 200:
                        data = response.json()
                        cache_hit = data.get("cache_hit", False)
                        template_used = data.get("template_used", False)

                        if cache_hit:
                            result.cache_hits += 1
                            status = "★ cache hit"
                        elif template_used:
                            status = "✓ template"
                        else:
                            result.cache_misses += 1
                            status = "○ LLM"

                        print(f"  [{i+1:2d}] {status} - {elapsed_ms:.0f}ms - {q['type']}")
                    else:
                        print(f"  [{i+1:2d}] ERROR: HTTP {response.status_code}")
                        result.errors += 1

                except Exception as e:
                    print(f"  [{i+1:2d}] ERROR: {e}")
                    result.errors += 1

            print(f"\n  Results:")
            print(f"    Mean latency: {result.mean_ms:.0f}ms")
            print(f"    P95 latency: {result.p95_ms:.0f}ms")
            print(f"    Cache hit rate: {result.cache_hit_rate:.1f}%")

    except ImportError:
        print("  ERROR: httpx not installed")
        result.errors += 1

    return result


# =============================================================================
# MAIN BENCHMARK RUNNER
# =============================================================================

async def run_all_benchmarks(quick: bool = True, api: bool = False, atomic_only: bool = False) -> dict:
    """Run all benchmarks and generate report."""

    print("\n" + "=" * 70)
    print("Heritage RAG Performance Benchmark Suite")
    print(f"Timestamp: {datetime.now().isoformat()}")
    print("=" * 70)

    results: dict[str, Any] = {
        "timestamp": datetime.now().isoformat(),
        "mode": "quick" if quick else ("api" if api else "atomic"),
        "benchmarks": {},
    }

    if atomic_only:
        # Only run atomic cache benchmark
        atomic_result = await benchmark_atomic_cache()
        results["benchmarks"]["atomic_cache"] = atomic_result.to_dict()

    elif quick:
        # Quick benchmarks (no server required)

        # 1. Embedding warmup
        embed_result = benchmark_embedding_warmup()
        results["benchmarks"]["embedding_warmup"] = embed_result.to_dict()

        # 2. Template embedding warmup
        template_embed_result = benchmark_template_embedding_warmup()
        results["benchmarks"]["template_embedding_warmup"] = template_embed_result.to_dict()

        # 3. Template matching
        template_match_result = benchmark_template_matching()
        results["benchmarks"]["template_matching"] = template_match_result.to_dict()

        # 4. Atomic cache
        atomic_result = await benchmark_atomic_cache()
        results["benchmarks"]["atomic_cache"] = atomic_result.to_dict()

    if api:
        # Full API benchmark (requires server)
        api_result = await benchmark_api_endpoint()
        results["benchmarks"]["api_endpoint"] = api_result.to_dict()

    # ==========================================================================
    # SUMMARY REPORT
    # ==========================================================================

    print("\n" + "=" * 70)
    print("BENCHMARK SUMMARY")
    print("=" * 70)

    print(f"\n{'Benchmark':<35} {'Mean':>10} {'P95':>10} {'Cache%':>10}")
    print("-" * 70)

    for name, data in results["benchmarks"].items():
        mean = f"{data['mean_ms']:.1f}ms"
        p95 = f"{data['p95_ms']:.1f}ms"
        cache = f"{data['cache_hit_rate']:.1f}%" if data['cache_hit_rate'] > 0 else "N/A"
        print(f"{name:<35} {mean:>10} {p95:>10} {cache:>10}")

    print("-" * 70)

    # Performance insights
    print("\n📈 Performance Insights:")

    if "embedding_warmup" in results["benchmarks"]:
        data = results["benchmarks"]["embedding_warmup"]
        if len(data.get("latencies_ms", [])) > 1:
            # First is cold, rest are warm - but we only have summary stats
            print(f"  • Embedding warmup eliminates {data['max_ms'] - data['min_ms']:.0f}ms cold start")

    if "template_embedding_warmup" in results["benchmarks"]:
        data = results["benchmarks"]["template_embedding_warmup"]
        if data["max_ms"] > 0:
            speedup = data["max_ms"] / max(data["min_ms"], 1)
            print(f"  • Template pre-computation provides {speedup:.0f}x speedup")

    if "atomic_cache" in results["benchmarks"]:
        data = results["benchmarks"]["atomic_cache"]
        print(f"  • Atomic cache hit rate: {data['cache_hit_rate']:.1f}% (target: 40-70%)")

    if "api_endpoint" in results["benchmarks"]:
        data = results["benchmarks"]["api_endpoint"]
        print(f"  • API P95 latency: {data['p95_ms']:.0f}ms")
        print(f"  • Full query cache hit rate: {data['cache_hit_rate']:.1f}%")

    # Save results
    results_path = Path(__file__).parent / "benchmark_results" / f"perf_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
    results_path.parent.mkdir(exist_ok=True)

    with open(results_path, "w") as f:
        json.dump(results, f, indent=2)

    print(f"\n💾 Results saved to: {results_path}")
    print("=" * 70)

    return results


def main():
    parser = argparse.ArgumentParser(description="Heritage RAG Performance Benchmark")
    parser.add_argument("--quick", action="store_true", default=True,
                       help="Run quick benchmarks (no server required)")
    parser.add_argument("--api", action="store_true",
                       help="Run API endpoint benchmark (requires server)")
    parser.add_argument("--atomic-cache", action="store_true",
                       help="Run only atomic cache benchmark")
    parser.add_argument("--all", action="store_true",
                       help="Run all benchmarks including API")

    args = parser.parse_args()

    if args.all:
        args.api = True
        args.quick = True
        args.atomic_cache = False

    asyncio.run(run_all_benchmarks(
        quick=args.quick and not args.atomic_cache,
        api=args.api,
        atomic_only=args.atomic_cache,
    ))


if __name__ == "__main__":
    main()