glam/backend/rag/benchmark_performance.py
2026-01-02 02:10:18 +01:00

597 lines
22 KiB
Python

#!/usr/bin/env python3
"""
Performance Benchmark Suite for Heritage RAG Pipeline
Measures latency improvements from:
1. Embedding model warmup (eliminates 3-15s cold start)
2. Template embedding warmup (eliminates 2-5s first-query delay)
3. SPARQL connection pooling (reduces per-request overhead)
4. Atomic sub-task caching (40-70% vs 5-15% hit rate)
5. Full-query semantic caching
Usage:
# Quick benchmark (no server required - tests components directly)
python benchmark_performance.py --quick
# Full API benchmark (requires server running on localhost:8000)
python benchmark_performance.py --api
# Atomic cache benchmark only
python benchmark_performance.py --atomic-cache
Requirements:
- For --api: Start server with `python main.py` first
- For --quick: No server needed (tests components in-process)
"""
import argparse
import asyncio
import json
import os
import statistics
import sys
import time
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from typing import Any
# Add parent paths for imports
sys.path.insert(0, str(Path(__file__).parent))
sys.path.insert(0, str(Path(__file__).parent.parent.parent / "src"))
# Import local modules with fallback handling
TemplateClassifier: Any = None
get_template_embedding_matcher: Any = None
AtomicCacheManager: Any = None
HeritageQueryDecomposer: Any = None
try:
from template_sparql import TemplateClassifier, get_template_embedding_matcher # type: ignore
except ImportError:
# Fallback for when running from different directory
import importlib.util
_spec = importlib.util.spec_from_file_location(
"template_sparql",
Path(__file__).parent / "template_sparql.py"
)
if _spec and _spec.loader:
_template_sparql = importlib.util.module_from_spec(_spec)
_spec.loader.exec_module(_template_sparql)
TemplateClassifier = _template_sparql.TemplateClassifier
get_template_embedding_matcher = _template_sparql.get_template_embedding_matcher
try:
from atomic_decomposer import AtomicCacheManager, HeritageQueryDecomposer # type: ignore
except ImportError:
import importlib.util
_spec2 = importlib.util.spec_from_file_location(
"atomic_decomposer",
Path(__file__).parent / "atomic_decomposer.py"
)
if _spec2 and _spec2.loader:
_atomic_decomposer = importlib.util.module_from_spec(_spec2)
_spec2.loader.exec_module(_atomic_decomposer)
AtomicCacheManager = _atomic_decomposer.AtomicCacheManager
HeritageQueryDecomposer = _atomic_decomposer.HeritageQueryDecomposer
@dataclass
class BenchmarkResult:
"""Result from a single benchmark run."""
name: str
iterations: int
latencies_ms: list[float] = field(default_factory=list)
cache_hits: int = 0
cache_misses: int = 0
errors: int = 0
@property
def mean_ms(self) -> float:
return statistics.mean(self.latencies_ms) if self.latencies_ms else 0
@property
def median_ms(self) -> float:
return statistics.median(self.latencies_ms) if self.latencies_ms else 0
@property
def p95_ms(self) -> float:
if len(self.latencies_ms) < 2:
return self.mean_ms
sorted_latencies = sorted(self.latencies_ms)
idx = int(len(sorted_latencies) * 0.95)
return sorted_latencies[min(idx, len(sorted_latencies) - 1)]
@property
def min_ms(self) -> float:
return min(self.latencies_ms) if self.latencies_ms else 0
@property
def max_ms(self) -> float:
return max(self.latencies_ms) if self.latencies_ms else 0
@property
def cache_hit_rate(self) -> float:
total = self.cache_hits + self.cache_misses
return (self.cache_hits / total * 100) if total > 0 else 0
def to_dict(self) -> dict:
return {
"name": self.name,
"iterations": self.iterations,
"mean_ms": round(self.mean_ms, 2),
"median_ms": round(self.median_ms, 2),
"p95_ms": round(self.p95_ms, 2),
"min_ms": round(self.min_ms, 2),
"max_ms": round(self.max_ms, 2),
"cache_hit_rate": round(self.cache_hit_rate, 2),
"errors": self.errors,
}
# =============================================================================
# TEST QUERIES - Diverse set covering all query types
# =============================================================================
TEST_QUERIES = [
# Template-matchable queries (should use fast template SPARQL)
{"query": "Welke musea zijn er in Amsterdam?", "type": "template", "lang": "nl"},
{"query": "Hoeveel bibliotheken zijn er in Rotterdam?", "type": "template", "lang": "nl"},
{"query": "Welke archieven zijn er in Utrecht?", "type": "template", "lang": "nl"},
{"query": "What museums are in The Hague?", "type": "template", "lang": "en"},
# Atomic-decomposable queries (benefit from sub-task caching)
{"query": "Hoeveel musea in Amsterdam hebben een ISIL code?", "type": "atomic", "lang": "nl"},
{"query": "Welke archieven in Noord-Holland hebben een website?", "type": "atomic", "lang": "nl"},
{"query": "How many libraries in Groningen have a Wikidata ID?", "type": "atomic", "lang": "en"},
# Similar queries (test sub-task cache reuse)
{"query": "Hoeveel musea in Amsterdam hebben een website?", "type": "atomic_reuse", "lang": "nl"},
{"query": "Welke musea in Amsterdam zijn open?", "type": "atomic_reuse", "lang": "nl"},
# LLM-required queries (complex, no template match)
{"query": "Vergelijk de collecties van het Rijksmuseum en het Van Gogh Museum", "type": "llm", "lang": "nl"},
{"query": "What is the history of archives in Friesland?", "type": "llm", "lang": "en"},
]
# =============================================================================
# BENCHMARK 1: Embedding Model Warmup
# =============================================================================
def benchmark_embedding_warmup() -> BenchmarkResult:
"""Benchmark embedding model cold start vs warm."""
print("\n📊 Benchmark: Embedding Model Warmup")
print("-" * 50)
result = BenchmarkResult(name="embedding_warmup", iterations=5)
try:
# Test cold start (new model instance each time)
from sentence_transformers import SentenceTransformer
# Cold start
print(" Testing cold start...")
start = time.time()
model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
_ = model.encode("test query")
cold_ms = (time.time() - start) * 1000
result.latencies_ms.append(cold_ms)
print(f" Cold start: {cold_ms:.0f}ms")
# Warm queries (model already loaded)
print(" Testing warm queries...")
for i in range(4):
start = time.time()
_ = model.encode(f"test query {i}")
warm_ms = (time.time() - start) * 1000
result.latencies_ms.append(warm_ms)
warm_avg = statistics.mean(result.latencies_ms[1:])
print(f" Warm average: {warm_avg:.1f}ms")
print(f" Speedup: {cold_ms / warm_avg:.1f}x faster when warm")
except Exception as e:
print(f" ERROR: {e}")
result.errors += 1
return result
# =============================================================================
# BENCHMARK 2: Template Embedding Warmup
# =============================================================================
def benchmark_template_embedding_warmup() -> BenchmarkResult:
"""Benchmark template embedding computation cold vs warm."""
print("\n📊 Benchmark: Template Embedding Warmup")
print("-" * 50)
result = BenchmarkResult(name="template_embedding_warmup", iterations=5)
try:
# Use module-level imports (already loaded at top of file)
# Load templates
classifier = TemplateClassifier()
templates = classifier._load_templates()
if not templates:
print(" No templates found!")
result.errors += 1
return result
# Cold start (force recompute)
print(f" Testing cold start ({len(templates)} templates)...")
matcher = get_template_embedding_matcher()
matcher._template_embeddings = {} # Clear cache
start = time.time()
matcher._ensure_embeddings_computed(templates)
cold_ms = (time.time() - start) * 1000
result.latencies_ms.append(cold_ms)
print(f" Cold computation: {cold_ms:.0f}ms")
# Warm lookups (embeddings cached)
print(" Testing warm lookups...")
for i in range(4):
start = time.time()
_ = matcher.match("Welke musea zijn er in Amsterdam?", templates)
warm_ms = (time.time() - start) * 1000
result.latencies_ms.append(warm_ms)
warm_avg = statistics.mean(result.latencies_ms[1:])
print(f" Warm lookup average: {warm_avg:.1f}ms")
print(f" Speedup: {cold_ms / warm_avg:.1f}x faster when warm")
except Exception as e:
print(f" ERROR: {e}")
result.errors += 1
return result
# =============================================================================
# BENCHMARK 3: Atomic Sub-task Caching
# =============================================================================
async def benchmark_atomic_cache() -> BenchmarkResult:
"""Benchmark atomic sub-task cache hit rates."""
print("\n📊 Benchmark: Atomic Sub-task Caching")
print("-" * 50)
result = BenchmarkResult(name="atomic_cache", iterations=len(TEST_QUERIES))
try:
# Use module-level imports (already loaded at top of file)
# Create cache manager (in-memory only for benchmark)
cache_mgr = AtomicCacheManager(semantic_cache=None)
decomposer = HeritageQueryDecomposer()
print(f" Processing {len(TEST_QUERIES)} queries...")
for i, q in enumerate(TEST_QUERIES):
query = q["query"]
lang = q["lang"]
start = time.time()
decomposed, cached = await cache_mgr.process_query(query, lang)
elapsed_ms = (time.time() - start) * 1000
result.latencies_ms.append(elapsed_ms)
# Track cache stats
for task in decomposed.sub_tasks:
if task.cache_hit:
result.cache_hits += 1
else:
result.cache_misses += 1
# Simulate caching the results (for next similar queries)
for task in decomposed.sub_tasks:
if not task.cache_hit:
# Cache a dummy result
await cache_mgr.cache_subtask_result(
task=task,
result={"dummy": True, "query_idx": i},
language=lang,
)
status = "✓ partial hit" if decomposed.partial_cache_hits > 0 else "○ miss"
if decomposed.fully_cached:
status = "★ full hit"
print(f" [{i+1:2d}] {status} - {decomposed.partial_cache_hits}/{len(decomposed.sub_tasks)} sub-tasks - {q['type']}")
# Get final stats
stats = cache_mgr.get_stats()
print(f"\n Final stats:")
print(f" Sub-task hit rate: {stats['subtask_hit_rate']:.1f}%")
print(f" Total sub-tasks: {stats['subtask_hits'] + stats['subtask_misses']}")
print(f" Full reassemblies: {stats['full_query_reassemblies']}")
except Exception as e:
print(f" ERROR: {e}")
result.errors += 1
return result
# =============================================================================
# BENCHMARK 4: Template Pattern Matching Speed
# =============================================================================
def benchmark_template_matching() -> BenchmarkResult:
"""Benchmark 3-tier template matching: exact → pattern → embedding.
Note: This benchmark tests only pattern and embedding matching (Tiers 1-2).
The full TemplateClassifier.forward() also includes LLM classification (Tier 3),
but we skip that here to isolate the fast-path performance.
"""
print("\n📊 Benchmark: Template Pattern Matching (3-tier)")
print("-" * 50)
result = BenchmarkResult(name="template_matching", iterations=len(TEST_QUERIES))
try:
# Use module-level imports (already loaded at top of file)
classifier = TemplateClassifier()
templates = classifier._load_templates()
if not templates:
print(" No templates found!")
result.errors += 1
return result
# Track which tier matched
tier_counts = {"pattern": 0, "embedding": 0, "none": 0}
# Get embedding matcher
embedding_matcher = get_template_embedding_matcher()
for q in TEST_QUERIES:
query = q["query"]
start = time.time()
# Tier 1: Pattern matching
pattern_match = classifier._match_by_patterns(query, templates)
if pattern_match and pattern_match.confidence >= 0.75:
elapsed_ms = (time.time() - start) * 1000
result.latencies_ms.append(elapsed_ms)
tier_counts["pattern"] += 1
result.cache_hits += 1
continue
# Tier 2: Embedding matching
embedding_match = embedding_matcher.match(query, templates, min_similarity=0.70)
elapsed_ms = (time.time() - start) * 1000
result.latencies_ms.append(elapsed_ms)
if embedding_match and embedding_match.confidence >= 0.70:
tier_counts["embedding"] += 1
result.cache_hits += 1
else:
tier_counts["none"] += 1
result.cache_misses += 1
print(f" Match distribution:")
for tier, count in tier_counts.items():
pct = count / len(TEST_QUERIES) * 100
print(f" {tier}: {count} ({pct:.0f}%)")
print(f" Average latency: {result.mean_ms:.1f}ms")
except Exception as e:
print(f" ERROR: {e}")
result.errors += 1
return result
# =============================================================================
# BENCHMARK 5: Full API Endpoint (requires server)
# =============================================================================
async def benchmark_api_endpoint(base_url: str = "http://localhost:8000") -> BenchmarkResult:
"""Benchmark full API endpoint latency."""
print("\n📊 Benchmark: Full API Endpoint (/api/rag/dspy/query)")
print("-" * 50)
result = BenchmarkResult(name="api_endpoint", iterations=len(TEST_QUERIES))
try:
import httpx
async with httpx.AsyncClient(timeout=60.0) as client:
# Warm up
print(" Warming up API...")
try:
await client.post(
f"{base_url}/api/rag/dspy/query",
json={"question": "test", "language": "nl"},
)
except Exception:
print(" WARNING: API not responding, skipping benchmark")
result.errors += 1
return result
print(f" Processing {len(TEST_QUERIES)} queries...")
for i, q in enumerate(TEST_QUERIES):
start = time.time()
try:
response = await client.post(
f"{base_url}/api/rag/dspy/query",
json={
"question": q["query"],
"language": q["lang"],
"skip_cache": False, # Use cache
},
)
elapsed_ms = (time.time() - start) * 1000
result.latencies_ms.append(elapsed_ms)
if response.status_code == 200:
data = response.json()
cache_hit = data.get("cache_hit", False)
template_used = data.get("template_used", False)
if cache_hit:
result.cache_hits += 1
status = "★ cache hit"
elif template_used:
status = "✓ template"
else:
result.cache_misses += 1
status = "○ LLM"
print(f" [{i+1:2d}] {status} - {elapsed_ms:.0f}ms - {q['type']}")
else:
print(f" [{i+1:2d}] ERROR: HTTP {response.status_code}")
result.errors += 1
except Exception as e:
print(f" [{i+1:2d}] ERROR: {e}")
result.errors += 1
print(f"\n Results:")
print(f" Mean latency: {result.mean_ms:.0f}ms")
print(f" P95 latency: {result.p95_ms:.0f}ms")
print(f" Cache hit rate: {result.cache_hit_rate:.1f}%")
except ImportError:
print(" ERROR: httpx not installed")
result.errors += 1
return result
# =============================================================================
# MAIN BENCHMARK RUNNER
# =============================================================================
async def run_all_benchmarks(quick: bool = True, api: bool = False, atomic_only: bool = False) -> dict:
"""Run all benchmarks and generate report."""
print("\n" + "=" * 70)
print("Heritage RAG Performance Benchmark Suite")
print(f"Timestamp: {datetime.now().isoformat()}")
print("=" * 70)
results: dict[str, Any] = {
"timestamp": datetime.now().isoformat(),
"mode": "quick" if quick else ("api" if api else "atomic"),
"benchmarks": {},
}
if atomic_only:
# Only run atomic cache benchmark
atomic_result = await benchmark_atomic_cache()
results["benchmarks"]["atomic_cache"] = atomic_result.to_dict()
elif quick:
# Quick benchmarks (no server required)
# 1. Embedding warmup
embed_result = benchmark_embedding_warmup()
results["benchmarks"]["embedding_warmup"] = embed_result.to_dict()
# 2. Template embedding warmup
template_embed_result = benchmark_template_embedding_warmup()
results["benchmarks"]["template_embedding_warmup"] = template_embed_result.to_dict()
# 3. Template matching
template_match_result = benchmark_template_matching()
results["benchmarks"]["template_matching"] = template_match_result.to_dict()
# 4. Atomic cache
atomic_result = await benchmark_atomic_cache()
results["benchmarks"]["atomic_cache"] = atomic_result.to_dict()
if api:
# Full API benchmark (requires server)
api_result = await benchmark_api_endpoint()
results["benchmarks"]["api_endpoint"] = api_result.to_dict()
# ==========================================================================
# SUMMARY REPORT
# ==========================================================================
print("\n" + "=" * 70)
print("BENCHMARK SUMMARY")
print("=" * 70)
print(f"\n{'Benchmark':<35} {'Mean':>10} {'P95':>10} {'Cache%':>10}")
print("-" * 70)
for name, data in results["benchmarks"].items():
mean = f"{data['mean_ms']:.1f}ms"
p95 = f"{data['p95_ms']:.1f}ms"
cache = f"{data['cache_hit_rate']:.1f}%" if data['cache_hit_rate'] > 0 else "N/A"
print(f"{name:<35} {mean:>10} {p95:>10} {cache:>10}")
print("-" * 70)
# Performance insights
print("\n📈 Performance Insights:")
if "embedding_warmup" in results["benchmarks"]:
data = results["benchmarks"]["embedding_warmup"]
if len(data.get("latencies_ms", [])) > 1:
# First is cold, rest are warm - but we only have summary stats
print(f" • Embedding warmup eliminates {data['max_ms'] - data['min_ms']:.0f}ms cold start")
if "template_embedding_warmup" in results["benchmarks"]:
data = results["benchmarks"]["template_embedding_warmup"]
if data["max_ms"] > 0:
speedup = data["max_ms"] / max(data["min_ms"], 1)
print(f" • Template pre-computation provides {speedup:.0f}x speedup")
if "atomic_cache" in results["benchmarks"]:
data = results["benchmarks"]["atomic_cache"]
print(f" • Atomic cache hit rate: {data['cache_hit_rate']:.1f}% (target: 40-70%)")
if "api_endpoint" in results["benchmarks"]:
data = results["benchmarks"]["api_endpoint"]
print(f" • API P95 latency: {data['p95_ms']:.0f}ms")
print(f" • Full query cache hit rate: {data['cache_hit_rate']:.1f}%")
# Save results
results_path = Path(__file__).parent / "benchmark_results" / f"perf_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
results_path.parent.mkdir(exist_ok=True)
with open(results_path, "w") as f:
json.dump(results, f, indent=2)
print(f"\n💾 Results saved to: {results_path}")
print("=" * 70)
return results
def main():
parser = argparse.ArgumentParser(description="Heritage RAG Performance Benchmark")
parser.add_argument("--quick", action="store_true", default=True,
help="Run quick benchmarks (no server required)")
parser.add_argument("--api", action="store_true",
help="Run API endpoint benchmark (requires server)")
parser.add_argument("--atomic-cache", action="store_true",
help="Run only atomic cache benchmark")
parser.add_argument("--all", action="store_true",
help="Run all benchmarks including API")
args = parser.parse_args()
if args.all:
args.api = True
args.quick = True
args.atomic_cache = False
asyncio.run(run_all_benchmarks(
quick=args.quick and not args.atomic_cache,
api=args.api,
atomic_only=args.atomic_cache,
))
if __name__ == "__main__":
main()