#!/usr/bin/env python3 """ Performance Benchmark Suite for Heritage RAG Pipeline Measures latency improvements from: 1. Embedding model warmup (eliminates 3-15s cold start) 2. Template embedding warmup (eliminates 2-5s first-query delay) 3. SPARQL connection pooling (reduces per-request overhead) 4. Atomic sub-task caching (40-70% vs 5-15% hit rate) 5. Full-query semantic caching Usage: # Quick benchmark (no server required - tests components directly) python benchmark_performance.py --quick # Full API benchmark (requires server running on localhost:8000) python benchmark_performance.py --api # Atomic cache benchmark only python benchmark_performance.py --atomic-cache Requirements: - For --api: Start server with `python main.py` first - For --quick: No server needed (tests components in-process) """ import argparse import asyncio import json import os import statistics import sys import time from dataclasses import dataclass, field from datetime import datetime from pathlib import Path from typing import Any # Add parent paths for imports sys.path.insert(0, str(Path(__file__).parent)) sys.path.insert(0, str(Path(__file__).parent.parent.parent / "src")) # Import local modules with fallback handling TemplateClassifier: Any = None get_template_embedding_matcher: Any = None AtomicCacheManager: Any = None HeritageQueryDecomposer: Any = None try: from template_sparql import TemplateClassifier, get_template_embedding_matcher # type: ignore except ImportError: # Fallback for when running from different directory import importlib.util _spec = importlib.util.spec_from_file_location( "template_sparql", Path(__file__).parent / "template_sparql.py" ) if _spec and _spec.loader: _template_sparql = importlib.util.module_from_spec(_spec) _spec.loader.exec_module(_template_sparql) TemplateClassifier = _template_sparql.TemplateClassifier get_template_embedding_matcher = _template_sparql.get_template_embedding_matcher try: from atomic_decomposer import AtomicCacheManager, HeritageQueryDecomposer # type: ignore except ImportError: import importlib.util _spec2 = importlib.util.spec_from_file_location( "atomic_decomposer", Path(__file__).parent / "atomic_decomposer.py" ) if _spec2 and _spec2.loader: _atomic_decomposer = importlib.util.module_from_spec(_spec2) _spec2.loader.exec_module(_atomic_decomposer) AtomicCacheManager = _atomic_decomposer.AtomicCacheManager HeritageQueryDecomposer = _atomic_decomposer.HeritageQueryDecomposer @dataclass class BenchmarkResult: """Result from a single benchmark run.""" name: str iterations: int latencies_ms: list[float] = field(default_factory=list) cache_hits: int = 0 cache_misses: int = 0 errors: int = 0 @property def mean_ms(self) -> float: return statistics.mean(self.latencies_ms) if self.latencies_ms else 0 @property def median_ms(self) -> float: return statistics.median(self.latencies_ms) if self.latencies_ms else 0 @property def p95_ms(self) -> float: if len(self.latencies_ms) < 2: return self.mean_ms sorted_latencies = sorted(self.latencies_ms) idx = int(len(sorted_latencies) * 0.95) return sorted_latencies[min(idx, len(sorted_latencies) - 1)] @property def min_ms(self) -> float: return min(self.latencies_ms) if self.latencies_ms else 0 @property def max_ms(self) -> float: return max(self.latencies_ms) if self.latencies_ms else 0 @property def cache_hit_rate(self) -> float: total = self.cache_hits + self.cache_misses return (self.cache_hits / total * 100) if total > 0 else 0 def to_dict(self) -> dict: return { "name": self.name, "iterations": self.iterations, "mean_ms": round(self.mean_ms, 2), "median_ms": round(self.median_ms, 2), "p95_ms": round(self.p95_ms, 2), "min_ms": round(self.min_ms, 2), "max_ms": round(self.max_ms, 2), "cache_hit_rate": round(self.cache_hit_rate, 2), "errors": self.errors, } # ============================================================================= # TEST QUERIES - Diverse set covering all query types # ============================================================================= TEST_QUERIES = [ # Template-matchable queries (should use fast template SPARQL) {"query": "Welke musea zijn er in Amsterdam?", "type": "template", "lang": "nl"}, {"query": "Hoeveel bibliotheken zijn er in Rotterdam?", "type": "template", "lang": "nl"}, {"query": "Welke archieven zijn er in Utrecht?", "type": "template", "lang": "nl"}, {"query": "What museums are in The Hague?", "type": "template", "lang": "en"}, # Atomic-decomposable queries (benefit from sub-task caching) {"query": "Hoeveel musea in Amsterdam hebben een ISIL code?", "type": "atomic", "lang": "nl"}, {"query": "Welke archieven in Noord-Holland hebben een website?", "type": "atomic", "lang": "nl"}, {"query": "How many libraries in Groningen have a Wikidata ID?", "type": "atomic", "lang": "en"}, # Similar queries (test sub-task cache reuse) {"query": "Hoeveel musea in Amsterdam hebben een website?", "type": "atomic_reuse", "lang": "nl"}, {"query": "Welke musea in Amsterdam zijn open?", "type": "atomic_reuse", "lang": "nl"}, # LLM-required queries (complex, no template match) {"query": "Vergelijk de collecties van het Rijksmuseum en het Van Gogh Museum", "type": "llm", "lang": "nl"}, {"query": "What is the history of archives in Friesland?", "type": "llm", "lang": "en"}, ] # ============================================================================= # BENCHMARK 1: Embedding Model Warmup # ============================================================================= def benchmark_embedding_warmup() -> BenchmarkResult: """Benchmark embedding model cold start vs warm.""" print("\nšŸ“Š Benchmark: Embedding Model Warmup") print("-" * 50) result = BenchmarkResult(name="embedding_warmup", iterations=5) try: # Test cold start (new model instance each time) from sentence_transformers import SentenceTransformer # Cold start print(" Testing cold start...") start = time.time() model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2") _ = model.encode("test query") cold_ms = (time.time() - start) * 1000 result.latencies_ms.append(cold_ms) print(f" Cold start: {cold_ms:.0f}ms") # Warm queries (model already loaded) print(" Testing warm queries...") for i in range(4): start = time.time() _ = model.encode(f"test query {i}") warm_ms = (time.time() - start) * 1000 result.latencies_ms.append(warm_ms) warm_avg = statistics.mean(result.latencies_ms[1:]) print(f" Warm average: {warm_avg:.1f}ms") print(f" Speedup: {cold_ms / warm_avg:.1f}x faster when warm") except Exception as e: print(f" ERROR: {e}") result.errors += 1 return result # ============================================================================= # BENCHMARK 2: Template Embedding Warmup # ============================================================================= def benchmark_template_embedding_warmup() -> BenchmarkResult: """Benchmark template embedding computation cold vs warm.""" print("\nšŸ“Š Benchmark: Template Embedding Warmup") print("-" * 50) result = BenchmarkResult(name="template_embedding_warmup", iterations=5) try: # Use module-level imports (already loaded at top of file) # Load templates classifier = TemplateClassifier() templates = classifier._load_templates() if not templates: print(" No templates found!") result.errors += 1 return result # Cold start (force recompute) print(f" Testing cold start ({len(templates)} templates)...") matcher = get_template_embedding_matcher() matcher._template_embeddings = {} # Clear cache start = time.time() matcher._ensure_embeddings_computed(templates) cold_ms = (time.time() - start) * 1000 result.latencies_ms.append(cold_ms) print(f" Cold computation: {cold_ms:.0f}ms") # Warm lookups (embeddings cached) print(" Testing warm lookups...") for i in range(4): start = time.time() _ = matcher.match("Welke musea zijn er in Amsterdam?", templates) warm_ms = (time.time() - start) * 1000 result.latencies_ms.append(warm_ms) warm_avg = statistics.mean(result.latencies_ms[1:]) print(f" Warm lookup average: {warm_avg:.1f}ms") print(f" Speedup: {cold_ms / warm_avg:.1f}x faster when warm") except Exception as e: print(f" ERROR: {e}") result.errors += 1 return result # ============================================================================= # BENCHMARK 3: Atomic Sub-task Caching # ============================================================================= async def benchmark_atomic_cache() -> BenchmarkResult: """Benchmark atomic sub-task cache hit rates.""" print("\nšŸ“Š Benchmark: Atomic Sub-task Caching") print("-" * 50) result = BenchmarkResult(name="atomic_cache", iterations=len(TEST_QUERIES)) try: # Use module-level imports (already loaded at top of file) # Create cache manager (in-memory only for benchmark) cache_mgr = AtomicCacheManager(semantic_cache=None) decomposer = HeritageQueryDecomposer() print(f" Processing {len(TEST_QUERIES)} queries...") for i, q in enumerate(TEST_QUERIES): query = q["query"] lang = q["lang"] start = time.time() decomposed, cached = await cache_mgr.process_query(query, lang) elapsed_ms = (time.time() - start) * 1000 result.latencies_ms.append(elapsed_ms) # Track cache stats for task in decomposed.sub_tasks: if task.cache_hit: result.cache_hits += 1 else: result.cache_misses += 1 # Simulate caching the results (for next similar queries) for task in decomposed.sub_tasks: if not task.cache_hit: # Cache a dummy result await cache_mgr.cache_subtask_result( task=task, result={"dummy": True, "query_idx": i}, language=lang, ) status = "āœ“ partial hit" if decomposed.partial_cache_hits > 0 else "ā—‹ miss" if decomposed.fully_cached: status = "ā˜… full hit" print(f" [{i+1:2d}] {status} - {decomposed.partial_cache_hits}/{len(decomposed.sub_tasks)} sub-tasks - {q['type']}") # Get final stats stats = cache_mgr.get_stats() print(f"\n Final stats:") print(f" Sub-task hit rate: {stats['subtask_hit_rate']:.1f}%") print(f" Total sub-tasks: {stats['subtask_hits'] + stats['subtask_misses']}") print(f" Full reassemblies: {stats['full_query_reassemblies']}") except Exception as e: print(f" ERROR: {e}") result.errors += 1 return result # ============================================================================= # BENCHMARK 4: Template Pattern Matching Speed # ============================================================================= def benchmark_template_matching() -> BenchmarkResult: """Benchmark 3-tier template matching: exact → pattern → embedding. Note: This benchmark tests only pattern and embedding matching (Tiers 1-2). The full TemplateClassifier.forward() also includes LLM classification (Tier 3), but we skip that here to isolate the fast-path performance. """ print("\nšŸ“Š Benchmark: Template Pattern Matching (3-tier)") print("-" * 50) result = BenchmarkResult(name="template_matching", iterations=len(TEST_QUERIES)) try: # Use module-level imports (already loaded at top of file) classifier = TemplateClassifier() templates = classifier._load_templates() if not templates: print(" No templates found!") result.errors += 1 return result # Track which tier matched tier_counts = {"pattern": 0, "embedding": 0, "none": 0} # Get embedding matcher embedding_matcher = get_template_embedding_matcher() for q in TEST_QUERIES: query = q["query"] start = time.time() # Tier 1: Pattern matching pattern_match = classifier._match_by_patterns(query, templates) if pattern_match and pattern_match.confidence >= 0.75: elapsed_ms = (time.time() - start) * 1000 result.latencies_ms.append(elapsed_ms) tier_counts["pattern"] += 1 result.cache_hits += 1 continue # Tier 2: Embedding matching embedding_match = embedding_matcher.match(query, templates, min_similarity=0.70) elapsed_ms = (time.time() - start) * 1000 result.latencies_ms.append(elapsed_ms) if embedding_match and embedding_match.confidence >= 0.70: tier_counts["embedding"] += 1 result.cache_hits += 1 else: tier_counts["none"] += 1 result.cache_misses += 1 print(f" Match distribution:") for tier, count in tier_counts.items(): pct = count / len(TEST_QUERIES) * 100 print(f" {tier}: {count} ({pct:.0f}%)") print(f" Average latency: {result.mean_ms:.1f}ms") except Exception as e: print(f" ERROR: {e}") result.errors += 1 return result # ============================================================================= # BENCHMARK 5: Full API Endpoint (requires server) # ============================================================================= async def benchmark_api_endpoint(base_url: str = "http://localhost:8000") -> BenchmarkResult: """Benchmark full API endpoint latency.""" print("\nšŸ“Š Benchmark: Full API Endpoint (/api/rag/dspy/query)") print("-" * 50) result = BenchmarkResult(name="api_endpoint", iterations=len(TEST_QUERIES)) try: import httpx async with httpx.AsyncClient(timeout=60.0) as client: # Warm up print(" Warming up API...") try: await client.post( f"{base_url}/api/rag/dspy/query", json={"question": "test", "language": "nl"}, ) except Exception: print(" WARNING: API not responding, skipping benchmark") result.errors += 1 return result print(f" Processing {len(TEST_QUERIES)} queries...") for i, q in enumerate(TEST_QUERIES): start = time.time() try: response = await client.post( f"{base_url}/api/rag/dspy/query", json={ "question": q["query"], "language": q["lang"], "skip_cache": False, # Use cache }, ) elapsed_ms = (time.time() - start) * 1000 result.latencies_ms.append(elapsed_ms) if response.status_code == 200: data = response.json() cache_hit = data.get("cache_hit", False) template_used = data.get("template_used", False) if cache_hit: result.cache_hits += 1 status = "ā˜… cache hit" elif template_used: status = "āœ“ template" else: result.cache_misses += 1 status = "ā—‹ LLM" print(f" [{i+1:2d}] {status} - {elapsed_ms:.0f}ms - {q['type']}") else: print(f" [{i+1:2d}] ERROR: HTTP {response.status_code}") result.errors += 1 except Exception as e: print(f" [{i+1:2d}] ERROR: {e}") result.errors += 1 print(f"\n Results:") print(f" Mean latency: {result.mean_ms:.0f}ms") print(f" P95 latency: {result.p95_ms:.0f}ms") print(f" Cache hit rate: {result.cache_hit_rate:.1f}%") except ImportError: print(" ERROR: httpx not installed") result.errors += 1 return result # ============================================================================= # MAIN BENCHMARK RUNNER # ============================================================================= async def run_all_benchmarks(quick: bool = True, api: bool = False, atomic_only: bool = False) -> dict: """Run all benchmarks and generate report.""" print("\n" + "=" * 70) print("Heritage RAG Performance Benchmark Suite") print(f"Timestamp: {datetime.now().isoformat()}") print("=" * 70) results: dict[str, Any] = { "timestamp": datetime.now().isoformat(), "mode": "quick" if quick else ("api" if api else "atomic"), "benchmarks": {}, } if atomic_only: # Only run atomic cache benchmark atomic_result = await benchmark_atomic_cache() results["benchmarks"]["atomic_cache"] = atomic_result.to_dict() elif quick: # Quick benchmarks (no server required) # 1. Embedding warmup embed_result = benchmark_embedding_warmup() results["benchmarks"]["embedding_warmup"] = embed_result.to_dict() # 2. Template embedding warmup template_embed_result = benchmark_template_embedding_warmup() results["benchmarks"]["template_embedding_warmup"] = template_embed_result.to_dict() # 3. Template matching template_match_result = benchmark_template_matching() results["benchmarks"]["template_matching"] = template_match_result.to_dict() # 4. Atomic cache atomic_result = await benchmark_atomic_cache() results["benchmarks"]["atomic_cache"] = atomic_result.to_dict() if api: # Full API benchmark (requires server) api_result = await benchmark_api_endpoint() results["benchmarks"]["api_endpoint"] = api_result.to_dict() # ========================================================================== # SUMMARY REPORT # ========================================================================== print("\n" + "=" * 70) print("BENCHMARK SUMMARY") print("=" * 70) print(f"\n{'Benchmark':<35} {'Mean':>10} {'P95':>10} {'Cache%':>10}") print("-" * 70) for name, data in results["benchmarks"].items(): mean = f"{data['mean_ms']:.1f}ms" p95 = f"{data['p95_ms']:.1f}ms" cache = f"{data['cache_hit_rate']:.1f}%" if data['cache_hit_rate'] > 0 else "N/A" print(f"{name:<35} {mean:>10} {p95:>10} {cache:>10}") print("-" * 70) # Performance insights print("\nšŸ“ˆ Performance Insights:") if "embedding_warmup" in results["benchmarks"]: data = results["benchmarks"]["embedding_warmup"] if len(data.get("latencies_ms", [])) > 1: # First is cold, rest are warm - but we only have summary stats print(f" • Embedding warmup eliminates {data['max_ms'] - data['min_ms']:.0f}ms cold start") if "template_embedding_warmup" in results["benchmarks"]: data = results["benchmarks"]["template_embedding_warmup"] if data["max_ms"] > 0: speedup = data["max_ms"] / max(data["min_ms"], 1) print(f" • Template pre-computation provides {speedup:.0f}x speedup") if "atomic_cache" in results["benchmarks"]: data = results["benchmarks"]["atomic_cache"] print(f" • Atomic cache hit rate: {data['cache_hit_rate']:.1f}% (target: 40-70%)") if "api_endpoint" in results["benchmarks"]: data = results["benchmarks"]["api_endpoint"] print(f" • API P95 latency: {data['p95_ms']:.0f}ms") print(f" • Full query cache hit rate: {data['cache_hit_rate']:.1f}%") # Save results results_path = Path(__file__).parent / "benchmark_results" / f"perf_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" results_path.parent.mkdir(exist_ok=True) with open(results_path, "w") as f: json.dump(results, f, indent=2) print(f"\nšŸ’¾ Results saved to: {results_path}") print("=" * 70) return results def main(): parser = argparse.ArgumentParser(description="Heritage RAG Performance Benchmark") parser.add_argument("--quick", action="store_true", default=True, help="Run quick benchmarks (no server required)") parser.add_argument("--api", action="store_true", help="Run API endpoint benchmark (requires server)") parser.add_argument("--atomic-cache", action="store_true", help="Run only atomic cache benchmark") parser.add_argument("--all", action="store_true", help="Run all benchmarks including API") args = parser.parse_args() if args.all: args.api = True args.quick = True args.atomic_cache = False asyncio.run(run_all_benchmarks( quick=args.quick and not args.atomic_cache, api=args.api, atomic_only=args.atomic_cache, )) if __name__ == "__main__": main()