597 lines
22 KiB
Python
597 lines
22 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Performance Benchmark Suite for Heritage RAG Pipeline
|
|
|
|
Measures latency improvements from:
|
|
1. Embedding model warmup (eliminates 3-15s cold start)
|
|
2. Template embedding warmup (eliminates 2-5s first-query delay)
|
|
3. SPARQL connection pooling (reduces per-request overhead)
|
|
4. Atomic sub-task caching (40-70% vs 5-15% hit rate)
|
|
5. Full-query semantic caching
|
|
|
|
Usage:
|
|
# Quick benchmark (no server required - tests components directly)
|
|
python benchmark_performance.py --quick
|
|
|
|
# Full API benchmark (requires server running on localhost:8000)
|
|
python benchmark_performance.py --api
|
|
|
|
# Atomic cache benchmark only
|
|
python benchmark_performance.py --atomic-cache
|
|
|
|
Requirements:
|
|
- For --api: Start server with `python main.py` first
|
|
- For --quick: No server needed (tests components in-process)
|
|
"""
|
|
|
|
import argparse
|
|
import asyncio
|
|
import json
|
|
import os
|
|
import statistics
|
|
import sys
|
|
import time
|
|
from dataclasses import dataclass, field
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
# Add parent paths for imports
|
|
sys.path.insert(0, str(Path(__file__).parent))
|
|
sys.path.insert(0, str(Path(__file__).parent.parent.parent / "src"))
|
|
|
|
# Import local modules with fallback handling
|
|
TemplateClassifier: Any = None
|
|
get_template_embedding_matcher: Any = None
|
|
AtomicCacheManager: Any = None
|
|
HeritageQueryDecomposer: Any = None
|
|
|
|
try:
|
|
from template_sparql import TemplateClassifier, get_template_embedding_matcher # type: ignore
|
|
except ImportError:
|
|
# Fallback for when running from different directory
|
|
import importlib.util
|
|
_spec = importlib.util.spec_from_file_location(
|
|
"template_sparql",
|
|
Path(__file__).parent / "template_sparql.py"
|
|
)
|
|
if _spec and _spec.loader:
|
|
_template_sparql = importlib.util.module_from_spec(_spec)
|
|
_spec.loader.exec_module(_template_sparql)
|
|
TemplateClassifier = _template_sparql.TemplateClassifier
|
|
get_template_embedding_matcher = _template_sparql.get_template_embedding_matcher
|
|
|
|
try:
|
|
from atomic_decomposer import AtomicCacheManager, HeritageQueryDecomposer # type: ignore
|
|
except ImportError:
|
|
import importlib.util
|
|
_spec2 = importlib.util.spec_from_file_location(
|
|
"atomic_decomposer",
|
|
Path(__file__).parent / "atomic_decomposer.py"
|
|
)
|
|
if _spec2 and _spec2.loader:
|
|
_atomic_decomposer = importlib.util.module_from_spec(_spec2)
|
|
_spec2.loader.exec_module(_atomic_decomposer)
|
|
AtomicCacheManager = _atomic_decomposer.AtomicCacheManager
|
|
HeritageQueryDecomposer = _atomic_decomposer.HeritageQueryDecomposer
|
|
|
|
|
|
@dataclass
|
|
class BenchmarkResult:
|
|
"""Result from a single benchmark run."""
|
|
name: str
|
|
iterations: int
|
|
latencies_ms: list[float] = field(default_factory=list)
|
|
cache_hits: int = 0
|
|
cache_misses: int = 0
|
|
errors: int = 0
|
|
|
|
@property
|
|
def mean_ms(self) -> float:
|
|
return statistics.mean(self.latencies_ms) if self.latencies_ms else 0
|
|
|
|
@property
|
|
def median_ms(self) -> float:
|
|
return statistics.median(self.latencies_ms) if self.latencies_ms else 0
|
|
|
|
@property
|
|
def p95_ms(self) -> float:
|
|
if len(self.latencies_ms) < 2:
|
|
return self.mean_ms
|
|
sorted_latencies = sorted(self.latencies_ms)
|
|
idx = int(len(sorted_latencies) * 0.95)
|
|
return sorted_latencies[min(idx, len(sorted_latencies) - 1)]
|
|
|
|
@property
|
|
def min_ms(self) -> float:
|
|
return min(self.latencies_ms) if self.latencies_ms else 0
|
|
|
|
@property
|
|
def max_ms(self) -> float:
|
|
return max(self.latencies_ms) if self.latencies_ms else 0
|
|
|
|
@property
|
|
def cache_hit_rate(self) -> float:
|
|
total = self.cache_hits + self.cache_misses
|
|
return (self.cache_hits / total * 100) if total > 0 else 0
|
|
|
|
def to_dict(self) -> dict:
|
|
return {
|
|
"name": self.name,
|
|
"iterations": self.iterations,
|
|
"mean_ms": round(self.mean_ms, 2),
|
|
"median_ms": round(self.median_ms, 2),
|
|
"p95_ms": round(self.p95_ms, 2),
|
|
"min_ms": round(self.min_ms, 2),
|
|
"max_ms": round(self.max_ms, 2),
|
|
"cache_hit_rate": round(self.cache_hit_rate, 2),
|
|
"errors": self.errors,
|
|
}
|
|
|
|
|
|
# =============================================================================
|
|
# TEST QUERIES - Diverse set covering all query types
|
|
# =============================================================================
|
|
|
|
TEST_QUERIES = [
|
|
# Template-matchable queries (should use fast template SPARQL)
|
|
{"query": "Welke musea zijn er in Amsterdam?", "type": "template", "lang": "nl"},
|
|
{"query": "Hoeveel bibliotheken zijn er in Rotterdam?", "type": "template", "lang": "nl"},
|
|
{"query": "Welke archieven zijn er in Utrecht?", "type": "template", "lang": "nl"},
|
|
{"query": "What museums are in The Hague?", "type": "template", "lang": "en"},
|
|
|
|
# Atomic-decomposable queries (benefit from sub-task caching)
|
|
{"query": "Hoeveel musea in Amsterdam hebben een ISIL code?", "type": "atomic", "lang": "nl"},
|
|
{"query": "Welke archieven in Noord-Holland hebben een website?", "type": "atomic", "lang": "nl"},
|
|
{"query": "How many libraries in Groningen have a Wikidata ID?", "type": "atomic", "lang": "en"},
|
|
|
|
# Similar queries (test sub-task cache reuse)
|
|
{"query": "Hoeveel musea in Amsterdam hebben een website?", "type": "atomic_reuse", "lang": "nl"},
|
|
{"query": "Welke musea in Amsterdam zijn open?", "type": "atomic_reuse", "lang": "nl"},
|
|
|
|
# LLM-required queries (complex, no template match)
|
|
{"query": "Vergelijk de collecties van het Rijksmuseum en het Van Gogh Museum", "type": "llm", "lang": "nl"},
|
|
{"query": "What is the history of archives in Friesland?", "type": "llm", "lang": "en"},
|
|
]
|
|
|
|
|
|
# =============================================================================
|
|
# BENCHMARK 1: Embedding Model Warmup
|
|
# =============================================================================
|
|
|
|
def benchmark_embedding_warmup() -> BenchmarkResult:
|
|
"""Benchmark embedding model cold start vs warm."""
|
|
print("\n📊 Benchmark: Embedding Model Warmup")
|
|
print("-" * 50)
|
|
|
|
result = BenchmarkResult(name="embedding_warmup", iterations=5)
|
|
|
|
try:
|
|
# Test cold start (new model instance each time)
|
|
from sentence_transformers import SentenceTransformer
|
|
|
|
# Cold start
|
|
print(" Testing cold start...")
|
|
start = time.time()
|
|
model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
|
|
_ = model.encode("test query")
|
|
cold_ms = (time.time() - start) * 1000
|
|
result.latencies_ms.append(cold_ms)
|
|
print(f" Cold start: {cold_ms:.0f}ms")
|
|
|
|
# Warm queries (model already loaded)
|
|
print(" Testing warm queries...")
|
|
for i in range(4):
|
|
start = time.time()
|
|
_ = model.encode(f"test query {i}")
|
|
warm_ms = (time.time() - start) * 1000
|
|
result.latencies_ms.append(warm_ms)
|
|
|
|
warm_avg = statistics.mean(result.latencies_ms[1:])
|
|
print(f" Warm average: {warm_avg:.1f}ms")
|
|
print(f" Speedup: {cold_ms / warm_avg:.1f}x faster when warm")
|
|
|
|
except Exception as e:
|
|
print(f" ERROR: {e}")
|
|
result.errors += 1
|
|
|
|
return result
|
|
|
|
|
|
# =============================================================================
|
|
# BENCHMARK 2: Template Embedding Warmup
|
|
# =============================================================================
|
|
|
|
def benchmark_template_embedding_warmup() -> BenchmarkResult:
|
|
"""Benchmark template embedding computation cold vs warm."""
|
|
print("\n📊 Benchmark: Template Embedding Warmup")
|
|
print("-" * 50)
|
|
|
|
result = BenchmarkResult(name="template_embedding_warmup", iterations=5)
|
|
|
|
try:
|
|
# Use module-level imports (already loaded at top of file)
|
|
# Load templates
|
|
classifier = TemplateClassifier()
|
|
templates = classifier._load_templates()
|
|
|
|
if not templates:
|
|
print(" No templates found!")
|
|
result.errors += 1
|
|
return result
|
|
|
|
# Cold start (force recompute)
|
|
print(f" Testing cold start ({len(templates)} templates)...")
|
|
matcher = get_template_embedding_matcher()
|
|
matcher._template_embeddings = {} # Clear cache
|
|
|
|
start = time.time()
|
|
matcher._ensure_embeddings_computed(templates)
|
|
cold_ms = (time.time() - start) * 1000
|
|
result.latencies_ms.append(cold_ms)
|
|
print(f" Cold computation: {cold_ms:.0f}ms")
|
|
|
|
# Warm lookups (embeddings cached)
|
|
print(" Testing warm lookups...")
|
|
for i in range(4):
|
|
start = time.time()
|
|
_ = matcher.match("Welke musea zijn er in Amsterdam?", templates)
|
|
warm_ms = (time.time() - start) * 1000
|
|
result.latencies_ms.append(warm_ms)
|
|
|
|
warm_avg = statistics.mean(result.latencies_ms[1:])
|
|
print(f" Warm lookup average: {warm_avg:.1f}ms")
|
|
print(f" Speedup: {cold_ms / warm_avg:.1f}x faster when warm")
|
|
|
|
except Exception as e:
|
|
print(f" ERROR: {e}")
|
|
result.errors += 1
|
|
|
|
return result
|
|
|
|
|
|
# =============================================================================
|
|
# BENCHMARK 3: Atomic Sub-task Caching
|
|
# =============================================================================
|
|
|
|
async def benchmark_atomic_cache() -> BenchmarkResult:
|
|
"""Benchmark atomic sub-task cache hit rates."""
|
|
print("\n📊 Benchmark: Atomic Sub-task Caching")
|
|
print("-" * 50)
|
|
|
|
result = BenchmarkResult(name="atomic_cache", iterations=len(TEST_QUERIES))
|
|
|
|
try:
|
|
# Use module-level imports (already loaded at top of file)
|
|
# Create cache manager (in-memory only for benchmark)
|
|
cache_mgr = AtomicCacheManager(semantic_cache=None)
|
|
decomposer = HeritageQueryDecomposer()
|
|
|
|
print(f" Processing {len(TEST_QUERIES)} queries...")
|
|
|
|
for i, q in enumerate(TEST_QUERIES):
|
|
query = q["query"]
|
|
lang = q["lang"]
|
|
|
|
start = time.time()
|
|
decomposed, cached = await cache_mgr.process_query(query, lang)
|
|
elapsed_ms = (time.time() - start) * 1000
|
|
result.latencies_ms.append(elapsed_ms)
|
|
|
|
# Track cache stats
|
|
for task in decomposed.sub_tasks:
|
|
if task.cache_hit:
|
|
result.cache_hits += 1
|
|
else:
|
|
result.cache_misses += 1
|
|
|
|
# Simulate caching the results (for next similar queries)
|
|
for task in decomposed.sub_tasks:
|
|
if not task.cache_hit:
|
|
# Cache a dummy result
|
|
await cache_mgr.cache_subtask_result(
|
|
task=task,
|
|
result={"dummy": True, "query_idx": i},
|
|
language=lang,
|
|
)
|
|
|
|
status = "✓ partial hit" if decomposed.partial_cache_hits > 0 else "○ miss"
|
|
if decomposed.fully_cached:
|
|
status = "★ full hit"
|
|
|
|
print(f" [{i+1:2d}] {status} - {decomposed.partial_cache_hits}/{len(decomposed.sub_tasks)} sub-tasks - {q['type']}")
|
|
|
|
# Get final stats
|
|
stats = cache_mgr.get_stats()
|
|
print(f"\n Final stats:")
|
|
print(f" Sub-task hit rate: {stats['subtask_hit_rate']:.1f}%")
|
|
print(f" Total sub-tasks: {stats['subtask_hits'] + stats['subtask_misses']}")
|
|
print(f" Full reassemblies: {stats['full_query_reassemblies']}")
|
|
|
|
except Exception as e:
|
|
print(f" ERROR: {e}")
|
|
result.errors += 1
|
|
|
|
return result
|
|
|
|
|
|
# =============================================================================
|
|
# BENCHMARK 4: Template Pattern Matching Speed
|
|
# =============================================================================
|
|
|
|
def benchmark_template_matching() -> BenchmarkResult:
|
|
"""Benchmark 3-tier template matching: exact → pattern → embedding.
|
|
|
|
Note: This benchmark tests only pattern and embedding matching (Tiers 1-2).
|
|
The full TemplateClassifier.forward() also includes LLM classification (Tier 3),
|
|
but we skip that here to isolate the fast-path performance.
|
|
"""
|
|
print("\n📊 Benchmark: Template Pattern Matching (3-tier)")
|
|
print("-" * 50)
|
|
|
|
result = BenchmarkResult(name="template_matching", iterations=len(TEST_QUERIES))
|
|
|
|
try:
|
|
# Use module-level imports (already loaded at top of file)
|
|
classifier = TemplateClassifier()
|
|
templates = classifier._load_templates()
|
|
|
|
if not templates:
|
|
print(" No templates found!")
|
|
result.errors += 1
|
|
return result
|
|
|
|
# Track which tier matched
|
|
tier_counts = {"pattern": 0, "embedding": 0, "none": 0}
|
|
|
|
# Get embedding matcher
|
|
embedding_matcher = get_template_embedding_matcher()
|
|
|
|
for q in TEST_QUERIES:
|
|
query = q["query"]
|
|
|
|
start = time.time()
|
|
|
|
# Tier 1: Pattern matching
|
|
pattern_match = classifier._match_by_patterns(query, templates)
|
|
if pattern_match and pattern_match.confidence >= 0.75:
|
|
elapsed_ms = (time.time() - start) * 1000
|
|
result.latencies_ms.append(elapsed_ms)
|
|
tier_counts["pattern"] += 1
|
|
result.cache_hits += 1
|
|
continue
|
|
|
|
# Tier 2: Embedding matching
|
|
embedding_match = embedding_matcher.match(query, templates, min_similarity=0.70)
|
|
elapsed_ms = (time.time() - start) * 1000
|
|
result.latencies_ms.append(elapsed_ms)
|
|
|
|
if embedding_match and embedding_match.confidence >= 0.70:
|
|
tier_counts["embedding"] += 1
|
|
result.cache_hits += 1
|
|
else:
|
|
tier_counts["none"] += 1
|
|
result.cache_misses += 1
|
|
|
|
print(f" Match distribution:")
|
|
for tier, count in tier_counts.items():
|
|
pct = count / len(TEST_QUERIES) * 100
|
|
print(f" {tier}: {count} ({pct:.0f}%)")
|
|
|
|
print(f" Average latency: {result.mean_ms:.1f}ms")
|
|
|
|
except Exception as e:
|
|
print(f" ERROR: {e}")
|
|
result.errors += 1
|
|
|
|
return result
|
|
|
|
|
|
# =============================================================================
|
|
# BENCHMARK 5: Full API Endpoint (requires server)
|
|
# =============================================================================
|
|
|
|
async def benchmark_api_endpoint(base_url: str = "http://localhost:8000") -> BenchmarkResult:
|
|
"""Benchmark full API endpoint latency."""
|
|
print("\n📊 Benchmark: Full API Endpoint (/api/rag/dspy/query)")
|
|
print("-" * 50)
|
|
|
|
result = BenchmarkResult(name="api_endpoint", iterations=len(TEST_QUERIES))
|
|
|
|
try:
|
|
import httpx
|
|
|
|
async with httpx.AsyncClient(timeout=60.0) as client:
|
|
# Warm up
|
|
print(" Warming up API...")
|
|
try:
|
|
await client.post(
|
|
f"{base_url}/api/rag/dspy/query",
|
|
json={"question": "test", "language": "nl"},
|
|
)
|
|
except Exception:
|
|
print(" WARNING: API not responding, skipping benchmark")
|
|
result.errors += 1
|
|
return result
|
|
|
|
print(f" Processing {len(TEST_QUERIES)} queries...")
|
|
|
|
for i, q in enumerate(TEST_QUERIES):
|
|
start = time.time()
|
|
try:
|
|
response = await client.post(
|
|
f"{base_url}/api/rag/dspy/query",
|
|
json={
|
|
"question": q["query"],
|
|
"language": q["lang"],
|
|
"skip_cache": False, # Use cache
|
|
},
|
|
)
|
|
elapsed_ms = (time.time() - start) * 1000
|
|
result.latencies_ms.append(elapsed_ms)
|
|
|
|
if response.status_code == 200:
|
|
data = response.json()
|
|
cache_hit = data.get("cache_hit", False)
|
|
template_used = data.get("template_used", False)
|
|
|
|
if cache_hit:
|
|
result.cache_hits += 1
|
|
status = "★ cache hit"
|
|
elif template_used:
|
|
status = "✓ template"
|
|
else:
|
|
result.cache_misses += 1
|
|
status = "○ LLM"
|
|
|
|
print(f" [{i+1:2d}] {status} - {elapsed_ms:.0f}ms - {q['type']}")
|
|
else:
|
|
print(f" [{i+1:2d}] ERROR: HTTP {response.status_code}")
|
|
result.errors += 1
|
|
|
|
except Exception as e:
|
|
print(f" [{i+1:2d}] ERROR: {e}")
|
|
result.errors += 1
|
|
|
|
print(f"\n Results:")
|
|
print(f" Mean latency: {result.mean_ms:.0f}ms")
|
|
print(f" P95 latency: {result.p95_ms:.0f}ms")
|
|
print(f" Cache hit rate: {result.cache_hit_rate:.1f}%")
|
|
|
|
except ImportError:
|
|
print(" ERROR: httpx not installed")
|
|
result.errors += 1
|
|
|
|
return result
|
|
|
|
|
|
# =============================================================================
|
|
# MAIN BENCHMARK RUNNER
|
|
# =============================================================================
|
|
|
|
async def run_all_benchmarks(quick: bool = True, api: bool = False, atomic_only: bool = False) -> dict:
|
|
"""Run all benchmarks and generate report."""
|
|
|
|
print("\n" + "=" * 70)
|
|
print("Heritage RAG Performance Benchmark Suite")
|
|
print(f"Timestamp: {datetime.now().isoformat()}")
|
|
print("=" * 70)
|
|
|
|
results: dict[str, Any] = {
|
|
"timestamp": datetime.now().isoformat(),
|
|
"mode": "quick" if quick else ("api" if api else "atomic"),
|
|
"benchmarks": {},
|
|
}
|
|
|
|
if atomic_only:
|
|
# Only run atomic cache benchmark
|
|
atomic_result = await benchmark_atomic_cache()
|
|
results["benchmarks"]["atomic_cache"] = atomic_result.to_dict()
|
|
|
|
elif quick:
|
|
# Quick benchmarks (no server required)
|
|
|
|
# 1. Embedding warmup
|
|
embed_result = benchmark_embedding_warmup()
|
|
results["benchmarks"]["embedding_warmup"] = embed_result.to_dict()
|
|
|
|
# 2. Template embedding warmup
|
|
template_embed_result = benchmark_template_embedding_warmup()
|
|
results["benchmarks"]["template_embedding_warmup"] = template_embed_result.to_dict()
|
|
|
|
# 3. Template matching
|
|
template_match_result = benchmark_template_matching()
|
|
results["benchmarks"]["template_matching"] = template_match_result.to_dict()
|
|
|
|
# 4. Atomic cache
|
|
atomic_result = await benchmark_atomic_cache()
|
|
results["benchmarks"]["atomic_cache"] = atomic_result.to_dict()
|
|
|
|
if api:
|
|
# Full API benchmark (requires server)
|
|
api_result = await benchmark_api_endpoint()
|
|
results["benchmarks"]["api_endpoint"] = api_result.to_dict()
|
|
|
|
# ==========================================================================
|
|
# SUMMARY REPORT
|
|
# ==========================================================================
|
|
|
|
print("\n" + "=" * 70)
|
|
print("BENCHMARK SUMMARY")
|
|
print("=" * 70)
|
|
|
|
print(f"\n{'Benchmark':<35} {'Mean':>10} {'P95':>10} {'Cache%':>10}")
|
|
print("-" * 70)
|
|
|
|
for name, data in results["benchmarks"].items():
|
|
mean = f"{data['mean_ms']:.1f}ms"
|
|
p95 = f"{data['p95_ms']:.1f}ms"
|
|
cache = f"{data['cache_hit_rate']:.1f}%" if data['cache_hit_rate'] > 0 else "N/A"
|
|
print(f"{name:<35} {mean:>10} {p95:>10} {cache:>10}")
|
|
|
|
print("-" * 70)
|
|
|
|
# Performance insights
|
|
print("\n📈 Performance Insights:")
|
|
|
|
if "embedding_warmup" in results["benchmarks"]:
|
|
data = results["benchmarks"]["embedding_warmup"]
|
|
if len(data.get("latencies_ms", [])) > 1:
|
|
# First is cold, rest are warm - but we only have summary stats
|
|
print(f" • Embedding warmup eliminates {data['max_ms'] - data['min_ms']:.0f}ms cold start")
|
|
|
|
if "template_embedding_warmup" in results["benchmarks"]:
|
|
data = results["benchmarks"]["template_embedding_warmup"]
|
|
if data["max_ms"] > 0:
|
|
speedup = data["max_ms"] / max(data["min_ms"], 1)
|
|
print(f" • Template pre-computation provides {speedup:.0f}x speedup")
|
|
|
|
if "atomic_cache" in results["benchmarks"]:
|
|
data = results["benchmarks"]["atomic_cache"]
|
|
print(f" • Atomic cache hit rate: {data['cache_hit_rate']:.1f}% (target: 40-70%)")
|
|
|
|
if "api_endpoint" in results["benchmarks"]:
|
|
data = results["benchmarks"]["api_endpoint"]
|
|
print(f" • API P95 latency: {data['p95_ms']:.0f}ms")
|
|
print(f" • Full query cache hit rate: {data['cache_hit_rate']:.1f}%")
|
|
|
|
# Save results
|
|
results_path = Path(__file__).parent / "benchmark_results" / f"perf_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
|
|
results_path.parent.mkdir(exist_ok=True)
|
|
|
|
with open(results_path, "w") as f:
|
|
json.dump(results, f, indent=2)
|
|
|
|
print(f"\n💾 Results saved to: {results_path}")
|
|
print("=" * 70)
|
|
|
|
return results
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Heritage RAG Performance Benchmark")
|
|
parser.add_argument("--quick", action="store_true", default=True,
|
|
help="Run quick benchmarks (no server required)")
|
|
parser.add_argument("--api", action="store_true",
|
|
help="Run API endpoint benchmark (requires server)")
|
|
parser.add_argument("--atomic-cache", action="store_true",
|
|
help="Run only atomic cache benchmark")
|
|
parser.add_argument("--all", action="store_true",
|
|
help="Run all benchmarks including API")
|
|
|
|
args = parser.parse_args()
|
|
|
|
if args.all:
|
|
args.api = True
|
|
args.quick = True
|
|
args.atomic_cache = False
|
|
|
|
asyncio.run(run_all_benchmarks(
|
|
quick=args.quick and not args.atomic_cache,
|
|
api=args.api,
|
|
atomic_only=args.atomic_cache,
|
|
))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|