glam/backend/rag/benchmark_optimization.py
2025-12-21 00:01:54 +01:00

267 lines
8.9 KiB
Python

#!/usr/bin/env python3
"""
Benchmark script comparing baseline vs optimized Heritage RAG pipeline.
This script demonstrates the quality improvement from DSPy optimization.
Usage:
python benchmark_optimization.py
Requirements:
- SSH tunnel active: ssh -f -N -L 7878:localhost:7878 root@91.98.224.44
- Environment loaded: source .venv/bin/activate && source .env
"""
import json
import logging
import os
import sys
import time
from datetime import datetime
from pathlib import Path
import dspy
# Add parent to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
from backend.rag.dspy_heritage_rag import HeritageRAGPipeline
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# Suppress noisy loggers
logging.getLogger('httpx').setLevel(logging.WARNING)
logging.getLogger('openai').setLevel(logging.WARNING)
# =============================================================================
# TEST QUERIES
# =============================================================================
TEST_QUERIES = [
{
"query": "List all libraries in Utrecht",
"expected_intent": "geographic",
"expected_keywords": ["library", "utrecht", "bibliotheek"],
"description": "Geographic query - libraries in specific city"
},
{
"query": "What is the history of the Rijksmuseum?",
"expected_intent": "entity_lookup",
"expected_keywords": ["rijksmuseum", "amsterdam", "museum", "history", "1800", "1885"],
"description": "Entity lookup - specific institution history"
},
{
"query": "How many archives are in the Netherlands?",
"expected_intent": "statistical",
"expected_keywords": ["archive", "netherlands", "number", "total"],
"description": "Statistical query - count institutions"
},
{
"query": "Compare museums in Amsterdam and Rotterdam",
"expected_intent": "comparative",
"expected_keywords": ["amsterdam", "rotterdam", "museum"],
"description": "Comparative query - two cities"
},
{
"query": "Welke archieven zijn er in Groningen?",
"expected_intent": "geographic",
"expected_keywords": ["archief", "groningen"],
"description": "Dutch language - geographic query"
},
]
def score_prediction(query_info: dict, prediction) -> dict:
"""Score a prediction against expected values."""
scores = {}
# Intent match (40%)
if hasattr(prediction, 'intent'):
expected = query_info.get('expected_intent', '').lower()
actual = prediction.intent.lower() if prediction.intent else ''
intent_match = expected == actual
# Partial credit for related intents
related = {
('geographic', 'entity_lookup'): 0.5,
('statistical', 'comparative'): 0.5,
('exploration', 'entity_lookup'): 0.5,
}
if intent_match:
scores['intent'] = 1.0
else:
pair = (expected, actual)
reverse = (actual, expected)
scores['intent'] = related.get(pair, related.get(reverse, 0.0))
else:
scores['intent'] = 0.0
# Keyword match (40%)
answer = prediction.answer.lower() if hasattr(prediction, 'answer') and prediction.answer else ''
keywords = query_info.get('expected_keywords', [])
if keywords:
matches = sum(1 for kw in keywords if kw.lower() in answer)
scores['keywords'] = matches / len(keywords)
else:
scores['keywords'] = 0.0
# Non-empty answer (20%)
if answer and len(answer.strip()) > 20:
scores['non_empty'] = 1.0
elif answer and len(answer.strip()) > 0:
scores['non_empty'] = 0.5
else:
scores['non_empty'] = 0.0
# Weighted total
scores['total'] = (
0.40 * scores['intent'] +
0.40 * scores['keywords'] +
0.20 * scores['non_empty']
)
return scores
def run_benchmark():
"""Run benchmark comparing baseline vs optimized pipeline."""
print("\n" + "=" * 70)
print("Heritage RAG Pipeline - Optimization Benchmark")
print("=" * 70)
# Configure DSPy
lm = dspy.LM("openai/gpt-4o-mini", cache=True, max_tokens=1024)
dspy.configure(lm=lm)
# ==========================================================================
# BASELINE PIPELINE
# ==========================================================================
print("\n" + "-" * 70)
print("BASELINE PIPELINE (no optimization)")
print("-" * 70)
baseline_pipeline = HeritageRAGPipeline()
baseline_scores = []
for i, q in enumerate(TEST_QUERIES):
print(f"\n[{i+1}/{len(TEST_QUERIES)}] {q['description']}")
print(f" Query: {q['query'][:60]}...")
try:
start = time.time()
result = baseline_pipeline(q['query'])
latency = time.time() - start
scores = score_prediction(q, result)
baseline_scores.append(scores['total'])
print(f" Intent: {result.intent} (expected: {q['expected_intent']}) → {scores['intent']:.0%}")
print(f" Keywords: {scores['keywords']:.0%} matched")
print(f" Score: {scores['total']:.2f} | Latency: {latency:.1f}s")
except Exception as e:
print(f" ERROR: {e}")
baseline_scores.append(0.0)
baseline_avg = sum(baseline_scores) / len(baseline_scores)
print(f"\n→ Baseline Average Score: {baseline_avg:.3f}")
# ==========================================================================
# OPTIMIZED PIPELINE
# ==========================================================================
print("\n" + "-" * 70)
print("OPTIMIZED PIPELINE (BootstrapFewShot)")
print("-" * 70)
optimized_path = Path(__file__).parent / "optimized_models" / "heritage_rag_bootstrap_latest.json"
if not optimized_path.exists():
print(f"ERROR: Optimized model not found at {optimized_path}")
return
optimized_pipeline = HeritageRAGPipeline()
optimized_pipeline.load(str(optimized_path))
print(f" Loaded optimized model from: {optimized_path.name}")
optimized_scores = []
for i, q in enumerate(TEST_QUERIES):
print(f"\n[{i+1}/{len(TEST_QUERIES)}] {q['description']}")
print(f" Query: {q['query'][:60]}...")
try:
start = time.time()
result = optimized_pipeline(q['query'])
latency = time.time() - start
scores = score_prediction(q, result)
optimized_scores.append(scores['total'])
print(f" Intent: {result.intent} (expected: {q['expected_intent']}) → {scores['intent']:.0%}")
print(f" Keywords: {scores['keywords']:.0%} matched")
print(f" Score: {scores['total']:.2f} | Latency: {latency:.1f}s")
except Exception as e:
print(f" ERROR: {e}")
optimized_scores.append(0.0)
optimized_avg = sum(optimized_scores) / len(optimized_scores)
print(f"\n→ Optimized Average Score: {optimized_avg:.3f}")
# ==========================================================================
# SUMMARY
# ==========================================================================
print("\n" + "=" * 70)
print("BENCHMARK SUMMARY")
print("=" * 70)
improvement = optimized_avg - baseline_avg
improvement_pct = (improvement / baseline_avg * 100) if baseline_avg > 0 else 0
print(f"\n Baseline Score: {baseline_avg:.3f}")
print(f" Optimized Score: {optimized_avg:.3f}")
print(f" Improvement: {improvement:+.3f} ({improvement_pct:+.1f}%)")
print("\n Per-query comparison:")
print(f" {'Query':<45} {'Baseline':>10} {'Optimized':>10} {'Δ':>8}")
print(" " + "-" * 75)
for i, q in enumerate(TEST_QUERIES):
base = baseline_scores[i]
opt = optimized_scores[i]
delta = opt - base
desc = q['description'][:43]
print(f" {desc:<45} {base:>10.2f} {opt:>10.2f} {delta:>+8.2f}")
print("\n" + "=" * 70)
# Save results
results = {
"timestamp": datetime.now().isoformat(),
"baseline_avg": baseline_avg,
"optimized_avg": optimized_avg,
"improvement": improvement,
"improvement_pct": improvement_pct,
"per_query": [
{
"query": q["query"],
"baseline": baseline_scores[i],
"optimized": optimized_scores[i],
}
for i, q in enumerate(TEST_QUERIES)
]
}
results_path = Path(__file__).parent / "optimized_models" / "benchmark_results.json"
with open(results_path, "w") as f:
json.dump(results, f, indent=2)
print(f"\nResults saved to: {results_path.name}")
if __name__ == "__main__":
run_benchmark()