267 lines
8.9 KiB
Python
267 lines
8.9 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Benchmark script comparing baseline vs optimized Heritage RAG pipeline.
|
|
|
|
This script demonstrates the quality improvement from DSPy optimization.
|
|
|
|
Usage:
|
|
python benchmark_optimization.py
|
|
|
|
Requirements:
|
|
- SSH tunnel active: ssh -f -N -L 7878:localhost:7878 root@91.98.224.44
|
|
- Environment loaded: source .venv/bin/activate && source .env
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
import os
|
|
import sys
|
|
import time
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
|
|
import dspy
|
|
|
|
# Add parent to path for imports
|
|
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
|
|
|
from backend.rag.dspy_heritage_rag import HeritageRAGPipeline
|
|
|
|
# Configure logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Suppress noisy loggers
|
|
logging.getLogger('httpx').setLevel(logging.WARNING)
|
|
logging.getLogger('openai').setLevel(logging.WARNING)
|
|
|
|
# =============================================================================
|
|
# TEST QUERIES
|
|
# =============================================================================
|
|
|
|
TEST_QUERIES = [
|
|
{
|
|
"query": "List all libraries in Utrecht",
|
|
"expected_intent": "geographic",
|
|
"expected_keywords": ["library", "utrecht", "bibliotheek"],
|
|
"description": "Geographic query - libraries in specific city"
|
|
},
|
|
{
|
|
"query": "What is the history of the Rijksmuseum?",
|
|
"expected_intent": "entity_lookup",
|
|
"expected_keywords": ["rijksmuseum", "amsterdam", "museum", "history", "1800", "1885"],
|
|
"description": "Entity lookup - specific institution history"
|
|
},
|
|
{
|
|
"query": "How many archives are in the Netherlands?",
|
|
"expected_intent": "statistical",
|
|
"expected_keywords": ["archive", "netherlands", "number", "total"],
|
|
"description": "Statistical query - count institutions"
|
|
},
|
|
{
|
|
"query": "Compare museums in Amsterdam and Rotterdam",
|
|
"expected_intent": "comparative",
|
|
"expected_keywords": ["amsterdam", "rotterdam", "museum"],
|
|
"description": "Comparative query - two cities"
|
|
},
|
|
{
|
|
"query": "Welke archieven zijn er in Groningen?",
|
|
"expected_intent": "geographic",
|
|
"expected_keywords": ["archief", "groningen"],
|
|
"description": "Dutch language - geographic query"
|
|
},
|
|
]
|
|
|
|
|
|
def score_prediction(query_info: dict, prediction) -> dict:
|
|
"""Score a prediction against expected values."""
|
|
scores = {}
|
|
|
|
# Intent match (40%)
|
|
if hasattr(prediction, 'intent'):
|
|
expected = query_info.get('expected_intent', '').lower()
|
|
actual = prediction.intent.lower() if prediction.intent else ''
|
|
intent_match = expected == actual
|
|
|
|
# Partial credit for related intents
|
|
related = {
|
|
('geographic', 'entity_lookup'): 0.5,
|
|
('statistical', 'comparative'): 0.5,
|
|
('exploration', 'entity_lookup'): 0.5,
|
|
}
|
|
|
|
if intent_match:
|
|
scores['intent'] = 1.0
|
|
else:
|
|
pair = (expected, actual)
|
|
reverse = (actual, expected)
|
|
scores['intent'] = related.get(pair, related.get(reverse, 0.0))
|
|
else:
|
|
scores['intent'] = 0.0
|
|
|
|
# Keyword match (40%)
|
|
answer = prediction.answer.lower() if hasattr(prediction, 'answer') and prediction.answer else ''
|
|
keywords = query_info.get('expected_keywords', [])
|
|
if keywords:
|
|
matches = sum(1 for kw in keywords if kw.lower() in answer)
|
|
scores['keywords'] = matches / len(keywords)
|
|
else:
|
|
scores['keywords'] = 0.0
|
|
|
|
# Non-empty answer (20%)
|
|
if answer and len(answer.strip()) > 20:
|
|
scores['non_empty'] = 1.0
|
|
elif answer and len(answer.strip()) > 0:
|
|
scores['non_empty'] = 0.5
|
|
else:
|
|
scores['non_empty'] = 0.0
|
|
|
|
# Weighted total
|
|
scores['total'] = (
|
|
0.40 * scores['intent'] +
|
|
0.40 * scores['keywords'] +
|
|
0.20 * scores['non_empty']
|
|
)
|
|
|
|
return scores
|
|
|
|
|
|
def run_benchmark():
|
|
"""Run benchmark comparing baseline vs optimized pipeline."""
|
|
|
|
print("\n" + "=" * 70)
|
|
print("Heritage RAG Pipeline - Optimization Benchmark")
|
|
print("=" * 70)
|
|
|
|
# Configure DSPy
|
|
lm = dspy.LM("openai/gpt-4o-mini", cache=True, max_tokens=1024)
|
|
dspy.configure(lm=lm)
|
|
|
|
# ==========================================================================
|
|
# BASELINE PIPELINE
|
|
# ==========================================================================
|
|
print("\n" + "-" * 70)
|
|
print("BASELINE PIPELINE (no optimization)")
|
|
print("-" * 70)
|
|
|
|
baseline_pipeline = HeritageRAGPipeline()
|
|
baseline_scores = []
|
|
|
|
for i, q in enumerate(TEST_QUERIES):
|
|
print(f"\n[{i+1}/{len(TEST_QUERIES)}] {q['description']}")
|
|
print(f" Query: {q['query'][:60]}...")
|
|
|
|
try:
|
|
start = time.time()
|
|
result = baseline_pipeline(q['query'])
|
|
latency = time.time() - start
|
|
|
|
scores = score_prediction(q, result)
|
|
baseline_scores.append(scores['total'])
|
|
|
|
print(f" Intent: {result.intent} (expected: {q['expected_intent']}) → {scores['intent']:.0%}")
|
|
print(f" Keywords: {scores['keywords']:.0%} matched")
|
|
print(f" Score: {scores['total']:.2f} | Latency: {latency:.1f}s")
|
|
except Exception as e:
|
|
print(f" ERROR: {e}")
|
|
baseline_scores.append(0.0)
|
|
|
|
baseline_avg = sum(baseline_scores) / len(baseline_scores)
|
|
print(f"\n→ Baseline Average Score: {baseline_avg:.3f}")
|
|
|
|
# ==========================================================================
|
|
# OPTIMIZED PIPELINE
|
|
# ==========================================================================
|
|
print("\n" + "-" * 70)
|
|
print("OPTIMIZED PIPELINE (BootstrapFewShot)")
|
|
print("-" * 70)
|
|
|
|
optimized_path = Path(__file__).parent / "optimized_models" / "heritage_rag_bootstrap_latest.json"
|
|
|
|
if not optimized_path.exists():
|
|
print(f"ERROR: Optimized model not found at {optimized_path}")
|
|
return
|
|
|
|
optimized_pipeline = HeritageRAGPipeline()
|
|
optimized_pipeline.load(str(optimized_path))
|
|
print(f" Loaded optimized model from: {optimized_path.name}")
|
|
|
|
optimized_scores = []
|
|
|
|
for i, q in enumerate(TEST_QUERIES):
|
|
print(f"\n[{i+1}/{len(TEST_QUERIES)}] {q['description']}")
|
|
print(f" Query: {q['query'][:60]}...")
|
|
|
|
try:
|
|
start = time.time()
|
|
result = optimized_pipeline(q['query'])
|
|
latency = time.time() - start
|
|
|
|
scores = score_prediction(q, result)
|
|
optimized_scores.append(scores['total'])
|
|
|
|
print(f" Intent: {result.intent} (expected: {q['expected_intent']}) → {scores['intent']:.0%}")
|
|
print(f" Keywords: {scores['keywords']:.0%} matched")
|
|
print(f" Score: {scores['total']:.2f} | Latency: {latency:.1f}s")
|
|
except Exception as e:
|
|
print(f" ERROR: {e}")
|
|
optimized_scores.append(0.0)
|
|
|
|
optimized_avg = sum(optimized_scores) / len(optimized_scores)
|
|
print(f"\n→ Optimized Average Score: {optimized_avg:.3f}")
|
|
|
|
# ==========================================================================
|
|
# SUMMARY
|
|
# ==========================================================================
|
|
print("\n" + "=" * 70)
|
|
print("BENCHMARK SUMMARY")
|
|
print("=" * 70)
|
|
|
|
improvement = optimized_avg - baseline_avg
|
|
improvement_pct = (improvement / baseline_avg * 100) if baseline_avg > 0 else 0
|
|
|
|
print(f"\n Baseline Score: {baseline_avg:.3f}")
|
|
print(f" Optimized Score: {optimized_avg:.3f}")
|
|
print(f" Improvement: {improvement:+.3f} ({improvement_pct:+.1f}%)")
|
|
|
|
print("\n Per-query comparison:")
|
|
print(f" {'Query':<45} {'Baseline':>10} {'Optimized':>10} {'Δ':>8}")
|
|
print(" " + "-" * 75)
|
|
|
|
for i, q in enumerate(TEST_QUERIES):
|
|
base = baseline_scores[i]
|
|
opt = optimized_scores[i]
|
|
delta = opt - base
|
|
desc = q['description'][:43]
|
|
print(f" {desc:<45} {base:>10.2f} {opt:>10.2f} {delta:>+8.2f}")
|
|
|
|
print("\n" + "=" * 70)
|
|
|
|
# Save results
|
|
results = {
|
|
"timestamp": datetime.now().isoformat(),
|
|
"baseline_avg": baseline_avg,
|
|
"optimized_avg": optimized_avg,
|
|
"improvement": improvement,
|
|
"improvement_pct": improvement_pct,
|
|
"per_query": [
|
|
{
|
|
"query": q["query"],
|
|
"baseline": baseline_scores[i],
|
|
"optimized": optimized_scores[i],
|
|
}
|
|
for i, q in enumerate(TEST_QUERIES)
|
|
]
|
|
}
|
|
|
|
results_path = Path(__file__).parent / "optimized_models" / "benchmark_results.json"
|
|
with open(results_path, "w") as f:
|
|
json.dump(results, f, indent=2)
|
|
print(f"\nResults saved to: {results_path.name}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
run_benchmark()
|