#!/usr/bin/env python3 """ Benchmark script comparing baseline vs optimized Heritage RAG pipeline. This script demonstrates the quality improvement from DSPy optimization. Usage: python benchmark_optimization.py Requirements: - SSH tunnel active: ssh -f -N -L 7878:localhost:7878 root@91.98.224.44 - Environment loaded: source .venv/bin/activate && source .env """ import json import logging import os import sys import time from datetime import datetime from pathlib import Path import dspy # Add parent to path for imports sys.path.insert(0, str(Path(__file__).parent.parent.parent)) from backend.rag.dspy_heritage_rag import HeritageRAGPipeline # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) # Suppress noisy loggers logging.getLogger('httpx').setLevel(logging.WARNING) logging.getLogger('openai').setLevel(logging.WARNING) # ============================================================================= # TEST QUERIES # ============================================================================= TEST_QUERIES = [ { "query": "List all libraries in Utrecht", "expected_intent": "geographic", "expected_keywords": ["library", "utrecht", "bibliotheek"], "description": "Geographic query - libraries in specific city" }, { "query": "What is the history of the Rijksmuseum?", "expected_intent": "entity_lookup", "expected_keywords": ["rijksmuseum", "amsterdam", "museum", "history", "1800", "1885"], "description": "Entity lookup - specific institution history" }, { "query": "How many archives are in the Netherlands?", "expected_intent": "statistical", "expected_keywords": ["archive", "netherlands", "number", "total"], "description": "Statistical query - count institutions" }, { "query": "Compare museums in Amsterdam and Rotterdam", "expected_intent": "comparative", "expected_keywords": ["amsterdam", "rotterdam", "museum"], "description": "Comparative query - two cities" }, { "query": "Welke archieven zijn er in Groningen?", "expected_intent": "geographic", "expected_keywords": ["archief", "groningen"], "description": "Dutch language - geographic query" }, ] def score_prediction(query_info: dict, prediction) -> dict: """Score a prediction against expected values.""" scores = {} # Intent match (40%) if hasattr(prediction, 'intent'): expected = query_info.get('expected_intent', '').lower() actual = prediction.intent.lower() if prediction.intent else '' intent_match = expected == actual # Partial credit for related intents related = { ('geographic', 'entity_lookup'): 0.5, ('statistical', 'comparative'): 0.5, ('exploration', 'entity_lookup'): 0.5, } if intent_match: scores['intent'] = 1.0 else: pair = (expected, actual) reverse = (actual, expected) scores['intent'] = related.get(pair, related.get(reverse, 0.0)) else: scores['intent'] = 0.0 # Keyword match (40%) answer = prediction.answer.lower() if hasattr(prediction, 'answer') and prediction.answer else '' keywords = query_info.get('expected_keywords', []) if keywords: matches = sum(1 for kw in keywords if kw.lower() in answer) scores['keywords'] = matches / len(keywords) else: scores['keywords'] = 0.0 # Non-empty answer (20%) if answer and len(answer.strip()) > 20: scores['non_empty'] = 1.0 elif answer and len(answer.strip()) > 0: scores['non_empty'] = 0.5 else: scores['non_empty'] = 0.0 # Weighted total scores['total'] = ( 0.40 * scores['intent'] + 0.40 * scores['keywords'] + 0.20 * scores['non_empty'] ) return scores def run_benchmark(): """Run benchmark comparing baseline vs optimized pipeline.""" print("\n" + "=" * 70) print("Heritage RAG Pipeline - Optimization Benchmark") print("=" * 70) # Configure DSPy lm = dspy.LM("openai/gpt-4o-mini", cache=True, max_tokens=1024) dspy.configure(lm=lm) # ========================================================================== # BASELINE PIPELINE # ========================================================================== print("\n" + "-" * 70) print("BASELINE PIPELINE (no optimization)") print("-" * 70) baseline_pipeline = HeritageRAGPipeline() baseline_scores = [] for i, q in enumerate(TEST_QUERIES): print(f"\n[{i+1}/{len(TEST_QUERIES)}] {q['description']}") print(f" Query: {q['query'][:60]}...") try: start = time.time() result = baseline_pipeline(q['query']) latency = time.time() - start scores = score_prediction(q, result) baseline_scores.append(scores['total']) print(f" Intent: {result.intent} (expected: {q['expected_intent']}) → {scores['intent']:.0%}") print(f" Keywords: {scores['keywords']:.0%} matched") print(f" Score: {scores['total']:.2f} | Latency: {latency:.1f}s") except Exception as e: print(f" ERROR: {e}") baseline_scores.append(0.0) baseline_avg = sum(baseline_scores) / len(baseline_scores) print(f"\n→ Baseline Average Score: {baseline_avg:.3f}") # ========================================================================== # OPTIMIZED PIPELINE # ========================================================================== print("\n" + "-" * 70) print("OPTIMIZED PIPELINE (BootstrapFewShot)") print("-" * 70) optimized_path = Path(__file__).parent / "optimized_models" / "heritage_rag_bootstrap_latest.json" if not optimized_path.exists(): print(f"ERROR: Optimized model not found at {optimized_path}") return optimized_pipeline = HeritageRAGPipeline() optimized_pipeline.load(str(optimized_path)) print(f" Loaded optimized model from: {optimized_path.name}") optimized_scores = [] for i, q in enumerate(TEST_QUERIES): print(f"\n[{i+1}/{len(TEST_QUERIES)}] {q['description']}") print(f" Query: {q['query'][:60]}...") try: start = time.time() result = optimized_pipeline(q['query']) latency = time.time() - start scores = score_prediction(q, result) optimized_scores.append(scores['total']) print(f" Intent: {result.intent} (expected: {q['expected_intent']}) → {scores['intent']:.0%}") print(f" Keywords: {scores['keywords']:.0%} matched") print(f" Score: {scores['total']:.2f} | Latency: {latency:.1f}s") except Exception as e: print(f" ERROR: {e}") optimized_scores.append(0.0) optimized_avg = sum(optimized_scores) / len(optimized_scores) print(f"\n→ Optimized Average Score: {optimized_avg:.3f}") # ========================================================================== # SUMMARY # ========================================================================== print("\n" + "=" * 70) print("BENCHMARK SUMMARY") print("=" * 70) improvement = optimized_avg - baseline_avg improvement_pct = (improvement / baseline_avg * 100) if baseline_avg > 0 else 0 print(f"\n Baseline Score: {baseline_avg:.3f}") print(f" Optimized Score: {optimized_avg:.3f}") print(f" Improvement: {improvement:+.3f} ({improvement_pct:+.1f}%)") print("\n Per-query comparison:") print(f" {'Query':<45} {'Baseline':>10} {'Optimized':>10} {'Δ':>8}") print(" " + "-" * 75) for i, q in enumerate(TEST_QUERIES): base = baseline_scores[i] opt = optimized_scores[i] delta = opt - base desc = q['description'][:43] print(f" {desc:<45} {base:>10.2f} {opt:>10.2f} {delta:>+8.2f}") print("\n" + "=" * 70) # Save results results = { "timestamp": datetime.now().isoformat(), "baseline_avg": baseline_avg, "optimized_avg": optimized_avg, "improvement": improvement, "improvement_pct": improvement_pct, "per_query": [ { "query": q["query"], "baseline": baseline_scores[i], "optimized": optimized_scores[i], } for i, q in enumerate(TEST_QUERIES) ] } results_path = Path(__file__).parent / "optimized_models" / "benchmark_results.json" with open(results_path, "w") as f: json.dump(results, f, indent=2) print(f"\nResults saved to: {results_path.name}") if __name__ == "__main__": run_benchmark()