#!/usr/bin/env python3 """ Benchmark script to measure RAG pipeline performance with fast/quality LM optimization. This script compares: 1. All stages using quality LM with schema-aware signatures (baseline) 2. Fast LM for routing/extraction with schema-aware signatures (optimized v1) 3. Fast LM for routing/extraction with SIMPLE signatures (optimized v2 - hypothesis: faster) Provider Selection: - Default: OpenAI (fast, ~1-3s per call, costs money) - Alternative: Z.AI (FREE, but slow ~20-30s per call) Requires: - OPENAI_API_KEY (for OpenAI models - RECOMMENDED for speed testing) - ZAI_API_TOKEN (for Z.AI models - FREE but slow) - SSH tunnel for Qdrant: ssh -f -N -L 6333:localhost:6333 root@91.98.224.44 Usage: # Use OpenAI (fast, recommended for benchmarking) python benchmark_lm_optimization.py # Use Z.AI (free but slow) python benchmark_lm_optimization.py --provider zai # Quick test with fewer queries python benchmark_lm_optimization.py --quick """ import argparse import os import sys import time from datetime import datetime from typing import Optional # Add project root to path project_root = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) sys.path.insert(0, project_root) # Load .env file for API keys try: from dotenv import load_dotenv load_dotenv(os.path.join(project_root, ".env")) print(f"[INFO] Loaded .env from {project_root}") except ImportError: print("[WARN] python-dotenv not installed, using environment variables only") import dspy # Try multiple import paths for compatibility try: from backend.rag.dspy_heritage_rag import HeritageRAGPipeline except ImportError: try: from dspy_heritage_rag import HeritageRAGPipeline except ImportError: # Direct import for running from within backend/rag directory import importlib.util spec = importlib.util.spec_from_file_location( "dspy_heritage_rag", os.path.join(os.path.dirname(__file__), "dspy_heritage_rag.py") ) module = importlib.util.module_from_spec(spec) spec.loader.exec_module(module) HeritageRAGPipeline = module.HeritageRAGPipeline def create_openai_quality_lm() -> Optional[dspy.LM]: """Create OpenAI GPT-4o for quality generation.""" openai_key = os.environ.get("OPENAI_API_KEY") if openai_key: try: lm = dspy.LM( "openai/gpt-4o", api_key=openai_key, temperature=0.3, max_tokens=2000, ) print("Quality LM: OpenAI GPT-4o (~$2.50/$10 per 1M tokens)") return lm except Exception as e: print(f"Failed to create OpenAI quality LM: {e}") return None def create_openai_fast_lm() -> Optional[dspy.LM]: """Create OpenAI GPT-4o-mini for fast operations.""" openai_key = os.environ.get("OPENAI_API_KEY") if openai_key: try: lm = dspy.LM( "openai/gpt-4o-mini", api_key=openai_key, temperature=0.2, max_tokens=1000, ) print("Fast LM: OpenAI GPT-4o-mini (~$0.15/$0.60 per 1M tokens)") return lm except Exception as e: print(f"Failed to create OpenAI fast LM: {e}") return None def create_zai_quality_lm() -> Optional[dspy.LM]: """Create Z.AI GLM-4.6 for quality generation (FREE but slow).""" zai_token = os.environ.get("ZAI_API_TOKEN") if zai_token: try: lm = dspy.LM( "openai/glm-4.6", api_key=zai_token, api_base="https://api.z.ai/api/coding/paas/v4", temperature=0.3, max_tokens=2000, ) print("Quality LM: Z.AI GLM-4.6 (FREE, ~30-40s per call)") return lm except Exception as e: print(f"Failed to create Z.AI quality LM: {e}") return None def create_zai_fast_lm() -> Optional[dspy.LM]: """Create Z.AI GLM-4.5-flash for fast operations (FREE but slow).""" zai_token = os.environ.get("ZAI_API_TOKEN") if zai_token: try: lm = dspy.LM( "openai/glm-4.5-flash", api_key=zai_token, api_base="https://api.z.ai/api/coding/paas/v4", temperature=0.2, max_tokens=1000, ) print("Fast LM: Z.AI GLM-4.5-flash (FREE, ~20-30s per call)") return lm except Exception as e: print(f"Failed to create Z.AI fast LM: {e}") return None def create_lms(provider: str = "openai") -> tuple[Optional[dspy.LM], Optional[dspy.LM]]: """Create quality and fast LMs based on provider preference. Returns: Tuple of (quality_lm, fast_lm) """ if provider == "openai": # Try OpenAI first (faster) quality_lm = create_openai_quality_lm() fast_lm = create_openai_fast_lm() # Fallback to Z.AI if OpenAI not available if not quality_lm: print("OpenAI not available, falling back to Z.AI...") quality_lm = create_zai_quality_lm() if not fast_lm: fast_lm = create_zai_fast_lm() else: # Use Z.AI (free but slow) quality_lm = create_zai_quality_lm() fast_lm = create_zai_fast_lm() # Fallback to OpenAI if Z.AI not available if not quality_lm: print("Z.AI not available, falling back to OpenAI...") quality_lm = create_openai_quality_lm() if not fast_lm: fast_lm = create_openai_fast_lm() return quality_lm, fast_lm def benchmark_query(pipeline: HeritageRAGPipeline, question: str, language: str = "nl") -> dict: """Run a single query and measure timing.""" start = time.perf_counter() try: result = pipeline.forward( question=question, language=language, include_viz=False, skip_cache=True, # Always skip cache for accurate benchmarking ) elapsed = time.perf_counter() - start return { "success": True, "elapsed_seconds": elapsed, "intent": result.intent, "answer_length": len(result.answer) if result.answer else 0, "confidence": getattr(result, "confidence", None), "timing_breakdown": getattr(result, "timing_breakdown", None), "timing_ms": getattr(result, "timing_ms", None), } except Exception as e: elapsed = time.perf_counter() - start return { "success": False, "elapsed_seconds": elapsed, "error": str(e), } def format_timing_breakdown(timing_breakdown) -> str: """Format timing breakdown for display.""" if not timing_breakdown: return " (no detailed timing available)" lines = [] for key, value_ms in timing_breakdown.items(): stage_name = key.replace("_ms", "").replace("_", " ").title() lines.append(f" {stage_name}: {value_ms:.0f}ms") return "\n".join(lines) def run_benchmark(provider: str = "openai", quick: bool = False): """Run the full benchmark comparing baseline vs optimized configurations. Compares 3 configurations: 1. Baseline: GPT-4o + schema-aware signatures (current production) 2. Optimized v1: GPT-4o-mini + schema-aware signatures (current optimization) 3. Optimized v2: GPT-4o-mini + SIMPLE signatures (hypothesis: faster) Args: provider: LLM provider to use ("openai" or "zai") quick: If True, only run 2 test queries """ import random import string print("=" * 70) print("RAG Pipeline Performance Benchmark") print(f"Started: {datetime.now().isoformat()}") print(f"Provider: {provider.upper()}") print("=" * 70) # Generate random suffixes to bypass DSPy internal cache # (DSPy caches by query string, so we need unique queries for fair comparison) rand1 = ''.join(random.choices(string.ascii_lowercase, k=4)) rand2 = ''.join(random.choices(string.ascii_lowercase, k=4)) rand3 = ''.join(random.choices(string.ascii_lowercase, k=4)) # Test questions for BASELINE (with unique suffix) baseline_questions = [ (f"Hoeveel musea zijn er in Amsterdam? [{rand1}a]", "nl"), # statistical (f"What archives are in The Hague? [{rand1}b]", "en"), # geographic (f"Welke bibliotheken hebben digitale collecties? [{rand1}c]", "nl"), # descriptive (f"Show me museums with Wikidata IDs [{rand1}d]", "en"), # exploratory (f"Vergelijk archieven in Utrecht en Rotterdam [{rand1}e]", "nl"), # comparative ] # Test questions for OPTIMIZED v1 (schema-aware, different unique suffix) optimized_v1_questions = [ (f"Hoeveel musea zijn er in Amsterdam? [{rand2}a]", "nl"), # statistical (f"What archives are in The Hague? [{rand2}b]", "en"), # geographic (f"Welke bibliotheken hebben digitale collecties? [{rand2}c]", "nl"), # descriptive (f"Show me museums with Wikidata IDs [{rand2}d]", "en"), # exploratory (f"Vergelijk archieven in Utrecht en Rotterdam [{rand2}e]", "nl"), # comparative ] # Test questions for OPTIMIZED v2 (simple signature, different unique suffix) optimized_v2_questions = [ (f"Hoeveel musea zijn er in Amsterdam? [{rand3}a]", "nl"), # statistical (f"What archives are in The Hague? [{rand3}b]", "en"), # geographic (f"Welke bibliotheken hebben digitale collecties? [{rand3}c]", "nl"), # descriptive (f"Show me museums with Wikidata IDs [{rand3}d]", "en"), # exploratory (f"Vergelijk archieven in Utrecht en Rotterdam [{rand3}e]", "nl"), # comparative ] # Display questions (without random suffix for clarity) display_questions = [ ("Hoeveel musea zijn er in Amsterdam?", "nl"), ("What archives are in The Hague?", "en"), ("Welke bibliotheken hebben digitale collecties?", "nl"), ("Show me museums with Wikidata IDs", "en"), ("Vergelijk archieven in Utrecht en Rotterdam", "nl"), ] if quick: baseline_questions = baseline_questions[:2] optimized_v1_questions = optimized_v1_questions[:2] optimized_v2_questions = optimized_v2_questions[:2] display_questions = display_questions[:2] print(f"\n[QUICK MODE: Testing with {len(display_questions)} queries]") num_queries = len(display_questions) # Create LMs print("\n--- Creating Language Models ---") quality_lm, fast_lm = create_lms(provider) if not quality_lm: print("ERROR: Could not create quality LM.") print("Set OPENAI_API_KEY or ZAI_API_TOKEN environment variable.") sys.exit(1) # Configure default LM dspy.configure(lm=quality_lm) # ========================================================================= # Create all 3 pipelines # ========================================================================= print("\n--- Creating Pipelines ---") # Baseline: Quality LM + schema-aware signatures print(" Creating baseline pipeline (GPT-4o + schema-aware)...") baseline_pipeline = HeritageRAGPipeline( fast_lm=None, # No fast LM - all stages use quality LM quality_lm=None, # Use global default use_schema_aware=True, # Schema-aware signatures ) # Optimized v1: Fast LM + schema-aware signatures (current approach - SLOW!) optimized_v1_pipeline = None if fast_lm: print(" Creating optimized v1 pipeline (GPT-4o-mini + schema-aware)...") optimized_v1_pipeline = HeritageRAGPipeline( fast_lm=fast_lm, quality_lm=quality_lm, use_schema_aware=True, # Schema-aware (large prompt) ) # Optimized v2: Fast LM + SIMPLE signatures (hypothesis: faster!) optimized_v2_pipeline = None if fast_lm: print(" Creating optimized v2 pipeline (GPT-4o-mini + SIMPLE signatures)...") optimized_v2_pipeline = HeritageRAGPipeline( fast_lm=fast_lm, quality_lm=quality_lm, use_schema_aware=False, # SIMPLE signatures (smaller prompt) ) # ========================================================================= # Warmup: Eliminate cold-start effects for all pipelines # ========================================================================= print("\n--- Warming up pipelines ---") warmup_q = "What is a museum? [warmup]" print(" Warming up baseline...") try: _ = baseline_pipeline.forward(question=warmup_q, language="en") print(" OK") except Exception as e: print(f" Failed: {e}") if optimized_v1_pipeline: print(" Warming up optimized v1 (schema-aware)...") try: _ = optimized_v1_pipeline.forward(question=warmup_q + " [v1]", language="en") print(" OK") except Exception as e: print(f" Failed: {e}") if optimized_v2_pipeline: print(" Warming up optimized v2 (simple)...") try: _ = optimized_v2_pipeline.forward(question=warmup_q + " [v2]", language="en") print(" OK") except Exception as e: print(f" Failed: {e}") # ========================================================================= # Benchmark 1: Baseline (GPT-4o + schema-aware) # ========================================================================= print("\n" + "=" * 70) print("BENCHMARK 1: Baseline (GPT-4o + schema-aware signatures)") print("=" * 70) baseline_results = [] for i, (question, lang) in enumerate(baseline_questions): display_q = display_questions[i][0] print(f"\n Query: {display_q[:50]}...") result = benchmark_query(baseline_pipeline, question, lang) baseline_results.append(result) if result["success"]: print(f" Time: {result['elapsed_seconds']:.2f}s | Intent: {result['intent']} | Answer: {result['answer_length']} chars") if result.get("timing_breakdown"): print(format_timing_breakdown(result["timing_breakdown"])) else: print(f" FAILED: {result['error'][:80]}...") baseline_avg = sum(r["elapsed_seconds"] for r in baseline_results) / len(baseline_results) baseline_success = sum(1 for r in baseline_results if r["success"]) print(f"\n Baseline Average: {baseline_avg:.2f}s ({baseline_success}/{len(baseline_results)} successful)") # ========================================================================= # Benchmark 2: Optimized v1 (GPT-4o-mini + schema-aware) - CURRENT APPROACH # ========================================================================= print("\n" + "=" * 70) print("BENCHMARK 2: Optimized v1 (GPT-4o-mini + schema-aware signatures)") print("=" * 70) if not optimized_v1_pipeline: print(" SKIPPED: No fast LM available") optimized_v1_avg = baseline_avg optimized_v1_results = baseline_results else: optimized_v1_results = [] for i, (question, lang) in enumerate(optimized_v1_questions): display_q = display_questions[i][0] print(f"\n Query: {display_q[:50]}...") result = benchmark_query(optimized_v1_pipeline, question, lang) optimized_v1_results.append(result) if result["success"]: print(f" Time: {result['elapsed_seconds']:.2f}s | Intent: {result['intent']} | Answer: {result['answer_length']} chars") if result.get("timing_breakdown"): print(format_timing_breakdown(result["timing_breakdown"])) else: print(f" FAILED: {result['error'][:80]}...") optimized_v1_avg = sum(r["elapsed_seconds"] for r in optimized_v1_results) / len(optimized_v1_results) optimized_v1_success = sum(1 for r in optimized_v1_results if r["success"]) print(f"\n Optimized v1 Average: {optimized_v1_avg:.2f}s ({optimized_v1_success}/{len(optimized_v1_results)} successful)") # ========================================================================= # Benchmark 3: Optimized v2 (GPT-4o-mini + SIMPLE signatures) - HYPOTHESIS # ========================================================================= print("\n" + "=" * 70) print("BENCHMARK 3: Optimized v2 (GPT-4o-mini + SIMPLE signatures)") print("=" * 70) if not optimized_v2_pipeline: print(" SKIPPED: No fast LM available") optimized_v2_avg = baseline_avg optimized_v2_results = baseline_results else: optimized_v2_results = [] for i, (question, lang) in enumerate(optimized_v2_questions): display_q = display_questions[i][0] print(f"\n Query: {display_q[:50]}...") result = benchmark_query(optimized_v2_pipeline, question, lang) optimized_v2_results.append(result) if result["success"]: print(f" Time: {result['elapsed_seconds']:.2f}s | Intent: {result['intent']} | Answer: {result['answer_length']} chars") if result.get("timing_breakdown"): print(format_timing_breakdown(result["timing_breakdown"])) else: print(f" FAILED: {result['error'][:80]}...") optimized_v2_avg = sum(r["elapsed_seconds"] for r in optimized_v2_results) / len(optimized_v2_results) optimized_v2_success = sum(1 for r in optimized_v2_results if r["success"]) print(f"\n Optimized v2 Average: {optimized_v2_avg:.2f}s ({optimized_v2_success}/{len(optimized_v2_results)} successful)") # ========================================================================= # Summary # ========================================================================= print("\n" + "=" * 70) print("SUMMARY - Three Configuration Comparison") print("=" * 70) print(f"\n {'Configuration':<50} {'Avg Time':>10} {'vs Baseline':>12}") print(" " + "-" * 72) # Baseline print(f" {'1. Baseline (GPT-4o + schema-aware)':<50} {baseline_avg:>9.2f}s {'(baseline)':>12}") # Optimized v1 (schema-aware) if optimized_v1_pipeline: v1_diff = (optimized_v1_avg - baseline_avg) / baseline_avg * 100 v1_sign = "+" if v1_diff > 0 else "" print(f" {'2. Optimized v1 (GPT-4o-mini + schema-aware)':<50} {optimized_v1_avg:>9.2f}s {v1_sign}{v1_diff:>10.0f}%") # Optimized v2 (simple) if optimized_v2_pipeline: v2_diff = (optimized_v2_avg - baseline_avg) / baseline_avg * 100 v2_sign = "+" if v2_diff > 0 else "" print(f" {'3. Optimized v2 (GPT-4o-mini + SIMPLE)':<50} {optimized_v2_avg:>9.2f}s {v2_sign}{v2_diff:>10.0f}%") # Winner determination print("\n ANALYSIS:") if optimized_v2_pipeline and optimized_v1_pipeline: if optimized_v2_avg < optimized_v1_avg: improvement = (optimized_v1_avg - optimized_v2_avg) / optimized_v1_avg * 100 print(f" -> SIMPLE signatures are {improvement:.0f}% faster than schema-aware with GPT-4o-mini") if optimized_v2_avg < baseline_avg: print(f" -> Optimized v2 (SIMPLE) is the WINNER - faster than baseline!") else: print(f" -> But still slower than GPT-4o baseline (schema complexity not the only issue)") else: print(f" -> Schema-aware signatures are NOT the bottleneck") print(f" -> The issue may be in DSPy context switching or GPT-4o-mini itself") # Per-query comparison table print("\n Per-Query Comparison:") print(f" {'Query':<35} {'Baseline':>9} {'v1(SA)':>9} {'v2(Simple)':>9}") print(" " + "-" * 62) for i, (question, _) in enumerate(display_questions): b = baseline_results[i] v1 = optimized_v1_results[i] if optimized_v1_pipeline else {"success": False} v2 = optimized_v2_results[i] if optimized_v2_pipeline else {"success": False} b_str = f"{b['elapsed_seconds']:.2f}s" if b["success"] else "FAIL" v1_str = f"{v1['elapsed_seconds']:.2f}s" if v1.get("success") else "FAIL" v2_str = f"{v2['elapsed_seconds']:.2f}s" if v2.get("success") else "FAIL" print(f" {question[:33]:<35} {b_str:>9} {v1_str:>9} {v2_str:>9}") # Cost estimate print("\n Estimated Cost (per query):") if provider == "openai": # Rough estimates based on typical token usage # Baseline: ~5 LLM calls × ~2000 tokens each × GPT-4o pricing baseline_cost = 5 * 2000 * (2.50 + 10.00) / 2 / 1_000_000 # Optimized: ~4 LLM calls × mini + 1 call × 4o optimized_cost = 4 * 1000 * (0.15 + 0.60) / 2 / 1_000_000 + 1 * 2000 * (2.50 + 10.00) / 2 / 1_000_000 print(f" Baseline (all GPT-4o): ~${baseline_cost:.4f}") print(f" Optimized (mini + 4o): ~${optimized_cost:.4f}") print(f" Cost savings: ~{(1 - optimized_cost/baseline_cost) * 100:.0f}%") else: print(f" Z.AI models are FREE (but slow)") print("\n" + "=" * 70) print("Benchmark Complete!") print("=" * 70) def main(): parser = argparse.ArgumentParser(description="Benchmark RAG pipeline LM optimization") parser.add_argument( "--provider", choices=["openai", "zai"], default="openai", help="LLM provider to use (default: openai for speed)" ) parser.add_argument( "--quick", action="store_true", help="Quick test with only 2 queries" ) args = parser.parse_args() run_benchmark(provider=args.provider, quick=args.quick) if __name__ == "__main__": main()