542 lines
22 KiB
Python
542 lines
22 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Benchmark script to measure RAG pipeline performance with fast/quality LM optimization.
|
||
|
||
This script compares:
|
||
1. All stages using quality LM with schema-aware signatures (baseline)
|
||
2. Fast LM for routing/extraction with schema-aware signatures (optimized v1)
|
||
3. Fast LM for routing/extraction with SIMPLE signatures (optimized v2 - hypothesis: faster)
|
||
|
||
Provider Selection:
|
||
- Default: OpenAI (fast, ~1-3s per call, costs money)
|
||
- Alternative: Z.AI (FREE, but slow ~20-30s per call)
|
||
|
||
Requires:
|
||
- OPENAI_API_KEY (for OpenAI models - RECOMMENDED for speed testing)
|
||
- ZAI_API_TOKEN (for Z.AI models - FREE but slow)
|
||
- SSH tunnel for Qdrant: ssh -f -N -L 6333:localhost:6333 root@91.98.224.44
|
||
|
||
Usage:
|
||
# Use OpenAI (fast, recommended for benchmarking)
|
||
python benchmark_lm_optimization.py
|
||
|
||
# Use Z.AI (free but slow)
|
||
python benchmark_lm_optimization.py --provider zai
|
||
|
||
# Quick test with fewer queries
|
||
python benchmark_lm_optimization.py --quick
|
||
"""
|
||
|
||
import argparse
|
||
import os
|
||
import sys
|
||
import time
|
||
from datetime import datetime
|
||
from typing import Optional
|
||
|
||
# Add project root to path
|
||
project_root = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
|
||
sys.path.insert(0, project_root)
|
||
|
||
# Load .env file for API keys
|
||
try:
|
||
from dotenv import load_dotenv
|
||
load_dotenv(os.path.join(project_root, ".env"))
|
||
print(f"[INFO] Loaded .env from {project_root}")
|
||
except ImportError:
|
||
print("[WARN] python-dotenv not installed, using environment variables only")
|
||
|
||
import dspy
|
||
|
||
# Try multiple import paths for compatibility
|
||
try:
|
||
from backend.rag.dspy_heritage_rag import HeritageRAGPipeline
|
||
except ImportError:
|
||
try:
|
||
from dspy_heritage_rag import HeritageRAGPipeline
|
||
except ImportError:
|
||
# Direct import for running from within backend/rag directory
|
||
import importlib.util
|
||
spec = importlib.util.spec_from_file_location(
|
||
"dspy_heritage_rag",
|
||
os.path.join(os.path.dirname(__file__), "dspy_heritage_rag.py")
|
||
)
|
||
module = importlib.util.module_from_spec(spec)
|
||
spec.loader.exec_module(module)
|
||
HeritageRAGPipeline = module.HeritageRAGPipeline
|
||
|
||
|
||
def create_openai_quality_lm() -> Optional[dspy.LM]:
|
||
"""Create OpenAI GPT-4o for quality generation."""
|
||
openai_key = os.environ.get("OPENAI_API_KEY")
|
||
if openai_key:
|
||
try:
|
||
lm = dspy.LM(
|
||
"openai/gpt-4o",
|
||
api_key=openai_key,
|
||
temperature=0.3,
|
||
max_tokens=2000,
|
||
)
|
||
print("Quality LM: OpenAI GPT-4o (~$2.50/$10 per 1M tokens)")
|
||
return lm
|
||
except Exception as e:
|
||
print(f"Failed to create OpenAI quality LM: {e}")
|
||
return None
|
||
|
||
|
||
def create_openai_fast_lm() -> Optional[dspy.LM]:
|
||
"""Create OpenAI GPT-4o-mini for fast operations."""
|
||
openai_key = os.environ.get("OPENAI_API_KEY")
|
||
if openai_key:
|
||
try:
|
||
lm = dspy.LM(
|
||
"openai/gpt-4o-mini",
|
||
api_key=openai_key,
|
||
temperature=0.2,
|
||
max_tokens=1000,
|
||
)
|
||
print("Fast LM: OpenAI GPT-4o-mini (~$0.15/$0.60 per 1M tokens)")
|
||
return lm
|
||
except Exception as e:
|
||
print(f"Failed to create OpenAI fast LM: {e}")
|
||
return None
|
||
|
||
|
||
def create_zai_quality_lm() -> Optional[dspy.LM]:
|
||
"""Create Z.AI GLM-4.6 for quality generation (FREE but slow)."""
|
||
zai_token = os.environ.get("ZAI_API_TOKEN")
|
||
if zai_token:
|
||
try:
|
||
lm = dspy.LM(
|
||
"openai/glm-4.6",
|
||
api_key=zai_token,
|
||
api_base="https://api.z.ai/api/coding/paas/v4",
|
||
temperature=0.3,
|
||
max_tokens=2000,
|
||
)
|
||
print("Quality LM: Z.AI GLM-4.6 (FREE, ~30-40s per call)")
|
||
return lm
|
||
except Exception as e:
|
||
print(f"Failed to create Z.AI quality LM: {e}")
|
||
return None
|
||
|
||
|
||
def create_zai_fast_lm() -> Optional[dspy.LM]:
|
||
"""Create Z.AI GLM-4.5-flash for fast operations (FREE but slow)."""
|
||
zai_token = os.environ.get("ZAI_API_TOKEN")
|
||
if zai_token:
|
||
try:
|
||
lm = dspy.LM(
|
||
"openai/glm-4.5-flash",
|
||
api_key=zai_token,
|
||
api_base="https://api.z.ai/api/coding/paas/v4",
|
||
temperature=0.2,
|
||
max_tokens=1000,
|
||
)
|
||
print("Fast LM: Z.AI GLM-4.5-flash (FREE, ~20-30s per call)")
|
||
return lm
|
||
except Exception as e:
|
||
print(f"Failed to create Z.AI fast LM: {e}")
|
||
return None
|
||
|
||
|
||
def create_lms(provider: str = "openai") -> tuple[Optional[dspy.LM], Optional[dspy.LM]]:
|
||
"""Create quality and fast LMs based on provider preference.
|
||
|
||
Returns:
|
||
Tuple of (quality_lm, fast_lm)
|
||
"""
|
||
if provider == "openai":
|
||
# Try OpenAI first (faster)
|
||
quality_lm = create_openai_quality_lm()
|
||
fast_lm = create_openai_fast_lm()
|
||
|
||
# Fallback to Z.AI if OpenAI not available
|
||
if not quality_lm:
|
||
print("OpenAI not available, falling back to Z.AI...")
|
||
quality_lm = create_zai_quality_lm()
|
||
if not fast_lm:
|
||
fast_lm = create_zai_fast_lm()
|
||
else:
|
||
# Use Z.AI (free but slow)
|
||
quality_lm = create_zai_quality_lm()
|
||
fast_lm = create_zai_fast_lm()
|
||
|
||
# Fallback to OpenAI if Z.AI not available
|
||
if not quality_lm:
|
||
print("Z.AI not available, falling back to OpenAI...")
|
||
quality_lm = create_openai_quality_lm()
|
||
if not fast_lm:
|
||
fast_lm = create_openai_fast_lm()
|
||
|
||
return quality_lm, fast_lm
|
||
|
||
|
||
def benchmark_query(pipeline: HeritageRAGPipeline, question: str, language: str = "nl") -> dict:
|
||
"""Run a single query and measure timing."""
|
||
start = time.perf_counter()
|
||
|
||
try:
|
||
result = pipeline.forward(
|
||
question=question,
|
||
language=language,
|
||
include_viz=False,
|
||
skip_cache=True, # Always skip cache for accurate benchmarking
|
||
)
|
||
|
||
elapsed = time.perf_counter() - start
|
||
|
||
return {
|
||
"success": True,
|
||
"elapsed_seconds": elapsed,
|
||
"intent": result.intent,
|
||
"answer_length": len(result.answer) if result.answer else 0,
|
||
"confidence": getattr(result, "confidence", None),
|
||
"timing_breakdown": getattr(result, "timing_breakdown", None),
|
||
"timing_ms": getattr(result, "timing_ms", None),
|
||
}
|
||
except Exception as e:
|
||
elapsed = time.perf_counter() - start
|
||
return {
|
||
"success": False,
|
||
"elapsed_seconds": elapsed,
|
||
"error": str(e),
|
||
}
|
||
|
||
|
||
def format_timing_breakdown(timing_breakdown) -> str:
|
||
"""Format timing breakdown for display."""
|
||
if not timing_breakdown:
|
||
return " (no detailed timing available)"
|
||
|
||
lines = []
|
||
for key, value_ms in timing_breakdown.items():
|
||
stage_name = key.replace("_ms", "").replace("_", " ").title()
|
||
lines.append(f" {stage_name}: {value_ms:.0f}ms")
|
||
return "\n".join(lines)
|
||
|
||
|
||
def run_benchmark(provider: str = "openai", quick: bool = False):
|
||
"""Run the full benchmark comparing baseline vs optimized configurations.
|
||
|
||
Compares 3 configurations:
|
||
1. Baseline: GPT-4o + schema-aware signatures (current production)
|
||
2. Optimized v1: GPT-4o-mini + schema-aware signatures (current optimization)
|
||
3. Optimized v2: GPT-4o-mini + SIMPLE signatures (hypothesis: faster)
|
||
|
||
Args:
|
||
provider: LLM provider to use ("openai" or "zai")
|
||
quick: If True, only run 2 test queries
|
||
"""
|
||
import random
|
||
import string
|
||
|
||
print("=" * 70)
|
||
print("RAG Pipeline Performance Benchmark")
|
||
print(f"Started: {datetime.now().isoformat()}")
|
||
print(f"Provider: {provider.upper()}")
|
||
print("=" * 70)
|
||
|
||
# Generate random suffixes to bypass DSPy internal cache
|
||
# (DSPy caches by query string, so we need unique queries for fair comparison)
|
||
rand1 = ''.join(random.choices(string.ascii_lowercase, k=4))
|
||
rand2 = ''.join(random.choices(string.ascii_lowercase, k=4))
|
||
rand3 = ''.join(random.choices(string.ascii_lowercase, k=4))
|
||
|
||
# Test questions for BASELINE (with unique suffix)
|
||
baseline_questions = [
|
||
(f"Hoeveel musea zijn er in Amsterdam? [{rand1}a]", "nl"), # statistical
|
||
(f"What archives are in The Hague? [{rand1}b]", "en"), # geographic
|
||
(f"Welke bibliotheken hebben digitale collecties? [{rand1}c]", "nl"), # descriptive
|
||
(f"Show me museums with Wikidata IDs [{rand1}d]", "en"), # exploratory
|
||
(f"Vergelijk archieven in Utrecht en Rotterdam [{rand1}e]", "nl"), # comparative
|
||
]
|
||
|
||
# Test questions for OPTIMIZED v1 (schema-aware, different unique suffix)
|
||
optimized_v1_questions = [
|
||
(f"Hoeveel musea zijn er in Amsterdam? [{rand2}a]", "nl"), # statistical
|
||
(f"What archives are in The Hague? [{rand2}b]", "en"), # geographic
|
||
(f"Welke bibliotheken hebben digitale collecties? [{rand2}c]", "nl"), # descriptive
|
||
(f"Show me museums with Wikidata IDs [{rand2}d]", "en"), # exploratory
|
||
(f"Vergelijk archieven in Utrecht en Rotterdam [{rand2}e]", "nl"), # comparative
|
||
]
|
||
|
||
# Test questions for OPTIMIZED v2 (simple signature, different unique suffix)
|
||
optimized_v2_questions = [
|
||
(f"Hoeveel musea zijn er in Amsterdam? [{rand3}a]", "nl"), # statistical
|
||
(f"What archives are in The Hague? [{rand3}b]", "en"), # geographic
|
||
(f"Welke bibliotheken hebben digitale collecties? [{rand3}c]", "nl"), # descriptive
|
||
(f"Show me museums with Wikidata IDs [{rand3}d]", "en"), # exploratory
|
||
(f"Vergelijk archieven in Utrecht en Rotterdam [{rand3}e]", "nl"), # comparative
|
||
]
|
||
|
||
# Display questions (without random suffix for clarity)
|
||
display_questions = [
|
||
("Hoeveel musea zijn er in Amsterdam?", "nl"),
|
||
("What archives are in The Hague?", "en"),
|
||
("Welke bibliotheken hebben digitale collecties?", "nl"),
|
||
("Show me museums with Wikidata IDs", "en"),
|
||
("Vergelijk archieven in Utrecht en Rotterdam", "nl"),
|
||
]
|
||
|
||
if quick:
|
||
baseline_questions = baseline_questions[:2]
|
||
optimized_v1_questions = optimized_v1_questions[:2]
|
||
optimized_v2_questions = optimized_v2_questions[:2]
|
||
display_questions = display_questions[:2]
|
||
print(f"\n[QUICK MODE: Testing with {len(display_questions)} queries]")
|
||
|
||
num_queries = len(display_questions)
|
||
|
||
# Create LMs
|
||
print("\n--- Creating Language Models ---")
|
||
quality_lm, fast_lm = create_lms(provider)
|
||
|
||
if not quality_lm:
|
||
print("ERROR: Could not create quality LM.")
|
||
print("Set OPENAI_API_KEY or ZAI_API_TOKEN environment variable.")
|
||
sys.exit(1)
|
||
|
||
# Configure default LM
|
||
dspy.configure(lm=quality_lm)
|
||
|
||
# =========================================================================
|
||
# Create all 3 pipelines
|
||
# =========================================================================
|
||
print("\n--- Creating Pipelines ---")
|
||
|
||
# Baseline: Quality LM + schema-aware signatures
|
||
print(" Creating baseline pipeline (GPT-4o + schema-aware)...")
|
||
baseline_pipeline = HeritageRAGPipeline(
|
||
fast_lm=None, # No fast LM - all stages use quality LM
|
||
quality_lm=None, # Use global default
|
||
use_schema_aware=True, # Schema-aware signatures
|
||
)
|
||
|
||
# Optimized v1: Fast LM + schema-aware signatures (current approach - SLOW!)
|
||
optimized_v1_pipeline = None
|
||
if fast_lm:
|
||
print(" Creating optimized v1 pipeline (GPT-4o-mini + schema-aware)...")
|
||
optimized_v1_pipeline = HeritageRAGPipeline(
|
||
fast_lm=fast_lm,
|
||
quality_lm=quality_lm,
|
||
use_schema_aware=True, # Schema-aware (large prompt)
|
||
)
|
||
|
||
# Optimized v2: Fast LM + SIMPLE signatures (hypothesis: faster!)
|
||
optimized_v2_pipeline = None
|
||
if fast_lm:
|
||
print(" Creating optimized v2 pipeline (GPT-4o-mini + SIMPLE signatures)...")
|
||
optimized_v2_pipeline = HeritageRAGPipeline(
|
||
fast_lm=fast_lm,
|
||
quality_lm=quality_lm,
|
||
use_schema_aware=False, # SIMPLE signatures (smaller prompt)
|
||
)
|
||
|
||
# =========================================================================
|
||
# Warmup: Eliminate cold-start effects for all pipelines
|
||
# =========================================================================
|
||
print("\n--- Warming up pipelines ---")
|
||
warmup_q = "What is a museum? [warmup]"
|
||
|
||
print(" Warming up baseline...")
|
||
try:
|
||
_ = baseline_pipeline.forward(question=warmup_q, language="en")
|
||
print(" OK")
|
||
except Exception as e:
|
||
print(f" Failed: {e}")
|
||
|
||
if optimized_v1_pipeline:
|
||
print(" Warming up optimized v1 (schema-aware)...")
|
||
try:
|
||
_ = optimized_v1_pipeline.forward(question=warmup_q + " [v1]", language="en")
|
||
print(" OK")
|
||
except Exception as e:
|
||
print(f" Failed: {e}")
|
||
|
||
if optimized_v2_pipeline:
|
||
print(" Warming up optimized v2 (simple)...")
|
||
try:
|
||
_ = optimized_v2_pipeline.forward(question=warmup_q + " [v2]", language="en")
|
||
print(" OK")
|
||
except Exception as e:
|
||
print(f" Failed: {e}")
|
||
|
||
# =========================================================================
|
||
# Benchmark 1: Baseline (GPT-4o + schema-aware)
|
||
# =========================================================================
|
||
print("\n" + "=" * 70)
|
||
print("BENCHMARK 1: Baseline (GPT-4o + schema-aware signatures)")
|
||
print("=" * 70)
|
||
|
||
baseline_results = []
|
||
for i, (question, lang) in enumerate(baseline_questions):
|
||
display_q = display_questions[i][0]
|
||
print(f"\n Query: {display_q[:50]}...")
|
||
result = benchmark_query(baseline_pipeline, question, lang)
|
||
baseline_results.append(result)
|
||
|
||
if result["success"]:
|
||
print(f" Time: {result['elapsed_seconds']:.2f}s | Intent: {result['intent']} | Answer: {result['answer_length']} chars")
|
||
if result.get("timing_breakdown"):
|
||
print(format_timing_breakdown(result["timing_breakdown"]))
|
||
else:
|
||
print(f" FAILED: {result['error'][:80]}...")
|
||
|
||
baseline_avg = sum(r["elapsed_seconds"] for r in baseline_results) / len(baseline_results)
|
||
baseline_success = sum(1 for r in baseline_results if r["success"])
|
||
print(f"\n Baseline Average: {baseline_avg:.2f}s ({baseline_success}/{len(baseline_results)} successful)")
|
||
|
||
# =========================================================================
|
||
# Benchmark 2: Optimized v1 (GPT-4o-mini + schema-aware) - CURRENT APPROACH
|
||
# =========================================================================
|
||
print("\n" + "=" * 70)
|
||
print("BENCHMARK 2: Optimized v1 (GPT-4o-mini + schema-aware signatures)")
|
||
print("=" * 70)
|
||
|
||
if not optimized_v1_pipeline:
|
||
print(" SKIPPED: No fast LM available")
|
||
optimized_v1_avg = baseline_avg
|
||
optimized_v1_results = baseline_results
|
||
else:
|
||
optimized_v1_results = []
|
||
for i, (question, lang) in enumerate(optimized_v1_questions):
|
||
display_q = display_questions[i][0]
|
||
print(f"\n Query: {display_q[:50]}...")
|
||
result = benchmark_query(optimized_v1_pipeline, question, lang)
|
||
optimized_v1_results.append(result)
|
||
|
||
if result["success"]:
|
||
print(f" Time: {result['elapsed_seconds']:.2f}s | Intent: {result['intent']} | Answer: {result['answer_length']} chars")
|
||
if result.get("timing_breakdown"):
|
||
print(format_timing_breakdown(result["timing_breakdown"]))
|
||
else:
|
||
print(f" FAILED: {result['error'][:80]}...")
|
||
|
||
optimized_v1_avg = sum(r["elapsed_seconds"] for r in optimized_v1_results) / len(optimized_v1_results)
|
||
optimized_v1_success = sum(1 for r in optimized_v1_results if r["success"])
|
||
print(f"\n Optimized v1 Average: {optimized_v1_avg:.2f}s ({optimized_v1_success}/{len(optimized_v1_results)} successful)")
|
||
|
||
# =========================================================================
|
||
# Benchmark 3: Optimized v2 (GPT-4o-mini + SIMPLE signatures) - HYPOTHESIS
|
||
# =========================================================================
|
||
print("\n" + "=" * 70)
|
||
print("BENCHMARK 3: Optimized v2 (GPT-4o-mini + SIMPLE signatures)")
|
||
print("=" * 70)
|
||
|
||
if not optimized_v2_pipeline:
|
||
print(" SKIPPED: No fast LM available")
|
||
optimized_v2_avg = baseline_avg
|
||
optimized_v2_results = baseline_results
|
||
else:
|
||
optimized_v2_results = []
|
||
for i, (question, lang) in enumerate(optimized_v2_questions):
|
||
display_q = display_questions[i][0]
|
||
print(f"\n Query: {display_q[:50]}...")
|
||
result = benchmark_query(optimized_v2_pipeline, question, lang)
|
||
optimized_v2_results.append(result)
|
||
|
||
if result["success"]:
|
||
print(f" Time: {result['elapsed_seconds']:.2f}s | Intent: {result['intent']} | Answer: {result['answer_length']} chars")
|
||
if result.get("timing_breakdown"):
|
||
print(format_timing_breakdown(result["timing_breakdown"]))
|
||
else:
|
||
print(f" FAILED: {result['error'][:80]}...")
|
||
|
||
optimized_v2_avg = sum(r["elapsed_seconds"] for r in optimized_v2_results) / len(optimized_v2_results)
|
||
optimized_v2_success = sum(1 for r in optimized_v2_results if r["success"])
|
||
print(f"\n Optimized v2 Average: {optimized_v2_avg:.2f}s ({optimized_v2_success}/{len(optimized_v2_results)} successful)")
|
||
|
||
# =========================================================================
|
||
# Summary
|
||
# =========================================================================
|
||
print("\n" + "=" * 70)
|
||
print("SUMMARY - Three Configuration Comparison")
|
||
print("=" * 70)
|
||
|
||
print(f"\n {'Configuration':<50} {'Avg Time':>10} {'vs Baseline':>12}")
|
||
print(" " + "-" * 72)
|
||
|
||
# Baseline
|
||
print(f" {'1. Baseline (GPT-4o + schema-aware)':<50} {baseline_avg:>9.2f}s {'(baseline)':>12}")
|
||
|
||
# Optimized v1 (schema-aware)
|
||
if optimized_v1_pipeline:
|
||
v1_diff = (optimized_v1_avg - baseline_avg) / baseline_avg * 100
|
||
v1_sign = "+" if v1_diff > 0 else ""
|
||
print(f" {'2. Optimized v1 (GPT-4o-mini + schema-aware)':<50} {optimized_v1_avg:>9.2f}s {v1_sign}{v1_diff:>10.0f}%")
|
||
|
||
# Optimized v2 (simple)
|
||
if optimized_v2_pipeline:
|
||
v2_diff = (optimized_v2_avg - baseline_avg) / baseline_avg * 100
|
||
v2_sign = "+" if v2_diff > 0 else ""
|
||
print(f" {'3. Optimized v2 (GPT-4o-mini + SIMPLE)':<50} {optimized_v2_avg:>9.2f}s {v2_sign}{v2_diff:>10.0f}%")
|
||
|
||
# Winner determination
|
||
print("\n ANALYSIS:")
|
||
if optimized_v2_pipeline and optimized_v1_pipeline:
|
||
if optimized_v2_avg < optimized_v1_avg:
|
||
improvement = (optimized_v1_avg - optimized_v2_avg) / optimized_v1_avg * 100
|
||
print(f" -> SIMPLE signatures are {improvement:.0f}% faster than schema-aware with GPT-4o-mini")
|
||
if optimized_v2_avg < baseline_avg:
|
||
print(f" -> Optimized v2 (SIMPLE) is the WINNER - faster than baseline!")
|
||
else:
|
||
print(f" -> But still slower than GPT-4o baseline (schema complexity not the only issue)")
|
||
else:
|
||
print(f" -> Schema-aware signatures are NOT the bottleneck")
|
||
print(f" -> The issue may be in DSPy context switching or GPT-4o-mini itself")
|
||
|
||
# Per-query comparison table
|
||
print("\n Per-Query Comparison:")
|
||
print(f" {'Query':<35} {'Baseline':>9} {'v1(SA)':>9} {'v2(Simple)':>9}")
|
||
print(" " + "-" * 62)
|
||
for i, (question, _) in enumerate(display_questions):
|
||
b = baseline_results[i]
|
||
v1 = optimized_v1_results[i] if optimized_v1_pipeline else {"success": False}
|
||
v2 = optimized_v2_results[i] if optimized_v2_pipeline else {"success": False}
|
||
|
||
b_str = f"{b['elapsed_seconds']:.2f}s" if b["success"] else "FAIL"
|
||
v1_str = f"{v1['elapsed_seconds']:.2f}s" if v1.get("success") else "FAIL"
|
||
v2_str = f"{v2['elapsed_seconds']:.2f}s" if v2.get("success") else "FAIL"
|
||
|
||
print(f" {question[:33]:<35} {b_str:>9} {v1_str:>9} {v2_str:>9}")
|
||
|
||
# Cost estimate
|
||
print("\n Estimated Cost (per query):")
|
||
if provider == "openai":
|
||
# Rough estimates based on typical token usage
|
||
# Baseline: ~5 LLM calls × ~2000 tokens each × GPT-4o pricing
|
||
baseline_cost = 5 * 2000 * (2.50 + 10.00) / 2 / 1_000_000
|
||
# Optimized: ~4 LLM calls × mini + 1 call × 4o
|
||
optimized_cost = 4 * 1000 * (0.15 + 0.60) / 2 / 1_000_000 + 1 * 2000 * (2.50 + 10.00) / 2 / 1_000_000
|
||
print(f" Baseline (all GPT-4o): ~${baseline_cost:.4f}")
|
||
print(f" Optimized (mini + 4o): ~${optimized_cost:.4f}")
|
||
print(f" Cost savings: ~{(1 - optimized_cost/baseline_cost) * 100:.0f}%")
|
||
else:
|
||
print(f" Z.AI models are FREE (but slow)")
|
||
|
||
print("\n" + "=" * 70)
|
||
print("Benchmark Complete!")
|
||
print("=" * 70)
|
||
|
||
|
||
def main():
|
||
parser = argparse.ArgumentParser(description="Benchmark RAG pipeline LM optimization")
|
||
parser.add_argument(
|
||
"--provider",
|
||
choices=["openai", "zai"],
|
||
default="openai",
|
||
help="LLM provider to use (default: openai for speed)"
|
||
)
|
||
parser.add_argument(
|
||
"--quick",
|
||
action="store_true",
|
||
help="Quick test with only 2 queries"
|
||
)
|
||
args = parser.parse_args()
|
||
|
||
run_benchmark(provider=args.provider, quick=args.quick)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|