394 lines
14 KiB
Python
394 lines
14 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Benchmark script comparing baseline vs optimized Heritage RAG pipeline.
|
|
|
|
This version runs each query in a separate subprocess to eliminate
|
|
in-memory caching artifacts that caused false "0% improvement" results.
|
|
|
|
Usage:
|
|
python benchmark_optimization_v2.py
|
|
|
|
Requirements:
|
|
- SSH tunnel active: ssh -f -N -L 7878:localhost:7878 root@91.98.224.44
|
|
- Environment loaded: source .venv/bin/activate && source .env
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import re
|
|
import subprocess
|
|
import sys
|
|
import time
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
|
|
# =============================================================================
|
|
# TEST QUERIES
|
|
# =============================================================================
|
|
|
|
TEST_QUERIES = [
|
|
{
|
|
"query": "List all libraries in Utrecht",
|
|
"expected_intent": "geographic",
|
|
# Include both English AND Dutch keywords for bilingual scoring
|
|
"expected_keywords": ["library", "bibliotheek", "utrecht"],
|
|
"description": "Geographic query - libraries in specific city"
|
|
},
|
|
{
|
|
"query": "What is the history of the Rijksmuseum?",
|
|
# Accept temporal, entity_lookup, or relational as valid intents
|
|
"expected_intent": "temporal",
|
|
"accepted_intents": ["temporal", "entity_lookup", "relational", "exploration"],
|
|
"expected_keywords": ["rijksmuseum", "museum", "1800", "opgericht", "founded", "geschiedenis", "history"],
|
|
"description": "Entity lookup - specific institution history"
|
|
},
|
|
{
|
|
"query": "How many archives are in the Netherlands?",
|
|
"expected_intent": "statistical",
|
|
# Bilingual keywords
|
|
"expected_keywords": ["archief", "archive", "nederland", "netherlands", "aantal", "number"],
|
|
"description": "Statistical query - count institutions"
|
|
},
|
|
{
|
|
"query": "Compare museums in Amsterdam and Rotterdam",
|
|
"expected_intent": "comparative",
|
|
"expected_keywords": ["amsterdam", "rotterdam", "museum", "musea"],
|
|
"description": "Comparative query - two cities"
|
|
},
|
|
{
|
|
"query": "Welke archieven zijn er in Groningen?",
|
|
"expected_intent": "geographic",
|
|
"expected_keywords": ["archief", "groningen", "groninger"],
|
|
"description": "Dutch language - geographic query"
|
|
},
|
|
{
|
|
"query": "What heritage institutions are in Drenthe?",
|
|
"expected_intent": "geographic",
|
|
# Bilingual heritage keywords
|
|
"expected_keywords": ["drenthe", "museum", "archief", "heritage", "erfgoed", "drents"],
|
|
"description": "Geographic query - heritage in province"
|
|
},
|
|
{
|
|
"query": "Tell me about archives in Friesland",
|
|
"expected_intent": "geographic",
|
|
# Include various forms of Frisian/Friesland
|
|
"expected_keywords": ["friesland", "fryslân", "archief", "fries", "tresoar", "frysk"],
|
|
"description": "Geographic query - archives in province"
|
|
},
|
|
]
|
|
|
|
|
|
def run_query_subprocess(query: str, use_optimized: bool, timeout: int = 120) -> dict:
|
|
"""
|
|
Run a single query in a separate Python subprocess.
|
|
|
|
This eliminates any in-memory caching between baseline and optimized runs.
|
|
"""
|
|
load_code = ""
|
|
if use_optimized:
|
|
load_code = """
|
|
pipeline.load('backend/rag/optimized_models/heritage_rag_bootstrap_latest.json')
|
|
demos = len(pipeline.answer_gen.predict.demos)
|
|
"""
|
|
else:
|
|
load_code = """
|
|
demos = 0
|
|
"""
|
|
|
|
# Escape the query for Python string
|
|
escaped_query = query.replace("'", "\\'").replace('"', '\\"')
|
|
|
|
code = f'''
|
|
import sys
|
|
import os
|
|
import time
|
|
import json
|
|
|
|
sys.path.insert(0, '.')
|
|
os.environ['LITELLM_CACHE'] = 'False'
|
|
|
|
import dspy
|
|
from backend.rag.dspy_heritage_rag import HeritageRAGPipeline
|
|
|
|
# Configure DSPy with no caching
|
|
lm = dspy.LM('openai/gpt-4o-mini', cache=False, max_tokens=1024)
|
|
dspy.configure(lm=lm)
|
|
|
|
# Create pipeline
|
|
pipeline = HeritageRAGPipeline()
|
|
{load_code}
|
|
|
|
# Run query
|
|
query = "{escaped_query}"
|
|
start = time.time()
|
|
try:
|
|
result = pipeline(query)
|
|
elapsed = time.time() - start
|
|
output = {{
|
|
"success": True,
|
|
"intent": result.intent,
|
|
"answer": result.answer,
|
|
"latency": elapsed,
|
|
"demos": demos
|
|
}}
|
|
except Exception as e:
|
|
output = {{
|
|
"success": False,
|
|
"error": str(e),
|
|
"latency": time.time() - start,
|
|
"demos": demos
|
|
}}
|
|
|
|
print("RESULT_JSON:" + json.dumps(output))
|
|
'''
|
|
|
|
try:
|
|
result = subprocess.run(
|
|
['python3', '-c', code],
|
|
capture_output=True,
|
|
text=True,
|
|
env={**os.environ, 'PYTHONPATH': '.'},
|
|
cwd=str(Path(__file__).parent.parent.parent),
|
|
timeout=timeout
|
|
)
|
|
|
|
# Extract JSON from output
|
|
output = result.stdout + result.stderr
|
|
match = re.search(r'RESULT_JSON:(\{.*\})', output, re.DOTALL)
|
|
|
|
if match:
|
|
return json.loads(match.group(1))
|
|
else:
|
|
return {
|
|
"success": False,
|
|
"error": f"Could not parse output: {output[:500]}",
|
|
"latency": 0,
|
|
"demos": 0
|
|
}
|
|
|
|
except subprocess.TimeoutExpired:
|
|
return {
|
|
"success": False,
|
|
"error": "Timeout",
|
|
"latency": timeout,
|
|
"demos": 0
|
|
}
|
|
except Exception as e:
|
|
return {
|
|
"success": False,
|
|
"error": str(e),
|
|
"latency": 0,
|
|
"demos": 0
|
|
}
|
|
|
|
|
|
def score_result(query_info: dict, result: dict) -> dict:
|
|
"""Score a result against expected values."""
|
|
scores = {}
|
|
|
|
if not result.get("success"):
|
|
return {"intent": 0.0, "keywords": 0.0, "non_empty": 0.0, "total": 0.0}
|
|
|
|
# Intent match (40%)
|
|
expected = query_info.get('expected_intent', '').lower()
|
|
actual = result.get('intent', '').lower()
|
|
|
|
# Check if intent matches primary or accepted alternatives
|
|
accepted = query_info.get('accepted_intents', [expected])
|
|
accepted = [i.lower() for i in accepted]
|
|
|
|
intent_match = actual in accepted
|
|
|
|
# Partial credit for related intents
|
|
related = {
|
|
('geographic', 'entity_lookup'): 0.5,
|
|
('geographic', 'exploration'): 0.7,
|
|
('statistical', 'comparative'): 0.5,
|
|
('exploration', 'entity_lookup'): 0.7,
|
|
('exploration', 'geographic'): 0.7,
|
|
('temporal', 'entity_lookup'): 0.8,
|
|
('temporal', 'relational'): 0.7,
|
|
('relational', 'entity_lookup'): 0.7,
|
|
}
|
|
|
|
if intent_match:
|
|
scores['intent'] = 1.0
|
|
else:
|
|
pair = (expected, actual)
|
|
reverse = (actual, expected)
|
|
scores['intent'] = related.get(pair, related.get(reverse, 0.0))
|
|
|
|
# Keyword match (40%)
|
|
answer = result.get('answer', '').lower()
|
|
keywords = query_info.get('expected_keywords', [])
|
|
if keywords:
|
|
matches = sum(1 for kw in keywords if kw.lower() in answer)
|
|
scores['keywords'] = matches / len(keywords)
|
|
else:
|
|
scores['keywords'] = 0.0
|
|
|
|
# Non-empty answer (20%)
|
|
if answer and len(answer.strip()) > 100:
|
|
scores['non_empty'] = 1.0
|
|
elif answer and len(answer.strip()) > 20:
|
|
scores['non_empty'] = 0.7
|
|
elif answer and len(answer.strip()) > 0:
|
|
scores['non_empty'] = 0.3
|
|
else:
|
|
scores['non_empty'] = 0.0
|
|
|
|
# Weighted total
|
|
scores['total'] = (
|
|
0.40 * scores['intent'] +
|
|
0.40 * scores['keywords'] +
|
|
0.20 * scores['non_empty']
|
|
)
|
|
|
|
return scores
|
|
|
|
|
|
def run_benchmark():
|
|
"""Run benchmark comparing baseline vs optimized pipeline."""
|
|
|
|
print("\n" + "=" * 70)
|
|
print("Heritage RAG Pipeline - Optimization Benchmark v2")
|
|
print("(Subprocess isolation to prevent caching artifacts)")
|
|
print("=" * 70)
|
|
|
|
baseline_results = []
|
|
optimized_results = []
|
|
|
|
# ==========================================================================
|
|
# RUN ALL QUERIES
|
|
# ==========================================================================
|
|
|
|
for i, q in enumerate(TEST_QUERIES):
|
|
print(f"\n{'='*70}")
|
|
print(f"[{i+1}/{len(TEST_QUERIES)}] {q['description']}")
|
|
print(f"Query: {q['query']}")
|
|
print("=" * 70)
|
|
|
|
# BASELINE
|
|
print("\n[BASELINE] Running in subprocess...")
|
|
base_result = run_query_subprocess(q['query'], use_optimized=False)
|
|
base_scores = score_result(q, base_result)
|
|
baseline_results.append({
|
|
"query": q,
|
|
"result": base_result,
|
|
"scores": base_scores
|
|
})
|
|
|
|
if base_result.get("success"):
|
|
print(f" Intent: {base_result['intent']} (expected: {q['expected_intent']}) → {base_scores['intent']:.0%}")
|
|
print(f" Keywords: {base_scores['keywords']:.0%} matched")
|
|
print(f" Score: {base_scores['total']:.2f} | Latency: {base_result['latency']:.1f}s")
|
|
print(f" Answer: {base_result['answer'][:100]}...")
|
|
else:
|
|
print(f" ERROR: {base_result.get('error', 'Unknown')}")
|
|
|
|
# OPTIMIZED
|
|
print("\n[OPTIMIZED] Running in subprocess...")
|
|
opt_result = run_query_subprocess(q['query'], use_optimized=True)
|
|
opt_scores = score_result(q, opt_result)
|
|
optimized_results.append({
|
|
"query": q,
|
|
"result": opt_result,
|
|
"scores": opt_scores
|
|
})
|
|
|
|
if opt_result.get("success"):
|
|
print(f" Demos loaded: {opt_result.get('demos', 0)}")
|
|
print(f" Intent: {opt_result['intent']} (expected: {q['expected_intent']}) → {opt_scores['intent']:.0%}")
|
|
print(f" Keywords: {opt_scores['keywords']:.0%} matched")
|
|
print(f" Score: {opt_scores['total']:.2f} | Latency: {opt_result['latency']:.1f}s")
|
|
print(f" Answer: {opt_result['answer'][:100]}...")
|
|
else:
|
|
print(f" ERROR: {opt_result.get('error', 'Unknown')}")
|
|
|
|
# Per-query comparison
|
|
delta = opt_scores['total'] - base_scores['total']
|
|
print(f"\n → Delta: {delta:+.2f} ({'improved' if delta > 0 else 'same' if delta == 0 else 'worse'})")
|
|
|
|
# ==========================================================================
|
|
# SUMMARY
|
|
# ==========================================================================
|
|
print("\n" + "=" * 70)
|
|
print("BENCHMARK SUMMARY")
|
|
print("=" * 70)
|
|
|
|
baseline_scores = [r['scores']['total'] for r in baseline_results]
|
|
optimized_scores = [r['scores']['total'] for r in optimized_results]
|
|
|
|
baseline_avg = sum(baseline_scores) / len(baseline_scores)
|
|
optimized_avg = sum(optimized_scores) / len(optimized_scores)
|
|
|
|
improvement = optimized_avg - baseline_avg
|
|
improvement_pct = (improvement / baseline_avg * 100) if baseline_avg > 0 else 0
|
|
|
|
# Latency comparison
|
|
baseline_latencies = [r['result']['latency'] for r in baseline_results if r['result'].get('success')]
|
|
optimized_latencies = [r['result']['latency'] for r in optimized_results if r['result'].get('success')]
|
|
|
|
baseline_lat_avg = sum(baseline_latencies) / len(baseline_latencies) if baseline_latencies else 0
|
|
optimized_lat_avg = sum(optimized_latencies) / len(optimized_latencies) if optimized_latencies else 0
|
|
|
|
print(f"\n Quality Scores:")
|
|
print(f" Baseline Score: {baseline_avg:.3f}")
|
|
print(f" Optimized Score: {optimized_avg:.3f}")
|
|
print(f" Improvement: {improvement:+.3f} ({improvement_pct:+.1f}%)")
|
|
|
|
print(f"\n Latency (avg):")
|
|
print(f" Baseline: {baseline_lat_avg:.1f}s")
|
|
print(f" Optimized: {optimized_lat_avg:.1f}s")
|
|
print(f" Speedup: {((baseline_lat_avg - optimized_lat_avg) / baseline_lat_avg * 100) if baseline_lat_avg > 0 else 0:.1f}%")
|
|
|
|
print("\n Per-query comparison:")
|
|
print(f" {'Query':<40} {'Base':>8} {'Opt':>8} {'Δ':>8} {'Base(s)':>8} {'Opt(s)':>8}")
|
|
print(" " + "-" * 82)
|
|
|
|
for i, q in enumerate(TEST_QUERIES):
|
|
base = baseline_results[i]['scores']['total']
|
|
opt = optimized_results[i]['scores']['total']
|
|
delta = opt - base
|
|
base_lat = baseline_results[i]['result']['latency']
|
|
opt_lat = optimized_results[i]['result']['latency']
|
|
desc = q['description'][:38]
|
|
print(f" {desc:<40} {base:>8.2f} {opt:>8.2f} {delta:>+8.2f} {base_lat:>8.1f} {opt_lat:>8.1f}")
|
|
|
|
print("\n" + "=" * 70)
|
|
|
|
# Save results
|
|
results = {
|
|
"timestamp": datetime.now().isoformat(),
|
|
"version": "v2_subprocess_isolation",
|
|
"baseline_avg": baseline_avg,
|
|
"optimized_avg": optimized_avg,
|
|
"improvement": improvement,
|
|
"improvement_pct": improvement_pct,
|
|
"baseline_latency_avg": baseline_lat_avg,
|
|
"optimized_latency_avg": optimized_lat_avg,
|
|
"per_query": [
|
|
{
|
|
"query": q["query"],
|
|
"description": q["description"],
|
|
"baseline_score": baseline_results[i]['scores']['total'],
|
|
"optimized_score": optimized_results[i]['scores']['total'],
|
|
"baseline_latency": baseline_results[i]['result']['latency'],
|
|
"optimized_latency": optimized_results[i]['result']['latency'],
|
|
"baseline_intent": baseline_results[i]['result'].get('intent'),
|
|
"optimized_intent": optimized_results[i]['result'].get('intent'),
|
|
}
|
|
for i, q in enumerate(TEST_QUERIES)
|
|
]
|
|
}
|
|
|
|
results_path = Path(__file__).parent / "optimized_models" / "benchmark_results_v2.json"
|
|
with open(results_path, "w") as f:
|
|
json.dump(results, f, indent=2)
|
|
print(f"\nResults saved to: {results_path.name}")
|
|
|
|
return results
|
|
|
|
|
|
if __name__ == "__main__":
|
|
run_benchmark()
|