glam/backend/rag/benchmark_optimization_v2.py

#!/usr/bin/env python3
"""
Benchmark script comparing baseline vs optimized Heritage RAG pipeline.

This version runs each query in a separate subprocess to eliminate
in-memory caching artifacts that caused false "0% improvement" results.

Usage:
    python benchmark_optimization_v2.py

Requirements:
    - SSH tunnel active: ssh -f -N -L 7878:localhost:7878 root@91.98.224.44
    - Environment loaded: source .venv/bin/activate && source .env
"""

import json
import os
import re
import subprocess
import sys
import time
from datetime import datetime
from pathlib import Path

# =============================================================================
# TEST QUERIES
# =============================================================================

TEST_QUERIES = [
    {
        "query": "List all libraries in Utrecht",
        "expected_intent": "geographic",
        # Include both English AND Dutch keywords for bilingual scoring
        "expected_keywords": ["library", "bibliotheek", "utrecht"],
        "description": "Geographic query - libraries in specific city"
    },
    {
        "query": "What is the history of the Rijksmuseum?",
        # Accept temporal, entity_lookup, or relational as valid intents
        "expected_intent": "temporal",
        "accepted_intents": ["temporal", "entity_lookup", "relational", "exploration"],
        "expected_keywords": ["rijksmuseum", "museum", "1800", "opgericht", "founded", "geschiedenis", "history"],
        "description": "Entity lookup - specific institution history"
    },
    {
        "query": "How many archives are in the Netherlands?",
        "expected_intent": "statistical",
        # Bilingual keywords
        "expected_keywords": ["archief", "archive", "nederland", "netherlands", "aantal", "number"],
        "description": "Statistical query - count institutions"
    },
    {
        "query": "Compare museums in Amsterdam and Rotterdam",
        "expected_intent": "comparative",
        "expected_keywords": ["amsterdam", "rotterdam", "museum", "musea"],
        "description": "Comparative query - two cities"
    },
    {
        "query": "Welke archieven zijn er in Groningen?",
        "expected_intent": "geographic",
        "expected_keywords": ["archief", "groningen", "groninger"],
        "description": "Dutch language - geographic query"
    },
    {
        "query": "What heritage institutions are in Drenthe?",
        "expected_intent": "geographic",
        # Bilingual heritage keywords
        "expected_keywords": ["drenthe", "museum", "archief", "heritage", "erfgoed", "drents"],
        "description": "Geographic query - heritage in province"
    },
    {
        "query": "Tell me about archives in Friesland",
        "expected_intent": "geographic",
        # Include various forms of Frisian/Friesland
        "expected_keywords": ["friesland", "fryslân", "archief", "fries", "tresoar", "frysk"],
        "description": "Geographic query - archives in province"
    },
]


def run_query_subprocess(query: str, use_optimized: bool, timeout: int = 120) -> dict:
    """
    Run a single query in a separate Python subprocess.

    This eliminates any in-memory caching between baseline and optimized runs.
    """
    load_code = ""
    if use_optimized:
        load_code = """
pipeline.load('backend/rag/optimized_models/heritage_rag_bootstrap_latest.json')
demos = len(pipeline.answer_gen.predict.demos)
"""
    else:
        load_code = """
demos = 0
"""

    # Escape the query for Python string
    escaped_query = query.replace("'", "\\'").replace('"', '\\"')

    code = f'''
import sys
import os
import time
import json

sys.path.insert(0, '.')
os.environ['LITELLM_CACHE'] = 'False'

import dspy
from backend.rag.dspy_heritage_rag import HeritageRAGPipeline

# Configure DSPy with no caching
lm = dspy.LM('openai/gpt-4o-mini', cache=False, max_tokens=1024)
dspy.configure(lm=lm)

# Create pipeline
pipeline = HeritageRAGPipeline()
{load_code}

# Run query
query = "{escaped_query}"
start = time.time()
try:
    result = pipeline(query)
    elapsed = time.time() - start
    output = {{
        "success": True,
        "intent": result.intent,
        "answer": result.answer,
        "latency": elapsed,
        "demos": demos
    }}
except Exception as e:
    output = {{
        "success": False,
        "error": str(e),
        "latency": time.time() - start,
        "demos": demos
    }}

print("RESULT_JSON:" + json.dumps(output))
'''

    try:
        result = subprocess.run(
            ['python3', '-c', code],
            capture_output=True,
            text=True,
            env={**os.environ, 'PYTHONPATH': '.'},
            cwd=str(Path(__file__).parent.parent.parent),
            timeout=timeout
        )

        # Extract JSON from output
        output = result.stdout + result.stderr
        match = re.search(r'RESULT_JSON:(\{.*\})', output, re.DOTALL)

        if match:
            return json.loads(match.group(1))
        else:
            return {
                "success": False,
                "error": f"Could not parse output: {output[:500]}",
                "latency": 0,
                "demos": 0
            }

    except subprocess.TimeoutExpired:
        return {
            "success": False,
            "error": "Timeout",
            "latency": timeout,
            "demos": 0
        }
    except Exception as e:
        return {
            "success": False,
            "error": str(e),
            "latency": 0,
            "demos": 0
        }


def score_result(query_info: dict, result: dict) -> dict:
    """Score a result against expected values."""
    scores = {}

    if not result.get("success"):
        return {"intent": 0.0, "keywords": 0.0, "non_empty": 0.0, "total": 0.0}

    # Intent match (40%)
    expected = query_info.get('expected_intent', '').lower()
    actual = result.get('intent', '').lower()

    # Check if intent matches primary or accepted alternatives
    accepted = query_info.get('accepted_intents', [expected])
    accepted = [i.lower() for i in accepted]

    intent_match = actual in accepted

    # Partial credit for related intents
    related = {
        ('geographic', 'entity_lookup'): 0.5,
        ('geographic', 'exploration'): 0.7,
        ('statistical', 'comparative'): 0.5,
        ('exploration', 'entity_lookup'): 0.7,
        ('exploration', 'geographic'): 0.7,
        ('temporal', 'entity_lookup'): 0.8,
        ('temporal', 'relational'): 0.7,
        ('relational', 'entity_lookup'): 0.7,
    }

    if intent_match:
        scores['intent'] = 1.0
    else:
        pair = (expected, actual)
        reverse = (actual, expected)
        scores['intent'] = related.get(pair, related.get(reverse, 0.0))

    # Keyword match (40%)
    answer = result.get('answer', '').lower()
    keywords = query_info.get('expected_keywords', [])
    if keywords:
        matches = sum(1 for kw in keywords if kw.lower() in answer)
        scores['keywords'] = matches / len(keywords)
    else:
        scores['keywords'] = 0.0

    # Non-empty answer (20%)
    if answer and len(answer.strip()) > 100:
        scores['non_empty'] = 1.0
    elif answer and len(answer.strip()) > 20:
        scores['non_empty'] = 0.7
    elif answer and len(answer.strip()) > 0:
        scores['non_empty'] = 0.3
    else:
        scores['non_empty'] = 0.0

    # Weighted total
    scores['total'] = (
        0.40 * scores['intent'] +
        0.40 * scores['keywords'] +
        0.20 * scores['non_empty']
    )

    return scores


def run_benchmark():
    """Run benchmark comparing baseline vs optimized pipeline."""

    print("\n" + "=" * 70)
    print("Heritage RAG Pipeline - Optimization Benchmark v2")
    print("(Subprocess isolation to prevent caching artifacts)")
    print("=" * 70)

    baseline_results = []
    optimized_results = []

    # ==========================================================================
    # RUN ALL QUERIES
    # ==========================================================================

    for i, q in enumerate(TEST_QUERIES):
        print(f"\n{'='*70}")
        print(f"[{i+1}/{len(TEST_QUERIES)}] {q['description']}")
        print(f"Query: {q['query']}")
        print("=" * 70)

        # BASELINE
        print("\n[BASELINE] Running in subprocess...")
        base_result = run_query_subprocess(q['query'], use_optimized=False)
        base_scores = score_result(q, base_result)
        baseline_results.append({
            "query": q,
            "result": base_result,
            "scores": base_scores
        })

        if base_result.get("success"):
            print(f"  Intent: {base_result['intent']} (expected: {q['expected_intent']}) → {base_scores['intent']:.0%}")
            print(f"  Keywords: {base_scores['keywords']:.0%} matched")
            print(f"  Score: {base_scores['total']:.2f} | Latency: {base_result['latency']:.1f}s")
            print(f"  Answer: {base_result['answer'][:100]}...")
        else:
            print(f"  ERROR: {base_result.get('error', 'Unknown')}")

        # OPTIMIZED
        print("\n[OPTIMIZED] Running in subprocess...")
        opt_result = run_query_subprocess(q['query'], use_optimized=True)
        opt_scores = score_result(q, opt_result)
        optimized_results.append({
            "query": q,
            "result": opt_result,
            "scores": opt_scores
        })

        if opt_result.get("success"):
            print(f"  Demos loaded: {opt_result.get('demos', 0)}")
            print(f"  Intent: {opt_result['intent']} (expected: {q['expected_intent']}) → {opt_scores['intent']:.0%}")
            print(f"  Keywords: {opt_scores['keywords']:.0%} matched")
            print(f"  Score: {opt_scores['total']:.2f} | Latency: {opt_result['latency']:.1f}s")
            print(f"  Answer: {opt_result['answer'][:100]}...")
        else:
            print(f"  ERROR: {opt_result.get('error', 'Unknown')}")

        # Per-query comparison
        delta = opt_scores['total'] - base_scores['total']
        print(f"\n  → Delta: {delta:+.2f} ({'improved' if delta > 0 else 'same' if delta == 0 else 'worse'})")

    # ==========================================================================
    # SUMMARY
    # ==========================================================================
    print("\n" + "=" * 70)
    print("BENCHMARK SUMMARY")
    print("=" * 70)

    baseline_scores = [r['scores']['total'] for r in baseline_results]
    optimized_scores = [r['scores']['total'] for r in optimized_results]

    baseline_avg = sum(baseline_scores) / len(baseline_scores)
    optimized_avg = sum(optimized_scores) / len(optimized_scores)

    improvement = optimized_avg - baseline_avg
    improvement_pct = (improvement / baseline_avg * 100) if baseline_avg > 0 else 0

    # Latency comparison
    baseline_latencies = [r['result']['latency'] for r in baseline_results if r['result'].get('success')]
    optimized_latencies = [r['result']['latency'] for r in optimized_results if r['result'].get('success')]

    baseline_lat_avg = sum(baseline_latencies) / len(baseline_latencies) if baseline_latencies else 0
    optimized_lat_avg = sum(optimized_latencies) / len(optimized_latencies) if optimized_latencies else 0

    print(f"\n  Quality Scores:")
    print(f"    Baseline Score:   {baseline_avg:.3f}")
    print(f"    Optimized Score:  {optimized_avg:.3f}")
    print(f"    Improvement:      {improvement:+.3f} ({improvement_pct:+.1f}%)")

    print(f"\n  Latency (avg):")
    print(f"    Baseline:   {baseline_lat_avg:.1f}s")
    print(f"    Optimized:  {optimized_lat_avg:.1f}s")
    print(f"    Speedup:    {((baseline_lat_avg - optimized_lat_avg) / baseline_lat_avg * 100) if baseline_lat_avg > 0 else 0:.1f}%")

    print("\n  Per-query comparison:")
    print(f"  {'Query':<40} {'Base':>8} {'Opt':>8} {'Δ':>8} {'Base(s)':>8} {'Opt(s)':>8}")
    print("  " + "-" * 82)

    for i, q in enumerate(TEST_QUERIES):
        base = baseline_results[i]['scores']['total']
        opt = optimized_results[i]['scores']['total']
        delta = opt - base
        base_lat = baseline_results[i]['result']['latency']
        opt_lat = optimized_results[i]['result']['latency']
        desc = q['description'][:38]
        print(f"  {desc:<40} {base:>8.2f} {opt:>8.2f} {delta:>+8.2f} {base_lat:>8.1f} {opt_lat:>8.1f}")

    print("\n" + "=" * 70)

    # Save results
    results = {
        "timestamp": datetime.now().isoformat(),
        "version": "v2_subprocess_isolation",
        "baseline_avg": baseline_avg,
        "optimized_avg": optimized_avg,
        "improvement": improvement,
        "improvement_pct": improvement_pct,
        "baseline_latency_avg": baseline_lat_avg,
        "optimized_latency_avg": optimized_lat_avg,
        "per_query": [
            {
                "query": q["query"],
                "description": q["description"],
                "baseline_score": baseline_results[i]['scores']['total'],
                "optimized_score": optimized_results[i]['scores']['total'],
                "baseline_latency": baseline_results[i]['result']['latency'],
                "optimized_latency": optimized_results[i]['result']['latency'],
                "baseline_intent": baseline_results[i]['result'].get('intent'),
                "optimized_intent": optimized_results[i]['result'].get('intent'),
            }
            for i, q in enumerate(TEST_QUERIES)
        ]
    }

    results_path = Path(__file__).parent / "optimized_models" / "benchmark_results_v2.json"
    with open(results_path, "w") as f:
        json.dump(results, f, indent=2)
    print(f"\nResults saved to: {results_path.name}")

    return results


if __name__ == "__main__":
    run_benchmark()