glam/backend/rag/evaluation/run_evaluation.py
2026-01-09 20:35:19 +01:00

287 lines
8.4 KiB
Python

#!/usr/bin/env python3
"""
RAG Evaluation Harness
CLI tool for running evaluation of the Heritage RAG system against
the golden dataset. Calls the RAG API and computes metrics.
Usage:
python -m backend.rag.evaluation.run_evaluation --help
python -m backend.rag.evaluation.run_evaluation --api-url https://archief.support
python -m backend.rag.evaluation.run_evaluation --max-examples 10 --verbose
"""
import argparse
import json
import sys
import time
from datetime import datetime
from pathlib import Path
from typing import Any
import httpx
from .dataset_loader import load_golden_dataset, GoldenExample
from .metrics import (
count_accuracy,
slot_extraction_accuracy,
heritage_rag_metric,
extract_count_from_answer,
format_evaluation_report,
)
def call_rag_api(
question: str,
api_url: str,
timeout: float = 30.0,
language: str = "nl",
) -> dict[str, Any]:
"""
Call the RAG API with a question and return the response.
Args:
question: The question to ask
api_url: Base URL of the RAG API
timeout: Request timeout in seconds
language: Language code (default "nl")
Returns:
API response as dictionary
"""
endpoint = f"{api_url.rstrip('/')}/api/rag/dspy/query"
payload = {
"question": question,
"language": language,
"include_visualization": True,
"use_agent": False,
}
with httpx.Client(timeout=timeout) as client:
response = client.post(endpoint, json=payload)
response.raise_for_status()
return response.json()
def evaluate_example(
example: GoldenExample,
api_url: str,
verbose: bool = False,
) -> dict[str, Any]:
"""
Evaluate a single example against the RAG API.
Args:
example: The golden example to test
api_url: Base URL of the RAG API
verbose: Whether to print progress
Returns:
Evaluation result dictionary
"""
result = {
"id": example.id,
"category": example.category,
"subcategory": example.subcategory,
"question": example.question,
"expected_count": example.expected_count,
"expected_slots": example.expected_slots,
}
try:
start_time = time.time()
response = call_rag_api(example.question, api_url)
elapsed = time.time() - start_time
result["response_time_ms"] = int(elapsed * 1000)
result["answer"] = response.get("answer", "")
result["visualization"] = response.get("visualization")
# Extract actual count from response
actual_count = response.get("count")
if actual_count is None:
actual_count = extract_count_from_answer(result["answer"])
result["actual_count"] = actual_count
# Calculate metrics
result["count_score"] = count_accuracy(example, response)
result["slot_score"] = slot_extraction_accuracy(example, response)
result["score"] = heritage_rag_metric(example, response)
result["success"] = True
except httpx.HTTPError as e:
result["success"] = False
result["error"] = f"HTTP error: {e}"
result["score"] = 0.0
except Exception as e:
result["success"] = False
result["error"] = f"Error: {e}"
result["score"] = 0.0
if verbose:
status = "" if result.get("score", 0) >= 0.8 else ""
print(f" {status} [{example.id}] {example.question[:40]}... -> {result.get('actual_count')} (expected {example.expected_count})")
return result
def run_evaluation(
api_url: str = "https://archief.support",
dataset_path: str | Path | None = None,
category_filter: str | None = None,
max_examples: int | None = None,
output_file: str | Path | None = None,
verbose: bool = False,
) -> dict[str, Any]:
"""
Run evaluation of the RAG system against the golden dataset.
Args:
api_url: Base URL of the RAG API
dataset_path: Path to the golden dataset JSON
category_filter: Only evaluate examples of this category
max_examples: Maximum number of examples to evaluate
output_file: Path to write detailed results JSON
verbose: Whether to print progress
Returns:
Summary of evaluation results
"""
if verbose:
print(f"Loading golden dataset...")
examples = load_golden_dataset(
dataset_path=dataset_path,
category_filter=category_filter,
max_examples=max_examples,
)
if verbose:
print(f"Loaded {len(examples)} examples")
print(f"Evaluating against {api_url}...")
print()
results = []
for i, example in enumerate(examples):
if verbose:
print(f"[{i+1}/{len(examples)}]", end=" ")
result = evaluate_example(example, api_url, verbose=verbose)
results.append(result)
# Calculate summary statistics
total = len(results)
successful = sum(1 for r in results if r.get("success"))
passed = sum(1 for r in results if r.get("score", 0) >= 0.8)
avg_score = sum(r.get("score", 0) for r in results) / max(total, 1)
avg_response_time = sum(r.get("response_time_ms", 0) for r in results) / max(successful, 1)
summary = {
"timestamp": datetime.now().isoformat(),
"api_url": api_url,
"total_examples": total,
"successful_calls": successful,
"passed_examples": passed,
"pass_rate": passed / max(total, 1),
"average_score": avg_score,
"average_response_time_ms": int(avg_response_time),
"category_filter": category_filter,
"results": results,
}
# Write detailed results to file
if output_file:
output_path = Path(output_file)
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, "w", encoding="utf-8") as f:
json.dump(summary, f, indent=2, ensure_ascii=False)
if verbose:
print(f"\nDetailed results written to: {output_path}")
# Print summary report
if verbose:
print()
print(format_evaluation_report(results))
return summary
def main():
"""CLI entry point."""
parser = argparse.ArgumentParser(
description="Evaluate Heritage RAG system against golden dataset"
)
parser.add_argument(
"--api-url",
default="https://archief.support",
help="Base URL of the RAG API (default: https://archief.support)"
)
parser.add_argument(
"--dataset",
type=str,
default=None,
help="Path to golden dataset JSON file"
)
parser.add_argument(
"--category",
type=str,
default=None,
help="Only evaluate examples of this category (e.g., 'count')"
)
parser.add_argument(
"--max-examples",
type=int,
default=None,
help="Maximum number of examples to evaluate"
)
parser.add_argument(
"--output",
type=str,
default=None,
help="Path to write detailed results JSON"
)
parser.add_argument(
"--verbose", "-v",
action="store_true",
help="Print progress and detailed output"
)
parser.add_argument(
"--json",
action="store_true",
help="Output summary as JSON (for CI integration)"
)
args = parser.parse_args()
try:
summary = run_evaluation(
api_url=args.api_url,
dataset_path=args.dataset,
category_filter=args.category,
max_examples=args.max_examples,
output_file=args.output,
verbose=args.verbose,
)
if args.json:
# Output just the summary (without full results) as JSON
json_summary = {k: v for k, v in summary.items() if k != "results"}
print(json.dumps(json_summary, indent=2))
# Exit with error if pass rate is below threshold
if summary["pass_rate"] < 0.8:
print(f"\n⚠️ Pass rate {summary['pass_rate']:.1%} is below 80% threshold", file=sys.stderr)
sys.exit(1)
except FileNotFoundError as e:
print(f"Error: {e}", file=sys.stderr)
sys.exit(1)
except httpx.HTTPError as e:
print(f"API error: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()