287 lines
8.4 KiB
Python
287 lines
8.4 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
RAG Evaluation Harness
|
|
|
|
CLI tool for running evaluation of the Heritage RAG system against
|
|
the golden dataset. Calls the RAG API and computes metrics.
|
|
|
|
Usage:
|
|
python -m backend.rag.evaluation.run_evaluation --help
|
|
python -m backend.rag.evaluation.run_evaluation --api-url https://archief.support
|
|
python -m backend.rag.evaluation.run_evaluation --max-examples 10 --verbose
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import sys
|
|
import time
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
import httpx
|
|
|
|
from .dataset_loader import load_golden_dataset, GoldenExample
|
|
from .metrics import (
|
|
count_accuracy,
|
|
slot_extraction_accuracy,
|
|
heritage_rag_metric,
|
|
extract_count_from_answer,
|
|
format_evaluation_report,
|
|
)
|
|
|
|
|
|
def call_rag_api(
|
|
question: str,
|
|
api_url: str,
|
|
timeout: float = 30.0,
|
|
language: str = "nl",
|
|
) -> dict[str, Any]:
|
|
"""
|
|
Call the RAG API with a question and return the response.
|
|
|
|
Args:
|
|
question: The question to ask
|
|
api_url: Base URL of the RAG API
|
|
timeout: Request timeout in seconds
|
|
language: Language code (default "nl")
|
|
|
|
Returns:
|
|
API response as dictionary
|
|
"""
|
|
endpoint = f"{api_url.rstrip('/')}/api/rag/dspy/query"
|
|
|
|
payload = {
|
|
"question": question,
|
|
"language": language,
|
|
"include_visualization": True,
|
|
"use_agent": False,
|
|
}
|
|
|
|
with httpx.Client(timeout=timeout) as client:
|
|
response = client.post(endpoint, json=payload)
|
|
response.raise_for_status()
|
|
return response.json()
|
|
|
|
|
|
def evaluate_example(
|
|
example: GoldenExample,
|
|
api_url: str,
|
|
verbose: bool = False,
|
|
) -> dict[str, Any]:
|
|
"""
|
|
Evaluate a single example against the RAG API.
|
|
|
|
Args:
|
|
example: The golden example to test
|
|
api_url: Base URL of the RAG API
|
|
verbose: Whether to print progress
|
|
|
|
Returns:
|
|
Evaluation result dictionary
|
|
"""
|
|
result = {
|
|
"id": example.id,
|
|
"category": example.category,
|
|
"subcategory": example.subcategory,
|
|
"question": example.question,
|
|
"expected_count": example.expected_count,
|
|
"expected_slots": example.expected_slots,
|
|
}
|
|
|
|
try:
|
|
start_time = time.time()
|
|
response = call_rag_api(example.question, api_url)
|
|
elapsed = time.time() - start_time
|
|
|
|
result["response_time_ms"] = int(elapsed * 1000)
|
|
result["answer"] = response.get("answer", "")
|
|
result["visualization"] = response.get("visualization")
|
|
|
|
# Extract actual count from response
|
|
actual_count = response.get("count")
|
|
if actual_count is None:
|
|
actual_count = extract_count_from_answer(result["answer"])
|
|
result["actual_count"] = actual_count
|
|
|
|
# Calculate metrics
|
|
result["count_score"] = count_accuracy(example, response)
|
|
result["slot_score"] = slot_extraction_accuracy(example, response)
|
|
result["score"] = heritage_rag_metric(example, response)
|
|
|
|
result["success"] = True
|
|
|
|
except httpx.HTTPError as e:
|
|
result["success"] = False
|
|
result["error"] = f"HTTP error: {e}"
|
|
result["score"] = 0.0
|
|
except Exception as e:
|
|
result["success"] = False
|
|
result["error"] = f"Error: {e}"
|
|
result["score"] = 0.0
|
|
|
|
if verbose:
|
|
status = "✓" if result.get("score", 0) >= 0.8 else "✗"
|
|
print(f" {status} [{example.id}] {example.question[:40]}... -> {result.get('actual_count')} (expected {example.expected_count})")
|
|
|
|
return result
|
|
|
|
|
|
def run_evaluation(
|
|
api_url: str = "https://archief.support",
|
|
dataset_path: str | Path | None = None,
|
|
category_filter: str | None = None,
|
|
max_examples: int | None = None,
|
|
output_file: str | Path | None = None,
|
|
verbose: bool = False,
|
|
) -> dict[str, Any]:
|
|
"""
|
|
Run evaluation of the RAG system against the golden dataset.
|
|
|
|
Args:
|
|
api_url: Base URL of the RAG API
|
|
dataset_path: Path to the golden dataset JSON
|
|
category_filter: Only evaluate examples of this category
|
|
max_examples: Maximum number of examples to evaluate
|
|
output_file: Path to write detailed results JSON
|
|
verbose: Whether to print progress
|
|
|
|
Returns:
|
|
Summary of evaluation results
|
|
"""
|
|
if verbose:
|
|
print(f"Loading golden dataset...")
|
|
|
|
examples = load_golden_dataset(
|
|
dataset_path=dataset_path,
|
|
category_filter=category_filter,
|
|
max_examples=max_examples,
|
|
)
|
|
|
|
if verbose:
|
|
print(f"Loaded {len(examples)} examples")
|
|
print(f"Evaluating against {api_url}...")
|
|
print()
|
|
|
|
results = []
|
|
for i, example in enumerate(examples):
|
|
if verbose:
|
|
print(f"[{i+1}/{len(examples)}]", end=" ")
|
|
|
|
result = evaluate_example(example, api_url, verbose=verbose)
|
|
results.append(result)
|
|
|
|
# Calculate summary statistics
|
|
total = len(results)
|
|
successful = sum(1 for r in results if r.get("success"))
|
|
passed = sum(1 for r in results if r.get("score", 0) >= 0.8)
|
|
avg_score = sum(r.get("score", 0) for r in results) / max(total, 1)
|
|
avg_response_time = sum(r.get("response_time_ms", 0) for r in results) / max(successful, 1)
|
|
|
|
summary = {
|
|
"timestamp": datetime.now().isoformat(),
|
|
"api_url": api_url,
|
|
"total_examples": total,
|
|
"successful_calls": successful,
|
|
"passed_examples": passed,
|
|
"pass_rate": passed / max(total, 1),
|
|
"average_score": avg_score,
|
|
"average_response_time_ms": int(avg_response_time),
|
|
"category_filter": category_filter,
|
|
"results": results,
|
|
}
|
|
|
|
# Write detailed results to file
|
|
if output_file:
|
|
output_path = Path(output_file)
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(output_path, "w", encoding="utf-8") as f:
|
|
json.dump(summary, f, indent=2, ensure_ascii=False)
|
|
if verbose:
|
|
print(f"\nDetailed results written to: {output_path}")
|
|
|
|
# Print summary report
|
|
if verbose:
|
|
print()
|
|
print(format_evaluation_report(results))
|
|
|
|
return summary
|
|
|
|
|
|
def main():
|
|
"""CLI entry point."""
|
|
parser = argparse.ArgumentParser(
|
|
description="Evaluate Heritage RAG system against golden dataset"
|
|
)
|
|
parser.add_argument(
|
|
"--api-url",
|
|
default="https://archief.support",
|
|
help="Base URL of the RAG API (default: https://archief.support)"
|
|
)
|
|
parser.add_argument(
|
|
"--dataset",
|
|
type=str,
|
|
default=None,
|
|
help="Path to golden dataset JSON file"
|
|
)
|
|
parser.add_argument(
|
|
"--category",
|
|
type=str,
|
|
default=None,
|
|
help="Only evaluate examples of this category (e.g., 'count')"
|
|
)
|
|
parser.add_argument(
|
|
"--max-examples",
|
|
type=int,
|
|
default=None,
|
|
help="Maximum number of examples to evaluate"
|
|
)
|
|
parser.add_argument(
|
|
"--output",
|
|
type=str,
|
|
default=None,
|
|
help="Path to write detailed results JSON"
|
|
)
|
|
parser.add_argument(
|
|
"--verbose", "-v",
|
|
action="store_true",
|
|
help="Print progress and detailed output"
|
|
)
|
|
parser.add_argument(
|
|
"--json",
|
|
action="store_true",
|
|
help="Output summary as JSON (for CI integration)"
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
try:
|
|
summary = run_evaluation(
|
|
api_url=args.api_url,
|
|
dataset_path=args.dataset,
|
|
category_filter=args.category,
|
|
max_examples=args.max_examples,
|
|
output_file=args.output,
|
|
verbose=args.verbose,
|
|
)
|
|
|
|
if args.json:
|
|
# Output just the summary (without full results) as JSON
|
|
json_summary = {k: v for k, v in summary.items() if k != "results"}
|
|
print(json.dumps(json_summary, indent=2))
|
|
|
|
# Exit with error if pass rate is below threshold
|
|
if summary["pass_rate"] < 0.8:
|
|
print(f"\n⚠️ Pass rate {summary['pass_rate']:.1%} is below 80% threshold", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
except FileNotFoundError as e:
|
|
print(f"Error: {e}", file=sys.stderr)
|
|
sys.exit(1)
|
|
except httpx.HTTPError as e:
|
|
print(f"API error: {e}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|