#!/usr/bin/env python3 """ RAG Evaluation Harness CLI tool for running evaluation of the Heritage RAG system against the golden dataset. Calls the RAG API and computes metrics. Usage: python -m backend.rag.evaluation.run_evaluation --help python -m backend.rag.evaluation.run_evaluation --api-url https://archief.support python -m backend.rag.evaluation.run_evaluation --max-examples 10 --verbose """ import argparse import json import sys import time from datetime import datetime from pathlib import Path from typing import Any import httpx from .dataset_loader import load_golden_dataset, GoldenExample from .metrics import ( count_accuracy, slot_extraction_accuracy, heritage_rag_metric, extract_count_from_answer, format_evaluation_report, ) def call_rag_api( question: str, api_url: str, timeout: float = 30.0, language: str = "nl", ) -> dict[str, Any]: """ Call the RAG API with a question and return the response. Args: question: The question to ask api_url: Base URL of the RAG API timeout: Request timeout in seconds language: Language code (default "nl") Returns: API response as dictionary """ endpoint = f"{api_url.rstrip('/')}/api/rag/dspy/query" payload = { "question": question, "language": language, "include_visualization": True, "use_agent": False, } with httpx.Client(timeout=timeout) as client: response = client.post(endpoint, json=payload) response.raise_for_status() return response.json() def evaluate_example( example: GoldenExample, api_url: str, verbose: bool = False, ) -> dict[str, Any]: """ Evaluate a single example against the RAG API. Args: example: The golden example to test api_url: Base URL of the RAG API verbose: Whether to print progress Returns: Evaluation result dictionary """ result = { "id": example.id, "category": example.category, "subcategory": example.subcategory, "question": example.question, "expected_count": example.expected_count, "expected_slots": example.expected_slots, } try: start_time = time.time() response = call_rag_api(example.question, api_url) elapsed = time.time() - start_time result["response_time_ms"] = int(elapsed * 1000) result["answer"] = response.get("answer", "") result["visualization"] = response.get("visualization") # Extract actual count from response actual_count = response.get("count") if actual_count is None: actual_count = extract_count_from_answer(result["answer"]) result["actual_count"] = actual_count # Calculate metrics result["count_score"] = count_accuracy(example, response) result["slot_score"] = slot_extraction_accuracy(example, response) result["score"] = heritage_rag_metric(example, response) result["success"] = True except httpx.HTTPError as e: result["success"] = False result["error"] = f"HTTP error: {e}" result["score"] = 0.0 except Exception as e: result["success"] = False result["error"] = f"Error: {e}" result["score"] = 0.0 if verbose: status = "✓" if result.get("score", 0) >= 0.8 else "✗" print(f" {status} [{example.id}] {example.question[:40]}... -> {result.get('actual_count')} (expected {example.expected_count})") return result def run_evaluation( api_url: str = "https://archief.support", dataset_path: str | Path | None = None, category_filter: str | None = None, max_examples: int | None = None, output_file: str | Path | None = None, verbose: bool = False, ) -> dict[str, Any]: """ Run evaluation of the RAG system against the golden dataset. Args: api_url: Base URL of the RAG API dataset_path: Path to the golden dataset JSON category_filter: Only evaluate examples of this category max_examples: Maximum number of examples to evaluate output_file: Path to write detailed results JSON verbose: Whether to print progress Returns: Summary of evaluation results """ if verbose: print(f"Loading golden dataset...") examples = load_golden_dataset( dataset_path=dataset_path, category_filter=category_filter, max_examples=max_examples, ) if verbose: print(f"Loaded {len(examples)} examples") print(f"Evaluating against {api_url}...") print() results = [] for i, example in enumerate(examples): if verbose: print(f"[{i+1}/{len(examples)}]", end=" ") result = evaluate_example(example, api_url, verbose=verbose) results.append(result) # Calculate summary statistics total = len(results) successful = sum(1 for r in results if r.get("success")) passed = sum(1 for r in results if r.get("score", 0) >= 0.8) avg_score = sum(r.get("score", 0) for r in results) / max(total, 1) avg_response_time = sum(r.get("response_time_ms", 0) for r in results) / max(successful, 1) summary = { "timestamp": datetime.now().isoformat(), "api_url": api_url, "total_examples": total, "successful_calls": successful, "passed_examples": passed, "pass_rate": passed / max(total, 1), "average_score": avg_score, "average_response_time_ms": int(avg_response_time), "category_filter": category_filter, "results": results, } # Write detailed results to file if output_file: output_path = Path(output_file) output_path.parent.mkdir(parents=True, exist_ok=True) with open(output_path, "w", encoding="utf-8") as f: json.dump(summary, f, indent=2, ensure_ascii=False) if verbose: print(f"\nDetailed results written to: {output_path}") # Print summary report if verbose: print() print(format_evaluation_report(results)) return summary def main(): """CLI entry point.""" parser = argparse.ArgumentParser( description="Evaluate Heritage RAG system against golden dataset" ) parser.add_argument( "--api-url", default="https://archief.support", help="Base URL of the RAG API (default: https://archief.support)" ) parser.add_argument( "--dataset", type=str, default=None, help="Path to golden dataset JSON file" ) parser.add_argument( "--category", type=str, default=None, help="Only evaluate examples of this category (e.g., 'count')" ) parser.add_argument( "--max-examples", type=int, default=None, help="Maximum number of examples to evaluate" ) parser.add_argument( "--output", type=str, default=None, help="Path to write detailed results JSON" ) parser.add_argument( "--verbose", "-v", action="store_true", help="Print progress and detailed output" ) parser.add_argument( "--json", action="store_true", help="Output summary as JSON (for CI integration)" ) args = parser.parse_args() try: summary = run_evaluation( api_url=args.api_url, dataset_path=args.dataset, category_filter=args.category, max_examples=args.max_examples, output_file=args.output, verbose=args.verbose, ) if args.json: # Output just the summary (without full results) as JSON json_summary = {k: v for k, v in summary.items() if k != "results"} print(json.dumps(json_summary, indent=2)) # Exit with error if pass rate is below threshold if summary["pass_rate"] < 0.8: print(f"\n⚠️ Pass rate {summary['pass_rate']:.1%} is below 80% threshold", file=sys.stderr) sys.exit(1) except FileNotFoundError as e: print(f"Error: {e}", file=sys.stderr) sys.exit(1) except httpx.HTTPError as e: print(f"API error: {e}", file=sys.stderr) sys.exit(1) if __name__ == "__main__": main()