12 KiB
12 KiB
Evaluation Harness Specification
Overview
The evaluation harness is a command-line tool that runs DSPy evaluations against the RAG system, generating detailed reports.
Architecture
┌─────────────────────────────────────────────────────────────────┐
│ Evaluation Harness │
├─────────────────────────────────────────────────────────────────┤
│ │
│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │
│ │ Dataset │───▶│ Pipeline │───▶│ Metrics │ │
│ │ Loader │ │ Executor │ │ Scorer │ │
│ └──────────────┘ └──────────────┘ └──────────────┘ │
│ │ │ │ │
│ ▼ ▼ ▼ │
│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │
│ │ Golden │ │ RAG API │ │ Score │ │
│ │ Dataset │ │ (HTTP) │ │ Aggregator │ │
│ └──────────────┘ └──────────────┘ └──────────────┘ │
│ │ │
│ ▼ │
│ ┌──────────────┐ │
│ │ Report │ │
│ │ Generator │ │
│ └──────────────┘ │
└─────────────────────────────────────────────────────────────────┘
Implementation
Main Evaluation Script
#!/usr/bin/env python
"""
backend/rag/evaluation/run_evaluation.py
Run DSPy evaluation against the Heritage RAG system.
"""
import argparse
import asyncio
import json
import sys
from datetime import datetime
from pathlib import Path
from typing import Optional
import dspy
from dspy import Evaluate, Example
from .metrics import heritage_rag_metric, count_accuracy, slot_extraction_accuracy
from .dataset_loader import load_golden_dataset, split_by_category
class RAGEvaluator:
"""Evaluation harness for Heritage RAG system."""
def __init__(
self,
dataset_path: str = "data/rag_eval/golden_dataset.json",
api_url: str = "http://localhost:8010",
output_dir: str = "reports/evaluation",
):
self.dataset_path = Path(dataset_path)
self.api_url = api_url
self.output_dir = Path(output_dir)
self.output_dir.mkdir(parents=True, exist_ok=True)
def load_dataset(self, split: str = "dev") -> list[Example]:
"""Load golden dataset as DSPy Examples."""
data = load_golden_dataset(self.dataset_path)
examples = []
for ex in data['examples']:
example = Example(**ex).with_inputs('question', 'language')
examples.append(example)
return examples
async def evaluate_single(self, example: Example) -> dict:
"""Evaluate a single example against the RAG API."""
import httpx
async with httpx.AsyncClient(timeout=60.0) as client:
response = await client.post(
f"{self.api_url}/api/rag/dspy/query",
json={
"question": example.question,
"language": example.get('language', 'nl'),
}
)
result = response.json()
# Create prediction object for metrics
pred = dspy.Prediction(
answer=result.get('answer'),
sparql=result.get('sparql'),
sparql_results=result.get('institutions', []),
slots=result.get('slots', {}),
)
# Compute metrics
scores = {
'count_accuracy': count_accuracy(example, pred),
'slot_extraction': slot_extraction_accuracy(example, pred),
'composite': heritage_rag_metric(example, pred),
}
return {
'example_id': example.id,
'question': example.question,
'expected': {
'count': example.get('expected_count'),
'slots': example.get('expected_slots'),
},
'actual': {
'answer': result.get('answer'),
'sparql': result.get('sparql'),
},
'scores': scores,
'passed': scores['composite'] >= 0.8,
}
async def run_evaluation(
self,
split: str = "dev",
categories: Optional[list[str]] = None,
max_examples: Optional[int] = None,
) -> dict:
"""Run full evaluation on dataset."""
examples = self.load_dataset(split)
if categories:
examples = [e for e in examples if e.get('category') in categories]
if max_examples:
examples = examples[:max_examples]
print(f"Evaluating {len(examples)} examples...")
results = []
for i, example in enumerate(examples):
print(f" [{i+1}/{len(examples)}] {example.question[:50]}...")
result = await self.evaluate_single(example)
results.append(result)
# Aggregate scores
report = self.generate_report(results)
# Save report
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
report_path = self.output_dir / f"eval_{split}_{timestamp}.json"
with open(report_path, 'w') as f:
json.dump(report, f, indent=2)
print(f"\nReport saved to: {report_path}")
return report
def generate_report(self, results: list[dict]) -> dict:
"""Generate evaluation report from results."""
passed = sum(1 for r in results if r['passed'])
total = len(results)
# Category breakdown
by_category = {}
for r in results:
cat = r.get('category', 'unknown')
if cat not in by_category:
by_category[cat] = {'passed': 0, 'total': 0, 'scores': []}
by_category[cat]['total'] += 1
by_category[cat]['scores'].append(r['scores']['composite'])
if r['passed']:
by_category[cat]['passed'] += 1
return {
'timestamp': datetime.now().isoformat(),
'summary': {
'total_examples': total,
'passed': passed,
'failed': total - passed,
'pass_rate': passed / total if total > 0 else 0,
'average_score': sum(r['scores']['composite'] for r in results) / total if total > 0 else 0,
},
'by_category': {
cat: {
'pass_rate': d['passed'] / d['total'] if d['total'] > 0 else 0,
'average_score': sum(d['scores']) / len(d['scores']) if d['scores'] else 0,
}
for cat, d in by_category.items()
},
'failed_examples': [r for r in results if not r['passed']],
'all_results': results,
}
def main():
parser = argparse.ArgumentParser(description="Run RAG evaluation")
parser.add_argument('--split', default='dev', choices=['train', 'dev', 'test'])
parser.add_argument('--categories', nargs='+', help='Filter by category')
parser.add_argument('--max-examples', type=int, help='Max examples to evaluate')
parser.add_argument('--api-url', default='http://localhost:8010')
parser.add_argument('--output-dir', default='reports/evaluation')
parser.add_argument('--threshold', type=float, default=0.8, help='Pass threshold')
args = parser.parse_args()
evaluator = RAGEvaluator(
api_url=args.api_url,
output_dir=args.output_dir,
)
report = asyncio.run(evaluator.run_evaluation(
split=args.split,
categories=args.categories,
max_examples=args.max_examples,
))
# Print summary
print("\n" + "=" * 60)
print("EVALUATION SUMMARY")
print("=" * 60)
print(f"Pass Rate: {report['summary']['pass_rate']:.1%}")
print(f"Average Score: {report['summary']['average_score']:.2f}")
print(f"Passed: {report['summary']['passed']}/{report['summary']['total_examples']}")
if report['summary']['pass_rate'] < args.threshold:
print(f"\n❌ FAILED: Pass rate below threshold ({args.threshold:.0%})")
sys.exit(1)
else:
print(f"\n✅ PASSED: Pass rate meets threshold ({args.threshold:.0%})")
sys.exit(0)
if __name__ == '__main__':
main()
CLI Usage
# Run full evaluation on dev set
python -m backend.rag.evaluation.run_evaluation --split dev
# Run only COUNT queries
python -m backend.rag.evaluation.run_evaluation --categories count
# Quick smoke test (10 examples)
python -m backend.rag.evaluation.run_evaluation --max-examples 10
# Run against production
python -m backend.rag.evaluation.run_evaluation --api-url https://archief.support
# Set custom threshold
python -m backend.rag.evaluation.run_evaluation --threshold 0.85
Report Format
{
"timestamp": "2025-01-09T15:30:00Z",
"summary": {
"total_examples": 200,
"passed": 180,
"failed": 20,
"pass_rate": 0.90,
"average_score": 0.87
},
"by_category": {
"count": {"pass_rate": 0.95, "average_score": 0.92},
"list": {"pass_rate": 0.88, "average_score": 0.85},
"detail": {"pass_rate": 0.85, "average_score": 0.82}
},
"failed_examples": [
{
"example_id": "count_042",
"question": "Hoeveel bibliotheken zijn er in Zeeland?",
"expected": {"count": 15},
"actual": {"answer": "Er zijn 12 bibliotheken..."},
"scores": {"composite": 0.65}
}
]
}
Integration with pytest
# tests/evaluation/test_rag_quality.py
import pytest
from backend.rag.evaluation.run_evaluation import RAGEvaluator
@pytest.fixture
def evaluator():
return RAGEvaluator(api_url="http://localhost:8010")
@pytest.mark.asyncio
async def test_count_queries(evaluator):
"""COUNT queries should have >90% accuracy."""
report = await evaluator.run_evaluation(
split='test',
categories=['count'],
)
assert report['summary']['pass_rate'] >= 0.90
@pytest.mark.asyncio
async def test_list_queries(evaluator):
"""LIST queries should have >85% accuracy."""
report = await evaluator.run_evaluation(
split='test',
categories=['list'],
)
assert report['summary']['pass_rate'] >= 0.85
@pytest.mark.asyncio
async def test_overall_quality(evaluator):
"""Overall pass rate should be >80%."""
report = await evaluator.run_evaluation(split='test')
assert report['summary']['pass_rate'] >= 0.80