glam/docs/plan/dspy_rag_automation/03-evaluation-harness.md
2026-01-09 20:35:19 +01:00

12 KiB

Evaluation Harness Specification

Overview

The evaluation harness is a command-line tool that runs DSPy evaluations against the RAG system, generating detailed reports.

Architecture

┌─────────────────────────────────────────────────────────────────┐
│                    Evaluation Harness                            │
├─────────────────────────────────────────────────────────────────┤
│                                                                  │
│  ┌──────────────┐    ┌──────────────┐    ┌──────────────┐       │
│  │   Dataset    │───▶│   Pipeline   │───▶│   Metrics    │       │
│  │   Loader     │    │   Executor   │    │   Scorer     │       │
│  └──────────────┘    └──────────────┘    └──────────────┘       │
│          │                   │                   │               │
│          ▼                   ▼                   ▼               │
│  ┌──────────────┐    ┌──────────────┐    ┌──────────────┐       │
│  │   Golden     │    │   RAG API    │    │   Score      │       │
│  │   Dataset    │    │   (HTTP)     │    │   Aggregator │       │
│  └──────────────┘    └──────────────┘    └──────────────┘       │
│                                                  │               │
│                                                  ▼               │
│                                         ┌──────────────┐        │
│                                         │   Report     │        │
│                                         │   Generator  │        │
│                                         └──────────────┘        │
└─────────────────────────────────────────────────────────────────┘

Implementation

Main Evaluation Script

#!/usr/bin/env python
"""
backend/rag/evaluation/run_evaluation.py

Run DSPy evaluation against the Heritage RAG system.
"""

import argparse
import asyncio
import json
import sys
from datetime import datetime
from pathlib import Path
from typing import Optional

import dspy
from dspy import Evaluate, Example

from .metrics import heritage_rag_metric, count_accuracy, slot_extraction_accuracy
from .dataset_loader import load_golden_dataset, split_by_category


class RAGEvaluator:
    """Evaluation harness for Heritage RAG system."""
    
    def __init__(
        self,
        dataset_path: str = "data/rag_eval/golden_dataset.json",
        api_url: str = "http://localhost:8010",
        output_dir: str = "reports/evaluation",
    ):
        self.dataset_path = Path(dataset_path)
        self.api_url = api_url
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(parents=True, exist_ok=True)
        
    def load_dataset(self, split: str = "dev") -> list[Example]:
        """Load golden dataset as DSPy Examples."""
        data = load_golden_dataset(self.dataset_path)
        
        examples = []
        for ex in data['examples']:
            example = Example(**ex).with_inputs('question', 'language')
            examples.append(example)
        
        return examples
    
    async def evaluate_single(self, example: Example) -> dict:
        """Evaluate a single example against the RAG API."""
        import httpx
        
        async with httpx.AsyncClient(timeout=60.0) as client:
            response = await client.post(
                f"{self.api_url}/api/rag/dspy/query",
                json={
                    "question": example.question,
                    "language": example.get('language', 'nl'),
                }
            )
            result = response.json()
        
        # Create prediction object for metrics
        pred = dspy.Prediction(
            answer=result.get('answer'),
            sparql=result.get('sparql'),
            sparql_results=result.get('institutions', []),
            slots=result.get('slots', {}),
        )
        
        # Compute metrics
        scores = {
            'count_accuracy': count_accuracy(example, pred),
            'slot_extraction': slot_extraction_accuracy(example, pred),
            'composite': heritage_rag_metric(example, pred),
        }
        
        return {
            'example_id': example.id,
            'question': example.question,
            'expected': {
                'count': example.get('expected_count'),
                'slots': example.get('expected_slots'),
            },
            'actual': {
                'answer': result.get('answer'),
                'sparql': result.get('sparql'),
            },
            'scores': scores,
            'passed': scores['composite'] >= 0.8,
        }
    
    async def run_evaluation(
        self,
        split: str = "dev",
        categories: Optional[list[str]] = None,
        max_examples: Optional[int] = None,
    ) -> dict:
        """Run full evaluation on dataset."""
        examples = self.load_dataset(split)
        
        if categories:
            examples = [e for e in examples if e.get('category') in categories]
        
        if max_examples:
            examples = examples[:max_examples]
        
        print(f"Evaluating {len(examples)} examples...")
        
        results = []
        for i, example in enumerate(examples):
            print(f"  [{i+1}/{len(examples)}] {example.question[:50]}...")
            result = await self.evaluate_single(example)
            results.append(result)
        
        # Aggregate scores
        report = self.generate_report(results)
        
        # Save report
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        report_path = self.output_dir / f"eval_{split}_{timestamp}.json"
        with open(report_path, 'w') as f:
            json.dump(report, f, indent=2)
        
        print(f"\nReport saved to: {report_path}")
        return report
    
    def generate_report(self, results: list[dict]) -> dict:
        """Generate evaluation report from results."""
        passed = sum(1 for r in results if r['passed'])
        total = len(results)
        
        # Category breakdown
        by_category = {}
        for r in results:
            cat = r.get('category', 'unknown')
            if cat not in by_category:
                by_category[cat] = {'passed': 0, 'total': 0, 'scores': []}
            by_category[cat]['total'] += 1
            by_category[cat]['scores'].append(r['scores']['composite'])
            if r['passed']:
                by_category[cat]['passed'] += 1
        
        return {
            'timestamp': datetime.now().isoformat(),
            'summary': {
                'total_examples': total,
                'passed': passed,
                'failed': total - passed,
                'pass_rate': passed / total if total > 0 else 0,
                'average_score': sum(r['scores']['composite'] for r in results) / total if total > 0 else 0,
            },
            'by_category': {
                cat: {
                    'pass_rate': d['passed'] / d['total'] if d['total'] > 0 else 0,
                    'average_score': sum(d['scores']) / len(d['scores']) if d['scores'] else 0,
                }
                for cat, d in by_category.items()
            },
            'failed_examples': [r for r in results if not r['passed']],
            'all_results': results,
        }


def main():
    parser = argparse.ArgumentParser(description="Run RAG evaluation")
    parser.add_argument('--split', default='dev', choices=['train', 'dev', 'test'])
    parser.add_argument('--categories', nargs='+', help='Filter by category')
    parser.add_argument('--max-examples', type=int, help='Max examples to evaluate')
    parser.add_argument('--api-url', default='http://localhost:8010')
    parser.add_argument('--output-dir', default='reports/evaluation')
    parser.add_argument('--threshold', type=float, default=0.8, help='Pass threshold')
    
    args = parser.parse_args()
    
    evaluator = RAGEvaluator(
        api_url=args.api_url,
        output_dir=args.output_dir,
    )
    
    report = asyncio.run(evaluator.run_evaluation(
        split=args.split,
        categories=args.categories,
        max_examples=args.max_examples,
    ))
    
    # Print summary
    print("\n" + "=" * 60)
    print("EVALUATION SUMMARY")
    print("=" * 60)
    print(f"Pass Rate: {report['summary']['pass_rate']:.1%}")
    print(f"Average Score: {report['summary']['average_score']:.2f}")
    print(f"Passed: {report['summary']['passed']}/{report['summary']['total_examples']}")
    
    if report['summary']['pass_rate'] < args.threshold:
        print(f"\n❌ FAILED: Pass rate below threshold ({args.threshold:.0%})")
        sys.exit(1)
    else:
        print(f"\n✅ PASSED: Pass rate meets threshold ({args.threshold:.0%})")
        sys.exit(0)


if __name__ == '__main__':
    main()

CLI Usage

# Run full evaluation on dev set
python -m backend.rag.evaluation.run_evaluation --split dev

# Run only COUNT queries
python -m backend.rag.evaluation.run_evaluation --categories count

# Quick smoke test (10 examples)
python -m backend.rag.evaluation.run_evaluation --max-examples 10

# Run against production
python -m backend.rag.evaluation.run_evaluation --api-url https://archief.support

# Set custom threshold
python -m backend.rag.evaluation.run_evaluation --threshold 0.85

Report Format

{
  "timestamp": "2025-01-09T15:30:00Z",
  "summary": {
    "total_examples": 200,
    "passed": 180,
    "failed": 20,
    "pass_rate": 0.90,
    "average_score": 0.87
  },
  "by_category": {
    "count": {"pass_rate": 0.95, "average_score": 0.92},
    "list": {"pass_rate": 0.88, "average_score": 0.85},
    "detail": {"pass_rate": 0.85, "average_score": 0.82}
  },
  "failed_examples": [
    {
      "example_id": "count_042",
      "question": "Hoeveel bibliotheken zijn er in Zeeland?",
      "expected": {"count": 15},
      "actual": {"answer": "Er zijn 12 bibliotheken..."},
      "scores": {"composite": 0.65}
    }
  ]
}

Integration with pytest

# tests/evaluation/test_rag_quality.py

import pytest
from backend.rag.evaluation.run_evaluation import RAGEvaluator

@pytest.fixture
def evaluator():
    return RAGEvaluator(api_url="http://localhost:8010")

@pytest.mark.asyncio
async def test_count_queries(evaluator):
    """COUNT queries should have >90% accuracy."""
    report = await evaluator.run_evaluation(
        split='test',
        categories=['count'],
    )
    assert report['summary']['pass_rate'] >= 0.90

@pytest.mark.asyncio
async def test_list_queries(evaluator):
    """LIST queries should have >85% accuracy."""
    report = await evaluator.run_evaluation(
        split='test',
        categories=['list'],
    )
    assert report['summary']['pass_rate'] >= 0.85

@pytest.mark.asyncio
async def test_overall_quality(evaluator):
    """Overall pass rate should be >80%."""
    report = await evaluator.run_evaluation(split='test')
    assert report['summary']['pass_rate'] >= 0.80