18 KiB
18 KiB
CI/CD Integration for RAG Evaluation
Overview
This document describes how to integrate DSPy evaluation and Playwright E2E tests into the CI/CD pipeline to create automated quality gates.
Pipeline Architecture
┌─────────────────────────────────────────────────────────────────┐
│ CI/CD Pipeline │
├─────────────────────────────────────────────────────────────────┤
│ │
│ PR Created │
│ │ │
│ ▼ │
│ ┌──────────────────────────────────────────────────────────┐ │
│ │ Parallel Jobs │ │
│ ├──────────────────────────────────────────────────────────┤ │
│ │ │ │
│ │ ┌────────────┐ ┌────────────┐ ┌────────────┐ │ │
│ │ │ Lint │ │ Unit │ │ Type │ │ │
│ │ │ Check │ │ Tests │ │ Check │ │ │
│ │ └────────────┘ └────────────┘ └────────────┘ │ │
│ │ │ │
│ └──────────────────────────────────────────────────────────┘ │
│ │ │
│ ▼ (if passed) │
│ ┌──────────────────────────────────────────────────────────┐ │
│ │ Deploy to Staging │ │
│ └──────────────────────────────────────────────────────────┘ │
│ │ │
│ ▼ │
│ ┌──────────────────────────────────────────────────────────┐ │
│ │ RAG Quality Gates │ │
│ ├──────────────────────────────────────────────────────────┤ │
│ │ │ │
│ │ ┌────────────────┐ ┌────────────────┐ │ │
│ │ │ DSPy Evaluation │ │ Playwright │ │ │
│ │ │ (API Tests) │ │ E2E Tests │ │ │
│ │ └────────────────┘ └────────────────┘ │ │
│ │ │ │
│ └──────────────────────────────────────────────────────────┘ │
│ │ │
│ ▼ (if quality >= threshold) │
│ ┌──────────────────────────────────────────────────────────┐ │
│ │ Merge to Main │ │
│ └──────────────────────────────────────────────────────────┘ │
│ │ │
│ ▼ │
│ ┌──────────────────────────────────────────────────────────┐ │
│ │ Deploy to Production │ │
│ └──────────────────────────────────────────────────────────┘ │
│ │
└─────────────────────────────────────────────────────────────────┘
GitHub Actions Workflow
Main Quality Gate Workflow
# .github/workflows/rag-quality-gate.yml
name: RAG Quality Gate
on:
pull_request:
paths:
- 'backend/rag/**'
- 'apps/archief-assistent/**'
- 'data/sparql_templates.yaml'
push:
branches: [main]
env:
STAGING_URL: https://staging.archief.support
PRODUCTION_URL: https://archief.support
jobs:
# ============================================
# Stage 1: Basic checks (parallel)
# ============================================
lint:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Install dependencies
run: pip install ruff mypy
- name: Run linter
run: ruff check backend/rag/
typecheck:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Install dependencies
run: pip install -r requirements.txt mypy
- name: Run type check
run: mypy backend/rag/ --ignore-missing-imports
unit-tests:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Install dependencies
run: pip install -r requirements.txt pytest
- name: Run unit tests
run: pytest tests/unit/ -v
# ============================================
# Stage 2: Deploy to staging
# ============================================
deploy-staging:
needs: [lint, typecheck, unit-tests]
runs-on: ubuntu-latest
if: github.event_name == 'pull_request'
steps:
- uses: actions/checkout@v4
- name: Deploy RAG to staging
run: |
rsync -avz --exclude='__pycache__' \
backend/rag/ \
${{ secrets.STAGING_USER }}@${{ secrets.STAGING_HOST }}:/opt/glam-backend/rag/
ssh ${{ secrets.STAGING_USER }}@${{ secrets.STAGING_HOST }} \
"systemctl restart glam-rag-api"
- name: Wait for service to be ready
run: |
for i in {1..30}; do
if curl -s ${{ env.STAGING_URL }}/api/health | grep -q "ok"; then
echo "Service is ready"
exit 0
fi
sleep 2
done
echo "Service failed to start"
exit 1
# ============================================
# Stage 3: RAG Quality Tests (parallel)
# ============================================
dspy-evaluation:
needs: deploy-staging
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Install dependencies
run: pip install -r requirements.txt
- name: Run DSPy evaluation
id: evaluation
run: |
python -m backend.rag.evaluation.run_evaluation \
--split test \
--api-url ${{ env.STAGING_URL }} \
--output-dir reports/dspy \
--threshold 0.80
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
- name: Upload evaluation report
if: always()
uses: actions/upload-artifact@v4
with:
name: dspy-evaluation-report
path: reports/dspy/
- name: Comment on PR with results
if: github.event_name == 'pull_request'
uses: actions/github-script@v7
with:
script: |
const fs = require('fs');
const reports = fs.readdirSync('reports/dspy/').filter(f => f.endsWith('.json'));
if (reports.length === 0) return;
const report = JSON.parse(fs.readFileSync(`reports/dspy/${reports[0]}`));
const body = `## DSPy RAG Evaluation Results
| Metric | Value |
|--------|-------|
| **Pass Rate** | ${(report.summary.pass_rate * 100).toFixed(1)}% |
| **Average Score** | ${report.summary.average_score.toFixed(2)} |
| **Passed** | ${report.summary.passed}/${report.summary.total_examples} |
### Category Breakdown
${Object.entries(report.by_category).map(([cat, data]) =>
`| ${cat} | ${(data.pass_rate * 100).toFixed(1)}% |`
).join('\n')}
${report.summary.pass_rate >= 0.80 ? '✅ **PASSED**' : '❌ **FAILED**'}
`;
github.rest.issues.createComment({
issue_number: context.issue.number,
owner: context.repo.owner,
repo: context.repo.repo,
body: body
});
playwright-e2e:
needs: deploy-staging
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Install dependencies
run: |
pip install -r requirements-test.txt
playwright install chromium --with-deps
- name: Run Playwright tests
run: |
pytest tests/e2e/ \
-v \
--html=reports/playwright/report.html \
--self-contained-html
env:
TEST_BASE_URL: ${{ env.STAGING_URL }}
TEST_USERNAME: ${{ secrets.TEST_USERNAME }}
TEST_PASSWORD: ${{ secrets.TEST_PASSWORD }}
- name: Upload Playwright report
if: always()
uses: actions/upload-artifact@v4
with:
name: playwright-report
path: reports/playwright/
# ============================================
# Stage 4: Quality Gate Decision
# ============================================
quality-gate:
needs: [dspy-evaluation, playwright-e2e]
runs-on: ubuntu-latest
if: always()
steps:
- name: Check quality gate
run: |
if [ "${{ needs.dspy-evaluation.result }}" != "success" ]; then
echo "❌ DSPy evaluation failed"
exit 1
fi
if [ "${{ needs.playwright-e2e.result }}" != "success" ]; then
echo "❌ Playwright E2E tests failed"
exit 1
fi
echo "✅ All quality gates passed"
# ============================================
# Stage 5: Deploy to production (main branch only)
# ============================================
deploy-production:
needs: quality-gate
runs-on: ubuntu-latest
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
steps:
- uses: actions/checkout@v4
- name: Deploy RAG to production
run: |
rsync -avz --exclude='__pycache__' \
backend/rag/ \
root@${{ secrets.PROD_HOST }}:/opt/glam-backend/rag/
ssh root@${{ secrets.PROD_HOST }} \
"systemctl restart glam-rag-api"
- name: Verify production health
run: |
sleep 10
curl -f ${{ env.PRODUCTION_URL }}/api/health
- name: Run smoke test on production
run: |
response=$(curl -s "${{ env.PRODUCTION_URL }}/api/rag/dspy/query" \
-H "Content-Type: application/json" \
-d '{"question": "Hoeveel archieven zijn er in Utrecht?"}')
if echo "$response" | grep -q "archieven"; then
echo "✅ Production smoke test passed"
else
echo "❌ Production smoke test failed"
exit 1
fi
Scheduled Regression Testing
# .github/workflows/scheduled-regression.yml
name: Scheduled Regression Tests
on:
schedule:
- cron: '0 6 * * *' # Daily at 6 AM
workflow_dispatch:
jobs:
full-regression:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Install dependencies
run: pip install -r requirements.txt playwright pytest
- name: Install Playwright browsers
run: playwright install chromium --with-deps
- name: Run full DSPy evaluation
run: |
python -m backend.rag.evaluation.run_evaluation \
--split test \
--api-url https://archief.support \
--output-dir reports/daily \
--threshold 0.80
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
- name: Run full E2E suite
run: |
pytest tests/e2e/ \
-v \
--html=reports/playwright/daily_report.html
env:
TEST_BASE_URL: https://archief.support
- name: Upload reports
uses: actions/upload-artifact@v4
with:
name: daily-regression-reports
path: reports/
- name: Notify on failure
if: failure()
uses: actions/github-script@v7
with:
script: |
github.rest.issues.create({
owner: context.repo.owner,
repo: context.repo.repo,
title: '🚨 Daily Regression Test Failed',
body: `Daily regression tests failed. Check the [workflow run](${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}) for details.`,
labels: ['bug', 'regression']
});
Quality Thresholds
Threshold Configuration
# config/quality_thresholds.yaml
thresholds:
# Overall pass rate
overall:
blocking: 0.80 # Below this, PR is blocked
warning: 0.85 # Below this, warning is issued
target: 0.90 # Goal for production
# Category-specific thresholds
categories:
count:
blocking: 0.90 # COUNT queries should be highly accurate
target: 0.95
list:
blocking: 0.80
target: 0.90
detail:
blocking: 0.75
target: 0.85
person:
blocking: 0.70 # Person queries are harder
target: 0.80
# E2E test pass rate
e2e:
blocking: 1.0 # All E2E tests must pass
Reporting and Dashboards
Metrics to Track
# backend/rag/evaluation/metrics_reporter.py
TRACKED_METRICS = {
"pass_rate": "Overall percentage of passing examples",
"average_score": "Mean composite metric score",
"count_accuracy": "Accuracy of COUNT queries",
"slot_extraction": "Accuracy of slot extraction",
"answer_relevance": "LLM-judged answer relevance",
"faithfulness": "Grounding in retrieved context",
"latency_p50": "Median response time (ms)",
"latency_p95": "95th percentile response time (ms)",
}
def generate_metrics_report(results: list[dict]) -> dict:
"""Generate comprehensive metrics report."""
return {
"timestamp": datetime.now().isoformat(),
"metrics": {
name: compute_metric(results, name)
for name in TRACKED_METRICS
},
"trend": compute_trend(results),
"regressions": detect_regressions(results),
}
Dashboard Integration
# Push metrics to monitoring system (e.g., Datadog, Grafana)
def push_metrics_to_datadog(report: dict):
"""Push evaluation metrics to Datadog."""
from datadog import statsd
for metric, value in report["metrics"].items():
statsd.gauge(f"rag.evaluation.{metric}", value)
# Track regressions as events
for regression in report["regressions"]:
statsd.event(
title=f"RAG Regression Detected: {regression['category']}",
text=regression["description"],
alert_type="warning",
)
Local Development Testing
Pre-commit Hook
#!/bin/bash
# .git/hooks/pre-push
echo "Running quick RAG evaluation..."
# Quick smoke test (5 examples)
python -m backend.rag.evaluation.run_evaluation \
--max-examples 5 \
--api-url http://localhost:8010 \
--threshold 0.70
if [ $? -ne 0 ]; then
echo "❌ RAG evaluation failed. Push blocked."
exit 1
fi
echo "✅ Quick evaluation passed"
Makefile Targets
# Makefile
.PHONY: test-rag eval-quick eval-full e2e-test
test-rag: eval-quick e2e-test
@echo "All RAG tests passed"
eval-quick:
python -m backend.rag.evaluation.run_evaluation \
--max-examples 10 \
--api-url http://localhost:8010
eval-full:
python -m backend.rag.evaluation.run_evaluation \
--split test \
--api-url http://localhost:8010
e2e-test:
pytest tests/e2e/ -v --headed
e2e-test-headless:
pytest tests/e2e/ -v