glam/.forgejo/workflows/dspy-eval.yml
kempersc 44061eb736
Some checks failed
DSPy RAG Evaluation / Layer 1 - Unit Tests (push) Failing after 10m35s
DSPy RAG Evaluation / Layer 3 - Integration Tests (push) Has been skipped
DSPy RAG Evaluation / Layer 2 - DSPy Module Tests (push) Has been skipped
DSPy RAG Evaluation / Layer 4 - Comprehensive Evaluation (push) Has been skipped
DSPy RAG Evaluation / Quality Gate (push) Failing after 1s
fix(ci): add pytest-cov to resolve coverage flag error
pyproject.toml has --cov flags in addopts which require pytest-cov.
Added pytest-cov to all jobs that run pytest.
2026-01-13 13:13:56 +01:00

332 lines
12 KiB
YAML

# DSPy RAG Evaluation Workflow
# Automated testing and evaluation for Heritage RAG system
#
# Layers:
# - Layer 1: Fast unit tests (no LLM)
# - Layer 2: DSPy module tests with LLM
# - Layer 3: Integration tests (requires SSH tunnel to Oxigraph)
# - Layer 4: Comprehensive evaluation (nightly)
name: DSPy RAG Evaluation
on:
push:
branches: [master]
paths:
- 'backend/rag/**'
- 'tests/dspy_gitops/**'
- 'src/glam_extractor/api/**'
- '.forgejo/workflows/dspy-eval.yml'
pull_request:
branches: [master]
paths:
- 'backend/rag/**'
- 'tests/dspy_gitops/**'
- 'src/glam_extractor/api/**'
workflow_dispatch:
inputs:
evaluation_level:
description: 'Evaluation depth'
required: true
default: 'standard'
type: choice
options:
- smoke
- standard
- comprehensive
schedule:
# Nightly comprehensive evaluation at 2 AM UTC
- cron: '0 2 * * *'
env:
SERVER_IP: '91.98.224.44'
SERVER_USER: 'root'
VENV_PATH: '/opt/venv'
jobs:
# ==========================================================================
# Layer 1: Fast Unit Tests (no LLM calls)
# ==========================================================================
unit-tests:
name: Layer 1 - Unit Tests
runs-on: ubuntu-latest
timeout-minutes: 10
steps:
- uses: https://github.com/actions/checkout@v4
- name: Setup Python with virtual environment
run: |
apt-get update
apt-get install -y python3 python3-pip python3-venv python3-full
python3 -m venv ${{ env.VENV_PATH }}
echo "Python version: $(python3 --version)"
echo "Venv created at ${{ env.VENV_PATH }}"
- name: Install dependencies
run: |
source ${{ env.VENV_PATH }}/bin/activate
pip install --upgrade pip
pip install -e ".[dev]"
pip install rapidfuzz pytest-cov
- name: Run Layer 1 unit tests
run: |
source ${{ env.VENV_PATH }}/bin/activate
python -m pytest tests/dspy_gitops/test_layer1_unit.py \
-v --tb=short \
--junit-xml=layer1-results.xml
- name: Upload test results
uses: https://github.com/actions/upload-artifact@v4
if: always()
with:
name: layer1-test-results
path: layer1-results.xml
# ==========================================================================
# Layer 2: DSPy Module Tests (with LLM)
# ==========================================================================
dspy-module-tests:
name: Layer 2 - DSPy Module Tests
runs-on: ubuntu-latest
timeout-minutes: 20
needs: unit-tests
# Run on PRs, scheduled runs, or manual triggers
if: github.event_name == 'pull_request' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch'
steps:
- uses: https://github.com/actions/checkout@v4
- name: Setup Python with virtual environment
run: |
apt-get update
apt-get install -y python3 python3-pip python3-venv python3-full
python3 -m venv ${{ env.VENV_PATH }}
- name: Install dependencies
run: |
source ${{ env.VENV_PATH }}/bin/activate
pip install --upgrade pip
pip install -e ".[dev]"
pip install dspy-ai httpx rapidfuzz litellm pytest-cov
- name: Run Layer 2 DSPy tests
env:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
run: |
source ${{ env.VENV_PATH }}/bin/activate
python -m pytest tests/dspy_gitops/test_layer2_dspy.py \
-v --tb=short \
--junit-xml=layer2-results.xml
- name: Upload test results
uses: https://github.com/actions/upload-artifact@v4
if: always()
with:
name: layer2-test-results
path: layer2-results.xml
# ==========================================================================
# Layer 3: Integration Tests (requires SSH tunnel to Oxigraph)
# ==========================================================================
integration-tests:
name: Layer 3 - Integration Tests
runs-on: ubuntu-latest
timeout-minutes: 15
needs: unit-tests
steps:
- uses: https://github.com/actions/checkout@v4
- name: Setup Python with virtual environment
run: |
apt-get update
apt-get install -y openssh-client curl python3 python3-pip python3-venv python3-full
python3 -m venv ${{ env.VENV_PATH }}
- name: Install Python dependencies
run: |
source ${{ env.VENV_PATH }}/bin/activate
pip install --upgrade pip
pip install -e ".[dev]"
pip install httpx pytest-asyncio pytest-cov
- name: Setup SSH for tunnel
run: |
mkdir -p ~/.ssh
echo "${{ secrets.DEPLOY_SSH_PRIVATE_KEY }}" > ~/.ssh/deploy_key
chmod 600 ~/.ssh/deploy_key
ssh-keyscan -H ${{ env.SERVER_IP }} >> ~/.ssh/known_hosts 2>/dev/null || true
- name: Create SSH tunnel to Oxigraph
run: |
# Create SSH tunnel: local port 7878 -> server localhost:7878
ssh -f -N -L 7878:127.0.0.1:7878 \
-i ~/.ssh/deploy_key \
-o StrictHostKeyChecking=no \
${{ env.SERVER_USER }}@${{ env.SERVER_IP }}
# Wait for tunnel to establish
sleep 3
# Verify tunnel is working
curl -sf "http://127.0.0.1:7878/query" \
-H "Accept: application/sparql-results+json" \
--data-urlencode "query=SELECT (1 AS ?test) WHERE {}" \
|| (echo "SSH tunnel failed" && exit 1)
echo "SSH tunnel established successfully"
- name: Run Layer 3 integration tests
env:
OXIGRAPH_ENDPOINT: "http://127.0.0.1:7878"
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
run: |
source ${{ env.VENV_PATH }}/bin/activate
python -m pytest tests/dspy_gitops/test_layer3_integration.py \
-v --tb=short \
--junit-xml=layer3-results.xml
- name: Upload test results
uses: https://github.com/actions/upload-artifact@v4
if: always()
with:
name: layer3-test-results
path: layer3-results.xml
# ==========================================================================
# Layer 4: Comprehensive Evaluation (nightly only)
# ==========================================================================
comprehensive-eval:
name: Layer 4 - Comprehensive Evaluation
runs-on: ubuntu-latest
timeout-minutes: 60
needs: [unit-tests, dspy-module-tests, integration-tests]
# Only run on schedule or manual trigger with 'comprehensive'
if: github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && github.event.inputs.evaluation_level == 'comprehensive')
steps:
- uses: https://github.com/actions/checkout@v4
- name: Setup Python with virtual environment
run: |
apt-get update
apt-get install -y openssh-client curl python3 python3-pip python3-venv python3-full
python3 -m venv ${{ env.VENV_PATH }}
- name: Install Python dependencies
run: |
source ${{ env.VENV_PATH }}/bin/activate
pip install --upgrade pip
pip install -e ".[dev]"
pip install dspy-ai httpx rapidfuzz pandas pytest-json-report litellm pytest-cov
- name: Setup SSH for tunnel
run: |
mkdir -p ~/.ssh
echo "${{ secrets.DEPLOY_SSH_PRIVATE_KEY }}" > ~/.ssh/deploy_key
chmod 600 ~/.ssh/deploy_key
ssh-keyscan -H ${{ env.SERVER_IP }} >> ~/.ssh/known_hosts 2>/dev/null || true
- name: Create SSH tunnel to Oxigraph
run: |
ssh -f -N -L 7878:127.0.0.1:7878 \
-i ~/.ssh/deploy_key \
-o StrictHostKeyChecking=no \
${{ env.SERVER_USER }}@${{ env.SERVER_IP }}
sleep 3
- name: Run comprehensive evaluation
env:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
OXIGRAPH_ENDPOINT: "http://127.0.0.1:7878"
run: |
source ${{ env.VENV_PATH }}/bin/activate
python -m pytest tests/dspy_gitops/test_layer4_comprehensive.py \
-v --tb=short \
--junit-xml=layer4-results.xml \
--json-report \
--json-report-file=eval-report.json
- name: Generate metrics summary
run: |
source ${{ env.VENV_PATH }}/bin/activate
python -c "
import json
from datetime import datetime
try:
with open('eval-report.json') as f:
report = json.load(f)
metrics = {
'timestamp': datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ'),
'commit': '${{ github.sha }}',
'total_tests': report.get('summary', {}).get('total', 0),
'passed': report.get('summary', {}).get('passed', 0),
'failed': report.get('summary', {}).get('failed', 0),
'duration': report.get('duration', 0),
}
with open('metrics.json', 'w') as f:
json.dump(metrics, f, indent=2)
print('Metrics saved to metrics.json')
print(json.dumps(metrics, indent=2))
except Exception as e:
print(f'Error generating metrics: {e}')
"
- name: Upload evaluation artifacts
uses: https://github.com/actions/upload-artifact@v4
with:
name: comprehensive-eval-results
path: |
layer4-results.xml
eval-report.json
metrics.json
# ==========================================================================
# Quality Gate Check
# ==========================================================================
quality-gate:
name: Quality Gate
runs-on: ubuntu-latest
needs: [unit-tests, dspy-module-tests, integration-tests]
if: always()
steps:
- name: Check all required tests passed
run: |
echo "Checking quality gates..."
# Layer 1 (unit tests) is always required
if [[ "${{ needs.unit-tests.result }}" != "success" ]]; then
echo "Layer 1 (Unit Tests) failed"
exit 1
fi
echo "Layer 1 (Unit Tests) passed"
# Layer 2 (DSPy module tests) required for PRs
if [[ "${{ github.event_name }}" == "pull_request" ]]; then
if [[ "${{ needs.dspy-module-tests.result }}" != "success" ]]; then
echo "Layer 2 (DSPy Module Tests) failed - required for PRs"
exit 1
fi
echo "Layer 2 (DSPy Module Tests) passed"
fi
# Layer 3 (integration tests) is warning-only for now
if [[ "${{ needs.integration-tests.result }}" != "success" ]]; then
echo "Warning: Layer 3 (Integration Tests) failed - non-blocking"
else
echo "Layer 3 (Integration Tests) passed"
fi
echo ""
echo "============================================"
echo " All required quality gates passed!"
echo "============================================"