From 8470bf58606c74e8ce15ca7d6f5adc6882012f81 Mon Sep 17 00:00:00 2001 From: kempersc Date: Sun, 11 Jan 2026 21:19:40 +0100 Subject: [PATCH] feat(ci): add DSPy RAG evaluation workflow for Forgejo Implements 4-layer testing pyramid: - Layer 1: Fast unit tests (no LLM, ~5 min) - Layer 2: DSPy module tests with LLM (~20 min) - Layer 3: Integration tests via SSH tunnel to Oxigraph - Layer 4: Comprehensive evaluation (nightly) Includes: - SSH tunnel setup for Oxigraph access - Quality gate checks - JUnit XML output for test results - Scheduled nightly runs at 2 AM UTC - Manual trigger with evaluation level selection --- .forgejo/workflows/dspy-eval.yml | 313 +++++++++++++++++++++++++++++++ 1 file changed, 313 insertions(+) create mode 100644 .forgejo/workflows/dspy-eval.yml diff --git a/.forgejo/workflows/dspy-eval.yml b/.forgejo/workflows/dspy-eval.yml new file mode 100644 index 0000000000..1e415f64f3 --- /dev/null +++ b/.forgejo/workflows/dspy-eval.yml @@ -0,0 +1,313 @@ +# DSPy RAG Evaluation Workflow +# Automated testing and evaluation for Heritage RAG system +# +# Layers: +# - Layer 1: Fast unit tests (no LLM) +# - Layer 2: DSPy module tests with LLM +# - Layer 3: Integration tests (requires SSH tunnel to Oxigraph) +# - Layer 4: Comprehensive evaluation (nightly) + +name: DSPy RAG Evaluation + +on: + push: + branches: [master] + paths: + - 'backend/rag/**' + - 'tests/dspy_gitops/**' + - 'src/glam_extractor/api/**' + - '.forgejo/workflows/dspy-eval.yml' + pull_request: + branches: [master] + paths: + - 'backend/rag/**' + - 'tests/dspy_gitops/**' + - 'src/glam_extractor/api/**' + workflow_dispatch: + inputs: + evaluation_level: + description: 'Evaluation depth' + required: true + default: 'standard' + type: choice + options: + - smoke + - standard + - comprehensive + schedule: + # Nightly comprehensive evaluation at 2 AM UTC + - cron: '0 2 * * *' + +env: + PYTHON_VERSION: '3.11' + SERVER_IP: '91.98.224.44' + SERVER_USER: 'root' + +jobs: + # ========================================================================== + # Layer 1: Fast Unit Tests (no LLM calls) + # ========================================================================== + unit-tests: + name: Layer 1 - Unit Tests + runs-on: ubuntu-latest + timeout-minutes: 5 + + steps: + - uses: https://github.com/actions/checkout@v4 + + - name: Set up Python + uses: https://github.com/actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Install dependencies + run: | + pip install -e ".[dev]" + pip install rapidfuzz + + - name: Run Layer 1 unit tests + run: | + pytest tests/dspy_gitops/test_layer1_unit.py \ + -v --tb=short \ + --junit-xml=layer1-results.xml + + - name: Upload test results + uses: https://github.com/actions/upload-artifact@v4 + if: always() + with: + name: layer1-test-results + path: layer1-results.xml + + # ========================================================================== + # Layer 2: DSPy Module Tests (with LLM) + # ========================================================================== + dspy-module-tests: + name: Layer 2 - DSPy Module Tests + runs-on: ubuntu-latest + timeout-minutes: 20 + needs: unit-tests + + # Run on PRs, scheduled runs, or manual triggers + if: github.event_name == 'pull_request' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' + + steps: + - uses: https://github.com/actions/checkout@v4 + + - name: Set up Python + uses: https://github.com/actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Install dependencies + run: | + pip install -e ".[dev]" + pip install dspy-ai httpx rapidfuzz litellm + + - name: Run Layer 2 DSPy tests + env: + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + run: | + pytest tests/dspy_gitops/test_layer2_dspy.py \ + -v --tb=short \ + --junit-xml=layer2-results.xml + + - name: Upload test results + uses: https://github.com/actions/upload-artifact@v4 + if: always() + with: + name: layer2-test-results + path: layer2-results.xml + + # ========================================================================== + # Layer 3: Integration Tests (requires SSH tunnel to Oxigraph) + # ========================================================================== + integration-tests: + name: Layer 3 - Integration Tests + runs-on: ubuntu-latest + timeout-minutes: 15 + needs: unit-tests + + steps: + - uses: https://github.com/actions/checkout@v4 + + - name: Set up Python + uses: https://github.com/actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Install dependencies + run: | + pip install -e ".[dev]" + pip install httpx pytest-asyncio + + - name: Setup SSH for tunnel + run: | + mkdir -p ~/.ssh + echo "${{ secrets.DEPLOY_SSH_PRIVATE_KEY }}" > ~/.ssh/deploy_key + chmod 600 ~/.ssh/deploy_key + ssh-keyscan -H ${{ env.SERVER_IP }} >> ~/.ssh/known_hosts 2>/dev/null || true + + - name: Create SSH tunnel to Oxigraph + run: | + # Create SSH tunnel: local port 7878 -> server localhost:7878 + ssh -f -N -L 7878:127.0.0.1:7878 \ + -i ~/.ssh/deploy_key \ + -o StrictHostKeyChecking=no \ + ${{ env.SERVER_USER }}@${{ env.SERVER_IP }} + + # Wait for tunnel to establish + sleep 3 + + # Verify tunnel is working + curl -sf "http://127.0.0.1:7878/query" \ + -H "Accept: application/sparql-results+json" \ + --data-urlencode "query=SELECT (1 AS ?test) WHERE {}" \ + || (echo "SSH tunnel failed" && exit 1) + + echo "SSH tunnel established successfully" + + - name: Run Layer 3 integration tests + env: + OXIGRAPH_ENDPOINT: "http://127.0.0.1:7878" + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + run: | + pytest tests/dspy_gitops/test_layer3_integration.py \ + -v --tb=short \ + --junit-xml=layer3-results.xml + + - name: Upload test results + uses: https://github.com/actions/upload-artifact@v4 + if: always() + with: + name: layer3-test-results + path: layer3-results.xml + + # ========================================================================== + # Layer 4: Comprehensive Evaluation (nightly only) + # ========================================================================== + comprehensive-eval: + name: Layer 4 - Comprehensive Evaluation + runs-on: ubuntu-latest + timeout-minutes: 60 + needs: [unit-tests, dspy-module-tests, integration-tests] + + # Only run on schedule or manual trigger with 'comprehensive' + if: github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && github.event.inputs.evaluation_level == 'comprehensive') + + steps: + - uses: https://github.com/actions/checkout@v4 + + - name: Set up Python + uses: https://github.com/actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Install dependencies + run: | + pip install -e ".[dev]" + pip install dspy-ai httpx rapidfuzz pandas pytest-json-report litellm + + - name: Setup SSH for tunnel + run: | + mkdir -p ~/.ssh + echo "${{ secrets.DEPLOY_SSH_PRIVATE_KEY }}" > ~/.ssh/deploy_key + chmod 600 ~/.ssh/deploy_key + ssh-keyscan -H ${{ env.SERVER_IP }} >> ~/.ssh/known_hosts 2>/dev/null || true + + - name: Create SSH tunnel to Oxigraph + run: | + ssh -f -N -L 7878:127.0.0.1:7878 \ + -i ~/.ssh/deploy_key \ + -o StrictHostKeyChecking=no \ + ${{ env.SERVER_USER }}@${{ env.SERVER_IP }} + sleep 3 + + - name: Run comprehensive evaluation + env: + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + OXIGRAPH_ENDPOINT: "http://127.0.0.1:7878" + run: | + pytest tests/dspy_gitops/test_layer4_comprehensive.py \ + -v --tb=short \ + --junit-xml=layer4-results.xml \ + --json-report \ + --json-report-file=eval-report.json + + - name: Generate metrics summary + run: | + python -c " + import json + from datetime import datetime + + try: + with open('eval-report.json') as f: + report = json.load(f) + + metrics = { + 'timestamp': datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ'), + 'commit': '${{ github.sha }}', + 'total_tests': report.get('summary', {}).get('total', 0), + 'passed': report.get('summary', {}).get('passed', 0), + 'failed': report.get('summary', {}).get('failed', 0), + 'duration': report.get('duration', 0), + } + + with open('metrics.json', 'w') as f: + json.dump(metrics, f, indent=2) + + print('Metrics saved to metrics.json') + print(json.dumps(metrics, indent=2)) + except Exception as e: + print(f'Error generating metrics: {e}') + " + + - name: Upload evaluation artifacts + uses: https://github.com/actions/upload-artifact@v4 + with: + name: comprehensive-eval-results + path: | + layer4-results.xml + eval-report.json + metrics.json + + # ========================================================================== + # Quality Gate Check + # ========================================================================== + quality-gate: + name: Quality Gate + runs-on: ubuntu-latest + needs: [unit-tests, dspy-module-tests, integration-tests] + if: always() + + steps: + - name: Check all required tests passed + run: | + echo "Checking quality gates..." + + # Layer 1 (unit tests) is always required + if [[ "${{ needs.unit-tests.result }}" != "success" ]]; then + echo "Layer 1 (Unit Tests) failed" + exit 1 + fi + echo "Layer 1 (Unit Tests) passed" + + # Layer 2 (DSPy module tests) required for PRs + if [[ "${{ github.event_name }}" == "pull_request" ]]; then + if [[ "${{ needs.dspy-module-tests.result }}" != "success" ]]; then + echo "Layer 2 (DSPy Module Tests) failed - required for PRs" + exit 1 + fi + echo "Layer 2 (DSPy Module Tests) passed" + fi + + # Layer 3 (integration tests) is warning-only for now + if [[ "${{ needs.integration-tests.result }}" != "success" ]]; then + echo "Warning: Layer 3 (Integration Tests) failed - non-blocking" + else + echo "Layer 3 (Integration Tests) passed" + fi + + echo "" + echo "============================================" + echo " All required quality gates passed!" + echo "============================================"