feat(ci): add DSPy RAG evaluation workflow for Forgejo

Implements 4-layer testing pyramid: - Layer 1: Fast unit tests (no LLM, ~5 min) - Layer 2: DSPy module tests with LLM (~20 min) - Layer 3: Integration tests via SSH tunnel to Oxigraph - Layer 4: Comprehensive evaluation (nightly) Includes: - SSH tunnel setup for Oxigraph access - Quality gate checks - JUnit XML output for test results - Scheduled nightly runs at 2 AM UTC - Manual trigger with evaluation level selection
2026-01-11 21:19:40 +01:00 · 2026-01-11 21:19:40 +01:00 · 8470bf5860
commit 8470bf5860
parent 95d79d0078
1 changed files with 313 additions and 0 deletions
--- a/.forgejo/workflows/dspy-eval.yml
+++ b/.forgejo/workflows/dspy-eval.yml
@ -0,0 +1,313 @@
+# DSPy RAG Evaluation Workflow
+# Automated testing and evaluation for Heritage RAG system
+#
+# Layers:
+# - Layer 1: Fast unit tests (no LLM)
+# - Layer 2: DSPy module tests with LLM
+# - Layer 3: Integration tests (requires SSH tunnel to Oxigraph)
+# - Layer 4: Comprehensive evaluation (nightly)
+
+name: DSPy RAG Evaluation
+
+on:
+  push:
+    branches: [master]
+    paths:
+      - 'backend/rag/**'
+      - 'tests/dspy_gitops/**'
+      - 'src/glam_extractor/api/**'
+      - '.forgejo/workflows/dspy-eval.yml'
+  pull_request:
+    branches: [master]
+    paths:
+      - 'backend/rag/**'
+      - 'tests/dspy_gitops/**'
+      - 'src/glam_extractor/api/**'
+  workflow_dispatch:
+    inputs:
+      evaluation_level:
+        description: 'Evaluation depth'
+        required: true
+        default: 'standard'
+        type: choice
+        options:
+          - smoke
+          - standard
+          - comprehensive
+  schedule:
+    # Nightly comprehensive evaluation at 2 AM UTC
+    - cron: '0 2 * * *'
+
+env:
+  PYTHON_VERSION: '3.11'
+  SERVER_IP: '91.98.224.44'
+  SERVER_USER: 'root'
+
+jobs:
+  # ==========================================================================
+  # Layer 1: Fast Unit Tests (no LLM calls)
+  # ==========================================================================
+  unit-tests:
+    name: Layer 1 - Unit Tests
+    runs-on: ubuntu-latest
+    timeout-minutes: 5
+    
+    steps:
+      - uses: https://github.com/actions/checkout@v4
+      
+      - name: Set up Python
+        uses: https://github.com/actions/setup-python@v5
+        with:
+          python-version: ${{ env.PYTHON_VERSION }}
+      
+      - name: Install dependencies
+        run: |
+          pip install -e ".[dev]"
+          pip install rapidfuzz
+      
+      - name: Run Layer 1 unit tests
+        run: |
+          pytest tests/dspy_gitops/test_layer1_unit.py \
+            -v --tb=short \
+            --junit-xml=layer1-results.xml
+      
+      - name: Upload test results
+        uses: https://github.com/actions/upload-artifact@v4
+        if: always()
+        with:
+          name: layer1-test-results
+          path: layer1-results.xml
+
+  # ==========================================================================
+  # Layer 2: DSPy Module Tests (with LLM)
+  # ==========================================================================
+  dspy-module-tests:
+    name: Layer 2 - DSPy Module Tests
+    runs-on: ubuntu-latest
+    timeout-minutes: 20
+    needs: unit-tests
+    
+    # Run on PRs, scheduled runs, or manual triggers
+    if: github.event_name == 'pull_request' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch'
+    
+    steps:
+      - uses: https://github.com/actions/checkout@v4
+      
+      - name: Set up Python
+        uses: https://github.com/actions/setup-python@v5
+        with:
+          python-version: ${{ env.PYTHON_VERSION }}
+      
+      - name: Install dependencies
+        run: |
+          pip install -e ".[dev]"
+          pip install dspy-ai httpx rapidfuzz litellm
+      
+      - name: Run Layer 2 DSPy tests
+        env:
+          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+        run: |
+          pytest tests/dspy_gitops/test_layer2_dspy.py \
+            -v --tb=short \
+            --junit-xml=layer2-results.xml
+      
+      - name: Upload test results
+        uses: https://github.com/actions/upload-artifact@v4
+        if: always()
+        with:
+          name: layer2-test-results
+          path: layer2-results.xml
+
+  # ==========================================================================
+  # Layer 3: Integration Tests (requires SSH tunnel to Oxigraph)
+  # ==========================================================================
+  integration-tests:
+    name: Layer 3 - Integration Tests
+    runs-on: ubuntu-latest
+    timeout-minutes: 15
+    needs: unit-tests
+    
+    steps:
+      - uses: https://github.com/actions/checkout@v4
+      
+      - name: Set up Python
+        uses: https://github.com/actions/setup-python@v5
+        with:
+          python-version: ${{ env.PYTHON_VERSION }}
+      
+      - name: Install dependencies
+        run: |
+          pip install -e ".[dev]"
+          pip install httpx pytest-asyncio
+      
+      - name: Setup SSH for tunnel
+        run: |
+          mkdir -p ~/.ssh
+          echo "${{ secrets.DEPLOY_SSH_PRIVATE_KEY }}" > ~/.ssh/deploy_key
+          chmod 600 ~/.ssh/deploy_key
+          ssh-keyscan -H ${{ env.SERVER_IP }} >> ~/.ssh/known_hosts 2>/dev/null || true
+      
+      - name: Create SSH tunnel to Oxigraph
+        run: |
+          # Create SSH tunnel: local port 7878 -> server localhost:7878
+          ssh -f -N -L 7878:127.0.0.1:7878 \
+            -i ~/.ssh/deploy_key \
+            -o StrictHostKeyChecking=no \
+            ${{ env.SERVER_USER }}@${{ env.SERVER_IP }}
+          
+          # Wait for tunnel to establish
+          sleep 3
+          
+          # Verify tunnel is working
+          curl -sf "http://127.0.0.1:7878/query" \
+            -H "Accept: application/sparql-results+json" \
+            --data-urlencode "query=SELECT (1 AS ?test) WHERE {}" \
+            || (echo "SSH tunnel failed" && exit 1)
+          
+          echo "SSH tunnel established successfully"
+      
+      - name: Run Layer 3 integration tests
+        env:
+          OXIGRAPH_ENDPOINT: "http://127.0.0.1:7878"
+          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+        run: |
+          pytest tests/dspy_gitops/test_layer3_integration.py \
+            -v --tb=short \
+            --junit-xml=layer3-results.xml
+      
+      - name: Upload test results
+        uses: https://github.com/actions/upload-artifact@v4
+        if: always()
+        with:
+          name: layer3-test-results
+          path: layer3-results.xml
+
+  # ==========================================================================
+  # Layer 4: Comprehensive Evaluation (nightly only)
+  # ==========================================================================
+  comprehensive-eval:
+    name: Layer 4 - Comprehensive Evaluation
+    runs-on: ubuntu-latest
+    timeout-minutes: 60
+    needs: [unit-tests, dspy-module-tests, integration-tests]
+    
+    # Only run on schedule or manual trigger with 'comprehensive'
+    if: github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && github.event.inputs.evaluation_level == 'comprehensive')
+    
+    steps:
+      - uses: https://github.com/actions/checkout@v4
+      
+      - name: Set up Python
+        uses: https://github.com/actions/setup-python@v5
+        with:
+          python-version: ${{ env.PYTHON_VERSION }}
+      
+      - name: Install dependencies
+        run: |
+          pip install -e ".[dev]"
+          pip install dspy-ai httpx rapidfuzz pandas pytest-json-report litellm
+      
+      - name: Setup SSH for tunnel
+        run: |
+          mkdir -p ~/.ssh
+          echo "${{ secrets.DEPLOY_SSH_PRIVATE_KEY }}" > ~/.ssh/deploy_key
+          chmod 600 ~/.ssh/deploy_key
+          ssh-keyscan -H ${{ env.SERVER_IP }} >> ~/.ssh/known_hosts 2>/dev/null || true
+      
+      - name: Create SSH tunnel to Oxigraph
+        run: |
+          ssh -f -N -L 7878:127.0.0.1:7878 \
+            -i ~/.ssh/deploy_key \
+            -o StrictHostKeyChecking=no \
+            ${{ env.SERVER_USER }}@${{ env.SERVER_IP }}
+          sleep 3
+      
+      - name: Run comprehensive evaluation
+        env:
+          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+          OXIGRAPH_ENDPOINT: "http://127.0.0.1:7878"
+        run: |
+          pytest tests/dspy_gitops/test_layer4_comprehensive.py \
+            -v --tb=short \
+            --junit-xml=layer4-results.xml \
+            --json-report \
+            --json-report-file=eval-report.json
+      
+      - name: Generate metrics summary
+        run: |
+          python -c "
+          import json
+          from datetime import datetime
+          
+          try:
+              with open('eval-report.json') as f:
+                  report = json.load(f)
+              
+              metrics = {
+                  'timestamp': datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ'),
+                  'commit': '${{ github.sha }}',
+                  'total_tests': report.get('summary', {}).get('total', 0),
+                  'passed': report.get('summary', {}).get('passed', 0),
+                  'failed': report.get('summary', {}).get('failed', 0),
+                  'duration': report.get('duration', 0),
+              }
+              
+              with open('metrics.json', 'w') as f:
+                  json.dump(metrics, f, indent=2)
+              
+              print('Metrics saved to metrics.json')
+              print(json.dumps(metrics, indent=2))
+          except Exception as e:
+              print(f'Error generating metrics: {e}')
+          "
+      
+      - name: Upload evaluation artifacts
+        uses: https://github.com/actions/upload-artifact@v4
+        with:
+          name: comprehensive-eval-results
+          path: |
+            layer4-results.xml
+            eval-report.json
+            metrics.json
+
+  # ==========================================================================
+  # Quality Gate Check
+  # ==========================================================================
+  quality-gate:
+    name: Quality Gate
+    runs-on: ubuntu-latest
+    needs: [unit-tests, dspy-module-tests, integration-tests]
+    if: always()
+    
+    steps:
+      - name: Check all required tests passed
+        run: |
+          echo "Checking quality gates..."
+          
+          # Layer 1 (unit tests) is always required
+          if [[ "${{ needs.unit-tests.result }}" != "success" ]]; then
+            echo "Layer 1 (Unit Tests) failed"
+            exit 1
+          fi
+          echo "Layer 1 (Unit Tests) passed"
+          
+          # Layer 2 (DSPy module tests) required for PRs
+          if [[ "${{ github.event_name }}" == "pull_request" ]]; then
+            if [[ "${{ needs.dspy-module-tests.result }}" != "success" ]]; then
+              echo "Layer 2 (DSPy Module Tests) failed - required for PRs"
+              exit 1
+            fi
+            echo "Layer 2 (DSPy Module Tests) passed"
+          fi
+          
+          # Layer 3 (integration tests) is warning-only for now
+          if [[ "${{ needs.integration-tests.result }}" != "success" ]]; then
+            echo "Warning: Layer 3 (Integration Tests) failed - non-blocking"
+          else
+            echo "Layer 3 (Integration Tests) passed"
+          fi
+          
+          echo ""
+          echo "============================================"
+          echo "  All required quality gates passed!"
+          echo "============================================"