# DSPy RAG Evaluation Workflow # Automated testing and evaluation for Heritage RAG system # # Layers: # - Layer 1: Fast unit tests (no LLM) # - Layer 2: DSPy module tests with LLM # - Layer 3: Integration tests (requires SSH tunnel to Oxigraph) # - Layer 4: Comprehensive evaluation (nightly) name: DSPy RAG Evaluation on: push: branches: [main] paths: - 'backend/rag/**' - 'tests/dspy_gitops/**' - 'src/glam_extractor/api/**' pull_request: branches: [main] paths: - 'backend/rag/**' - 'tests/dspy_gitops/**' - 'src/glam_extractor/api/**' workflow_dispatch: inputs: evaluation_level: description: 'Evaluation depth' required: true default: 'standard' type: choice options: - smoke - standard - comprehensive schedule: # Nightly comprehensive evaluation at 2 AM UTC - cron: '0 2 * * *' env: PYTHON_VERSION: '3.11' SERVER_IP: '91.98.224.44' SERVER_USER: 'root' jobs: # ========================================================================== # Layer 1: Fast Unit Tests (no LLM calls) # ========================================================================== unit-tests: name: Layer 1 - Unit Tests runs-on: ubuntu-latest timeout-minutes: 5 steps: - uses: actions/checkout@v4 - name: Set up Python uses: actions/setup-python@v5 with: python-version: ${{ env.PYTHON_VERSION }} cache: 'pip' - name: Install dependencies run: | pip install -e ".[dev]" pip install rapidfuzz - name: Run Layer 1 unit tests run: | pytest tests/dspy_gitops/test_layer1_unit.py \ -v --tb=short \ -m "layer1 or not (layer2 or layer3 or layer4)" \ --junit-xml=layer1-results.xml - name: Upload test results uses: actions/upload-artifact@v4 if: always() with: name: layer1-test-results path: layer1-results.xml # ========================================================================== # Layer 2: DSPy Module Tests (with LLM) # ========================================================================== dspy-module-tests: name: Layer 2 - DSPy Module Tests runs-on: ubuntu-latest timeout-minutes: 20 needs: unit-tests # Run on PRs, scheduled runs, or manual triggers if: github.event_name == 'pull_request' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' steps: - uses: actions/checkout@v4 - name: Set up Python uses: actions/setup-python@v5 with: python-version: ${{ env.PYTHON_VERSION }} cache: 'pip' - name: Install dependencies run: | pip install -e ".[dev]" pip install dspy-ai httpx rapidfuzz litellm - name: Run Layer 2 DSPy tests env: ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} run: | pytest tests/dspy_gitops/test_layer2_dspy.py \ -v --tb=short \ -m "layer2 or not (layer1 or layer3 or layer4)" \ --junit-xml=layer2-results.xml - name: Upload test results uses: actions/upload-artifact@v4 if: always() with: name: layer2-test-results path: layer2-results.xml - name: Comment PR with Layer 2 results if: github.event_name == 'pull_request' uses: actions/github-script@v7 with: script: | const fs = require('fs'); try { const results = fs.readFileSync('layer2-results.xml', 'utf8'); const testsMatch = results.match(/tests="(\d+)"/); const failuresMatch = results.match(/failures="(\d+)"/); const errorsMatch = results.match(/errors="(\d+)"/); const tests = testsMatch ? testsMatch[1] : '0'; const failures = failuresMatch ? failuresMatch[1] : '0'; const errors = errorsMatch ? errorsMatch[1] : '0'; const passed = parseInt(tests) - parseInt(failures) - parseInt(errors); const body = '## DSPy Layer 2 Evaluation Results\n\n' + '| Metric | Value |\n' + '|--------|-------|\n' + '| Tests Passed | ' + passed + '/' + tests + ' |\n' + '| Failures | ' + failures + ' |\n' + '| Errors | ' + errors + ' |\n' + '| Status | ' + ((parseInt(failures) + parseInt(errors)) > 0 ? '❌ FAILED' : '✅ PASSED') + ' |\n'; github.rest.issues.createComment({ issue_number: context.issue.number, owner: context.repo.owner, repo: context.repo.repo, body: body }); } catch (e) { console.log('Could not parse results:', e); } # ========================================================================== # Layer 3: Integration Tests (requires SSH tunnel to Oxigraph) # ========================================================================== integration-tests: name: Layer 3 - Integration Tests runs-on: ubuntu-latest timeout-minutes: 15 needs: unit-tests steps: - uses: actions/checkout@v4 - name: Set up Python uses: actions/setup-python@v5 with: python-version: ${{ env.PYTHON_VERSION }} cache: 'pip' - name: Install dependencies run: | pip install -e ".[dev]" pip install httpx pytest-asyncio - name: Setup SSH for tunnel run: | mkdir -p ~/.ssh echo "${{ secrets.DEPLOY_SSH_PRIVATE_KEY }}" > ~/.ssh/deploy_key chmod 600 ~/.ssh/deploy_key ssh-keyscan -H ${{ env.SERVER_IP }} >> ~/.ssh/known_hosts 2>/dev/null || true - name: Create SSH tunnel to Oxigraph run: | # Create SSH tunnel: local port 7878 -> server localhost:7878 ssh -f -N -L 7878:127.0.0.1:7878 \ -i ~/.ssh/deploy_key \ -o StrictHostKeyChecking=no \ ${{ env.SERVER_USER }}@${{ env.SERVER_IP }} # Wait for tunnel to establish sleep 3 # Verify tunnel is working curl -sf "http://127.0.0.1:7878/query" \ -H "Accept: application/sparql-results+json" \ --data-urlencode "query=SELECT (1 AS ?test) WHERE {}" \ || (echo "SSH tunnel failed" && exit 1) echo "SSH tunnel established successfully" - name: Run Layer 3 integration tests env: OXIGRAPH_ENDPOINT: "http://127.0.0.1:7878" ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} run: | pytest tests/dspy_gitops/test_layer3_integration.py \ -v --tb=short \ -m "layer3 or not (layer1 or layer2 or layer4)" \ --junit-xml=layer3-results.xml - name: Upload test results uses: actions/upload-artifact@v4 if: always() with: name: layer3-test-results path: layer3-results.xml # ========================================================================== # Layer 4: Comprehensive Evaluation (nightly only) # ========================================================================== comprehensive-eval: name: Layer 4 - Comprehensive Evaluation runs-on: ubuntu-latest timeout-minutes: 60 needs: [unit-tests, dspy-module-tests, integration-tests] # Only run on schedule or manual trigger with 'comprehensive' if: github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && github.event.inputs.evaluation_level == 'comprehensive') steps: - uses: actions/checkout@v4 - name: Set up Python uses: actions/setup-python@v5 with: python-version: ${{ env.PYTHON_VERSION }} cache: 'pip' - name: Install dependencies run: | pip install -e ".[dev]" pip install dspy-ai httpx rapidfuzz pandas pytest-json-report litellm - name: Setup SSH for tunnel run: | mkdir -p ~/.ssh echo "${{ secrets.DEPLOY_SSH_PRIVATE_KEY }}" > ~/.ssh/deploy_key chmod 600 ~/.ssh/deploy_key ssh-keyscan -H ${{ env.SERVER_IP }} >> ~/.ssh/known_hosts 2>/dev/null || true - name: Create SSH tunnel to Oxigraph run: | ssh -f -N -L 7878:127.0.0.1:7878 \ -i ~/.ssh/deploy_key \ -o StrictHostKeyChecking=no \ ${{ env.SERVER_USER }}@${{ env.SERVER_IP }} sleep 3 - name: Run comprehensive evaluation env: ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} OXIGRAPH_ENDPOINT: "http://127.0.0.1:7878" run: | pytest tests/dspy_gitops/test_layer4_comprehensive.py \ -v --tb=short \ -m "layer4 or not (layer1 or layer2 or layer3)" \ --junit-xml=layer4-results.xml \ --json-report \ --json-report-file=eval-report.json - name: Generate metrics summary run: | python -c " import json from datetime import datetime try: with open('eval-report.json') as f: report = json.load(f) metrics = { 'timestamp': datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ'), 'commit': '${{ github.sha }}', 'total_tests': report.get('summary', {}).get('total', 0), 'passed': report.get('summary', {}).get('passed', 0), 'failed': report.get('summary', {}).get('failed', 0), 'duration': report.get('duration', 0), } with open('metrics.json', 'w') as f: json.dump(metrics, f, indent=2) print('Metrics saved to metrics.json') print(json.dumps(metrics, indent=2)) except Exception as e: print(f'Error generating metrics: {e}') " - name: Upload evaluation artifacts uses: actions/upload-artifact@v4 with: name: comprehensive-eval-results path: | layer4-results.xml eval-report.json metrics.json # ========================================================================== # Quality Gate Check # ========================================================================== quality-gate: name: Quality Gate runs-on: ubuntu-latest needs: [unit-tests, dspy-module-tests, integration-tests] if: always() steps: - name: Check all required tests passed run: | echo "Checking quality gates..." # Layer 1 (unit tests) is always required if [[ "${{ needs.unit-tests.result }}" != "success" ]]; then echo "❌ Layer 1 (Unit Tests) failed" exit 1 fi echo "✅ Layer 1 (Unit Tests) passed" # Layer 2 (DSPy module tests) required for PRs if [[ "${{ github.event_name }}" == "pull_request" ]]; then if [[ "${{ needs.dspy-module-tests.result }}" != "success" ]]; then echo "❌ Layer 2 (DSPy Module Tests) failed - required for PRs" exit 1 fi echo "✅ Layer 2 (DSPy Module Tests) passed" fi # Layer 3 (integration tests) is warning-only for now if [[ "${{ needs.integration-tests.result }}" != "success" ]]; then echo "⚠️ Layer 3 (Integration Tests) failed - non-blocking" else echo "✅ Layer 3 (Integration Tests) passed" fi echo "" echo "============================================" echo " All required quality gates passed!" echo "============================================"