# DSPy RAG Evaluation Workflow # Automated testing and evaluation for Heritage RAG system # # All layers run on every push/PR: # - Layer 1: Fast unit tests (no LLM) # - Layer 2: DSPy module tests with LLM # - Layer 3: Integration tests (requires SSH tunnel to Oxigraph) # - Layer 4: Comprehensive evaluation name: DSPy RAG Evaluation on: push: branches: [master] paths: - 'backend/rag/**' - 'tests/dspy_gitops/**' - 'src/glam_extractor/api/**' - '.forgejo/workflows/dspy-eval.yml' pull_request: branches: [master] paths: - 'backend/rag/**' - 'tests/dspy_gitops/**' - 'src/glam_extractor/api/**' workflow_dispatch: inputs: evaluation_level: description: 'Evaluation depth' required: true default: 'standard' type: choice options: - smoke - standard - comprehensive schedule: # Nightly comprehensive evaluation at 2 AM UTC - cron: '0 2 * * *' env: SERVER_IP: '91.98.224.44' SERVER_USER: 'root' VENV_PATH: '/opt/venv' jobs: # ========================================================================== # Layer 1: Fast Unit Tests (no LLM calls) # ========================================================================== unit-tests: name: Layer 1 - Unit Tests runs-on: ubuntu-latest timeout-minutes: 10 steps: - uses: https://github.com/actions/checkout@v4 - name: Setup Python with virtual environment run: | apt-get update apt-get install -y python3 python3-pip python3-venv python3-full python3 -m venv ${{ env.VENV_PATH }} echo "Python version: $(python3 --version)" echo "Venv created at ${{ env.VENV_PATH }}" - name: Install dependencies run: | source ${{ env.VENV_PATH }}/bin/activate pip install --upgrade pip pip install -e ".[dev]" pip install rapidfuzz pytest-cov - name: Run Layer 1 unit tests run: | source ${{ env.VENV_PATH }}/bin/activate python -m pytest tests/dspy_gitops/test_layer1_unit.py \ -v --tb=short \ --junit-xml=layer1-results.xml - name: Upload test results uses: https://github.com/actions/upload-artifact@v3 if: always() with: name: layer1-test-results path: layer1-results.xml # ========================================================================== # Layer 2: DSPy Module Tests (with LLM) # ========================================================================== dspy-module-tests: name: Layer 2 - DSPy Module Tests runs-on: ubuntu-latest timeout-minutes: 20 needs: unit-tests steps: - uses: https://github.com/actions/checkout@v4 - name: Setup Python with virtual environment run: | apt-get update apt-get install -y python3 python3-pip python3-venv python3-full python3 -m venv ${{ env.VENV_PATH }} - name: Install dependencies run: | source ${{ env.VENV_PATH }}/bin/activate pip install --upgrade pip pip install -e ".[dev]" pip install dspy-ai httpx rapidfuzz litellm pytest-cov - name: Run Layer 2 DSPy tests env: ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} run: | source ${{ env.VENV_PATH }}/bin/activate python -m pytest tests/dspy_gitops/test_layer2_dspy.py \ -v --tb=short \ --junit-xml=layer2-results.xml - name: Upload test results uses: https://github.com/actions/upload-artifact@v3 if: always() with: name: layer2-test-results path: layer2-results.xml # ========================================================================== # Layer 3: Integration Tests (requires SSH tunnel to Oxigraph) # ========================================================================== integration-tests: name: Layer 3 - Integration Tests runs-on: ubuntu-latest timeout-minutes: 15 needs: unit-tests steps: - uses: https://github.com/actions/checkout@v4 - name: Setup Python with virtual environment run: | apt-get update apt-get install -y openssh-client curl python3 python3-pip python3-venv python3-full python3 -m venv ${{ env.VENV_PATH }} - name: Install Python dependencies run: | source ${{ env.VENV_PATH }}/bin/activate pip install --upgrade pip pip install -e ".[dev]" pip install httpx pytest-asyncio pytest-cov - name: Setup SSH for tunnel run: | mkdir -p ~/.ssh echo "${{ secrets.DEPLOY_SSH_PRIVATE_KEY }}" > ~/.ssh/deploy_key chmod 600 ~/.ssh/deploy_key ssh-keyscan -H ${{ env.SERVER_IP }} >> ~/.ssh/known_hosts 2>/dev/null || true - name: Create SSH tunnel to Oxigraph run: | # Create SSH tunnel: local port 7878 -> server localhost:7878 ssh -f -N -L 7878:127.0.0.1:7878 \ -i ~/.ssh/deploy_key \ -o StrictHostKeyChecking=no \ ${{ env.SERVER_USER }}@${{ env.SERVER_IP }} # Wait for tunnel to establish sleep 3 # Verify tunnel is working curl -sf "http://127.0.0.1:7878/query" \ -H "Accept: application/sparql-results+json" \ --data-urlencode "query=SELECT (1 AS ?test) WHERE {}" \ || (echo "SSH tunnel failed" && exit 1) echo "SSH tunnel established successfully" - name: Run Layer 3 integration tests env: OXIGRAPH_ENDPOINT: "http://127.0.0.1:7878" ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} run: | source ${{ env.VENV_PATH }}/bin/activate python -m pytest tests/dspy_gitops/test_layer3_integration.py \ -v --tb=short \ --junit-xml=layer3-results.xml - name: Upload test results uses: https://github.com/actions/upload-artifact@v3 if: always() with: name: layer3-test-results path: layer3-results.xml # ========================================================================== # Layer 4: Comprehensive Evaluation # ========================================================================== comprehensive-eval: name: Layer 4 - Comprehensive Evaluation runs-on: ubuntu-latest timeout-minutes: 60 needs: [unit-tests, dspy-module-tests, integration-tests] steps: - uses: https://github.com/actions/checkout@v4 - name: Setup Python with virtual environment run: | apt-get update apt-get install -y openssh-client curl python3 python3-pip python3-venv python3-full python3 -m venv ${{ env.VENV_PATH }} - name: Install Python dependencies run: | source ${{ env.VENV_PATH }}/bin/activate pip install --upgrade pip pip install -e ".[dev]" pip install dspy-ai httpx rapidfuzz pandas pytest-json-report litellm pytest-cov - name: Setup SSH for tunnel run: | mkdir -p ~/.ssh echo "${{ secrets.DEPLOY_SSH_PRIVATE_KEY }}" > ~/.ssh/deploy_key chmod 600 ~/.ssh/deploy_key ssh-keyscan -H ${{ env.SERVER_IP }} >> ~/.ssh/known_hosts 2>/dev/null || true - name: Create SSH tunnel to Oxigraph run: | ssh -f -N -L 7878:127.0.0.1:7878 \ -i ~/.ssh/deploy_key \ -o StrictHostKeyChecking=no \ ${{ env.SERVER_USER }}@${{ env.SERVER_IP }} sleep 3 - name: Run comprehensive evaluation env: ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} OXIGRAPH_ENDPOINT: "http://127.0.0.1:7878" run: | source ${{ env.VENV_PATH }}/bin/activate python -m pytest tests/dspy_gitops/test_layer4_comprehensive.py \ -v --tb=short \ --junit-xml=layer4-results.xml \ --json-report \ --json-report-file=eval-report.json - name: Generate metrics summary run: | source ${{ env.VENV_PATH }}/bin/activate python -c " import json from datetime import datetime try: with open('eval-report.json') as f: report = json.load(f) metrics = { 'timestamp': datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ'), 'commit': '${{ github.sha }}', 'total_tests': report.get('summary', {}).get('total', 0), 'passed': report.get('summary', {}).get('passed', 0), 'failed': report.get('summary', {}).get('failed', 0), 'duration': report.get('duration', 0), } with open('metrics.json', 'w') as f: json.dump(metrics, f, indent=2) print('Metrics saved to metrics.json') print(json.dumps(metrics, indent=2)) except Exception as e: print(f'Error generating metrics: {e}') " - name: Upload evaluation artifacts uses: https://github.com/actions/upload-artifact@v3 with: name: comprehensive-eval-results path: | layer4-results.xml eval-report.json metrics.json # ========================================================================== # Quality Gate Check # ========================================================================== quality-gate: name: Quality Gate runs-on: ubuntu-latest needs: [unit-tests, dspy-module-tests, integration-tests, comprehensive-eval] if: always() steps: - name: Check all required tests passed run: | echo "Checking quality gates..." # Layer 1 (unit tests) is required if [[ "${{ needs.unit-tests.result }}" != "success" ]]; then echo "❌ Layer 1 (Unit Tests) failed" exit 1 fi echo "✅ Layer 1 (Unit Tests) passed" # Layer 2 (DSPy module tests) is required if [[ "${{ needs.dspy-module-tests.result }}" != "success" ]]; then echo "❌ Layer 2 (DSPy Module Tests) failed" exit 1 fi echo "✅ Layer 2 (DSPy Module Tests) passed" # Layer 3 (integration tests) is required if [[ "${{ needs.integration-tests.result }}" != "success" ]]; then echo "❌ Layer 3 (Integration Tests) failed" exit 1 fi echo "✅ Layer 3 (Integration Tests) passed" # Layer 4 (comprehensive evaluation) is required if [[ "${{ needs.comprehensive-eval.result }}" != "success" ]]; then echo "❌ Layer 4 (Comprehensive Evaluation) failed" exit 1 fi echo "✅ Layer 4 (Comprehensive Evaluation) passed" echo "" echo "============================================" echo " All quality gates passed!" echo "============================================"