glam/.github/workflows/dspy-eval.yml

# DSPy RAG Evaluation Workflow
# Automated testing and evaluation for Heritage RAG system
#
# Layers:
# - Layer 1: Fast unit tests (no LLM)
# - Layer 2: DSPy module tests with LLM
# - Layer 3: Integration tests (requires SSH tunnel to Oxigraph)
# - Layer 4: Comprehensive evaluation (nightly)

name: DSPy RAG Evaluation

on:
  push:
    branches: [main]
    paths:
      - 'backend/rag/**'
      - 'tests/dspy_gitops/**'
      - 'src/glam_extractor/api/**'
  pull_request:
    branches: [main]
    paths:
      - 'backend/rag/**'
      - 'tests/dspy_gitops/**'
      - 'src/glam_extractor/api/**'
  workflow_dispatch:
    inputs:
      evaluation_level:
        description: 'Evaluation depth'
        required: true
        default: 'standard'
        type: choice
        options:
          - smoke
          - standard
          - comprehensive
  schedule:
    # Nightly comprehensive evaluation at 2 AM UTC
    - cron: '0 2 * * *'

env:
  PYTHON_VERSION: '3.11'
  SERVER_IP: '91.98.224.44'
  SERVER_USER: 'root'

jobs:
  # ==========================================================================
  # Layer 1: Fast Unit Tests (no LLM calls)
  # ==========================================================================
  unit-tests:
    name: Layer 1 - Unit Tests
    runs-on: ubuntu-latest
    timeout-minutes: 5

    steps:
      - uses: actions/checkout@v4

      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: ${{ env.PYTHON_VERSION }}
          cache: 'pip'

      - name: Install dependencies
        run: |
          pip install -e ".[dev]"
          pip install rapidfuzz

      - name: Run Layer 1 unit tests
        run: |
          pytest tests/dspy_gitops/test_layer1_unit.py \
            -v --tb=short \
            -m "layer1 or not (layer2 or layer3 or layer4)" \
            --junit-xml=layer1-results.xml

      - name: Upload test results
        uses: actions/upload-artifact@v4
        if: always()
        with:
          name: layer1-test-results
          path: layer1-results.xml

  # ==========================================================================
  # Layer 2: DSPy Module Tests (with LLM)
  # ==========================================================================
  dspy-module-tests:
    name: Layer 2 - DSPy Module Tests
    runs-on: ubuntu-latest
    timeout-minutes: 20
    needs: unit-tests

    # Run on PRs, scheduled runs, or manual triggers
    if: github.event_name == 'pull_request' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch'

    steps:
      - uses: actions/checkout@v4

      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: ${{ env.PYTHON_VERSION }}
          cache: 'pip'

      - name: Install dependencies
        run: |
          pip install -e ".[dev]"
          pip install dspy-ai httpx rapidfuzz litellm

      - name: Run Layer 2 DSPy tests
        env:
          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
        run: |
          pytest tests/dspy_gitops/test_layer2_dspy.py \
            -v --tb=short \
            -m "layer2 or not (layer1 or layer3 or layer4)" \
            --junit-xml=layer2-results.xml

      - name: Upload test results
        uses: actions/upload-artifact@v4
        if: always()
        with:
          name: layer2-test-results
          path: layer2-results.xml

      - name: Comment PR with Layer 2 results
        if: github.event_name == 'pull_request'
        uses: actions/github-script@v7
        with:
          script: |
            const fs = require('fs');
            try {
              const results = fs.readFileSync('layer2-results.xml', 'utf8');
              const testsMatch = results.match(/tests="(\d+)"/);
              const failuresMatch = results.match(/failures="(\d+)"/);
              const errorsMatch = results.match(/errors="(\d+)"/);

              const tests = testsMatch ? testsMatch[1] : '0';
              const failures = failuresMatch ? failuresMatch[1] : '0';
              const errors = errorsMatch ? errorsMatch[1] : '0';
              const passed = parseInt(tests) - parseInt(failures) - parseInt(errors);

              const body = '## DSPy Layer 2 Evaluation Results\n\n' +
                '| Metric | Value |\n' +
                '|--------|-------|\n' +
                '| Tests Passed | ' + passed + '/' + tests + ' |\n' +
                '| Failures | ' + failures + ' |\n' +
                '| Errors | ' + errors + ' |\n' +
                '| Status | ' + ((parseInt(failures) + parseInt(errors)) > 0 ? '❌ FAILED' : '✅ PASSED') + ' |\n';

              github.rest.issues.createComment({
                issue_number: context.issue.number,
                owner: context.repo.owner,
                repo: context.repo.repo,
                body: body
              });
            } catch (e) {
              console.log('Could not parse results:', e);
            }

  # ==========================================================================
  # Layer 3: Integration Tests (requires SSH tunnel to Oxigraph)
  # ==========================================================================
  integration-tests:
    name: Layer 3 - Integration Tests
    runs-on: ubuntu-latest
    timeout-minutes: 15
    needs: unit-tests

    steps:
      - uses: actions/checkout@v4

      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: ${{ env.PYTHON_VERSION }}
          cache: 'pip'

      - name: Install dependencies
        run: |
          pip install -e ".[dev]"
          pip install httpx pytest-asyncio

      - name: Setup SSH for tunnel
        run: |
          mkdir -p ~/.ssh
          echo "${{ secrets.DEPLOY_SSH_PRIVATE_KEY }}" > ~/.ssh/deploy_key
          chmod 600 ~/.ssh/deploy_key
          ssh-keyscan -H ${{ env.SERVER_IP }} >> ~/.ssh/known_hosts 2>/dev/null || true

      - name: Create SSH tunnel to Oxigraph
        run: |
          # Create SSH tunnel: local port 7878 -> server localhost:7878
          ssh -f -N -L 7878:127.0.0.1:7878 \
            -i ~/.ssh/deploy_key \
            -o StrictHostKeyChecking=no \
            ${{ env.SERVER_USER }}@${{ env.SERVER_IP }}

          # Wait for tunnel to establish
          sleep 3

          # Verify tunnel is working
          curl -sf "http://127.0.0.1:7878/query" \
            -H "Accept: application/sparql-results+json" \
            --data-urlencode "query=SELECT (1 AS ?test) WHERE {}" \
            || (echo "SSH tunnel failed" && exit 1)

          echo "SSH tunnel established successfully"

      - name: Run Layer 3 integration tests
        env:
          OXIGRAPH_ENDPOINT: "http://127.0.0.1:7878"
          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
        run: |
          pytest tests/dspy_gitops/test_layer3_integration.py \
            -v --tb=short \
            -m "layer3 or not (layer1 or layer2 or layer4)" \
            --junit-xml=layer3-results.xml

      - name: Upload test results
        uses: actions/upload-artifact@v4
        if: always()
        with:
          name: layer3-test-results
          path: layer3-results.xml

  # ==========================================================================
  # Layer 4: Comprehensive Evaluation (nightly only)
  # ==========================================================================
  comprehensive-eval:
    name: Layer 4 - Comprehensive Evaluation
    runs-on: ubuntu-latest
    timeout-minutes: 60
    needs: [unit-tests, dspy-module-tests, integration-tests]

    # Only run on schedule or manual trigger with 'comprehensive'
    if: github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && github.event.inputs.evaluation_level == 'comprehensive')

    steps:
      - uses: actions/checkout@v4

      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: ${{ env.PYTHON_VERSION }}
          cache: 'pip'

      - name: Install dependencies
        run: |
          pip install -e ".[dev]"
          pip install dspy-ai httpx rapidfuzz pandas pytest-json-report litellm

      - name: Setup SSH for tunnel
        run: |
          mkdir -p ~/.ssh
          echo "${{ secrets.DEPLOY_SSH_PRIVATE_KEY }}" > ~/.ssh/deploy_key
          chmod 600 ~/.ssh/deploy_key
          ssh-keyscan -H ${{ env.SERVER_IP }} >> ~/.ssh/known_hosts 2>/dev/null || true

      - name: Create SSH tunnel to Oxigraph
        run: |
          ssh -f -N -L 7878:127.0.0.1:7878 \
            -i ~/.ssh/deploy_key \
            -o StrictHostKeyChecking=no \
            ${{ env.SERVER_USER }}@${{ env.SERVER_IP }}
          sleep 3

      - name: Run comprehensive evaluation
        env:
          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
          OXIGRAPH_ENDPOINT: "http://127.0.0.1:7878"
        run: |
          pytest tests/dspy_gitops/test_layer4_comprehensive.py \
            -v --tb=short \
            -m "layer4 or not (layer1 or layer2 or layer3)" \
            --junit-xml=layer4-results.xml \
            --json-report \
            --json-report-file=eval-report.json

      - name: Generate metrics summary
        run: |
          python -c "
          import json
          from datetime import datetime

          try:
              with open('eval-report.json') as f:
                  report = json.load(f)

              metrics = {
                  'timestamp': datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ'),
                  'commit': '${{ github.sha }}',
                  'total_tests': report.get('summary', {}).get('total', 0),
                  'passed': report.get('summary', {}).get('passed', 0),
                  'failed': report.get('summary', {}).get('failed', 0),
                  'duration': report.get('duration', 0),
              }

              with open('metrics.json', 'w') as f:
                  json.dump(metrics, f, indent=2)

              print('Metrics saved to metrics.json')
              print(json.dumps(metrics, indent=2))
          except Exception as e:
              print(f'Error generating metrics: {e}')
          "

      - name: Upload evaluation artifacts
        uses: actions/upload-artifact@v4
        with:
          name: comprehensive-eval-results
          path: |
            layer4-results.xml
            eval-report.json
            metrics.json

  # ==========================================================================
  # Quality Gate Check
  # ==========================================================================
  quality-gate:
    name: Quality Gate
    runs-on: ubuntu-latest
    needs: [unit-tests, dspy-module-tests, integration-tests]
    if: always()

    steps:
      - name: Check all required tests passed
        run: |
          echo "Checking quality gates..."

          # Layer 1 (unit tests) is always required
          if [[ "${{ needs.unit-tests.result }}" != "success" ]]; then
            echo "❌ Layer 1 (Unit Tests) failed"
            exit 1
          fi
          echo "✅ Layer 1 (Unit Tests) passed"

          # Layer 2 (DSPy module tests) required for PRs
          if [[ "${{ github.event_name }}" == "pull_request" ]]; then
            if [[ "${{ needs.dspy-module-tests.result }}" != "success" ]]; then
              echo "❌ Layer 2 (DSPy Module Tests) failed - required for PRs"
              exit 1
            fi
            echo "✅ Layer 2 (DSPy Module Tests) passed"
          fi

          # Layer 3 (integration tests) is warning-only for now
          if [[ "${{ needs.integration-tests.result }}" != "success" ]]; then
            echo "⚠️ Layer 3 (Integration Tests) failed - non-blocking"
          else
            echo "✅ Layer 3 (Integration Tests) passed"
          fi

          echo ""
          echo "============================================"
          echo "  All required quality gates passed!"
          echo "============================================"