chore(ci): remove GitHub dspy-eval workflow (replaced by Forgejo workflow)
This commit is contained in:
parent
8470bf5860
commit
7e9df1d600
1 changed files with 0 additions and 355 deletions
355
.github/workflows/dspy-eval.yml
vendored
355
.github/workflows/dspy-eval.yml
vendored
|
|
@ -1,355 +0,0 @@
|
||||||
# DSPy RAG Evaluation Workflow
|
|
||||||
# Automated testing and evaluation for Heritage RAG system
|
|
||||||
#
|
|
||||||
# Layers:
|
|
||||||
# - Layer 1: Fast unit tests (no LLM)
|
|
||||||
# - Layer 2: DSPy module tests with LLM
|
|
||||||
# - Layer 3: Integration tests (requires SSH tunnel to Oxigraph)
|
|
||||||
# - Layer 4: Comprehensive evaluation (nightly)
|
|
||||||
|
|
||||||
name: DSPy RAG Evaluation
|
|
||||||
|
|
||||||
on:
|
|
||||||
push:
|
|
||||||
branches: [main]
|
|
||||||
paths:
|
|
||||||
- 'backend/rag/**'
|
|
||||||
- 'tests/dspy_gitops/**'
|
|
||||||
- 'src/glam_extractor/api/**'
|
|
||||||
pull_request:
|
|
||||||
branches: [main]
|
|
||||||
paths:
|
|
||||||
- 'backend/rag/**'
|
|
||||||
- 'tests/dspy_gitops/**'
|
|
||||||
- 'src/glam_extractor/api/**'
|
|
||||||
workflow_dispatch:
|
|
||||||
inputs:
|
|
||||||
evaluation_level:
|
|
||||||
description: 'Evaluation depth'
|
|
||||||
required: true
|
|
||||||
default: 'standard'
|
|
||||||
type: choice
|
|
||||||
options:
|
|
||||||
- smoke
|
|
||||||
- standard
|
|
||||||
- comprehensive
|
|
||||||
schedule:
|
|
||||||
# Nightly comprehensive evaluation at 2 AM UTC
|
|
||||||
- cron: '0 2 * * *'
|
|
||||||
|
|
||||||
env:
|
|
||||||
PYTHON_VERSION: '3.11'
|
|
||||||
SERVER_IP: '91.98.224.44'
|
|
||||||
SERVER_USER: 'root'
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
# ==========================================================================
|
|
||||||
# Layer 1: Fast Unit Tests (no LLM calls)
|
|
||||||
# ==========================================================================
|
|
||||||
unit-tests:
|
|
||||||
name: Layer 1 - Unit Tests
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
timeout-minutes: 5
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v4
|
|
||||||
|
|
||||||
- name: Set up Python
|
|
||||||
uses: actions/setup-python@v5
|
|
||||||
with:
|
|
||||||
python-version: ${{ env.PYTHON_VERSION }}
|
|
||||||
cache: 'pip'
|
|
||||||
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
pip install -e ".[dev]"
|
|
||||||
pip install rapidfuzz
|
|
||||||
|
|
||||||
- name: Run Layer 1 unit tests
|
|
||||||
run: |
|
|
||||||
pytest tests/dspy_gitops/test_layer1_unit.py \
|
|
||||||
-v --tb=short \
|
|
||||||
-m "layer1 or not (layer2 or layer3 or layer4)" \
|
|
||||||
--junit-xml=layer1-results.xml
|
|
||||||
|
|
||||||
- name: Upload test results
|
|
||||||
uses: actions/upload-artifact@v4
|
|
||||||
if: always()
|
|
||||||
with:
|
|
||||||
name: layer1-test-results
|
|
||||||
path: layer1-results.xml
|
|
||||||
|
|
||||||
# ==========================================================================
|
|
||||||
# Layer 2: DSPy Module Tests (with LLM)
|
|
||||||
# ==========================================================================
|
|
||||||
dspy-module-tests:
|
|
||||||
name: Layer 2 - DSPy Module Tests
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
timeout-minutes: 20
|
|
||||||
needs: unit-tests
|
|
||||||
|
|
||||||
# Run on PRs, scheduled runs, or manual triggers
|
|
||||||
if: github.event_name == 'pull_request' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch'
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v4
|
|
||||||
|
|
||||||
- name: Set up Python
|
|
||||||
uses: actions/setup-python@v5
|
|
||||||
with:
|
|
||||||
python-version: ${{ env.PYTHON_VERSION }}
|
|
||||||
cache: 'pip'
|
|
||||||
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
pip install -e ".[dev]"
|
|
||||||
pip install dspy-ai httpx rapidfuzz litellm
|
|
||||||
|
|
||||||
- name: Run Layer 2 DSPy tests
|
|
||||||
env:
|
|
||||||
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
|
||||||
run: |
|
|
||||||
pytest tests/dspy_gitops/test_layer2_dspy.py \
|
|
||||||
-v --tb=short \
|
|
||||||
-m "layer2 or not (layer1 or layer3 or layer4)" \
|
|
||||||
--junit-xml=layer2-results.xml
|
|
||||||
|
|
||||||
- name: Upload test results
|
|
||||||
uses: actions/upload-artifact@v4
|
|
||||||
if: always()
|
|
||||||
with:
|
|
||||||
name: layer2-test-results
|
|
||||||
path: layer2-results.xml
|
|
||||||
|
|
||||||
- name: Comment PR with Layer 2 results
|
|
||||||
if: github.event_name == 'pull_request'
|
|
||||||
uses: actions/github-script@v7
|
|
||||||
with:
|
|
||||||
script: |
|
|
||||||
const fs = require('fs');
|
|
||||||
try {
|
|
||||||
const results = fs.readFileSync('layer2-results.xml', 'utf8');
|
|
||||||
const testsMatch = results.match(/tests="(\d+)"/);
|
|
||||||
const failuresMatch = results.match(/failures="(\d+)"/);
|
|
||||||
const errorsMatch = results.match(/errors="(\d+)"/);
|
|
||||||
|
|
||||||
const tests = testsMatch ? testsMatch[1] : '0';
|
|
||||||
const failures = failuresMatch ? failuresMatch[1] : '0';
|
|
||||||
const errors = errorsMatch ? errorsMatch[1] : '0';
|
|
||||||
const passed = parseInt(tests) - parseInt(failures) - parseInt(errors);
|
|
||||||
|
|
||||||
const body = '## DSPy Layer 2 Evaluation Results\n\n' +
|
|
||||||
'| Metric | Value |\n' +
|
|
||||||
'|--------|-------|\n' +
|
|
||||||
'| Tests Passed | ' + passed + '/' + tests + ' |\n' +
|
|
||||||
'| Failures | ' + failures + ' |\n' +
|
|
||||||
'| Errors | ' + errors + ' |\n' +
|
|
||||||
'| Status | ' + ((parseInt(failures) + parseInt(errors)) > 0 ? '❌ FAILED' : '✅ PASSED') + ' |\n';
|
|
||||||
|
|
||||||
github.rest.issues.createComment({
|
|
||||||
issue_number: context.issue.number,
|
|
||||||
owner: context.repo.owner,
|
|
||||||
repo: context.repo.repo,
|
|
||||||
body: body
|
|
||||||
});
|
|
||||||
} catch (e) {
|
|
||||||
console.log('Could not parse results:', e);
|
|
||||||
}
|
|
||||||
|
|
||||||
# ==========================================================================
|
|
||||||
# Layer 3: Integration Tests (requires SSH tunnel to Oxigraph)
|
|
||||||
# ==========================================================================
|
|
||||||
integration-tests:
|
|
||||||
name: Layer 3 - Integration Tests
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
timeout-minutes: 15
|
|
||||||
needs: unit-tests
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v4
|
|
||||||
|
|
||||||
- name: Set up Python
|
|
||||||
uses: actions/setup-python@v5
|
|
||||||
with:
|
|
||||||
python-version: ${{ env.PYTHON_VERSION }}
|
|
||||||
cache: 'pip'
|
|
||||||
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
pip install -e ".[dev]"
|
|
||||||
pip install httpx pytest-asyncio
|
|
||||||
|
|
||||||
- name: Setup SSH for tunnel
|
|
||||||
run: |
|
|
||||||
mkdir -p ~/.ssh
|
|
||||||
echo "${{ secrets.DEPLOY_SSH_PRIVATE_KEY }}" > ~/.ssh/deploy_key
|
|
||||||
chmod 600 ~/.ssh/deploy_key
|
|
||||||
ssh-keyscan -H ${{ env.SERVER_IP }} >> ~/.ssh/known_hosts 2>/dev/null || true
|
|
||||||
|
|
||||||
- name: Create SSH tunnel to Oxigraph
|
|
||||||
run: |
|
|
||||||
# Create SSH tunnel: local port 7878 -> server localhost:7878
|
|
||||||
ssh -f -N -L 7878:127.0.0.1:7878 \
|
|
||||||
-i ~/.ssh/deploy_key \
|
|
||||||
-o StrictHostKeyChecking=no \
|
|
||||||
${{ env.SERVER_USER }}@${{ env.SERVER_IP }}
|
|
||||||
|
|
||||||
# Wait for tunnel to establish
|
|
||||||
sleep 3
|
|
||||||
|
|
||||||
# Verify tunnel is working
|
|
||||||
curl -sf "http://127.0.0.1:7878/query" \
|
|
||||||
-H "Accept: application/sparql-results+json" \
|
|
||||||
--data-urlencode "query=SELECT (1 AS ?test) WHERE {}" \
|
|
||||||
|| (echo "SSH tunnel failed" && exit 1)
|
|
||||||
|
|
||||||
echo "SSH tunnel established successfully"
|
|
||||||
|
|
||||||
- name: Run Layer 3 integration tests
|
|
||||||
env:
|
|
||||||
OXIGRAPH_ENDPOINT: "http://127.0.0.1:7878"
|
|
||||||
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
|
||||||
run: |
|
|
||||||
pytest tests/dspy_gitops/test_layer3_integration.py \
|
|
||||||
-v --tb=short \
|
|
||||||
-m "layer3 or not (layer1 or layer2 or layer4)" \
|
|
||||||
--junit-xml=layer3-results.xml
|
|
||||||
|
|
||||||
- name: Upload test results
|
|
||||||
uses: actions/upload-artifact@v4
|
|
||||||
if: always()
|
|
||||||
with:
|
|
||||||
name: layer3-test-results
|
|
||||||
path: layer3-results.xml
|
|
||||||
|
|
||||||
# ==========================================================================
|
|
||||||
# Layer 4: Comprehensive Evaluation (nightly only)
|
|
||||||
# ==========================================================================
|
|
||||||
comprehensive-eval:
|
|
||||||
name: Layer 4 - Comprehensive Evaluation
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
timeout-minutes: 60
|
|
||||||
needs: [unit-tests, dspy-module-tests, integration-tests]
|
|
||||||
|
|
||||||
# Only run on schedule or manual trigger with 'comprehensive'
|
|
||||||
if: github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && github.event.inputs.evaluation_level == 'comprehensive')
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v4
|
|
||||||
|
|
||||||
- name: Set up Python
|
|
||||||
uses: actions/setup-python@v5
|
|
||||||
with:
|
|
||||||
python-version: ${{ env.PYTHON_VERSION }}
|
|
||||||
cache: 'pip'
|
|
||||||
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
pip install -e ".[dev]"
|
|
||||||
pip install dspy-ai httpx rapidfuzz pandas pytest-json-report litellm
|
|
||||||
|
|
||||||
- name: Setup SSH for tunnel
|
|
||||||
run: |
|
|
||||||
mkdir -p ~/.ssh
|
|
||||||
echo "${{ secrets.DEPLOY_SSH_PRIVATE_KEY }}" > ~/.ssh/deploy_key
|
|
||||||
chmod 600 ~/.ssh/deploy_key
|
|
||||||
ssh-keyscan -H ${{ env.SERVER_IP }} >> ~/.ssh/known_hosts 2>/dev/null || true
|
|
||||||
|
|
||||||
- name: Create SSH tunnel to Oxigraph
|
|
||||||
run: |
|
|
||||||
ssh -f -N -L 7878:127.0.0.1:7878 \
|
|
||||||
-i ~/.ssh/deploy_key \
|
|
||||||
-o StrictHostKeyChecking=no \
|
|
||||||
${{ env.SERVER_USER }}@${{ env.SERVER_IP }}
|
|
||||||
sleep 3
|
|
||||||
|
|
||||||
- name: Run comprehensive evaluation
|
|
||||||
env:
|
|
||||||
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
|
||||||
OXIGRAPH_ENDPOINT: "http://127.0.0.1:7878"
|
|
||||||
run: |
|
|
||||||
pytest tests/dspy_gitops/test_layer4_comprehensive.py \
|
|
||||||
-v --tb=short \
|
|
||||||
-m "layer4 or not (layer1 or layer2 or layer3)" \
|
|
||||||
--junit-xml=layer4-results.xml \
|
|
||||||
--json-report \
|
|
||||||
--json-report-file=eval-report.json
|
|
||||||
|
|
||||||
- name: Generate metrics summary
|
|
||||||
run: |
|
|
||||||
python -c "
|
|
||||||
import json
|
|
||||||
from datetime import datetime
|
|
||||||
|
|
||||||
try:
|
|
||||||
with open('eval-report.json') as f:
|
|
||||||
report = json.load(f)
|
|
||||||
|
|
||||||
metrics = {
|
|
||||||
'timestamp': datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ'),
|
|
||||||
'commit': '${{ github.sha }}',
|
|
||||||
'total_tests': report.get('summary', {}).get('total', 0),
|
|
||||||
'passed': report.get('summary', {}).get('passed', 0),
|
|
||||||
'failed': report.get('summary', {}).get('failed', 0),
|
|
||||||
'duration': report.get('duration', 0),
|
|
||||||
}
|
|
||||||
|
|
||||||
with open('metrics.json', 'w') as f:
|
|
||||||
json.dump(metrics, f, indent=2)
|
|
||||||
|
|
||||||
print('Metrics saved to metrics.json')
|
|
||||||
print(json.dumps(metrics, indent=2))
|
|
||||||
except Exception as e:
|
|
||||||
print(f'Error generating metrics: {e}')
|
|
||||||
"
|
|
||||||
|
|
||||||
- name: Upload evaluation artifacts
|
|
||||||
uses: actions/upload-artifact@v4
|
|
||||||
with:
|
|
||||||
name: comprehensive-eval-results
|
|
||||||
path: |
|
|
||||||
layer4-results.xml
|
|
||||||
eval-report.json
|
|
||||||
metrics.json
|
|
||||||
|
|
||||||
# ==========================================================================
|
|
||||||
# Quality Gate Check
|
|
||||||
# ==========================================================================
|
|
||||||
quality-gate:
|
|
||||||
name: Quality Gate
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
needs: [unit-tests, dspy-module-tests, integration-tests]
|
|
||||||
if: always()
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- name: Check all required tests passed
|
|
||||||
run: |
|
|
||||||
echo "Checking quality gates..."
|
|
||||||
|
|
||||||
# Layer 1 (unit tests) is always required
|
|
||||||
if [[ "${{ needs.unit-tests.result }}" != "success" ]]; then
|
|
||||||
echo "❌ Layer 1 (Unit Tests) failed"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
echo "✅ Layer 1 (Unit Tests) passed"
|
|
||||||
|
|
||||||
# Layer 2 (DSPy module tests) required for PRs
|
|
||||||
if [[ "${{ github.event_name }}" == "pull_request" ]]; then
|
|
||||||
if [[ "${{ needs.dspy-module-tests.result }}" != "success" ]]; then
|
|
||||||
echo "❌ Layer 2 (DSPy Module Tests) failed - required for PRs"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
echo "✅ Layer 2 (DSPy Module Tests) passed"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Layer 3 (integration tests) is warning-only for now
|
|
||||||
if [[ "${{ needs.integration-tests.result }}" != "success" ]]; then
|
|
||||||
echo "⚠️ Layer 3 (Integration Tests) failed - non-blocking"
|
|
||||||
else
|
|
||||||
echo "✅ Layer 3 (Integration Tests) passed"
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo ""
|
|
||||||
echo "============================================"
|
|
||||||
echo " All required quality gates passed!"
|
|
||||||
echo "============================================"
|
|
||||||
Loading…
Reference in a new issue