397 lines
11 KiB
Markdown
397 lines
11 KiB
Markdown
# DSPy Optimizer Configuration
|
|
|
|
## Overview
|
|
|
|
DSPy optimizers automatically tune prompts and few-shot examples to improve metric scores. This document covers:
|
|
1. MIPROv2 - Joint instruction + few-shot optimization
|
|
2. BootstrapFewShot - Rapid few-shot example generation
|
|
3. GEPA - Reflective prompt optimization (already partially implemented)
|
|
|
|
## MIPROv2 Configuration
|
|
|
|
MIPROv2 (Multiprompt Instruction PRoposal Optimizer) uses Bayesian Optimization to find the best combination of instructions and few-shot examples.
|
|
|
|
### Basic Setup
|
|
|
|
```python
|
|
# backend/rag/evaluation/optimizer.py
|
|
|
|
import dspy
|
|
from dspy.teleprompt import MIPROv2, BootstrapFewShot
|
|
from .metrics import heritage_rag_metric
|
|
from .dataset_loader import load_golden_dataset
|
|
|
|
|
|
class HeritageRAGOptimizer:
|
|
"""Optimizer for Heritage RAG prompts and few-shot examples."""
|
|
|
|
def __init__(
|
|
self,
|
|
metric=heritage_rag_metric,
|
|
auto: str = "light", # "light", "medium", or "heavy"
|
|
max_bootstrapped_demos: int = 4,
|
|
max_labeled_demos: int = 4,
|
|
):
|
|
self.metric = metric
|
|
self.auto = auto
|
|
self.max_bootstrapped_demos = max_bootstrapped_demos
|
|
self.max_labeled_demos = max_labeled_demos
|
|
|
|
def optimize_with_miprov2(
|
|
self,
|
|
program: dspy.Module,
|
|
trainset: list[dspy.Example],
|
|
devset: list[dspy.Example],
|
|
) -> dspy.Module:
|
|
"""
|
|
Optimize program using MIPROv2.
|
|
|
|
Args:
|
|
program: DSPy program to optimize
|
|
trainset: Training examples (for few-shot bootstrapping)
|
|
devset: Dev examples (for validation during optimization)
|
|
|
|
Returns:
|
|
Optimized program with improved prompts/demos
|
|
"""
|
|
optimizer = MIPROv2(
|
|
metric=self.metric,
|
|
auto=self.auto,
|
|
max_bootstrapped_demos=self.max_bootstrapped_demos,
|
|
max_labeled_demos=self.max_labeled_demos,
|
|
num_threads=4,
|
|
verbose=True,
|
|
log_dir="logs/miprov2",
|
|
track_stats=True,
|
|
)
|
|
|
|
# Run optimization
|
|
optimized_program = optimizer.compile(
|
|
program,
|
|
trainset=trainset,
|
|
eval_kwargs={"devset": devset},
|
|
)
|
|
|
|
return optimized_program
|
|
|
|
def optimize_with_bootstrap(
|
|
self,
|
|
program: dspy.Module,
|
|
trainset: list[dspy.Example],
|
|
teacher_settings: dict = None,
|
|
) -> dspy.Module:
|
|
"""
|
|
Quick optimization using BootstrapFewShot.
|
|
|
|
Faster than MIPROv2, good for initial few-shot collection.
|
|
"""
|
|
optimizer = BootstrapFewShot(
|
|
metric=self.metric,
|
|
max_bootstrapped_demos=self.max_bootstrapped_demos,
|
|
max_labeled_demos=self.max_labeled_demos,
|
|
max_rounds=1,
|
|
max_errors=5,
|
|
)
|
|
|
|
if teacher_settings:
|
|
optimizer.teacher_settings = teacher_settings
|
|
|
|
return optimizer.compile(program, trainset=trainset)
|
|
```
|
|
|
|
### Optimization Presets
|
|
|
|
```yaml
|
|
# config/optimizer_presets.yaml
|
|
|
|
presets:
|
|
light:
|
|
description: "Quick optimization (~5 min)"
|
|
auto: light
|
|
max_bootstrapped_demos: 2
|
|
max_labeled_demos: 2
|
|
num_candidates: 5
|
|
use_cases:
|
|
- "Quick iteration during development"
|
|
- "Testing new prompt ideas"
|
|
|
|
medium:
|
|
description: "Balanced optimization (~30 min)"
|
|
auto: medium
|
|
max_bootstrapped_demos: 4
|
|
max_labeled_demos: 4
|
|
num_candidates: 10
|
|
use_cases:
|
|
- "Weekly optimization runs"
|
|
- "Before major releases"
|
|
|
|
heavy:
|
|
description: "Thorough optimization (~2 hours)"
|
|
auto: heavy
|
|
max_bootstrapped_demos: 8
|
|
max_labeled_demos: 8
|
|
num_candidates: 20
|
|
use_cases:
|
|
- "Major version updates"
|
|
- "Comprehensive prompt overhaul"
|
|
|
|
category_specific:
|
|
count_queries:
|
|
description: "Optimize COUNT query handling"
|
|
target_signatures:
|
|
- HeritageQueryIntent
|
|
- GenerateSPARQL
|
|
focus_metric: count_accuracy
|
|
|
|
person_queries:
|
|
description: "Optimize PERSON query handling"
|
|
target_signatures:
|
|
- PersonQueryRouter
|
|
- PersonSPARQL
|
|
focus_metric: person_extraction_accuracy
|
|
```
|
|
|
|
### Running Optimization
|
|
|
|
```python
|
|
#!/usr/bin/env python
|
|
"""
|
|
scripts/optimize_rag.py
|
|
|
|
Run DSPy optimization on Heritage RAG prompts.
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
|
|
import dspy
|
|
from backend.rag.dspy_heritage_rag import HeritageRAGPipeline
|
|
from backend.rag.evaluation.optimizer import HeritageRAGOptimizer
|
|
from backend.rag.evaluation.dataset_loader import load_golden_dataset, split_dataset
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Optimize RAG prompts")
|
|
parser.add_argument('--preset', default='light', choices=['light', 'medium', 'heavy'])
|
|
parser.add_argument('--categories', nargs='+', help='Focus on specific categories')
|
|
parser.add_argument('--output-dir', default='optimized_programs')
|
|
parser.add_argument('--model', default='gpt-4o-mini', help='Optimization model')
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Configure DSPy
|
|
lm = dspy.LM(f"openai/{args.model}")
|
|
dspy.configure(lm=lm)
|
|
|
|
# Load dataset
|
|
data = load_golden_dataset('data/rag_eval/golden_dataset.json')
|
|
trainset, devset, _ = split_dataset(data, train=0.7, dev=0.15, test=0.15)
|
|
|
|
if args.categories:
|
|
trainset = [e for e in trainset if e.get('category') in args.categories]
|
|
devset = [e for e in devset if e.get('category') in args.categories]
|
|
|
|
print(f"Training set: {len(trainset)} examples")
|
|
print(f"Dev set: {len(devset)} examples")
|
|
|
|
# Create program
|
|
program = HeritageRAGPipeline()
|
|
|
|
# Baseline evaluation
|
|
print("\n--- Baseline Evaluation ---")
|
|
baseline_score = evaluate_program(program, devset)
|
|
print(f"Baseline score: {baseline_score:.2%}")
|
|
|
|
# Optimize
|
|
print(f"\n--- Running {args.preset.upper()} optimization ---")
|
|
optimizer = HeritageRAGOptimizer(auto=args.preset)
|
|
optimized_program = optimizer.optimize_with_miprov2(
|
|
program=program,
|
|
trainset=trainset,
|
|
devset=devset,
|
|
)
|
|
|
|
# Optimized evaluation
|
|
print("\n--- Optimized Evaluation ---")
|
|
optimized_score = evaluate_program(optimized_program, devset)
|
|
print(f"Optimized score: {optimized_score:.2%}")
|
|
print(f"Improvement: {(optimized_score - baseline_score) / baseline_score:.1%}")
|
|
|
|
# Save optimized program
|
|
output_dir = Path(args.output_dir)
|
|
output_dir.mkdir(exist_ok=True)
|
|
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
output_path = output_dir / f"heritage_rag_{args.preset}_{timestamp}.json"
|
|
|
|
optimized_program.save(str(output_path))
|
|
print(f"\nSaved optimized program to: {output_path}")
|
|
|
|
# Save optimization report
|
|
report = {
|
|
"timestamp": timestamp,
|
|
"preset": args.preset,
|
|
"baseline_score": baseline_score,
|
|
"optimized_score": optimized_score,
|
|
"improvement": (optimized_score - baseline_score) / baseline_score,
|
|
"trainset_size": len(trainset),
|
|
"devset_size": len(devset),
|
|
"categories": args.categories,
|
|
}
|
|
|
|
report_path = output_dir / f"report_{args.preset}_{timestamp}.json"
|
|
with open(report_path, 'w') as f:
|
|
json.dump(report, f, indent=2)
|
|
|
|
|
|
def evaluate_program(program, devset):
|
|
"""Evaluate program on devset."""
|
|
from dspy import Evaluate
|
|
from backend.rag.evaluation.metrics import heritage_rag_metric
|
|
|
|
evaluator = Evaluate(
|
|
devset=devset,
|
|
metric=heritage_rag_metric,
|
|
num_threads=4,
|
|
display_progress=True,
|
|
)
|
|
return evaluator(program)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|
|
```
|
|
|
|
### CLI Usage
|
|
|
|
```bash
|
|
# Quick optimization (light preset)
|
|
python scripts/optimize_rag.py --preset light
|
|
|
|
# Focus on COUNT queries
|
|
python scripts/optimize_rag.py --preset medium --categories count
|
|
|
|
# Heavy optimization for production
|
|
python scripts/optimize_rag.py --preset heavy --model gpt-4o
|
|
|
|
# Output to specific directory
|
|
python scripts/optimize_rag.py --output-dir optimized_programs/v2
|
|
```
|
|
|
|
## Loading Optimized Programs
|
|
|
|
```python
|
|
# In production code
|
|
|
|
from backend.rag.dspy_heritage_rag import HeritageRAGPipeline
|
|
|
|
def get_optimized_pipeline():
|
|
"""Load the latest optimized pipeline."""
|
|
from pathlib import Path
|
|
|
|
optimized_dir = Path("optimized_programs")
|
|
if not optimized_dir.exists():
|
|
return HeritageRAGPipeline() # Fallback to default
|
|
|
|
# Find latest optimization
|
|
files = sorted(optimized_dir.glob("heritage_rag_*.json"), reverse=True)
|
|
if not files:
|
|
return HeritageRAGPipeline()
|
|
|
|
latest = files[0]
|
|
print(f"Loading optimized program: {latest}")
|
|
|
|
program = HeritageRAGPipeline()
|
|
program.load(str(latest))
|
|
return program
|
|
```
|
|
|
|
## Optimization Schedule
|
|
|
|
### Automated Weekly Optimization
|
|
|
|
```yaml
|
|
# .github/workflows/weekly-optimization.yml
|
|
name: Weekly RAG Optimization
|
|
|
|
on:
|
|
schedule:
|
|
- cron: '0 2 * * 0' # Sunday 2 AM
|
|
workflow_dispatch:
|
|
|
|
jobs:
|
|
optimize:
|
|
runs-on: ubuntu-latest
|
|
steps:
|
|
- uses: actions/checkout@v4
|
|
|
|
- name: Set up Python
|
|
uses: actions/setup-python@v5
|
|
with:
|
|
python-version: '3.11'
|
|
|
|
- name: Install dependencies
|
|
run: pip install -r requirements.txt
|
|
|
|
- name: Run optimization
|
|
run: python scripts/optimize_rag.py --preset medium
|
|
env:
|
|
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
|
|
|
- name: Upload optimized program
|
|
uses: actions/upload-artifact@v4
|
|
with:
|
|
name: optimized-program
|
|
path: optimized_programs/
|
|
|
|
- name: Create PR with optimized prompts
|
|
uses: peter-evans/create-pull-request@v5
|
|
with:
|
|
title: "chore: Weekly RAG prompt optimization"
|
|
body: |
|
|
Automated weekly optimization of RAG prompts.
|
|
|
|
See optimization report in artifacts.
|
|
branch: auto/weekly-optimization
|
|
```
|
|
|
|
## Monitoring Optimization Progress
|
|
|
|
### Metrics Tracking
|
|
|
|
```python
|
|
# Track optimization metrics over time
|
|
|
|
import wandb
|
|
|
|
def log_optimization_metrics(report: dict):
|
|
"""Log optimization metrics to W&B."""
|
|
wandb.init(
|
|
project="heritage-rag-optimization",
|
|
config=report,
|
|
)
|
|
|
|
wandb.log({
|
|
"baseline_score": report["baseline_score"],
|
|
"optimized_score": report["optimized_score"],
|
|
"improvement": report["improvement"],
|
|
})
|
|
|
|
wandb.finish()
|
|
```
|
|
|
|
### Version Control for Prompts
|
|
|
|
```
|
|
optimized_programs/
|
|
├── heritage_rag_light_20250109_150000.json
|
|
├── heritage_rag_medium_20250112_020000.json
|
|
├── heritage_rag_heavy_20250115_020000.json
|
|
├── reports/
|
|
│ ├── report_light_20250109_150000.json
|
|
│ ├── report_medium_20250112_020000.json
|
|
│ └── report_heavy_20250115_020000.json
|
|
└── current -> heritage_rag_medium_20250112_020000.json # symlink
|
|
```
|