11 KiB
11 KiB
DSPy Optimizer Configuration
Overview
DSPy optimizers automatically tune prompts and few-shot examples to improve metric scores. This document covers:
- MIPROv2 - Joint instruction + few-shot optimization
- BootstrapFewShot - Rapid few-shot example generation
- GEPA - Reflective prompt optimization (already partially implemented)
MIPROv2 Configuration
MIPROv2 (Multiprompt Instruction PRoposal Optimizer) uses Bayesian Optimization to find the best combination of instructions and few-shot examples.
Basic Setup
# backend/rag/evaluation/optimizer.py
import dspy
from dspy.teleprompt import MIPROv2, BootstrapFewShot
from .metrics import heritage_rag_metric
from .dataset_loader import load_golden_dataset
class HeritageRAGOptimizer:
"""Optimizer for Heritage RAG prompts and few-shot examples."""
def __init__(
self,
metric=heritage_rag_metric,
auto: str = "light", # "light", "medium", or "heavy"
max_bootstrapped_demos: int = 4,
max_labeled_demos: int = 4,
):
self.metric = metric
self.auto = auto
self.max_bootstrapped_demos = max_bootstrapped_demos
self.max_labeled_demos = max_labeled_demos
def optimize_with_miprov2(
self,
program: dspy.Module,
trainset: list[dspy.Example],
devset: list[dspy.Example],
) -> dspy.Module:
"""
Optimize program using MIPROv2.
Args:
program: DSPy program to optimize
trainset: Training examples (for few-shot bootstrapping)
devset: Dev examples (for validation during optimization)
Returns:
Optimized program with improved prompts/demos
"""
optimizer = MIPROv2(
metric=self.metric,
auto=self.auto,
max_bootstrapped_demos=self.max_bootstrapped_demos,
max_labeled_demos=self.max_labeled_demos,
num_threads=4,
verbose=True,
log_dir="logs/miprov2",
track_stats=True,
)
# Run optimization
optimized_program = optimizer.compile(
program,
trainset=trainset,
eval_kwargs={"devset": devset},
)
return optimized_program
def optimize_with_bootstrap(
self,
program: dspy.Module,
trainset: list[dspy.Example],
teacher_settings: dict = None,
) -> dspy.Module:
"""
Quick optimization using BootstrapFewShot.
Faster than MIPROv2, good for initial few-shot collection.
"""
optimizer = BootstrapFewShot(
metric=self.metric,
max_bootstrapped_demos=self.max_bootstrapped_demos,
max_labeled_demos=self.max_labeled_demos,
max_rounds=1,
max_errors=5,
)
if teacher_settings:
optimizer.teacher_settings = teacher_settings
return optimizer.compile(program, trainset=trainset)
Optimization Presets
# config/optimizer_presets.yaml
presets:
light:
description: "Quick optimization (~5 min)"
auto: light
max_bootstrapped_demos: 2
max_labeled_demos: 2
num_candidates: 5
use_cases:
- "Quick iteration during development"
- "Testing new prompt ideas"
medium:
description: "Balanced optimization (~30 min)"
auto: medium
max_bootstrapped_demos: 4
max_labeled_demos: 4
num_candidates: 10
use_cases:
- "Weekly optimization runs"
- "Before major releases"
heavy:
description: "Thorough optimization (~2 hours)"
auto: heavy
max_bootstrapped_demos: 8
max_labeled_demos: 8
num_candidates: 20
use_cases:
- "Major version updates"
- "Comprehensive prompt overhaul"
category_specific:
count_queries:
description: "Optimize COUNT query handling"
target_signatures:
- HeritageQueryIntent
- GenerateSPARQL
focus_metric: count_accuracy
person_queries:
description: "Optimize PERSON query handling"
target_signatures:
- PersonQueryRouter
- PersonSPARQL
focus_metric: person_extraction_accuracy
Running Optimization
#!/usr/bin/env python
"""
scripts/optimize_rag.py
Run DSPy optimization on Heritage RAG prompts.
"""
import argparse
import json
from pathlib import Path
from datetime import datetime
import dspy
from backend.rag.dspy_heritage_rag import HeritageRAGPipeline
from backend.rag.evaluation.optimizer import HeritageRAGOptimizer
from backend.rag.evaluation.dataset_loader import load_golden_dataset, split_dataset
def main():
parser = argparse.ArgumentParser(description="Optimize RAG prompts")
parser.add_argument('--preset', default='light', choices=['light', 'medium', 'heavy'])
parser.add_argument('--categories', nargs='+', help='Focus on specific categories')
parser.add_argument('--output-dir', default='optimized_programs')
parser.add_argument('--model', default='gpt-4o-mini', help='Optimization model')
args = parser.parse_args()
# Configure DSPy
lm = dspy.LM(f"openai/{args.model}")
dspy.configure(lm=lm)
# Load dataset
data = load_golden_dataset('data/rag_eval/golden_dataset.json')
trainset, devset, _ = split_dataset(data, train=0.7, dev=0.15, test=0.15)
if args.categories:
trainset = [e for e in trainset if e.get('category') in args.categories]
devset = [e for e in devset if e.get('category') in args.categories]
print(f"Training set: {len(trainset)} examples")
print(f"Dev set: {len(devset)} examples")
# Create program
program = HeritageRAGPipeline()
# Baseline evaluation
print("\n--- Baseline Evaluation ---")
baseline_score = evaluate_program(program, devset)
print(f"Baseline score: {baseline_score:.2%}")
# Optimize
print(f"\n--- Running {args.preset.upper()} optimization ---")
optimizer = HeritageRAGOptimizer(auto=args.preset)
optimized_program = optimizer.optimize_with_miprov2(
program=program,
trainset=trainset,
devset=devset,
)
# Optimized evaluation
print("\n--- Optimized Evaluation ---")
optimized_score = evaluate_program(optimized_program, devset)
print(f"Optimized score: {optimized_score:.2%}")
print(f"Improvement: {(optimized_score - baseline_score) / baseline_score:.1%}")
# Save optimized program
output_dir = Path(args.output_dir)
output_dir.mkdir(exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_path = output_dir / f"heritage_rag_{args.preset}_{timestamp}.json"
optimized_program.save(str(output_path))
print(f"\nSaved optimized program to: {output_path}")
# Save optimization report
report = {
"timestamp": timestamp,
"preset": args.preset,
"baseline_score": baseline_score,
"optimized_score": optimized_score,
"improvement": (optimized_score - baseline_score) / baseline_score,
"trainset_size": len(trainset),
"devset_size": len(devset),
"categories": args.categories,
}
report_path = output_dir / f"report_{args.preset}_{timestamp}.json"
with open(report_path, 'w') as f:
json.dump(report, f, indent=2)
def evaluate_program(program, devset):
"""Evaluate program on devset."""
from dspy import Evaluate
from backend.rag.evaluation.metrics import heritage_rag_metric
evaluator = Evaluate(
devset=devset,
metric=heritage_rag_metric,
num_threads=4,
display_progress=True,
)
return evaluator(program)
if __name__ == '__main__':
main()
CLI Usage
# Quick optimization (light preset)
python scripts/optimize_rag.py --preset light
# Focus on COUNT queries
python scripts/optimize_rag.py --preset medium --categories count
# Heavy optimization for production
python scripts/optimize_rag.py --preset heavy --model gpt-4o
# Output to specific directory
python scripts/optimize_rag.py --output-dir optimized_programs/v2
Loading Optimized Programs
# In production code
from backend.rag.dspy_heritage_rag import HeritageRAGPipeline
def get_optimized_pipeline():
"""Load the latest optimized pipeline."""
from pathlib import Path
optimized_dir = Path("optimized_programs")
if not optimized_dir.exists():
return HeritageRAGPipeline() # Fallback to default
# Find latest optimization
files = sorted(optimized_dir.glob("heritage_rag_*.json"), reverse=True)
if not files:
return HeritageRAGPipeline()
latest = files[0]
print(f"Loading optimized program: {latest}")
program = HeritageRAGPipeline()
program.load(str(latest))
return program
Optimization Schedule
Automated Weekly Optimization
# .github/workflows/weekly-optimization.yml
name: Weekly RAG Optimization
on:
schedule:
- cron: '0 2 * * 0' # Sunday 2 AM
workflow_dispatch:
jobs:
optimize:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Install dependencies
run: pip install -r requirements.txt
- name: Run optimization
run: python scripts/optimize_rag.py --preset medium
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
- name: Upload optimized program
uses: actions/upload-artifact@v4
with:
name: optimized-program
path: optimized_programs/
- name: Create PR with optimized prompts
uses: peter-evans/create-pull-request@v5
with:
title: "chore: Weekly RAG prompt optimization"
body: |
Automated weekly optimization of RAG prompts.
See optimization report in artifacts.
branch: auto/weekly-optimization
Monitoring Optimization Progress
Metrics Tracking
# Track optimization metrics over time
import wandb
def log_optimization_metrics(report: dict):
"""Log optimization metrics to W&B."""
wandb.init(
project="heritage-rag-optimization",
config=report,
)
wandb.log({
"baseline_score": report["baseline_score"],
"optimized_score": report["optimized_score"],
"improvement": report["improvement"],
})
wandb.finish()
Version Control for Prompts
optimized_programs/
├── heritage_rag_light_20250109_150000.json
├── heritage_rag_medium_20250112_020000.json
├── heritage_rag_heavy_20250115_020000.json
├── reports/
│ ├── report_light_20250109_150000.json
│ ├── report_medium_20250112_020000.json
│ └── report_heavy_20250115_020000.json
└── current -> heritage_rag_medium_20250112_020000.json # symlink