glam/backend/rag/optimize_templates.py
2026-01-02 02:11:04 +01:00

1136 lines
48 KiB
Python

"""
DSPy Optimizer for Heritage RAG Template System
Optimizes the DSPy modules in template_sparql.py using DSPy's built-in
optimizers (MIPRO, BootstrapFewShot, COPRO) to improve:
- Template classification accuracy
- Slot extraction precision/recall
- Follow-up question resolution
- Fyke filter relevance detection
Based on:
- DSPy 2.6+ optimization API
- Formica et al. (2023) - Template SPARQL achieves 65% precision
- data/sparql_templates.yaml template definitions
Usage:
# Generate training data from templates
python -m backend.rag.optimize_templates generate-data
# Run optimization
python -m backend.rag.optimize_templates optimize --optimizer mipro
# Evaluate current model
python -m backend.rag.optimize_templates evaluate
# Export optimized prompts
python -m backend.rag.optimize_templates export
Author: OpenCode
Created: 2025-01-06
"""
from __future__ import annotations
import json
import logging
import random
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Literal, Optional
import yaml
logger = logging.getLogger(__name__)
# =============================================================================
# PATHS
# =============================================================================
TEMPLATES_PATH = Path(__file__).parent.parent.parent / "data" / "sparql_templates.yaml"
TRAINING_DATA_PATH = Path(__file__).parent.parent.parent / "data" / "training" / "template_training.json"
OPTIMIZED_PROMPTS_PATH = Path(__file__).parent / "optimized_prompts.json"
# =============================================================================
# TRAINING DATA STRUCTURES
# =============================================================================
@dataclass
class TemplateExample:
"""A training example for template classification."""
question: str
template_id: str
slots: dict[str, str]
language: str = "nl"
is_follow_up: bool = False
previous_question: Optional[str] = None
previous_slots: Optional[dict[str, str]] = None
@dataclass
class FykeExample:
"""A training example for Fyke filter."""
question: str
is_relevant: bool
reasoning: str
@dataclass
class SlotExample:
"""A training example for slot extraction."""
question: str
template_id: str
expected_slots: dict[str, str]
language: str = "nl"
@dataclass
class TrainingDataset:
"""Complete training dataset for all DSPy modules."""
template_examples: list[TemplateExample] = field(default_factory=list)
fyke_examples: list[FykeExample] = field(default_factory=list)
slot_examples: list[SlotExample] = field(default_factory=list)
follow_up_examples: list[TemplateExample] = field(default_factory=list)
metadata: dict[str, Any] = field(default_factory=dict)
def to_dict(self) -> dict[str, Any]:
"""Convert to JSON-serializable dict."""
return {
"metadata": {
**self.metadata,
"generated_at": datetime.now(timezone.utc).isoformat(),
"template_count": len(self.template_examples),
"fyke_count": len(self.fyke_examples),
"slot_count": len(self.slot_examples),
"follow_up_count": len(self.follow_up_examples),
},
"template_examples": [
{
"question": ex.question,
"template_id": ex.template_id,
"slots": ex.slots,
"language": ex.language,
"is_follow_up": ex.is_follow_up,
"previous_question": ex.previous_question,
"previous_slots": ex.previous_slots,
}
for ex in self.template_examples
],
"fyke_examples": [
{
"question": ex.question,
"is_relevant": ex.is_relevant,
"reasoning": ex.reasoning,
}
for ex in self.fyke_examples
],
"slot_examples": [
{
"question": ex.question,
"template_id": ex.template_id,
"expected_slots": ex.expected_slots,
"language": ex.language,
}
for ex in self.slot_examples
],
"follow_up_examples": [
{
"question": ex.question,
"template_id": ex.template_id,
"slots": ex.slots,
"language": ex.language,
"is_follow_up": ex.is_follow_up,
"previous_question": ex.previous_question,
"previous_slots": ex.previous_slots,
}
for ex in self.follow_up_examples
],
}
@classmethod
def from_dict(cls, data: dict[str, Any]) -> "TrainingDataset":
"""Load from JSON dict."""
return cls(
metadata=data.get("metadata", {}),
template_examples=[
TemplateExample(**ex) for ex in data.get("template_examples", [])
],
fyke_examples=[
FykeExample(**ex) for ex in data.get("fyke_examples", [])
],
slot_examples=[
SlotExample(**ex) for ex in data.get("slot_examples", [])
],
follow_up_examples=[
TemplateExample(**ex) for ex in data.get("follow_up_examples", [])
],
)
# =============================================================================
# DATA GENERATION
# =============================================================================
class TrainingDataGenerator:
"""Generates training data from SPARQL templates YAML."""
def __init__(self):
self.templates: dict[str, Any] = {}
self.slot_types: dict[str, Any] = {}
self.fyke_config: dict[str, Any] = {}
self.follow_up_patterns: dict[str, Any] = {}
def load_templates(self) -> None:
"""Load templates from YAML file."""
if not TEMPLATES_PATH.exists():
raise FileNotFoundError(f"Templates not found: {TEMPLATES_PATH}")
with open(TEMPLATES_PATH) as f:
data = yaml.safe_load(f)
self.templates = data.get("templates", {})
self.slot_types = data.get("_slot_types", {})
self.fyke_config = data.get("fyke_filter", {})
self.follow_up_patterns = data.get("follow_up_patterns", {})
logger.info(f"Loaded {len(self.templates)} templates")
def generate_template_examples(self) -> list[TemplateExample]:
"""Generate template classification examples from patterns."""
examples = []
# City and region substitution values
dutch_cities = [
"Amsterdam", "Rotterdam", "Den Haag", "Utrecht", "Eindhoven",
"Groningen", "Tilburg", "Almere", "Breda", "Nijmegen",
"Enschede", "Haarlem", "Arnhem", "Zaanstad", "Amersfoort",
"Apeldoorn", "Maastricht", "Leiden", "Dordrecht", "Zwolle",
]
regions = [
("Noord-Holland", "NL-NH"),
("Zuid-Holland", "NL-ZH"),
("Noord-Brabant", "NL-NB"),
("Gelderland", "NL-GE"),
("Utrecht", "NL-UT"),
("Overijssel", "NL-OV"),
("Limburg", "NL-LI"),
("Friesland", "NL-FR"),
("Groningen", "NL-GR"),
("Drenthe", "NL-DR"),
("Flevoland", "NL-FL"),
("Zeeland", "NL-ZE"),
]
countries = [
("Nederland", "Q55"),
("Belgium", "Q31"),
("Germany", "Q183"),
("France", "Q142"),
]
institution_types = [
("musea", "M"),
("archieven", "A"),
("bibliotheken", "L"),
("galerijen", "G"),
("museums", "M"),
("archives", "A"),
("libraries", "L"),
]
for template_id, template_data in self.templates.items():
patterns = template_data.get("question_patterns", [])
template_examples = template_data.get("examples", [])
# Use explicit examples from template
for ex in template_examples:
examples.append(TemplateExample(
question=ex.get("question", ""),
template_id=template_id,
slots=ex.get("slots", {}),
language="nl" if any(c in ex.get("question", "") for c in "éèêëîïôùûüçàâäœæ") or any(w in ex.get("question", "").lower() for w in ["welke", "hoeveel", "wat"]) else "en",
))
# Generate examples from patterns
for pattern in patterns[:3]: # Limit per pattern
# Substitute placeholders
if "{institution_type_nl}" in pattern or "{institution_type_en}" in pattern:
for inst_name, inst_code in institution_types[:3]:
question = pattern.replace("{institution_type_nl}", inst_name)
question = question.replace("{institution_type_en}", inst_name)
if "{city}" in question:
for city in random.sample(dutch_cities, min(3, len(dutch_cities))):
q = question.replace("{city}", city)
examples.append(TemplateExample(
question=q,
template_id=template_id,
slots={"institution_type": inst_code, "city": city},
language="nl" if "Welke" in q or "Hoeveel" in q else "en",
))
elif "{region}" in question:
for region_name, region_code in random.sample(regions, min(2, len(regions))):
q = question.replace("{region}", region_name)
examples.append(TemplateExample(
question=q,
template_id=template_id,
slots={"institution_type": inst_code, "region": region_code},
language="nl" if "Welke" in q or "Hoeveel" in q else "en",
))
elif "{country}" in question:
for country_name, country_code in random.sample(countries, min(2, len(countries))):
q = question.replace("{country}", country_name)
examples.append(TemplateExample(
question=q,
template_id=template_id,
slots={"institution_type": inst_code, "country": country_code},
language="nl" if "Welke" in q or "Hoeveel" in q else "en",
))
elif "{location}" in question:
for city in random.sample(dutch_cities, min(2, len(dutch_cities))):
q = question.replace("{location}", city)
examples.append(TemplateExample(
question=q,
template_id=template_id,
slots={"institution_type": inst_code, "location": city},
language="nl" if "Welke" in q or "Hoeveel" in q else "en",
))
elif "{city}" in pattern:
for city in random.sample(dutch_cities, min(3, len(dutch_cities))):
q = pattern.replace("{city}", city)
examples.append(TemplateExample(
question=q,
template_id=template_id,
slots={"city": city},
language="nl" if "Welke" in q or "Wat" in q else "en",
))
elif "{institution_name}" in pattern:
institution_names = ["Rijksmuseum", "Nationaal Archief", "Koninklijke Bibliotheek", "Van Gogh Museum"]
for name in institution_names[:2]:
q = pattern.replace("{institution_name}", name)
examples.append(TemplateExample(
question=q,
template_id=template_id,
slots={"institution_name": name},
language="nl" if "Waar" in q or "Informatie" in q else "en",
))
elif "{identifier}" in pattern:
identifiers = ["NL-AmRMA", "NL-HaNA", "NL-DhHSA"]
for ident in identifiers[:2]:
q = pattern.replace("{identifier}", ident)
examples.append(TemplateExample(
question=q,
template_id=template_id,
slots={"identifier": ident, "identifier_type": "isil"},
language="nl" if "Welke" in q else "en",
))
elif "{location1}" in pattern and "{location2}" in pattern:
pairs = [("Amsterdam", "Rotterdam"), ("Den Haag", "Utrecht"), ("Groningen", "Maastricht")]
for loc1, loc2 in pairs[:2]:
q = pattern.replace("{location1}", loc1).replace("{location2}", loc2)
examples.append(TemplateExample(
question=q,
template_id=template_id,
slots={"location1": loc1, "location2": loc2},
language="nl" if "Vergelijk" in q else "en",
))
logger.info(f"Generated {len(examples)} template classification examples")
return examples
def generate_fyke_examples(self) -> list[FykeExample]:
"""Generate Fyke filter training examples."""
examples = []
# Relevant examples (heritage-related)
relevant_questions = [
("Welke musea zijn er in Amsterdam?", "Contains heritage keyword 'musea' and asks about institutions"),
("Hoeveel archieven heeft Noord-Holland?", "Asks about archive count in a region"),
("Wat is het Rijksmuseum?", "Asks about a specific museum"),
("Welke bibliotheken zijn er in Den Haag?", "Contains heritage keyword 'bibliotheken'"),
("Erfgoedinstellingen in Utrecht", "Contains heritage keyword 'erfgoedinstellingen'"),
("What museums are in Rotterdam?", "English query about museums"),
("How many archives are there in the Netherlands?", "English query about archives"),
("Information about the National Archive", "Asks about heritage institution"),
("Cultural institutions in Maastricht", "Contains heritage keyword 'cultural institutions'"),
("Galerijen in Groningen", "Contains heritage keyword 'galerijen'"),
("Welke collecties heeft het Rijksmuseum?", "Asks about museum collections"),
("Tentoonstellingen in Amsterdam", "Asks about exhibitions"),
("GLAM instellingen in Nederland", "Contains GLAM acronym"),
("Heritage institutions in the Netherlands", "Contains heritage keyword"),
]
for question, reasoning in relevant_questions:
examples.append(FykeExample(
question=question,
is_relevant=True,
reasoning=reasoning,
))
# Irrelevant examples (out of scope)
irrelevant_questions = [
("Waar kan ik tandpasta kopen?", "Shopping query unrelated to heritage"),
("Wat is het weer morgen in Amsterdam?", "Weather query"),
("Beste restaurants in Rotterdam", "Restaurant/dining query"),
("Voetbalwedstrijd Ajax vandaag", "Sports query"),
("How do I book a hotel in The Hague?", "Travel booking query"),
("Bitcoin price today", "Cryptocurrency query"),
("Recipe for apple pie", "Cooking query"),
("Where can I find a supermarket?", "Shopping query"),
("What is the capital of France?", "General knowledge, not heritage-specific"),
("How do I fix my computer?", "Technical support query"),
("Dating tips", "Personal advice query"),
("Best flight deals to Spain", "Travel booking query"),
]
for question, reasoning in irrelevant_questions:
examples.append(FykeExample(
question=question,
is_relevant=False,
reasoning=reasoning,
))
# Edge cases (could be ambiguous)
edge_cases = [
("Buildings in Amsterdam", True, "Could refer to historic/heritage buildings"),
("History of Rotterdam", True, "Historical topic often relates to heritage"),
("Art in The Hague", True, "Art is heritage-related"),
("Books about Dutch history", True, "Library/archive related topic"),
("Where is the library?", True, "Asks about library location"),
("Opening hours", False, "Too vague without heritage context"),
("How old is it?", False, "Pronoun reference without context"),
]
for question, is_relevant, reasoning in edge_cases:
examples.append(FykeExample(
question=question,
is_relevant=is_relevant,
reasoning=reasoning,
))
logger.info(f"Generated {len(examples)} Fyke filter examples")
return examples
def generate_slot_examples(self) -> list[SlotExample]:
"""Generate slot extraction training examples."""
examples = []
# Institution type extraction
type_questions = [
("Welke musea zijn er in Amsterdam?", "list_institutions_by_type_city", {"institution_type": "M", "city": "Amsterdam"}),
("Archieven in Den Haag", "list_institutions_by_type_city", {"institution_type": "A", "city": "Den Haag"}),
("What libraries are in Rotterdam?", "list_institutions_by_type_city", {"institution_type": "L", "city": "Rotterdam"}),
("Galerijen in Utrecht", "list_institutions_by_type_city", {"institution_type": "G", "city": "Utrecht"}),
("Museums in Noord-Holland", "list_institutions_by_type_region", {"institution_type": "M", "region": "NL-NH"}),
("Archives in Gelderland", "list_institutions_by_type_region", {"institution_type": "A", "region": "NL-GE"}),
("Bibliotheken in Nederland", "list_institutions_by_type_country", {"institution_type": "L", "country": "Q55"}),
("Hoeveel musea zijn er in Amsterdam?", "count_institutions_by_type_location", {"institution_type": "M", "location": "Amsterdam"}),
("Hoeveel archieven heeft Zuid-Holland?", "count_institutions_by_type_location", {"institution_type": "A", "location": "NL-ZH"}),
]
for question, template_id, expected_slots in type_questions:
examples.append(SlotExample(
question=question,
template_id=template_id,
expected_slots=expected_slots,
language="nl" if any(w in question.lower() for w in ["welke", "hoeveel", "wat"]) else "en",
))
# Name extraction
name_questions = [
("Waar is het Rijksmuseum?", "find_institution_by_name", {"institution_name": "Rijksmuseum"}),
("Informatie over het Nationaal Archief", "find_institution_by_name", {"institution_name": "Nationaal Archief"}),
("What is the Van Gogh Museum?", "find_institution_by_name", {"institution_name": "Van Gogh Museum"}),
("Tell me about Koninklijke Bibliotheek", "find_institution_by_name", {"institution_name": "Koninklijke Bibliotheek"}),
]
for question, template_id, expected_slots in name_questions:
examples.append(SlotExample(
question=question,
template_id=template_id,
expected_slots=expected_slots,
language="nl" if "Waar" in question or "Informatie" in question else "en",
))
# Identifier extraction
identifier_questions = [
("Welke instelling heeft ISIL NL-AmRMA?", "find_institution_by_identifier", {"identifier": "NL-AmRMA", "identifier_type": "isil"}),
("Zoek ISIL NL-HaNA", "find_institution_by_identifier", {"identifier": "NL-HaNA", "identifier_type": "isil"}),
("Find institution with ISIL DE-1", "find_institution_by_identifier", {"identifier": "DE-1", "identifier_type": "isil"}),
]
for question, template_id, expected_slots in identifier_questions:
examples.append(SlotExample(
question=question,
template_id=template_id,
expected_slots=expected_slots,
language="nl" if "Welke" in question or "Zoek" in question else "en",
))
# Comparison extraction
comparison_questions = [
("Vergelijk Amsterdam en Rotterdam", "compare_locations", {"location1": "Amsterdam", "location2": "Rotterdam"}),
("Amsterdam versus Utrecht", "compare_locations", {"location1": "Amsterdam", "location2": "Utrecht"}),
("Compare The Hague and Leiden", "compare_locations", {"location1": "Den Haag", "location2": "Leiden"}),
]
for question, template_id, expected_slots in comparison_questions:
examples.append(SlotExample(
question=question,
template_id=template_id,
expected_slots=expected_slots,
language="nl" if "Vergelijk" in question else "en",
))
logger.info(f"Generated {len(examples)} slot extraction examples")
return examples
def generate_follow_up_examples(self) -> list[TemplateExample]:
"""Generate follow-up question resolution examples."""
examples = []
# Location swap follow-ups
location_swaps = [
# (follow-up, resolved, previous_question, inherited_slots, new_slots)
("En in Rotterdam?", "Welke musea zijn er in Rotterdam?",
"Welke musea zijn er in Amsterdam?", {"institution_type": "M"}, {"institution_type": "M", "city": "Rotterdam"}),
("En in Utrecht?", "Welke archieven zijn er in Utrecht?",
"Welke archieven zijn er in Den Haag?", {"institution_type": "A"}, {"institution_type": "A", "city": "Utrecht"}),
("What about Groningen?", "What libraries are in Groningen?",
"What libraries are in Amsterdam?", {"institution_type": "L"}, {"institution_type": "L", "city": "Groningen"}),
("En Enschede?", "Welke bibliotheken zijn er in Enschede?",
"Welke bibliotheken zijn er in Zwolle?", {"institution_type": "L"}, {"institution_type": "L", "city": "Enschede"}),
]
for follow_up, resolved, prev_q, prev_slots, new_slots in location_swaps:
examples.append(TemplateExample(
question=follow_up,
template_id="list_institutions_by_type_city",
slots=new_slots,
language="nl" if "En" in follow_up else "en",
is_follow_up=True,
previous_question=prev_q,
previous_slots=prev_slots,
))
# Type swap follow-ups
type_swaps = [
("En de archieven?", "Welke archieven zijn er in Amsterdam?",
"Welke musea zijn er in Amsterdam?", {"city": "Amsterdam"}, {"institution_type": "A", "city": "Amsterdam"}),
("What about libraries?", "What libraries are in Rotterdam?",
"What museums are in Rotterdam?", {"city": "Rotterdam"}, {"institution_type": "L", "city": "Rotterdam"}),
("En bibliotheken?", "Welke bibliotheken zijn er in Utrecht?",
"Welke galerijen zijn er in Utrecht?", {"city": "Utrecht"}, {"institution_type": "L", "city": "Utrecht"}),
]
for follow_up, resolved, prev_q, prev_slots, new_slots in type_swaps:
examples.append(TemplateExample(
question=follow_up,
template_id="list_institutions_by_type_city",
slots=new_slots,
language="nl" if "En" in follow_up else "en",
is_follow_up=True,
previous_question=prev_q,
previous_slots=prev_slots,
))
# Count from list follow-ups
count_follow_ups = [
("Hoeveel zijn dat?", "Hoeveel musea zijn er in Amsterdam?",
"Welke musea zijn er in Amsterdam?", {"institution_type": "M", "city": "Amsterdam"}, {"institution_type": "M", "location": "Amsterdam"}),
("How many?", "How many archives are in Rotterdam?",
"What archives are in Rotterdam?", {"institution_type": "A", "city": "Rotterdam"}, {"institution_type": "A", "location": "Rotterdam"}),
("Tel ze", "Hoeveel bibliotheken zijn er in Den Haag?",
"Welke bibliotheken zijn er in Den Haag?", {"institution_type": "L", "city": "Den Haag"}, {"institution_type": "L", "location": "Den Haag"}),
]
for follow_up, resolved, prev_q, prev_slots, new_slots in count_follow_ups:
examples.append(TemplateExample(
question=follow_up,
template_id="count_institutions_by_type_location",
slots=new_slots,
language="nl" if "Hoeveel" in follow_up or "Tel" in follow_up else "en",
is_follow_up=True,
previous_question=prev_q,
previous_slots=prev_slots,
))
logger.info(f"Generated {len(examples)} follow-up examples")
return examples
def generate_dataset(self) -> TrainingDataset:
"""Generate complete training dataset."""
self.load_templates()
return TrainingDataset(
template_examples=self.generate_template_examples(),
fyke_examples=self.generate_fyke_examples(),
slot_examples=self.generate_slot_examples(),
follow_up_examples=self.generate_follow_up_examples(),
metadata={
"source": str(TEMPLATES_PATH),
"template_count": len(self.templates),
},
)
# =============================================================================
# EVALUATION METRICS
# =============================================================================
@dataclass
class EvaluationResult:
"""Results from evaluating a DSPy module."""
module_name: str
accuracy: float
precision: float
recall: float
f1_score: float
total_examples: int
correct: int
errors: list[dict[str, Any]] = field(default_factory=list)
def to_dict(self) -> dict[str, Any]:
return {
"module_name": self.module_name,
"accuracy": self.accuracy,
"precision": self.precision,
"recall": self.recall,
"f1_score": self.f1_score,
"total_examples": self.total_examples,
"correct": self.correct,
"error_count": len(self.errors),
"errors": self.errors[:10], # Limit error samples
}
class TemplateEvaluator:
"""Evaluates template classification and slot extraction."""
def __init__(self):
self.pipeline = None
def _get_pipeline(self):
"""Lazy load pipeline."""
if self.pipeline is None:
from .template_sparql import TemplateSPARQLPipeline
self.pipeline = TemplateSPARQLPipeline()
return self.pipeline
def evaluate_template_classification(
self,
examples: list[TemplateExample],
sample_size: Optional[int] = None,
) -> EvaluationResult:
"""Evaluate template classification accuracy."""
pipeline = self._get_pipeline()
if sample_size and sample_size < len(examples):
examples = random.sample(examples, sample_size)
correct = 0
errors = []
for ex in examples:
try:
result = pipeline.template_classifier.forward(
question=ex.question,
language=ex.language,
)
predicted = result.template_id
expected = ex.template_id
if predicted == expected:
correct += 1
else:
errors.append({
"question": ex.question,
"expected": expected,
"predicted": predicted,
"confidence": result.confidence,
})
except Exception as e:
errors.append({
"question": ex.question,
"expected": ex.template_id,
"error": str(e),
})
total = len(examples)
accuracy = correct / total if total > 0 else 0.0
return EvaluationResult(
module_name="TemplateClassifier",
accuracy=accuracy,
precision=accuracy, # For classification, precision = accuracy
recall=accuracy,
f1_score=accuracy,
total_examples=total,
correct=correct,
errors=errors,
)
def evaluate_slot_extraction(
self,
examples: list[SlotExample],
sample_size: Optional[int] = None,
) -> EvaluationResult:
"""Evaluate slot extraction precision and recall."""
pipeline = self._get_pipeline()
if sample_size and sample_size < len(examples):
examples = random.sample(examples, sample_size)
total_expected = 0
total_predicted = 0
total_correct = 0
errors = []
for ex in examples:
try:
predicted_slots, _ = pipeline.slot_extractor.forward(
question=ex.question,
template_id=ex.template_id,
inherited_slots=None,
)
expected_slots = ex.expected_slots
# Count slot matches
for key, expected_value in expected_slots.items():
total_expected += 1
if key in predicted_slots:
total_predicted += 1
# Normalize for comparison
pred_val = str(predicted_slots[key]).strip().lower()
exp_val = str(expected_value).strip().lower()
if pred_val == exp_val or pred_val in exp_val or exp_val in pred_val:
total_correct += 1
else:
errors.append({
"question": ex.question,
"slot": key,
"expected": expected_value,
"predicted": predicted_slots[key],
})
else:
errors.append({
"question": ex.question,
"slot": key,
"expected": expected_value,
"predicted": None,
})
except Exception as e:
errors.append({
"question": ex.question,
"error": str(e),
})
precision = total_correct / total_predicted if total_predicted > 0 else 0.0
recall = total_correct / total_expected if total_expected > 0 else 0.0
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
return EvaluationResult(
module_name="SlotExtractor",
accuracy=f1,
precision=precision,
recall=recall,
f1_score=f1,
total_examples=len(examples),
correct=total_correct,
errors=errors,
)
def evaluate_fyke_filter(
self,
examples: list[FykeExample],
sample_size: Optional[int] = None,
) -> EvaluationResult:
"""Evaluate Fyke filter accuracy."""
pipeline = self._get_pipeline()
if sample_size and sample_size < len(examples):
examples = random.sample(examples, sample_size)
correct = 0
errors = []
true_positives = 0
false_positives = 0
false_negatives = 0
for ex in examples:
try:
result = pipeline.fyke_filter.forward(
resolved_question=ex.question,
conversation_topic="heritage institutions",
language="nl",
)
predicted = result.is_relevant
expected = ex.is_relevant
if predicted == expected:
correct += 1
if expected:
true_positives += 1
else:
errors.append({
"question": ex.question,
"expected": expected,
"predicted": predicted,
"reasoning": result.reasoning,
})
if predicted and not expected:
false_positives += 1
elif not predicted and expected:
false_negatives += 1
except Exception as e:
errors.append({
"question": ex.question,
"expected": ex.is_relevant,
"error": str(e),
})
total = len(examples)
accuracy = correct / total if total > 0 else 0.0
precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0.0
recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0.0
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
return EvaluationResult(
module_name="FykeFilter",
accuracy=accuracy,
precision=precision,
recall=recall,
f1_score=f1,
total_examples=total,
correct=correct,
errors=errors,
)
# =============================================================================
# DSPy OPTIMIZATION
# =============================================================================
class TemplateOptimizer:
"""Optimizes DSPy modules using MIPRO or BootstrapFewShot."""
def __init__(self, optimizer_type: Literal["mipro", "bootstrap", "copro"] = "bootstrap"):
self.optimizer_type = optimizer_type
self.pipeline = None
def _get_pipeline(self):
"""Lazy load pipeline."""
if self.pipeline is None:
from .template_sparql import TemplateSPARQLPipeline
self.pipeline = TemplateSPARQLPipeline()
return self.pipeline
def _template_classification_metric(self, example, prediction, trace=None) -> float:
"""Metric for template classification optimization."""
expected = example.template_id
predicted = prediction.template_id if hasattr(prediction, 'template_id') else None
return 1.0 if predicted == expected else 0.0
def _slot_extraction_metric(self, example, prediction, trace=None) -> float:
"""Metric for slot extraction optimization."""
expected_slots = example.expected_slots
predicted_slots = prediction if isinstance(prediction, dict) else {}
if not expected_slots:
return 1.0
correct = 0
for key, expected_value in expected_slots.items():
if key in predicted_slots:
pred_val = str(predicted_slots[key]).strip().lower()
exp_val = str(expected_value).strip().lower()
if pred_val == exp_val or pred_val in exp_val or exp_val in pred_val:
correct += 1
return correct / len(expected_slots)
def optimize_template_classifier(
self,
examples: list[TemplateExample],
num_trials: int = 10,
) -> dict[str, Any]:
"""Optimize template classifier using DSPy optimizer."""
try:
import dspy
except ImportError:
return {"error": "DSPy not installed"}
pipeline = self._get_pipeline()
# Convert examples to DSPy format
trainset = []
for ex in examples:
trainset.append(dspy.Example(
question=ex.question,
language=ex.language,
template_id=ex.template_id,
).with_inputs("question", "language"))
# Split into train/dev
random.shuffle(trainset)
split = int(len(trainset) * 0.8)
train = trainset[:split]
dev = trainset[split:]
# Select optimizer
if self.optimizer_type == "mipro":
try:
# DSPy 2.6+ uses MIPROv2
from dspy.teleprompt import MIPROv2
optimizer = MIPROv2(
metric=self._template_classification_metric,
num_candidates=num_trials,
auto="medium", # light, medium, or heavy
)
except ImportError:
try:
# Fallback to MIPRO for older DSPy versions
from dspy.teleprompt import MIPRO
optimizer = MIPRO(
metric=self._template_classification_metric,
num_candidates=num_trials,
)
except ImportError:
from dspy.teleprompt import BootstrapFewShot
optimizer = BootstrapFewShot(
metric=self._template_classification_metric,
max_bootstrapped_demos=4,
max_labeled_demos=4,
)
elif self.optimizer_type == "copro":
try:
from dspy.teleprompt import COPRO
optimizer = COPRO(
metric=self._template_classification_metric,
depth=3,
breadth=3,
)
except ImportError:
from dspy.teleprompt import BootstrapFewShot
optimizer = BootstrapFewShot(
metric=self._template_classification_metric,
max_bootstrapped_demos=4,
)
else:
from dspy.teleprompt import BootstrapFewShot
optimizer = BootstrapFewShot(
metric=self._template_classification_metric,
max_bootstrapped_demos=4,
max_labeled_demos=4,
)
# Run optimization
logger.info(f"Running {self.optimizer_type} optimization with {len(train)} training examples...")
try:
# DSPy compile() signature varies by version and optimizer
# BootstrapFewShot: compile(student, trainset=..., teacher=None)
# MIPROv2: compile(student, trainset=..., num_trials=..., eval_kwargs=...)
if self.optimizer_type == "mipro" and hasattr(optimizer, 'compile'):
# MIPROv2 uses eval_kwargs for validation set
optimized_classifier = optimizer.compile(
pipeline.template_classifier,
trainset=train,
eval_kwargs={"devset": dev} if dev else {},
)
else:
# BootstrapFewShot and COPRO use simpler signature
optimized_classifier = optimizer.compile(
pipeline.template_classifier,
trainset=train,
)
# Evaluate optimized model
correct = 0
for ex in dev:
result = optimized_classifier.forward(
question=ex.question,
language=ex.language,
)
if result.template_id == ex.template_id:
correct += 1
accuracy = correct / len(dev) if dev else 0.0
return {
"success": True,
"optimizer": self.optimizer_type,
"train_size": len(train),
"dev_size": len(dev),
"dev_accuracy": accuracy,
"optimized_module": optimized_classifier,
}
except Exception as e:
logger.error(f"Optimization failed: {e}")
return {"success": False, "error": str(e)}
# =============================================================================
# CLI
# =============================================================================
def main():
"""CLI entry point."""
import argparse
parser = argparse.ArgumentParser(description="DSPy Template Optimizer")
subparsers = parser.add_subparsers(dest="command", help="Command to run")
# generate-data command
gen_parser = subparsers.add_parser("generate-data", help="Generate training data from templates")
gen_parser.add_argument("--output", "-o", type=str, default=str(TRAINING_DATA_PATH),
help="Output path for training data JSON")
# evaluate command
eval_parser = subparsers.add_parser("evaluate", help="Evaluate current model")
eval_parser.add_argument("--module", "-m", type=str, default="all",
choices=["all", "template", "slot", "fyke"],
help="Which module to evaluate")
eval_parser.add_argument("--sample-size", "-n", type=int, default=50,
help="Number of examples to evaluate")
# optimize command
opt_parser = subparsers.add_parser("optimize", help="Run DSPy optimization")
opt_parser.add_argument("--optimizer", "-O", type=str, default="bootstrap",
choices=["mipro", "bootstrap", "copro"],
help="Optimizer to use")
opt_parser.add_argument("--trials", "-t", type=int, default=10,
help="Number of optimization trials")
opt_parser.add_argument("--module", "-m", type=str, default="template",
choices=["template", "slot"],
help="Module to optimize")
# export command
export_parser = subparsers.add_parser("export", help="Export optimized prompts")
export_parser.add_argument("--output", "-o", type=str, default=str(OPTIMIZED_PROMPTS_PATH),
help="Output path for optimized prompts")
args = parser.parse_args()
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
if args.command == "generate-data":
print("Generating training data from templates...")
generator = TrainingDataGenerator()
dataset = generator.generate_dataset()
# Ensure output directory exists
output_path = Path(args.output)
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, "w") as f:
json.dump(dataset.to_dict(), f, indent=2, ensure_ascii=False)
print(f"Generated training data:")
print(f" - Template examples: {len(dataset.template_examples)}")
print(f" - Fyke examples: {len(dataset.fyke_examples)}")
print(f" - Slot examples: {len(dataset.slot_examples)}")
print(f" - Follow-up examples: {len(dataset.follow_up_examples)}")
print(f" - Saved to: {output_path}")
elif args.command == "evaluate":
print("Loading training data...")
# Load or generate dataset
if TRAINING_DATA_PATH.exists():
with open(TRAINING_DATA_PATH) as f:
dataset = TrainingDataset.from_dict(json.load(f))
else:
print("Training data not found, generating...")
generator = TrainingDataGenerator()
dataset = generator.generate_dataset()
evaluator = TemplateEvaluator()
if args.module in ["all", "template"]:
print("\nEvaluating Template Classifier...")
result = evaluator.evaluate_template_classification(
dataset.template_examples,
sample_size=args.sample_size,
)
print(f" Accuracy: {result.accuracy:.2%}")
print(f" Correct: {result.correct}/{result.total_examples}")
if result.errors:
print(f" Sample errors:")
for err in result.errors[:3]:
print(f" - Q: {err.get('question', '')[:50]}...")
print(f" Expected: {err.get('expected')}, Got: {err.get('predicted')}")
if args.module in ["all", "slot"]:
print("\nEvaluating Slot Extractor...")
result = evaluator.evaluate_slot_extraction(
dataset.slot_examples,
sample_size=args.sample_size,
)
print(f" Precision: {result.precision:.2%}")
print(f" Recall: {result.recall:.2%}")
print(f" F1 Score: {result.f1_score:.2%}")
if args.module in ["all", "fyke"]:
print("\nEvaluating Fyke Filter...")
result = evaluator.evaluate_fyke_filter(
dataset.fyke_examples,
sample_size=args.sample_size,
)
print(f" Accuracy: {result.accuracy:.2%}")
print(f" Precision: {result.precision:.2%}")
print(f" Recall: {result.recall:.2%}")
print(f" F1 Score: {result.f1_score:.2%}")
elif args.command == "optimize":
print(f"Running {args.optimizer} optimization for {args.module}...")
# Load or generate dataset
if TRAINING_DATA_PATH.exists():
with open(TRAINING_DATA_PATH) as f:
dataset = TrainingDataset.from_dict(json.load(f))
else:
print("Training data not found, generating...")
generator = TrainingDataGenerator()
dataset = generator.generate_dataset()
optimizer = TemplateOptimizer(optimizer_type=args.optimizer)
if args.module == "template":
result = optimizer.optimize_template_classifier(
dataset.template_examples,
num_trials=args.trials,
)
if result.get("success"):
print(f"Optimization complete!")
print(f" Dev accuracy: {result['dev_accuracy']:.2%}")
else:
print(f"Optimization failed: {result.get('error')}")
elif args.command == "export":
print("Export not yet implemented - optimized modules are saved during optimization")
else:
parser.print_help()
if __name__ == "__main__":
main()