""" DSPy Optimizer for Heritage RAG Template System Optimizes the DSPy modules in template_sparql.py using DSPy's built-in optimizers (MIPRO, BootstrapFewShot, COPRO) to improve: - Template classification accuracy - Slot extraction precision/recall - Follow-up question resolution - Fyke filter relevance detection Based on: - DSPy 2.6+ optimization API - Formica et al. (2023) - Template SPARQL achieves 65% precision - data/sparql_templates.yaml template definitions Usage: # Generate training data from templates python -m backend.rag.optimize_templates generate-data # Run optimization python -m backend.rag.optimize_templates optimize --optimizer mipro # Evaluate current model python -m backend.rag.optimize_templates evaluate # Export optimized prompts python -m backend.rag.optimize_templates export Author: OpenCode Created: 2025-01-06 """ from __future__ import annotations import json import logging import random from dataclasses import dataclass, field from datetime import datetime, timezone from pathlib import Path from typing import Any, Literal, Optional import yaml logger = logging.getLogger(__name__) # ============================================================================= # PATHS # ============================================================================= TEMPLATES_PATH = Path(__file__).parent.parent.parent / "data" / "sparql_templates.yaml" TRAINING_DATA_PATH = Path(__file__).parent.parent.parent / "data" / "training" / "template_training.json" OPTIMIZED_PROMPTS_PATH = Path(__file__).parent / "optimized_prompts.json" # ============================================================================= # TRAINING DATA STRUCTURES # ============================================================================= @dataclass class TemplateExample: """A training example for template classification.""" question: str template_id: str slots: dict[str, str] language: str = "nl" is_follow_up: bool = False previous_question: Optional[str] = None previous_slots: Optional[dict[str, str]] = None @dataclass class FykeExample: """A training example for Fyke filter.""" question: str is_relevant: bool reasoning: str @dataclass class SlotExample: """A training example for slot extraction.""" question: str template_id: str expected_slots: dict[str, str] language: str = "nl" @dataclass class TrainingDataset: """Complete training dataset for all DSPy modules.""" template_examples: list[TemplateExample] = field(default_factory=list) fyke_examples: list[FykeExample] = field(default_factory=list) slot_examples: list[SlotExample] = field(default_factory=list) follow_up_examples: list[TemplateExample] = field(default_factory=list) metadata: dict[str, Any] = field(default_factory=dict) def to_dict(self) -> dict[str, Any]: """Convert to JSON-serializable dict.""" return { "metadata": { **self.metadata, "generated_at": datetime.now(timezone.utc).isoformat(), "template_count": len(self.template_examples), "fyke_count": len(self.fyke_examples), "slot_count": len(self.slot_examples), "follow_up_count": len(self.follow_up_examples), }, "template_examples": [ { "question": ex.question, "template_id": ex.template_id, "slots": ex.slots, "language": ex.language, "is_follow_up": ex.is_follow_up, "previous_question": ex.previous_question, "previous_slots": ex.previous_slots, } for ex in self.template_examples ], "fyke_examples": [ { "question": ex.question, "is_relevant": ex.is_relevant, "reasoning": ex.reasoning, } for ex in self.fyke_examples ], "slot_examples": [ { "question": ex.question, "template_id": ex.template_id, "expected_slots": ex.expected_slots, "language": ex.language, } for ex in self.slot_examples ], "follow_up_examples": [ { "question": ex.question, "template_id": ex.template_id, "slots": ex.slots, "language": ex.language, "is_follow_up": ex.is_follow_up, "previous_question": ex.previous_question, "previous_slots": ex.previous_slots, } for ex in self.follow_up_examples ], } @classmethod def from_dict(cls, data: dict[str, Any]) -> "TrainingDataset": """Load from JSON dict.""" return cls( metadata=data.get("metadata", {}), template_examples=[ TemplateExample(**ex) for ex in data.get("template_examples", []) ], fyke_examples=[ FykeExample(**ex) for ex in data.get("fyke_examples", []) ], slot_examples=[ SlotExample(**ex) for ex in data.get("slot_examples", []) ], follow_up_examples=[ TemplateExample(**ex) for ex in data.get("follow_up_examples", []) ], ) # ============================================================================= # DATA GENERATION # ============================================================================= class TrainingDataGenerator: """Generates training data from SPARQL templates YAML.""" def __init__(self): self.templates: dict[str, Any] = {} self.slot_types: dict[str, Any] = {} self.fyke_config: dict[str, Any] = {} self.follow_up_patterns: dict[str, Any] = {} def load_templates(self) -> None: """Load templates from YAML file.""" if not TEMPLATES_PATH.exists(): raise FileNotFoundError(f"Templates not found: {TEMPLATES_PATH}") with open(TEMPLATES_PATH) as f: data = yaml.safe_load(f) self.templates = data.get("templates", {}) self.slot_types = data.get("_slot_types", {}) self.fyke_config = data.get("fyke_filter", {}) self.follow_up_patterns = data.get("follow_up_patterns", {}) logger.info(f"Loaded {len(self.templates)} templates") def generate_template_examples(self) -> list[TemplateExample]: """Generate template classification examples from patterns.""" examples = [] # City and region substitution values dutch_cities = [ "Amsterdam", "Rotterdam", "Den Haag", "Utrecht", "Eindhoven", "Groningen", "Tilburg", "Almere", "Breda", "Nijmegen", "Enschede", "Haarlem", "Arnhem", "Zaanstad", "Amersfoort", "Apeldoorn", "Maastricht", "Leiden", "Dordrecht", "Zwolle", ] regions = [ ("Noord-Holland", "NL-NH"), ("Zuid-Holland", "NL-ZH"), ("Noord-Brabant", "NL-NB"), ("Gelderland", "NL-GE"), ("Utrecht", "NL-UT"), ("Overijssel", "NL-OV"), ("Limburg", "NL-LI"), ("Friesland", "NL-FR"), ("Groningen", "NL-GR"), ("Drenthe", "NL-DR"), ("Flevoland", "NL-FL"), ("Zeeland", "NL-ZE"), ] countries = [ ("Nederland", "Q55"), ("Belgium", "Q31"), ("Germany", "Q183"), ("France", "Q142"), ] institution_types = [ ("musea", "M"), ("archieven", "A"), ("bibliotheken", "L"), ("galerijen", "G"), ("museums", "M"), ("archives", "A"), ("libraries", "L"), ] for template_id, template_data in self.templates.items(): patterns = template_data.get("question_patterns", []) template_examples = template_data.get("examples", []) # Use explicit examples from template for ex in template_examples: examples.append(TemplateExample( question=ex.get("question", ""), template_id=template_id, slots=ex.get("slots", {}), language="nl" if any(c in ex.get("question", "") for c in "éèêëîïôùûüçàâäœæ") or any(w in ex.get("question", "").lower() for w in ["welke", "hoeveel", "wat"]) else "en", )) # Generate examples from patterns for pattern in patterns[:3]: # Limit per pattern # Substitute placeholders if "{institution_type_nl}" in pattern or "{institution_type_en}" in pattern: for inst_name, inst_code in institution_types[:3]: question = pattern.replace("{institution_type_nl}", inst_name) question = question.replace("{institution_type_en}", inst_name) if "{city}" in question: for city in random.sample(dutch_cities, min(3, len(dutch_cities))): q = question.replace("{city}", city) examples.append(TemplateExample( question=q, template_id=template_id, slots={"institution_type": inst_code, "city": city}, language="nl" if "Welke" in q or "Hoeveel" in q else "en", )) elif "{region}" in question: for region_name, region_code in random.sample(regions, min(2, len(regions))): q = question.replace("{region}", region_name) examples.append(TemplateExample( question=q, template_id=template_id, slots={"institution_type": inst_code, "region": region_code}, language="nl" if "Welke" in q or "Hoeveel" in q else "en", )) elif "{country}" in question: for country_name, country_code in random.sample(countries, min(2, len(countries))): q = question.replace("{country}", country_name) examples.append(TemplateExample( question=q, template_id=template_id, slots={"institution_type": inst_code, "country": country_code}, language="nl" if "Welke" in q or "Hoeveel" in q else "en", )) elif "{location}" in question: for city in random.sample(dutch_cities, min(2, len(dutch_cities))): q = question.replace("{location}", city) examples.append(TemplateExample( question=q, template_id=template_id, slots={"institution_type": inst_code, "location": city}, language="nl" if "Welke" in q or "Hoeveel" in q else "en", )) elif "{city}" in pattern: for city in random.sample(dutch_cities, min(3, len(dutch_cities))): q = pattern.replace("{city}", city) examples.append(TemplateExample( question=q, template_id=template_id, slots={"city": city}, language="nl" if "Welke" in q or "Wat" in q else "en", )) elif "{institution_name}" in pattern: institution_names = ["Rijksmuseum", "Nationaal Archief", "Koninklijke Bibliotheek", "Van Gogh Museum"] for name in institution_names[:2]: q = pattern.replace("{institution_name}", name) examples.append(TemplateExample( question=q, template_id=template_id, slots={"institution_name": name}, language="nl" if "Waar" in q or "Informatie" in q else "en", )) elif "{identifier}" in pattern: identifiers = ["NL-AmRMA", "NL-HaNA", "NL-DhHSA"] for ident in identifiers[:2]: q = pattern.replace("{identifier}", ident) examples.append(TemplateExample( question=q, template_id=template_id, slots={"identifier": ident, "identifier_type": "isil"}, language="nl" if "Welke" in q else "en", )) elif "{location1}" in pattern and "{location2}" in pattern: pairs = [("Amsterdam", "Rotterdam"), ("Den Haag", "Utrecht"), ("Groningen", "Maastricht")] for loc1, loc2 in pairs[:2]: q = pattern.replace("{location1}", loc1).replace("{location2}", loc2) examples.append(TemplateExample( question=q, template_id=template_id, slots={"location1": loc1, "location2": loc2}, language="nl" if "Vergelijk" in q else "en", )) logger.info(f"Generated {len(examples)} template classification examples") return examples def generate_fyke_examples(self) -> list[FykeExample]: """Generate Fyke filter training examples.""" examples = [] # Relevant examples (heritage-related) relevant_questions = [ ("Welke musea zijn er in Amsterdam?", "Contains heritage keyword 'musea' and asks about institutions"), ("Hoeveel archieven heeft Noord-Holland?", "Asks about archive count in a region"), ("Wat is het Rijksmuseum?", "Asks about a specific museum"), ("Welke bibliotheken zijn er in Den Haag?", "Contains heritage keyword 'bibliotheken'"), ("Erfgoedinstellingen in Utrecht", "Contains heritage keyword 'erfgoedinstellingen'"), ("What museums are in Rotterdam?", "English query about museums"), ("How many archives are there in the Netherlands?", "English query about archives"), ("Information about the National Archive", "Asks about heritage institution"), ("Cultural institutions in Maastricht", "Contains heritage keyword 'cultural institutions'"), ("Galerijen in Groningen", "Contains heritage keyword 'galerijen'"), ("Welke collecties heeft het Rijksmuseum?", "Asks about museum collections"), ("Tentoonstellingen in Amsterdam", "Asks about exhibitions"), ("GLAM instellingen in Nederland", "Contains GLAM acronym"), ("Heritage institutions in the Netherlands", "Contains heritage keyword"), ] for question, reasoning in relevant_questions: examples.append(FykeExample( question=question, is_relevant=True, reasoning=reasoning, )) # Irrelevant examples (out of scope) irrelevant_questions = [ ("Waar kan ik tandpasta kopen?", "Shopping query unrelated to heritage"), ("Wat is het weer morgen in Amsterdam?", "Weather query"), ("Beste restaurants in Rotterdam", "Restaurant/dining query"), ("Voetbalwedstrijd Ajax vandaag", "Sports query"), ("How do I book a hotel in The Hague?", "Travel booking query"), ("Bitcoin price today", "Cryptocurrency query"), ("Recipe for apple pie", "Cooking query"), ("Where can I find a supermarket?", "Shopping query"), ("What is the capital of France?", "General knowledge, not heritage-specific"), ("How do I fix my computer?", "Technical support query"), ("Dating tips", "Personal advice query"), ("Best flight deals to Spain", "Travel booking query"), ] for question, reasoning in irrelevant_questions: examples.append(FykeExample( question=question, is_relevant=False, reasoning=reasoning, )) # Edge cases (could be ambiguous) edge_cases = [ ("Buildings in Amsterdam", True, "Could refer to historic/heritage buildings"), ("History of Rotterdam", True, "Historical topic often relates to heritage"), ("Art in The Hague", True, "Art is heritage-related"), ("Books about Dutch history", True, "Library/archive related topic"), ("Where is the library?", True, "Asks about library location"), ("Opening hours", False, "Too vague without heritage context"), ("How old is it?", False, "Pronoun reference without context"), ] for question, is_relevant, reasoning in edge_cases: examples.append(FykeExample( question=question, is_relevant=is_relevant, reasoning=reasoning, )) logger.info(f"Generated {len(examples)} Fyke filter examples") return examples def generate_slot_examples(self) -> list[SlotExample]: """Generate slot extraction training examples.""" examples = [] # Institution type extraction type_questions = [ ("Welke musea zijn er in Amsterdam?", "list_institutions_by_type_city", {"institution_type": "M", "city": "Amsterdam"}), ("Archieven in Den Haag", "list_institutions_by_type_city", {"institution_type": "A", "city": "Den Haag"}), ("What libraries are in Rotterdam?", "list_institutions_by_type_city", {"institution_type": "L", "city": "Rotterdam"}), ("Galerijen in Utrecht", "list_institutions_by_type_city", {"institution_type": "G", "city": "Utrecht"}), ("Museums in Noord-Holland", "list_institutions_by_type_region", {"institution_type": "M", "region": "NL-NH"}), ("Archives in Gelderland", "list_institutions_by_type_region", {"institution_type": "A", "region": "NL-GE"}), ("Bibliotheken in Nederland", "list_institutions_by_type_country", {"institution_type": "L", "country": "Q55"}), ("Hoeveel musea zijn er in Amsterdam?", "count_institutions_by_type_location", {"institution_type": "M", "location": "Amsterdam"}), ("Hoeveel archieven heeft Zuid-Holland?", "count_institutions_by_type_location", {"institution_type": "A", "location": "NL-ZH"}), ] for question, template_id, expected_slots in type_questions: examples.append(SlotExample( question=question, template_id=template_id, expected_slots=expected_slots, language="nl" if any(w in question.lower() for w in ["welke", "hoeveel", "wat"]) else "en", )) # Name extraction name_questions = [ ("Waar is het Rijksmuseum?", "find_institution_by_name", {"institution_name": "Rijksmuseum"}), ("Informatie over het Nationaal Archief", "find_institution_by_name", {"institution_name": "Nationaal Archief"}), ("What is the Van Gogh Museum?", "find_institution_by_name", {"institution_name": "Van Gogh Museum"}), ("Tell me about Koninklijke Bibliotheek", "find_institution_by_name", {"institution_name": "Koninklijke Bibliotheek"}), ] for question, template_id, expected_slots in name_questions: examples.append(SlotExample( question=question, template_id=template_id, expected_slots=expected_slots, language="nl" if "Waar" in question or "Informatie" in question else "en", )) # Identifier extraction identifier_questions = [ ("Welke instelling heeft ISIL NL-AmRMA?", "find_institution_by_identifier", {"identifier": "NL-AmRMA", "identifier_type": "isil"}), ("Zoek ISIL NL-HaNA", "find_institution_by_identifier", {"identifier": "NL-HaNA", "identifier_type": "isil"}), ("Find institution with ISIL DE-1", "find_institution_by_identifier", {"identifier": "DE-1", "identifier_type": "isil"}), ] for question, template_id, expected_slots in identifier_questions: examples.append(SlotExample( question=question, template_id=template_id, expected_slots=expected_slots, language="nl" if "Welke" in question or "Zoek" in question else "en", )) # Comparison extraction comparison_questions = [ ("Vergelijk Amsterdam en Rotterdam", "compare_locations", {"location1": "Amsterdam", "location2": "Rotterdam"}), ("Amsterdam versus Utrecht", "compare_locations", {"location1": "Amsterdam", "location2": "Utrecht"}), ("Compare The Hague and Leiden", "compare_locations", {"location1": "Den Haag", "location2": "Leiden"}), ] for question, template_id, expected_slots in comparison_questions: examples.append(SlotExample( question=question, template_id=template_id, expected_slots=expected_slots, language="nl" if "Vergelijk" in question else "en", )) logger.info(f"Generated {len(examples)} slot extraction examples") return examples def generate_follow_up_examples(self) -> list[TemplateExample]: """Generate follow-up question resolution examples.""" examples = [] # Location swap follow-ups location_swaps = [ # (follow-up, resolved, previous_question, inherited_slots, new_slots) ("En in Rotterdam?", "Welke musea zijn er in Rotterdam?", "Welke musea zijn er in Amsterdam?", {"institution_type": "M"}, {"institution_type": "M", "city": "Rotterdam"}), ("En in Utrecht?", "Welke archieven zijn er in Utrecht?", "Welke archieven zijn er in Den Haag?", {"institution_type": "A"}, {"institution_type": "A", "city": "Utrecht"}), ("What about Groningen?", "What libraries are in Groningen?", "What libraries are in Amsterdam?", {"institution_type": "L"}, {"institution_type": "L", "city": "Groningen"}), ("En Enschede?", "Welke bibliotheken zijn er in Enschede?", "Welke bibliotheken zijn er in Zwolle?", {"institution_type": "L"}, {"institution_type": "L", "city": "Enschede"}), ] for follow_up, resolved, prev_q, prev_slots, new_slots in location_swaps: examples.append(TemplateExample( question=follow_up, template_id="list_institutions_by_type_city", slots=new_slots, language="nl" if "En" in follow_up else "en", is_follow_up=True, previous_question=prev_q, previous_slots=prev_slots, )) # Type swap follow-ups type_swaps = [ ("En de archieven?", "Welke archieven zijn er in Amsterdam?", "Welke musea zijn er in Amsterdam?", {"city": "Amsterdam"}, {"institution_type": "A", "city": "Amsterdam"}), ("What about libraries?", "What libraries are in Rotterdam?", "What museums are in Rotterdam?", {"city": "Rotterdam"}, {"institution_type": "L", "city": "Rotterdam"}), ("En bibliotheken?", "Welke bibliotheken zijn er in Utrecht?", "Welke galerijen zijn er in Utrecht?", {"city": "Utrecht"}, {"institution_type": "L", "city": "Utrecht"}), ] for follow_up, resolved, prev_q, prev_slots, new_slots in type_swaps: examples.append(TemplateExample( question=follow_up, template_id="list_institutions_by_type_city", slots=new_slots, language="nl" if "En" in follow_up else "en", is_follow_up=True, previous_question=prev_q, previous_slots=prev_slots, )) # Count from list follow-ups count_follow_ups = [ ("Hoeveel zijn dat?", "Hoeveel musea zijn er in Amsterdam?", "Welke musea zijn er in Amsterdam?", {"institution_type": "M", "city": "Amsterdam"}, {"institution_type": "M", "location": "Amsterdam"}), ("How many?", "How many archives are in Rotterdam?", "What archives are in Rotterdam?", {"institution_type": "A", "city": "Rotterdam"}, {"institution_type": "A", "location": "Rotterdam"}), ("Tel ze", "Hoeveel bibliotheken zijn er in Den Haag?", "Welke bibliotheken zijn er in Den Haag?", {"institution_type": "L", "city": "Den Haag"}, {"institution_type": "L", "location": "Den Haag"}), ] for follow_up, resolved, prev_q, prev_slots, new_slots in count_follow_ups: examples.append(TemplateExample( question=follow_up, template_id="count_institutions_by_type_location", slots=new_slots, language="nl" if "Hoeveel" in follow_up or "Tel" in follow_up else "en", is_follow_up=True, previous_question=prev_q, previous_slots=prev_slots, )) logger.info(f"Generated {len(examples)} follow-up examples") return examples def generate_dataset(self) -> TrainingDataset: """Generate complete training dataset.""" self.load_templates() return TrainingDataset( template_examples=self.generate_template_examples(), fyke_examples=self.generate_fyke_examples(), slot_examples=self.generate_slot_examples(), follow_up_examples=self.generate_follow_up_examples(), metadata={ "source": str(TEMPLATES_PATH), "template_count": len(self.templates), }, ) # ============================================================================= # EVALUATION METRICS # ============================================================================= @dataclass class EvaluationResult: """Results from evaluating a DSPy module.""" module_name: str accuracy: float precision: float recall: float f1_score: float total_examples: int correct: int errors: list[dict[str, Any]] = field(default_factory=list) def to_dict(self) -> dict[str, Any]: return { "module_name": self.module_name, "accuracy": self.accuracy, "precision": self.precision, "recall": self.recall, "f1_score": self.f1_score, "total_examples": self.total_examples, "correct": self.correct, "error_count": len(self.errors), "errors": self.errors[:10], # Limit error samples } class TemplateEvaluator: """Evaluates template classification and slot extraction.""" def __init__(self): self.pipeline = None def _get_pipeline(self): """Lazy load pipeline.""" if self.pipeline is None: from .template_sparql import TemplateSPARQLPipeline self.pipeline = TemplateSPARQLPipeline() return self.pipeline def evaluate_template_classification( self, examples: list[TemplateExample], sample_size: Optional[int] = None, ) -> EvaluationResult: """Evaluate template classification accuracy.""" pipeline = self._get_pipeline() if sample_size and sample_size < len(examples): examples = random.sample(examples, sample_size) correct = 0 errors = [] for ex in examples: try: result = pipeline.template_classifier.forward( question=ex.question, language=ex.language, ) predicted = result.template_id expected = ex.template_id if predicted == expected: correct += 1 else: errors.append({ "question": ex.question, "expected": expected, "predicted": predicted, "confidence": result.confidence, }) except Exception as e: errors.append({ "question": ex.question, "expected": ex.template_id, "error": str(e), }) total = len(examples) accuracy = correct / total if total > 0 else 0.0 return EvaluationResult( module_name="TemplateClassifier", accuracy=accuracy, precision=accuracy, # For classification, precision = accuracy recall=accuracy, f1_score=accuracy, total_examples=total, correct=correct, errors=errors, ) def evaluate_slot_extraction( self, examples: list[SlotExample], sample_size: Optional[int] = None, ) -> EvaluationResult: """Evaluate slot extraction precision and recall.""" pipeline = self._get_pipeline() if sample_size and sample_size < len(examples): examples = random.sample(examples, sample_size) total_expected = 0 total_predicted = 0 total_correct = 0 errors = [] for ex in examples: try: predicted_slots, _ = pipeline.slot_extractor.forward( question=ex.question, template_id=ex.template_id, inherited_slots=None, ) expected_slots = ex.expected_slots # Count slot matches for key, expected_value in expected_slots.items(): total_expected += 1 if key in predicted_slots: total_predicted += 1 # Normalize for comparison pred_val = str(predicted_slots[key]).strip().lower() exp_val = str(expected_value).strip().lower() if pred_val == exp_val or pred_val in exp_val or exp_val in pred_val: total_correct += 1 else: errors.append({ "question": ex.question, "slot": key, "expected": expected_value, "predicted": predicted_slots[key], }) else: errors.append({ "question": ex.question, "slot": key, "expected": expected_value, "predicted": None, }) except Exception as e: errors.append({ "question": ex.question, "error": str(e), }) precision = total_correct / total_predicted if total_predicted > 0 else 0.0 recall = total_correct / total_expected if total_expected > 0 else 0.0 f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0 return EvaluationResult( module_name="SlotExtractor", accuracy=f1, precision=precision, recall=recall, f1_score=f1, total_examples=len(examples), correct=total_correct, errors=errors, ) def evaluate_fyke_filter( self, examples: list[FykeExample], sample_size: Optional[int] = None, ) -> EvaluationResult: """Evaluate Fyke filter accuracy.""" pipeline = self._get_pipeline() if sample_size and sample_size < len(examples): examples = random.sample(examples, sample_size) correct = 0 errors = [] true_positives = 0 false_positives = 0 false_negatives = 0 for ex in examples: try: result = pipeline.fyke_filter.forward( resolved_question=ex.question, conversation_topic="heritage institutions", language="nl", ) predicted = result.is_relevant expected = ex.is_relevant if predicted == expected: correct += 1 if expected: true_positives += 1 else: errors.append({ "question": ex.question, "expected": expected, "predicted": predicted, "reasoning": result.reasoning, }) if predicted and not expected: false_positives += 1 elif not predicted and expected: false_negatives += 1 except Exception as e: errors.append({ "question": ex.question, "expected": ex.is_relevant, "error": str(e), }) total = len(examples) accuracy = correct / total if total > 0 else 0.0 precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0.0 recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0.0 f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0 return EvaluationResult( module_name="FykeFilter", accuracy=accuracy, precision=precision, recall=recall, f1_score=f1, total_examples=total, correct=correct, errors=errors, ) # ============================================================================= # DSPy OPTIMIZATION # ============================================================================= class TemplateOptimizer: """Optimizes DSPy modules using MIPRO or BootstrapFewShot.""" def __init__(self, optimizer_type: Literal["mipro", "bootstrap", "copro"] = "bootstrap"): self.optimizer_type = optimizer_type self.pipeline = None def _get_pipeline(self): """Lazy load pipeline.""" if self.pipeline is None: from .template_sparql import TemplateSPARQLPipeline self.pipeline = TemplateSPARQLPipeline() return self.pipeline def _template_classification_metric(self, example, prediction, trace=None) -> float: """Metric for template classification optimization.""" expected = example.template_id predicted = prediction.template_id if hasattr(prediction, 'template_id') else None return 1.0 if predicted == expected else 0.0 def _slot_extraction_metric(self, example, prediction, trace=None) -> float: """Metric for slot extraction optimization.""" expected_slots = example.expected_slots predicted_slots = prediction if isinstance(prediction, dict) else {} if not expected_slots: return 1.0 correct = 0 for key, expected_value in expected_slots.items(): if key in predicted_slots: pred_val = str(predicted_slots[key]).strip().lower() exp_val = str(expected_value).strip().lower() if pred_val == exp_val or pred_val in exp_val or exp_val in pred_val: correct += 1 return correct / len(expected_slots) def optimize_template_classifier( self, examples: list[TemplateExample], num_trials: int = 10, ) -> dict[str, Any]: """Optimize template classifier using DSPy optimizer.""" try: import dspy except ImportError: return {"error": "DSPy not installed"} pipeline = self._get_pipeline() # Convert examples to DSPy format trainset = [] for ex in examples: trainset.append(dspy.Example( question=ex.question, language=ex.language, template_id=ex.template_id, ).with_inputs("question", "language")) # Split into train/dev random.shuffle(trainset) split = int(len(trainset) * 0.8) train = trainset[:split] dev = trainset[split:] # Select optimizer if self.optimizer_type == "mipro": try: # DSPy 2.6+ uses MIPROv2 from dspy.teleprompt import MIPROv2 optimizer = MIPROv2( metric=self._template_classification_metric, num_candidates=num_trials, auto="medium", # light, medium, or heavy ) except ImportError: try: # Fallback to MIPRO for older DSPy versions from dspy.teleprompt import MIPRO optimizer = MIPRO( metric=self._template_classification_metric, num_candidates=num_trials, ) except ImportError: from dspy.teleprompt import BootstrapFewShot optimizer = BootstrapFewShot( metric=self._template_classification_metric, max_bootstrapped_demos=4, max_labeled_demos=4, ) elif self.optimizer_type == "copro": try: from dspy.teleprompt import COPRO optimizer = COPRO( metric=self._template_classification_metric, depth=3, breadth=3, ) except ImportError: from dspy.teleprompt import BootstrapFewShot optimizer = BootstrapFewShot( metric=self._template_classification_metric, max_bootstrapped_demos=4, ) else: from dspy.teleprompt import BootstrapFewShot optimizer = BootstrapFewShot( metric=self._template_classification_metric, max_bootstrapped_demos=4, max_labeled_demos=4, ) # Run optimization logger.info(f"Running {self.optimizer_type} optimization with {len(train)} training examples...") try: # DSPy compile() signature varies by version and optimizer # BootstrapFewShot: compile(student, trainset=..., teacher=None) # MIPROv2: compile(student, trainset=..., num_trials=..., eval_kwargs=...) if self.optimizer_type == "mipro" and hasattr(optimizer, 'compile'): # MIPROv2 uses eval_kwargs for validation set optimized_classifier = optimizer.compile( pipeline.template_classifier, trainset=train, eval_kwargs={"devset": dev} if dev else {}, ) else: # BootstrapFewShot and COPRO use simpler signature optimized_classifier = optimizer.compile( pipeline.template_classifier, trainset=train, ) # Evaluate optimized model correct = 0 for ex in dev: result = optimized_classifier.forward( question=ex.question, language=ex.language, ) if result.template_id == ex.template_id: correct += 1 accuracy = correct / len(dev) if dev else 0.0 return { "success": True, "optimizer": self.optimizer_type, "train_size": len(train), "dev_size": len(dev), "dev_accuracy": accuracy, "optimized_module": optimized_classifier, } except Exception as e: logger.error(f"Optimization failed: {e}") return {"success": False, "error": str(e)} # ============================================================================= # CLI # ============================================================================= def main(): """CLI entry point.""" import argparse parser = argparse.ArgumentParser(description="DSPy Template Optimizer") subparsers = parser.add_subparsers(dest="command", help="Command to run") # generate-data command gen_parser = subparsers.add_parser("generate-data", help="Generate training data from templates") gen_parser.add_argument("--output", "-o", type=str, default=str(TRAINING_DATA_PATH), help="Output path for training data JSON") # evaluate command eval_parser = subparsers.add_parser("evaluate", help="Evaluate current model") eval_parser.add_argument("--module", "-m", type=str, default="all", choices=["all", "template", "slot", "fyke"], help="Which module to evaluate") eval_parser.add_argument("--sample-size", "-n", type=int, default=50, help="Number of examples to evaluate") # optimize command opt_parser = subparsers.add_parser("optimize", help="Run DSPy optimization") opt_parser.add_argument("--optimizer", "-O", type=str, default="bootstrap", choices=["mipro", "bootstrap", "copro"], help="Optimizer to use") opt_parser.add_argument("--trials", "-t", type=int, default=10, help="Number of optimization trials") opt_parser.add_argument("--module", "-m", type=str, default="template", choices=["template", "slot"], help="Module to optimize") # export command export_parser = subparsers.add_parser("export", help="Export optimized prompts") export_parser.add_argument("--output", "-o", type=str, default=str(OPTIMIZED_PROMPTS_PATH), help="Output path for optimized prompts") args = parser.parse_args() logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") if args.command == "generate-data": print("Generating training data from templates...") generator = TrainingDataGenerator() dataset = generator.generate_dataset() # Ensure output directory exists output_path = Path(args.output) output_path.parent.mkdir(parents=True, exist_ok=True) with open(output_path, "w") as f: json.dump(dataset.to_dict(), f, indent=2, ensure_ascii=False) print(f"Generated training data:") print(f" - Template examples: {len(dataset.template_examples)}") print(f" - Fyke examples: {len(dataset.fyke_examples)}") print(f" - Slot examples: {len(dataset.slot_examples)}") print(f" - Follow-up examples: {len(dataset.follow_up_examples)}") print(f" - Saved to: {output_path}") elif args.command == "evaluate": print("Loading training data...") # Load or generate dataset if TRAINING_DATA_PATH.exists(): with open(TRAINING_DATA_PATH) as f: dataset = TrainingDataset.from_dict(json.load(f)) else: print("Training data not found, generating...") generator = TrainingDataGenerator() dataset = generator.generate_dataset() evaluator = TemplateEvaluator() if args.module in ["all", "template"]: print("\nEvaluating Template Classifier...") result = evaluator.evaluate_template_classification( dataset.template_examples, sample_size=args.sample_size, ) print(f" Accuracy: {result.accuracy:.2%}") print(f" Correct: {result.correct}/{result.total_examples}") if result.errors: print(f" Sample errors:") for err in result.errors[:3]: print(f" - Q: {err.get('question', '')[:50]}...") print(f" Expected: {err.get('expected')}, Got: {err.get('predicted')}") if args.module in ["all", "slot"]: print("\nEvaluating Slot Extractor...") result = evaluator.evaluate_slot_extraction( dataset.slot_examples, sample_size=args.sample_size, ) print(f" Precision: {result.precision:.2%}") print(f" Recall: {result.recall:.2%}") print(f" F1 Score: {result.f1_score:.2%}") if args.module in ["all", "fyke"]: print("\nEvaluating Fyke Filter...") result = evaluator.evaluate_fyke_filter( dataset.fyke_examples, sample_size=args.sample_size, ) print(f" Accuracy: {result.accuracy:.2%}") print(f" Precision: {result.precision:.2%}") print(f" Recall: {result.recall:.2%}") print(f" F1 Score: {result.f1_score:.2%}") elif args.command == "optimize": print(f"Running {args.optimizer} optimization for {args.module}...") # Load or generate dataset if TRAINING_DATA_PATH.exists(): with open(TRAINING_DATA_PATH) as f: dataset = TrainingDataset.from_dict(json.load(f)) else: print("Training data not found, generating...") generator = TrainingDataGenerator() dataset = generator.generate_dataset() optimizer = TemplateOptimizer(optimizer_type=args.optimizer) if args.module == "template": result = optimizer.optimize_template_classifier( dataset.template_examples, num_trials=args.trials, ) if result.get("success"): print(f"Optimization complete!") print(f" Dev accuracy: {result['dev_accuracy']:.2%}") else: print(f"Optimization failed: {result.get('error')}") elif args.command == "export": print("Export not yet implemented - optimized modules are saved during optimization") else: parser.print_help() if __name__ == "__main__": main()