1136 lines
48 KiB
Python
1136 lines
48 KiB
Python
"""
|
|
DSPy Optimizer for Heritage RAG Template System
|
|
|
|
Optimizes the DSPy modules in template_sparql.py using DSPy's built-in
|
|
optimizers (MIPRO, BootstrapFewShot, COPRO) to improve:
|
|
- Template classification accuracy
|
|
- Slot extraction precision/recall
|
|
- Follow-up question resolution
|
|
- Fyke filter relevance detection
|
|
|
|
Based on:
|
|
- DSPy 2.6+ optimization API
|
|
- Formica et al. (2023) - Template SPARQL achieves 65% precision
|
|
- data/sparql_templates.yaml template definitions
|
|
|
|
Usage:
|
|
# Generate training data from templates
|
|
python -m backend.rag.optimize_templates generate-data
|
|
|
|
# Run optimization
|
|
python -m backend.rag.optimize_templates optimize --optimizer mipro
|
|
|
|
# Evaluate current model
|
|
python -m backend.rag.optimize_templates evaluate
|
|
|
|
# Export optimized prompts
|
|
python -m backend.rag.optimize_templates export
|
|
|
|
Author: OpenCode
|
|
Created: 2025-01-06
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import logging
|
|
import random
|
|
from dataclasses import dataclass, field
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Any, Literal, Optional
|
|
|
|
import yaml
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# =============================================================================
|
|
# PATHS
|
|
# =============================================================================
|
|
|
|
TEMPLATES_PATH = Path(__file__).parent.parent.parent / "data" / "sparql_templates.yaml"
|
|
TRAINING_DATA_PATH = Path(__file__).parent.parent.parent / "data" / "training" / "template_training.json"
|
|
OPTIMIZED_PROMPTS_PATH = Path(__file__).parent / "optimized_prompts.json"
|
|
|
|
|
|
# =============================================================================
|
|
# TRAINING DATA STRUCTURES
|
|
# =============================================================================
|
|
|
|
@dataclass
|
|
class TemplateExample:
|
|
"""A training example for template classification."""
|
|
question: str
|
|
template_id: str
|
|
slots: dict[str, str]
|
|
language: str = "nl"
|
|
is_follow_up: bool = False
|
|
previous_question: Optional[str] = None
|
|
previous_slots: Optional[dict[str, str]] = None
|
|
|
|
|
|
@dataclass
|
|
class FykeExample:
|
|
"""A training example for Fyke filter."""
|
|
question: str
|
|
is_relevant: bool
|
|
reasoning: str
|
|
|
|
|
|
@dataclass
|
|
class SlotExample:
|
|
"""A training example for slot extraction."""
|
|
question: str
|
|
template_id: str
|
|
expected_slots: dict[str, str]
|
|
language: str = "nl"
|
|
|
|
|
|
@dataclass
|
|
class TrainingDataset:
|
|
"""Complete training dataset for all DSPy modules."""
|
|
template_examples: list[TemplateExample] = field(default_factory=list)
|
|
fyke_examples: list[FykeExample] = field(default_factory=list)
|
|
slot_examples: list[SlotExample] = field(default_factory=list)
|
|
follow_up_examples: list[TemplateExample] = field(default_factory=list)
|
|
metadata: dict[str, Any] = field(default_factory=dict)
|
|
|
|
def to_dict(self) -> dict[str, Any]:
|
|
"""Convert to JSON-serializable dict."""
|
|
return {
|
|
"metadata": {
|
|
**self.metadata,
|
|
"generated_at": datetime.now(timezone.utc).isoformat(),
|
|
"template_count": len(self.template_examples),
|
|
"fyke_count": len(self.fyke_examples),
|
|
"slot_count": len(self.slot_examples),
|
|
"follow_up_count": len(self.follow_up_examples),
|
|
},
|
|
"template_examples": [
|
|
{
|
|
"question": ex.question,
|
|
"template_id": ex.template_id,
|
|
"slots": ex.slots,
|
|
"language": ex.language,
|
|
"is_follow_up": ex.is_follow_up,
|
|
"previous_question": ex.previous_question,
|
|
"previous_slots": ex.previous_slots,
|
|
}
|
|
for ex in self.template_examples
|
|
],
|
|
"fyke_examples": [
|
|
{
|
|
"question": ex.question,
|
|
"is_relevant": ex.is_relevant,
|
|
"reasoning": ex.reasoning,
|
|
}
|
|
for ex in self.fyke_examples
|
|
],
|
|
"slot_examples": [
|
|
{
|
|
"question": ex.question,
|
|
"template_id": ex.template_id,
|
|
"expected_slots": ex.expected_slots,
|
|
"language": ex.language,
|
|
}
|
|
for ex in self.slot_examples
|
|
],
|
|
"follow_up_examples": [
|
|
{
|
|
"question": ex.question,
|
|
"template_id": ex.template_id,
|
|
"slots": ex.slots,
|
|
"language": ex.language,
|
|
"is_follow_up": ex.is_follow_up,
|
|
"previous_question": ex.previous_question,
|
|
"previous_slots": ex.previous_slots,
|
|
}
|
|
for ex in self.follow_up_examples
|
|
],
|
|
}
|
|
|
|
@classmethod
|
|
def from_dict(cls, data: dict[str, Any]) -> "TrainingDataset":
|
|
"""Load from JSON dict."""
|
|
return cls(
|
|
metadata=data.get("metadata", {}),
|
|
template_examples=[
|
|
TemplateExample(**ex) for ex in data.get("template_examples", [])
|
|
],
|
|
fyke_examples=[
|
|
FykeExample(**ex) for ex in data.get("fyke_examples", [])
|
|
],
|
|
slot_examples=[
|
|
SlotExample(**ex) for ex in data.get("slot_examples", [])
|
|
],
|
|
follow_up_examples=[
|
|
TemplateExample(**ex) for ex in data.get("follow_up_examples", [])
|
|
],
|
|
)
|
|
|
|
|
|
# =============================================================================
|
|
# DATA GENERATION
|
|
# =============================================================================
|
|
|
|
class TrainingDataGenerator:
|
|
"""Generates training data from SPARQL templates YAML."""
|
|
|
|
def __init__(self):
|
|
self.templates: dict[str, Any] = {}
|
|
self.slot_types: dict[str, Any] = {}
|
|
self.fyke_config: dict[str, Any] = {}
|
|
self.follow_up_patterns: dict[str, Any] = {}
|
|
|
|
def load_templates(self) -> None:
|
|
"""Load templates from YAML file."""
|
|
if not TEMPLATES_PATH.exists():
|
|
raise FileNotFoundError(f"Templates not found: {TEMPLATES_PATH}")
|
|
|
|
with open(TEMPLATES_PATH) as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
self.templates = data.get("templates", {})
|
|
self.slot_types = data.get("_slot_types", {})
|
|
self.fyke_config = data.get("fyke_filter", {})
|
|
self.follow_up_patterns = data.get("follow_up_patterns", {})
|
|
|
|
logger.info(f"Loaded {len(self.templates)} templates")
|
|
|
|
def generate_template_examples(self) -> list[TemplateExample]:
|
|
"""Generate template classification examples from patterns."""
|
|
examples = []
|
|
|
|
# City and region substitution values
|
|
dutch_cities = [
|
|
"Amsterdam", "Rotterdam", "Den Haag", "Utrecht", "Eindhoven",
|
|
"Groningen", "Tilburg", "Almere", "Breda", "Nijmegen",
|
|
"Enschede", "Haarlem", "Arnhem", "Zaanstad", "Amersfoort",
|
|
"Apeldoorn", "Maastricht", "Leiden", "Dordrecht", "Zwolle",
|
|
]
|
|
|
|
regions = [
|
|
("Noord-Holland", "NL-NH"),
|
|
("Zuid-Holland", "NL-ZH"),
|
|
("Noord-Brabant", "NL-NB"),
|
|
("Gelderland", "NL-GE"),
|
|
("Utrecht", "NL-UT"),
|
|
("Overijssel", "NL-OV"),
|
|
("Limburg", "NL-LI"),
|
|
("Friesland", "NL-FR"),
|
|
("Groningen", "NL-GR"),
|
|
("Drenthe", "NL-DR"),
|
|
("Flevoland", "NL-FL"),
|
|
("Zeeland", "NL-ZE"),
|
|
]
|
|
|
|
countries = [
|
|
("Nederland", "Q55"),
|
|
("Belgium", "Q31"),
|
|
("Germany", "Q183"),
|
|
("France", "Q142"),
|
|
]
|
|
|
|
institution_types = [
|
|
("musea", "M"),
|
|
("archieven", "A"),
|
|
("bibliotheken", "L"),
|
|
("galerijen", "G"),
|
|
("museums", "M"),
|
|
("archives", "A"),
|
|
("libraries", "L"),
|
|
]
|
|
|
|
for template_id, template_data in self.templates.items():
|
|
patterns = template_data.get("question_patterns", [])
|
|
template_examples = template_data.get("examples", [])
|
|
|
|
# Use explicit examples from template
|
|
for ex in template_examples:
|
|
examples.append(TemplateExample(
|
|
question=ex.get("question", ""),
|
|
template_id=template_id,
|
|
slots=ex.get("slots", {}),
|
|
language="nl" if any(c in ex.get("question", "") for c in "éèêëîïôùûüçàâäœæ") or any(w in ex.get("question", "").lower() for w in ["welke", "hoeveel", "wat"]) else "en",
|
|
))
|
|
|
|
# Generate examples from patterns
|
|
for pattern in patterns[:3]: # Limit per pattern
|
|
# Substitute placeholders
|
|
if "{institution_type_nl}" in pattern or "{institution_type_en}" in pattern:
|
|
for inst_name, inst_code in institution_types[:3]:
|
|
question = pattern.replace("{institution_type_nl}", inst_name)
|
|
question = question.replace("{institution_type_en}", inst_name)
|
|
|
|
if "{city}" in question:
|
|
for city in random.sample(dutch_cities, min(3, len(dutch_cities))):
|
|
q = question.replace("{city}", city)
|
|
examples.append(TemplateExample(
|
|
question=q,
|
|
template_id=template_id,
|
|
slots={"institution_type": inst_code, "city": city},
|
|
language="nl" if "Welke" in q or "Hoeveel" in q else "en",
|
|
))
|
|
|
|
elif "{region}" in question:
|
|
for region_name, region_code in random.sample(regions, min(2, len(regions))):
|
|
q = question.replace("{region}", region_name)
|
|
examples.append(TemplateExample(
|
|
question=q,
|
|
template_id=template_id,
|
|
slots={"institution_type": inst_code, "region": region_code},
|
|
language="nl" if "Welke" in q or "Hoeveel" in q else "en",
|
|
))
|
|
|
|
elif "{country}" in question:
|
|
for country_name, country_code in random.sample(countries, min(2, len(countries))):
|
|
q = question.replace("{country}", country_name)
|
|
examples.append(TemplateExample(
|
|
question=q,
|
|
template_id=template_id,
|
|
slots={"institution_type": inst_code, "country": country_code},
|
|
language="nl" if "Welke" in q or "Hoeveel" in q else "en",
|
|
))
|
|
|
|
elif "{location}" in question:
|
|
for city in random.sample(dutch_cities, min(2, len(dutch_cities))):
|
|
q = question.replace("{location}", city)
|
|
examples.append(TemplateExample(
|
|
question=q,
|
|
template_id=template_id,
|
|
slots={"institution_type": inst_code, "location": city},
|
|
language="nl" if "Welke" in q or "Hoeveel" in q else "en",
|
|
))
|
|
|
|
elif "{city}" in pattern:
|
|
for city in random.sample(dutch_cities, min(3, len(dutch_cities))):
|
|
q = pattern.replace("{city}", city)
|
|
examples.append(TemplateExample(
|
|
question=q,
|
|
template_id=template_id,
|
|
slots={"city": city},
|
|
language="nl" if "Welke" in q or "Wat" in q else "en",
|
|
))
|
|
|
|
elif "{institution_name}" in pattern:
|
|
institution_names = ["Rijksmuseum", "Nationaal Archief", "Koninklijke Bibliotheek", "Van Gogh Museum"]
|
|
for name in institution_names[:2]:
|
|
q = pattern.replace("{institution_name}", name)
|
|
examples.append(TemplateExample(
|
|
question=q,
|
|
template_id=template_id,
|
|
slots={"institution_name": name},
|
|
language="nl" if "Waar" in q or "Informatie" in q else "en",
|
|
))
|
|
|
|
elif "{identifier}" in pattern:
|
|
identifiers = ["NL-AmRMA", "NL-HaNA", "NL-DhHSA"]
|
|
for ident in identifiers[:2]:
|
|
q = pattern.replace("{identifier}", ident)
|
|
examples.append(TemplateExample(
|
|
question=q,
|
|
template_id=template_id,
|
|
slots={"identifier": ident, "identifier_type": "isil"},
|
|
language="nl" if "Welke" in q else "en",
|
|
))
|
|
|
|
elif "{location1}" in pattern and "{location2}" in pattern:
|
|
pairs = [("Amsterdam", "Rotterdam"), ("Den Haag", "Utrecht"), ("Groningen", "Maastricht")]
|
|
for loc1, loc2 in pairs[:2]:
|
|
q = pattern.replace("{location1}", loc1).replace("{location2}", loc2)
|
|
examples.append(TemplateExample(
|
|
question=q,
|
|
template_id=template_id,
|
|
slots={"location1": loc1, "location2": loc2},
|
|
language="nl" if "Vergelijk" in q else "en",
|
|
))
|
|
|
|
logger.info(f"Generated {len(examples)} template classification examples")
|
|
return examples
|
|
|
|
def generate_fyke_examples(self) -> list[FykeExample]:
|
|
"""Generate Fyke filter training examples."""
|
|
examples = []
|
|
|
|
# Relevant examples (heritage-related)
|
|
relevant_questions = [
|
|
("Welke musea zijn er in Amsterdam?", "Contains heritage keyword 'musea' and asks about institutions"),
|
|
("Hoeveel archieven heeft Noord-Holland?", "Asks about archive count in a region"),
|
|
("Wat is het Rijksmuseum?", "Asks about a specific museum"),
|
|
("Welke bibliotheken zijn er in Den Haag?", "Contains heritage keyword 'bibliotheken'"),
|
|
("Erfgoedinstellingen in Utrecht", "Contains heritage keyword 'erfgoedinstellingen'"),
|
|
("What museums are in Rotterdam?", "English query about museums"),
|
|
("How many archives are there in the Netherlands?", "English query about archives"),
|
|
("Information about the National Archive", "Asks about heritage institution"),
|
|
("Cultural institutions in Maastricht", "Contains heritage keyword 'cultural institutions'"),
|
|
("Galerijen in Groningen", "Contains heritage keyword 'galerijen'"),
|
|
("Welke collecties heeft het Rijksmuseum?", "Asks about museum collections"),
|
|
("Tentoonstellingen in Amsterdam", "Asks about exhibitions"),
|
|
("GLAM instellingen in Nederland", "Contains GLAM acronym"),
|
|
("Heritage institutions in the Netherlands", "Contains heritage keyword"),
|
|
]
|
|
|
|
for question, reasoning in relevant_questions:
|
|
examples.append(FykeExample(
|
|
question=question,
|
|
is_relevant=True,
|
|
reasoning=reasoning,
|
|
))
|
|
|
|
# Irrelevant examples (out of scope)
|
|
irrelevant_questions = [
|
|
("Waar kan ik tandpasta kopen?", "Shopping query unrelated to heritage"),
|
|
("Wat is het weer morgen in Amsterdam?", "Weather query"),
|
|
("Beste restaurants in Rotterdam", "Restaurant/dining query"),
|
|
("Voetbalwedstrijd Ajax vandaag", "Sports query"),
|
|
("How do I book a hotel in The Hague?", "Travel booking query"),
|
|
("Bitcoin price today", "Cryptocurrency query"),
|
|
("Recipe for apple pie", "Cooking query"),
|
|
("Where can I find a supermarket?", "Shopping query"),
|
|
("What is the capital of France?", "General knowledge, not heritage-specific"),
|
|
("How do I fix my computer?", "Technical support query"),
|
|
("Dating tips", "Personal advice query"),
|
|
("Best flight deals to Spain", "Travel booking query"),
|
|
]
|
|
|
|
for question, reasoning in irrelevant_questions:
|
|
examples.append(FykeExample(
|
|
question=question,
|
|
is_relevant=False,
|
|
reasoning=reasoning,
|
|
))
|
|
|
|
# Edge cases (could be ambiguous)
|
|
edge_cases = [
|
|
("Buildings in Amsterdam", True, "Could refer to historic/heritage buildings"),
|
|
("History of Rotterdam", True, "Historical topic often relates to heritage"),
|
|
("Art in The Hague", True, "Art is heritage-related"),
|
|
("Books about Dutch history", True, "Library/archive related topic"),
|
|
("Where is the library?", True, "Asks about library location"),
|
|
("Opening hours", False, "Too vague without heritage context"),
|
|
("How old is it?", False, "Pronoun reference without context"),
|
|
]
|
|
|
|
for question, is_relevant, reasoning in edge_cases:
|
|
examples.append(FykeExample(
|
|
question=question,
|
|
is_relevant=is_relevant,
|
|
reasoning=reasoning,
|
|
))
|
|
|
|
logger.info(f"Generated {len(examples)} Fyke filter examples")
|
|
return examples
|
|
|
|
def generate_slot_examples(self) -> list[SlotExample]:
|
|
"""Generate slot extraction training examples."""
|
|
examples = []
|
|
|
|
# Institution type extraction
|
|
type_questions = [
|
|
("Welke musea zijn er in Amsterdam?", "list_institutions_by_type_city", {"institution_type": "M", "city": "Amsterdam"}),
|
|
("Archieven in Den Haag", "list_institutions_by_type_city", {"institution_type": "A", "city": "Den Haag"}),
|
|
("What libraries are in Rotterdam?", "list_institutions_by_type_city", {"institution_type": "L", "city": "Rotterdam"}),
|
|
("Galerijen in Utrecht", "list_institutions_by_type_city", {"institution_type": "G", "city": "Utrecht"}),
|
|
("Museums in Noord-Holland", "list_institutions_by_type_region", {"institution_type": "M", "region": "NL-NH"}),
|
|
("Archives in Gelderland", "list_institutions_by_type_region", {"institution_type": "A", "region": "NL-GE"}),
|
|
("Bibliotheken in Nederland", "list_institutions_by_type_country", {"institution_type": "L", "country": "Q55"}),
|
|
("Hoeveel musea zijn er in Amsterdam?", "count_institutions_by_type_location", {"institution_type": "M", "location": "Amsterdam"}),
|
|
("Hoeveel archieven heeft Zuid-Holland?", "count_institutions_by_type_location", {"institution_type": "A", "location": "NL-ZH"}),
|
|
]
|
|
|
|
for question, template_id, expected_slots in type_questions:
|
|
examples.append(SlotExample(
|
|
question=question,
|
|
template_id=template_id,
|
|
expected_slots=expected_slots,
|
|
language="nl" if any(w in question.lower() for w in ["welke", "hoeveel", "wat"]) else "en",
|
|
))
|
|
|
|
# Name extraction
|
|
name_questions = [
|
|
("Waar is het Rijksmuseum?", "find_institution_by_name", {"institution_name": "Rijksmuseum"}),
|
|
("Informatie over het Nationaal Archief", "find_institution_by_name", {"institution_name": "Nationaal Archief"}),
|
|
("What is the Van Gogh Museum?", "find_institution_by_name", {"institution_name": "Van Gogh Museum"}),
|
|
("Tell me about Koninklijke Bibliotheek", "find_institution_by_name", {"institution_name": "Koninklijke Bibliotheek"}),
|
|
]
|
|
|
|
for question, template_id, expected_slots in name_questions:
|
|
examples.append(SlotExample(
|
|
question=question,
|
|
template_id=template_id,
|
|
expected_slots=expected_slots,
|
|
language="nl" if "Waar" in question or "Informatie" in question else "en",
|
|
))
|
|
|
|
# Identifier extraction
|
|
identifier_questions = [
|
|
("Welke instelling heeft ISIL NL-AmRMA?", "find_institution_by_identifier", {"identifier": "NL-AmRMA", "identifier_type": "isil"}),
|
|
("Zoek ISIL NL-HaNA", "find_institution_by_identifier", {"identifier": "NL-HaNA", "identifier_type": "isil"}),
|
|
("Find institution with ISIL DE-1", "find_institution_by_identifier", {"identifier": "DE-1", "identifier_type": "isil"}),
|
|
]
|
|
|
|
for question, template_id, expected_slots in identifier_questions:
|
|
examples.append(SlotExample(
|
|
question=question,
|
|
template_id=template_id,
|
|
expected_slots=expected_slots,
|
|
language="nl" if "Welke" in question or "Zoek" in question else "en",
|
|
))
|
|
|
|
# Comparison extraction
|
|
comparison_questions = [
|
|
("Vergelijk Amsterdam en Rotterdam", "compare_locations", {"location1": "Amsterdam", "location2": "Rotterdam"}),
|
|
("Amsterdam versus Utrecht", "compare_locations", {"location1": "Amsterdam", "location2": "Utrecht"}),
|
|
("Compare The Hague and Leiden", "compare_locations", {"location1": "Den Haag", "location2": "Leiden"}),
|
|
]
|
|
|
|
for question, template_id, expected_slots in comparison_questions:
|
|
examples.append(SlotExample(
|
|
question=question,
|
|
template_id=template_id,
|
|
expected_slots=expected_slots,
|
|
language="nl" if "Vergelijk" in question else "en",
|
|
))
|
|
|
|
logger.info(f"Generated {len(examples)} slot extraction examples")
|
|
return examples
|
|
|
|
def generate_follow_up_examples(self) -> list[TemplateExample]:
|
|
"""Generate follow-up question resolution examples."""
|
|
examples = []
|
|
|
|
# Location swap follow-ups
|
|
location_swaps = [
|
|
# (follow-up, resolved, previous_question, inherited_slots, new_slots)
|
|
("En in Rotterdam?", "Welke musea zijn er in Rotterdam?",
|
|
"Welke musea zijn er in Amsterdam?", {"institution_type": "M"}, {"institution_type": "M", "city": "Rotterdam"}),
|
|
("En in Utrecht?", "Welke archieven zijn er in Utrecht?",
|
|
"Welke archieven zijn er in Den Haag?", {"institution_type": "A"}, {"institution_type": "A", "city": "Utrecht"}),
|
|
("What about Groningen?", "What libraries are in Groningen?",
|
|
"What libraries are in Amsterdam?", {"institution_type": "L"}, {"institution_type": "L", "city": "Groningen"}),
|
|
("En Enschede?", "Welke bibliotheken zijn er in Enschede?",
|
|
"Welke bibliotheken zijn er in Zwolle?", {"institution_type": "L"}, {"institution_type": "L", "city": "Enschede"}),
|
|
]
|
|
|
|
for follow_up, resolved, prev_q, prev_slots, new_slots in location_swaps:
|
|
examples.append(TemplateExample(
|
|
question=follow_up,
|
|
template_id="list_institutions_by_type_city",
|
|
slots=new_slots,
|
|
language="nl" if "En" in follow_up else "en",
|
|
is_follow_up=True,
|
|
previous_question=prev_q,
|
|
previous_slots=prev_slots,
|
|
))
|
|
|
|
# Type swap follow-ups
|
|
type_swaps = [
|
|
("En de archieven?", "Welke archieven zijn er in Amsterdam?",
|
|
"Welke musea zijn er in Amsterdam?", {"city": "Amsterdam"}, {"institution_type": "A", "city": "Amsterdam"}),
|
|
("What about libraries?", "What libraries are in Rotterdam?",
|
|
"What museums are in Rotterdam?", {"city": "Rotterdam"}, {"institution_type": "L", "city": "Rotterdam"}),
|
|
("En bibliotheken?", "Welke bibliotheken zijn er in Utrecht?",
|
|
"Welke galerijen zijn er in Utrecht?", {"city": "Utrecht"}, {"institution_type": "L", "city": "Utrecht"}),
|
|
]
|
|
|
|
for follow_up, resolved, prev_q, prev_slots, new_slots in type_swaps:
|
|
examples.append(TemplateExample(
|
|
question=follow_up,
|
|
template_id="list_institutions_by_type_city",
|
|
slots=new_slots,
|
|
language="nl" if "En" in follow_up else "en",
|
|
is_follow_up=True,
|
|
previous_question=prev_q,
|
|
previous_slots=prev_slots,
|
|
))
|
|
|
|
# Count from list follow-ups
|
|
count_follow_ups = [
|
|
("Hoeveel zijn dat?", "Hoeveel musea zijn er in Amsterdam?",
|
|
"Welke musea zijn er in Amsterdam?", {"institution_type": "M", "city": "Amsterdam"}, {"institution_type": "M", "location": "Amsterdam"}),
|
|
("How many?", "How many archives are in Rotterdam?",
|
|
"What archives are in Rotterdam?", {"institution_type": "A", "city": "Rotterdam"}, {"institution_type": "A", "location": "Rotterdam"}),
|
|
("Tel ze", "Hoeveel bibliotheken zijn er in Den Haag?",
|
|
"Welke bibliotheken zijn er in Den Haag?", {"institution_type": "L", "city": "Den Haag"}, {"institution_type": "L", "location": "Den Haag"}),
|
|
]
|
|
|
|
for follow_up, resolved, prev_q, prev_slots, new_slots in count_follow_ups:
|
|
examples.append(TemplateExample(
|
|
question=follow_up,
|
|
template_id="count_institutions_by_type_location",
|
|
slots=new_slots,
|
|
language="nl" if "Hoeveel" in follow_up or "Tel" in follow_up else "en",
|
|
is_follow_up=True,
|
|
previous_question=prev_q,
|
|
previous_slots=prev_slots,
|
|
))
|
|
|
|
logger.info(f"Generated {len(examples)} follow-up examples")
|
|
return examples
|
|
|
|
def generate_dataset(self) -> TrainingDataset:
|
|
"""Generate complete training dataset."""
|
|
self.load_templates()
|
|
|
|
return TrainingDataset(
|
|
template_examples=self.generate_template_examples(),
|
|
fyke_examples=self.generate_fyke_examples(),
|
|
slot_examples=self.generate_slot_examples(),
|
|
follow_up_examples=self.generate_follow_up_examples(),
|
|
metadata={
|
|
"source": str(TEMPLATES_PATH),
|
|
"template_count": len(self.templates),
|
|
},
|
|
)
|
|
|
|
|
|
# =============================================================================
|
|
# EVALUATION METRICS
|
|
# =============================================================================
|
|
|
|
@dataclass
|
|
class EvaluationResult:
|
|
"""Results from evaluating a DSPy module."""
|
|
module_name: str
|
|
accuracy: float
|
|
precision: float
|
|
recall: float
|
|
f1_score: float
|
|
total_examples: int
|
|
correct: int
|
|
errors: list[dict[str, Any]] = field(default_factory=list)
|
|
|
|
def to_dict(self) -> dict[str, Any]:
|
|
return {
|
|
"module_name": self.module_name,
|
|
"accuracy": self.accuracy,
|
|
"precision": self.precision,
|
|
"recall": self.recall,
|
|
"f1_score": self.f1_score,
|
|
"total_examples": self.total_examples,
|
|
"correct": self.correct,
|
|
"error_count": len(self.errors),
|
|
"errors": self.errors[:10], # Limit error samples
|
|
}
|
|
|
|
|
|
class TemplateEvaluator:
|
|
"""Evaluates template classification and slot extraction."""
|
|
|
|
def __init__(self):
|
|
self.pipeline = None
|
|
|
|
def _get_pipeline(self):
|
|
"""Lazy load pipeline."""
|
|
if self.pipeline is None:
|
|
from .template_sparql import TemplateSPARQLPipeline
|
|
self.pipeline = TemplateSPARQLPipeline()
|
|
return self.pipeline
|
|
|
|
def evaluate_template_classification(
|
|
self,
|
|
examples: list[TemplateExample],
|
|
sample_size: Optional[int] = None,
|
|
) -> EvaluationResult:
|
|
"""Evaluate template classification accuracy."""
|
|
pipeline = self._get_pipeline()
|
|
|
|
if sample_size and sample_size < len(examples):
|
|
examples = random.sample(examples, sample_size)
|
|
|
|
correct = 0
|
|
errors = []
|
|
|
|
for ex in examples:
|
|
try:
|
|
result = pipeline.template_classifier.forward(
|
|
question=ex.question,
|
|
language=ex.language,
|
|
)
|
|
|
|
predicted = result.template_id
|
|
expected = ex.template_id
|
|
|
|
if predicted == expected:
|
|
correct += 1
|
|
else:
|
|
errors.append({
|
|
"question": ex.question,
|
|
"expected": expected,
|
|
"predicted": predicted,
|
|
"confidence": result.confidence,
|
|
})
|
|
|
|
except Exception as e:
|
|
errors.append({
|
|
"question": ex.question,
|
|
"expected": ex.template_id,
|
|
"error": str(e),
|
|
})
|
|
|
|
total = len(examples)
|
|
accuracy = correct / total if total > 0 else 0.0
|
|
|
|
return EvaluationResult(
|
|
module_name="TemplateClassifier",
|
|
accuracy=accuracy,
|
|
precision=accuracy, # For classification, precision = accuracy
|
|
recall=accuracy,
|
|
f1_score=accuracy,
|
|
total_examples=total,
|
|
correct=correct,
|
|
errors=errors,
|
|
)
|
|
|
|
def evaluate_slot_extraction(
|
|
self,
|
|
examples: list[SlotExample],
|
|
sample_size: Optional[int] = None,
|
|
) -> EvaluationResult:
|
|
"""Evaluate slot extraction precision and recall."""
|
|
pipeline = self._get_pipeline()
|
|
|
|
if sample_size and sample_size < len(examples):
|
|
examples = random.sample(examples, sample_size)
|
|
|
|
total_expected = 0
|
|
total_predicted = 0
|
|
total_correct = 0
|
|
errors = []
|
|
|
|
for ex in examples:
|
|
try:
|
|
predicted_slots, _ = pipeline.slot_extractor.forward(
|
|
question=ex.question,
|
|
template_id=ex.template_id,
|
|
inherited_slots=None,
|
|
)
|
|
|
|
expected_slots = ex.expected_slots
|
|
|
|
# Count slot matches
|
|
for key, expected_value in expected_slots.items():
|
|
total_expected += 1
|
|
if key in predicted_slots:
|
|
total_predicted += 1
|
|
# Normalize for comparison
|
|
pred_val = str(predicted_slots[key]).strip().lower()
|
|
exp_val = str(expected_value).strip().lower()
|
|
if pred_val == exp_val or pred_val in exp_val or exp_val in pred_val:
|
|
total_correct += 1
|
|
else:
|
|
errors.append({
|
|
"question": ex.question,
|
|
"slot": key,
|
|
"expected": expected_value,
|
|
"predicted": predicted_slots[key],
|
|
})
|
|
else:
|
|
errors.append({
|
|
"question": ex.question,
|
|
"slot": key,
|
|
"expected": expected_value,
|
|
"predicted": None,
|
|
})
|
|
|
|
except Exception as e:
|
|
errors.append({
|
|
"question": ex.question,
|
|
"error": str(e),
|
|
})
|
|
|
|
precision = total_correct / total_predicted if total_predicted > 0 else 0.0
|
|
recall = total_correct / total_expected if total_expected > 0 else 0.0
|
|
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
|
|
|
|
return EvaluationResult(
|
|
module_name="SlotExtractor",
|
|
accuracy=f1,
|
|
precision=precision,
|
|
recall=recall,
|
|
f1_score=f1,
|
|
total_examples=len(examples),
|
|
correct=total_correct,
|
|
errors=errors,
|
|
)
|
|
|
|
def evaluate_fyke_filter(
|
|
self,
|
|
examples: list[FykeExample],
|
|
sample_size: Optional[int] = None,
|
|
) -> EvaluationResult:
|
|
"""Evaluate Fyke filter accuracy."""
|
|
pipeline = self._get_pipeline()
|
|
|
|
if sample_size and sample_size < len(examples):
|
|
examples = random.sample(examples, sample_size)
|
|
|
|
correct = 0
|
|
errors = []
|
|
true_positives = 0
|
|
false_positives = 0
|
|
false_negatives = 0
|
|
|
|
for ex in examples:
|
|
try:
|
|
result = pipeline.fyke_filter.forward(
|
|
resolved_question=ex.question,
|
|
conversation_topic="heritage institutions",
|
|
language="nl",
|
|
)
|
|
|
|
predicted = result.is_relevant
|
|
expected = ex.is_relevant
|
|
|
|
if predicted == expected:
|
|
correct += 1
|
|
if expected:
|
|
true_positives += 1
|
|
else:
|
|
errors.append({
|
|
"question": ex.question,
|
|
"expected": expected,
|
|
"predicted": predicted,
|
|
"reasoning": result.reasoning,
|
|
})
|
|
if predicted and not expected:
|
|
false_positives += 1
|
|
elif not predicted and expected:
|
|
false_negatives += 1
|
|
|
|
except Exception as e:
|
|
errors.append({
|
|
"question": ex.question,
|
|
"expected": ex.is_relevant,
|
|
"error": str(e),
|
|
})
|
|
|
|
total = len(examples)
|
|
accuracy = correct / total if total > 0 else 0.0
|
|
precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0.0
|
|
recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0.0
|
|
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
|
|
|
|
return EvaluationResult(
|
|
module_name="FykeFilter",
|
|
accuracy=accuracy,
|
|
precision=precision,
|
|
recall=recall,
|
|
f1_score=f1,
|
|
total_examples=total,
|
|
correct=correct,
|
|
errors=errors,
|
|
)
|
|
|
|
|
|
# =============================================================================
|
|
# DSPy OPTIMIZATION
|
|
# =============================================================================
|
|
|
|
class TemplateOptimizer:
|
|
"""Optimizes DSPy modules using MIPRO or BootstrapFewShot."""
|
|
|
|
def __init__(self, optimizer_type: Literal["mipro", "bootstrap", "copro"] = "bootstrap"):
|
|
self.optimizer_type = optimizer_type
|
|
self.pipeline = None
|
|
|
|
def _get_pipeline(self):
|
|
"""Lazy load pipeline."""
|
|
if self.pipeline is None:
|
|
from .template_sparql import TemplateSPARQLPipeline
|
|
self.pipeline = TemplateSPARQLPipeline()
|
|
return self.pipeline
|
|
|
|
def _template_classification_metric(self, example, prediction, trace=None) -> float:
|
|
"""Metric for template classification optimization."""
|
|
expected = example.template_id
|
|
predicted = prediction.template_id if hasattr(prediction, 'template_id') else None
|
|
return 1.0 if predicted == expected else 0.0
|
|
|
|
def _slot_extraction_metric(self, example, prediction, trace=None) -> float:
|
|
"""Metric for slot extraction optimization."""
|
|
expected_slots = example.expected_slots
|
|
predicted_slots = prediction if isinstance(prediction, dict) else {}
|
|
|
|
if not expected_slots:
|
|
return 1.0
|
|
|
|
correct = 0
|
|
for key, expected_value in expected_slots.items():
|
|
if key in predicted_slots:
|
|
pred_val = str(predicted_slots[key]).strip().lower()
|
|
exp_val = str(expected_value).strip().lower()
|
|
if pred_val == exp_val or pred_val in exp_val or exp_val in pred_val:
|
|
correct += 1
|
|
|
|
return correct / len(expected_slots)
|
|
|
|
def optimize_template_classifier(
|
|
self,
|
|
examples: list[TemplateExample],
|
|
num_trials: int = 10,
|
|
) -> dict[str, Any]:
|
|
"""Optimize template classifier using DSPy optimizer."""
|
|
try:
|
|
import dspy
|
|
except ImportError:
|
|
return {"error": "DSPy not installed"}
|
|
|
|
pipeline = self._get_pipeline()
|
|
|
|
# Convert examples to DSPy format
|
|
trainset = []
|
|
for ex in examples:
|
|
trainset.append(dspy.Example(
|
|
question=ex.question,
|
|
language=ex.language,
|
|
template_id=ex.template_id,
|
|
).with_inputs("question", "language"))
|
|
|
|
# Split into train/dev
|
|
random.shuffle(trainset)
|
|
split = int(len(trainset) * 0.8)
|
|
train = trainset[:split]
|
|
dev = trainset[split:]
|
|
|
|
# Select optimizer
|
|
if self.optimizer_type == "mipro":
|
|
try:
|
|
# DSPy 2.6+ uses MIPROv2
|
|
from dspy.teleprompt import MIPROv2
|
|
optimizer = MIPROv2(
|
|
metric=self._template_classification_metric,
|
|
num_candidates=num_trials,
|
|
auto="medium", # light, medium, or heavy
|
|
)
|
|
except ImportError:
|
|
try:
|
|
# Fallback to MIPRO for older DSPy versions
|
|
from dspy.teleprompt import MIPRO
|
|
optimizer = MIPRO(
|
|
metric=self._template_classification_metric,
|
|
num_candidates=num_trials,
|
|
)
|
|
except ImportError:
|
|
from dspy.teleprompt import BootstrapFewShot
|
|
optimizer = BootstrapFewShot(
|
|
metric=self._template_classification_metric,
|
|
max_bootstrapped_demos=4,
|
|
max_labeled_demos=4,
|
|
)
|
|
elif self.optimizer_type == "copro":
|
|
try:
|
|
from dspy.teleprompt import COPRO
|
|
optimizer = COPRO(
|
|
metric=self._template_classification_metric,
|
|
depth=3,
|
|
breadth=3,
|
|
)
|
|
except ImportError:
|
|
from dspy.teleprompt import BootstrapFewShot
|
|
optimizer = BootstrapFewShot(
|
|
metric=self._template_classification_metric,
|
|
max_bootstrapped_demos=4,
|
|
)
|
|
else:
|
|
from dspy.teleprompt import BootstrapFewShot
|
|
optimizer = BootstrapFewShot(
|
|
metric=self._template_classification_metric,
|
|
max_bootstrapped_demos=4,
|
|
max_labeled_demos=4,
|
|
)
|
|
|
|
# Run optimization
|
|
logger.info(f"Running {self.optimizer_type} optimization with {len(train)} training examples...")
|
|
|
|
try:
|
|
# DSPy compile() signature varies by version and optimizer
|
|
# BootstrapFewShot: compile(student, trainset=..., teacher=None)
|
|
# MIPROv2: compile(student, trainset=..., num_trials=..., eval_kwargs=...)
|
|
if self.optimizer_type == "mipro" and hasattr(optimizer, 'compile'):
|
|
# MIPROv2 uses eval_kwargs for validation set
|
|
optimized_classifier = optimizer.compile(
|
|
pipeline.template_classifier,
|
|
trainset=train,
|
|
eval_kwargs={"devset": dev} if dev else {},
|
|
)
|
|
else:
|
|
# BootstrapFewShot and COPRO use simpler signature
|
|
optimized_classifier = optimizer.compile(
|
|
pipeline.template_classifier,
|
|
trainset=train,
|
|
)
|
|
|
|
# Evaluate optimized model
|
|
correct = 0
|
|
for ex in dev:
|
|
result = optimized_classifier.forward(
|
|
question=ex.question,
|
|
language=ex.language,
|
|
)
|
|
if result.template_id == ex.template_id:
|
|
correct += 1
|
|
|
|
accuracy = correct / len(dev) if dev else 0.0
|
|
|
|
return {
|
|
"success": True,
|
|
"optimizer": self.optimizer_type,
|
|
"train_size": len(train),
|
|
"dev_size": len(dev),
|
|
"dev_accuracy": accuracy,
|
|
"optimized_module": optimized_classifier,
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Optimization failed: {e}")
|
|
return {"success": False, "error": str(e)}
|
|
|
|
|
|
# =============================================================================
|
|
# CLI
|
|
# =============================================================================
|
|
|
|
def main():
|
|
"""CLI entry point."""
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description="DSPy Template Optimizer")
|
|
subparsers = parser.add_subparsers(dest="command", help="Command to run")
|
|
|
|
# generate-data command
|
|
gen_parser = subparsers.add_parser("generate-data", help="Generate training data from templates")
|
|
gen_parser.add_argument("--output", "-o", type=str, default=str(TRAINING_DATA_PATH),
|
|
help="Output path for training data JSON")
|
|
|
|
# evaluate command
|
|
eval_parser = subparsers.add_parser("evaluate", help="Evaluate current model")
|
|
eval_parser.add_argument("--module", "-m", type=str, default="all",
|
|
choices=["all", "template", "slot", "fyke"],
|
|
help="Which module to evaluate")
|
|
eval_parser.add_argument("--sample-size", "-n", type=int, default=50,
|
|
help="Number of examples to evaluate")
|
|
|
|
# optimize command
|
|
opt_parser = subparsers.add_parser("optimize", help="Run DSPy optimization")
|
|
opt_parser.add_argument("--optimizer", "-O", type=str, default="bootstrap",
|
|
choices=["mipro", "bootstrap", "copro"],
|
|
help="Optimizer to use")
|
|
opt_parser.add_argument("--trials", "-t", type=int, default=10,
|
|
help="Number of optimization trials")
|
|
opt_parser.add_argument("--module", "-m", type=str, default="template",
|
|
choices=["template", "slot"],
|
|
help="Module to optimize")
|
|
|
|
# export command
|
|
export_parser = subparsers.add_parser("export", help="Export optimized prompts")
|
|
export_parser.add_argument("--output", "-o", type=str, default=str(OPTIMIZED_PROMPTS_PATH),
|
|
help="Output path for optimized prompts")
|
|
|
|
args = parser.parse_args()
|
|
|
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
|
|
|
|
if args.command == "generate-data":
|
|
print("Generating training data from templates...")
|
|
generator = TrainingDataGenerator()
|
|
dataset = generator.generate_dataset()
|
|
|
|
# Ensure output directory exists
|
|
output_path = Path(args.output)
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
with open(output_path, "w") as f:
|
|
json.dump(dataset.to_dict(), f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"Generated training data:")
|
|
print(f" - Template examples: {len(dataset.template_examples)}")
|
|
print(f" - Fyke examples: {len(dataset.fyke_examples)}")
|
|
print(f" - Slot examples: {len(dataset.slot_examples)}")
|
|
print(f" - Follow-up examples: {len(dataset.follow_up_examples)}")
|
|
print(f" - Saved to: {output_path}")
|
|
|
|
elif args.command == "evaluate":
|
|
print("Loading training data...")
|
|
|
|
# Load or generate dataset
|
|
if TRAINING_DATA_PATH.exists():
|
|
with open(TRAINING_DATA_PATH) as f:
|
|
dataset = TrainingDataset.from_dict(json.load(f))
|
|
else:
|
|
print("Training data not found, generating...")
|
|
generator = TrainingDataGenerator()
|
|
dataset = generator.generate_dataset()
|
|
|
|
evaluator = TemplateEvaluator()
|
|
|
|
if args.module in ["all", "template"]:
|
|
print("\nEvaluating Template Classifier...")
|
|
result = evaluator.evaluate_template_classification(
|
|
dataset.template_examples,
|
|
sample_size=args.sample_size,
|
|
)
|
|
print(f" Accuracy: {result.accuracy:.2%}")
|
|
print(f" Correct: {result.correct}/{result.total_examples}")
|
|
if result.errors:
|
|
print(f" Sample errors:")
|
|
for err in result.errors[:3]:
|
|
print(f" - Q: {err.get('question', '')[:50]}...")
|
|
print(f" Expected: {err.get('expected')}, Got: {err.get('predicted')}")
|
|
|
|
if args.module in ["all", "slot"]:
|
|
print("\nEvaluating Slot Extractor...")
|
|
result = evaluator.evaluate_slot_extraction(
|
|
dataset.slot_examples,
|
|
sample_size=args.sample_size,
|
|
)
|
|
print(f" Precision: {result.precision:.2%}")
|
|
print(f" Recall: {result.recall:.2%}")
|
|
print(f" F1 Score: {result.f1_score:.2%}")
|
|
|
|
if args.module in ["all", "fyke"]:
|
|
print("\nEvaluating Fyke Filter...")
|
|
result = evaluator.evaluate_fyke_filter(
|
|
dataset.fyke_examples,
|
|
sample_size=args.sample_size,
|
|
)
|
|
print(f" Accuracy: {result.accuracy:.2%}")
|
|
print(f" Precision: {result.precision:.2%}")
|
|
print(f" Recall: {result.recall:.2%}")
|
|
print(f" F1 Score: {result.f1_score:.2%}")
|
|
|
|
elif args.command == "optimize":
|
|
print(f"Running {args.optimizer} optimization for {args.module}...")
|
|
|
|
# Load or generate dataset
|
|
if TRAINING_DATA_PATH.exists():
|
|
with open(TRAINING_DATA_PATH) as f:
|
|
dataset = TrainingDataset.from_dict(json.load(f))
|
|
else:
|
|
print("Training data not found, generating...")
|
|
generator = TrainingDataGenerator()
|
|
dataset = generator.generate_dataset()
|
|
|
|
optimizer = TemplateOptimizer(optimizer_type=args.optimizer)
|
|
|
|
if args.module == "template":
|
|
result = optimizer.optimize_template_classifier(
|
|
dataset.template_examples,
|
|
num_trials=args.trials,
|
|
)
|
|
|
|
if result.get("success"):
|
|
print(f"Optimization complete!")
|
|
print(f" Dev accuracy: {result['dev_accuracy']:.2%}")
|
|
else:
|
|
print(f"Optimization failed: {result.get('error')}")
|
|
|
|
elif args.command == "export":
|
|
print("Export not yet implemented - optimized modules are saved during optimization")
|
|
|
|
else:
|
|
parser.print_help()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|