glam/backend/rag/gepa_training_extended.py
2025-12-11 22:32:09 +01:00

569 lines
23 KiB
Python

"""
Extended GEPA Training Examples for Heritage RAG
This module provides additional training and validation examples for
GEPA optimization, based on real heritage institution query patterns.
Usage:
from gepa_training_extended import get_extended_training_data
trainset, valset = get_extended_training_data()
"""
from dspy import Example
def get_extended_training_data() -> tuple[list[Example], list[Example]]:
"""Get extended training and validation data for GEPA optimization.
Returns comprehensive examples covering:
- All 7 query intents (geographic, statistical, relational, temporal,
entity_lookup, comparative, exploration)
- Multiple languages (nl, en, de, fr)
- All GLAMORCUBESFIXPHDNT institution types
- Complex multi-hop queries
- Edge cases and disambiguation
Returns:
Tuple of (trainset, valset) with Example objects
"""
# ==========================================================================
# TRAINING EXAMPLES (70% of data)
# ==========================================================================
trainset = [
# ---------------------------------------------------------------------
# GEOGRAPHIC INTENT
# ---------------------------------------------------------------------
Example(
question="Hoeveel musea zijn er in Amsterdam?",
language="nl",
expected_intent="geographic",
expected_entities=["amsterdam", "musea"],
expected_sources=["postgis", "qdrant", "sparql"],
answer_contains=["Amsterdam", "museum"],
).with_inputs("question", "language"),
Example(
question="Show me archives within 10km of Utrecht Centraal",
language="en",
expected_intent="geographic",
expected_entities=["utrecht centraal", "archives"],
expected_sources=["postgis", "qdrant", "sparql"],
answer_contains=["Utrecht", "archive", "km"],
).with_inputs("question", "language"),
Example(
question="Welke bibliotheken zijn er in Limburg?",
language="nl",
expected_intent="geographic",
expected_entities=["limburg", "bibliotheken"],
expected_sources=["postgis", "qdrant", "sparql"],
answer_contains=["Limburg", "bibliotheek"],
).with_inputs("question", "language"),
Example(
question="Find heritage institutions in the Randstad region",
language="en",
expected_intent="geographic",
expected_entities=["randstad", "heritage institutions"],
expected_sources=["postgis", "qdrant", "sparql"],
answer_contains=["Randstad", "Amsterdam", "Rotterdam"],
).with_inputs("question", "language"),
Example(
question="Zeig mir Museen in der Nähe von Maastricht",
language="de",
expected_intent="geographic",
expected_entities=["maastricht", "museen"],
expected_sources=["postgis", "qdrant", "sparql"],
answer_contains=["Maastricht", "Museum"],
).with_inputs("question", "language"),
# ---------------------------------------------------------------------
# STATISTICAL INTENT
# ---------------------------------------------------------------------
Example(
question="How many libraries are there in the Netherlands?",
language="en",
expected_intent="statistical",
expected_entities=["libraries", "netherlands"],
expected_sources=["sparql", "qdrant"],
answer_contains=["librar", "Netherlands", "number"],
).with_inputs("question", "language"),
Example(
question="Hoeveel erfgoedinstellingen heeft Noord-Holland?",
language="nl",
expected_intent="statistical",
expected_entities=["noord-holland", "erfgoedinstellingen"],
expected_sources=["sparql", "qdrant"],
answer_contains=["Noord-Holland", "aantal"],
).with_inputs("question", "language"),
Example(
question="What percentage of Dutch museums have a Wikidata entry?",
language="en",
expected_intent="statistical",
expected_entities=["dutch museums", "wikidata"],
expected_sources=["sparql", "qdrant"],
answer_contains=["percent", "Wikidata", "museum"],
).with_inputs("question", "language"),
Example(
question="Count archives by province in the Netherlands",
language="en",
expected_intent="statistical",
expected_entities=["archives", "province", "netherlands"],
expected_sources=["sparql", "qdrant"],
answer_contains=["archive", "province"],
).with_inputs("question", "language"),
Example(
question="Wat is de verdeling van erfgoedinstellingen per type?",
language="nl",
expected_intent="statistical",
expected_entities=["erfgoedinstellingen", "type", "verdeling"],
expected_sources=["sparql", "qdrant"],
answer_contains=["verdeling", "type"],
).with_inputs("question", "language"),
# ---------------------------------------------------------------------
# RELATIONAL INTENT
# ---------------------------------------------------------------------
Example(
question="What institutions are part of the Erfgoed Leiden network?",
language="en",
expected_intent="relational",
expected_entities=["erfgoed leiden"],
expected_sources=["typedb", "sparql"],
answer_contains=["Erfgoed", "Leiden", "member"],
).with_inputs("question", "language"),
Example(
question="Welke archieven zijn onderdeel van het Nationaal Archief?",
language="nl",
expected_intent="relational",
expected_entities=["nationaal archief"],
expected_sources=["typedb", "sparql"],
answer_contains=["Nationaal Archief", "onderdeel"],
).with_inputs("question", "language"),
Example(
question="Show relationships between museums in Amsterdam",
language="en",
expected_intent="relational",
expected_entities=["museums", "amsterdam"],
expected_sources=["typedb", "sparql"],
answer_contains=["Amsterdam", "museum", "relation"],
).with_inputs("question", "language"),
Example(
question="Which archives merged to form Noord-Hollands Archief?",
language="en",
expected_intent="relational",
expected_entities=["noord-hollands archief"],
expected_sources=["typedb", "sparql"],
answer_contains=["Noord-Hollands Archief", "merge"],
).with_inputs("question", "language"),
# ---------------------------------------------------------------------
# TEMPORAL INTENT
# ---------------------------------------------------------------------
Example(
question="When was the Rijksmuseum founded?",
language="en",
expected_intent="temporal",
expected_entities=["rijksmuseum"],
expected_sources=["sparql", "qdrant"],
answer_contains=["Rijksmuseum", "founded", "1800"],
).with_inputs("question", "language"),
Example(
question="Welke bibliotheken zijn gefuseerd sinds 2000?",
language="nl",
expected_intent="temporal",
expected_entities=["bibliotheken", "fusie", "2000"],
expected_sources=["typedb", "sparql"],
answer_contains=["bibliotheek", "fusie"],
).with_inputs("question", "language"),
Example(
question="Show the history of organizational changes for Stadsarchief Amsterdam",
language="en",
expected_intent="temporal",
expected_entities=["stadsarchief amsterdam"],
expected_sources=["typedb", "sparql"],
answer_contains=["Stadsarchief", "Amsterdam", "change"],
).with_inputs("question", "language"),
Example(
question="Which museums closed in the last 10 years?",
language="en",
expected_intent="temporal",
expected_entities=["museums", "closed"],
expected_sources=["typedb", "sparql"],
answer_contains=["museum", "closed", "year"],
).with_inputs("question", "language"),
Example(
question="Wanneer is het Nationaal Archief opgericht?",
language="nl",
expected_intent="temporal",
expected_entities=["nationaal archief"],
expected_sources=["sparql", "qdrant"],
answer_contains=["Nationaal Archief", "opgericht"],
).with_inputs("question", "language"),
# ---------------------------------------------------------------------
# ENTITY LOOKUP INTENT
# ---------------------------------------------------------------------
Example(
question="Where is the Rijksmuseum located?",
language="en",
expected_intent="entity_lookup",
expected_entities=["rijksmuseum"],
expected_sources=["sparql", "qdrant"],
answer_contains=["Rijksmuseum", "Amsterdam", "Museumplein"],
).with_inputs("question", "language"),
Example(
question="What is the ISIL code for Koninklijke Bibliotheek?",
language="en",
expected_intent="entity_lookup",
expected_entities=["koninklijke bibliotheek", "isil"],
expected_sources=["sparql", "qdrant"],
answer_contains=["Koninklijke Bibliotheek", "ISIL", "NL-"],
).with_inputs("question", "language"),
Example(
question="Geef informatie over het Van Gogh Museum",
language="nl",
expected_intent="entity_lookup",
expected_entities=["van gogh museum"],
expected_sources=["sparql", "qdrant"],
answer_contains=["Van Gogh", "Museum", "Amsterdam"],
).with_inputs("question", "language"),
Example(
question="What type of institution is Naturalis?",
language="en",
expected_intent="entity_lookup",
expected_entities=["naturalis"],
expected_sources=["sparql", "qdrant"],
answer_contains=["Naturalis", "museum", "natural history"],
).with_inputs("question", "language"),
Example(
question="Show me details about Regionaal Archief Tilburg",
language="en",
expected_intent="entity_lookup",
expected_entities=["regionaal archief tilburg"],
expected_sources=["sparql", "qdrant"],
answer_contains=["Tilburg", "archief"],
).with_inputs("question", "language"),
# ---------------------------------------------------------------------
# COMPARATIVE INTENT
# ---------------------------------------------------------------------
Example(
question="Compare the collections of Rijksmuseum and Van Gogh Museum",
language="en",
expected_intent="comparative",
expected_entities=["rijksmuseum", "van gogh museum"],
expected_sources=["sparql", "qdrant"],
answer_contains=["Rijksmuseum", "Van Gogh", "collection"],
).with_inputs("question", "language"),
Example(
question="Vergelijk het Nationaal Archief met het Stadsarchief Amsterdam",
language="nl",
expected_intent="comparative",
expected_entities=["nationaal archief", "stadsarchief amsterdam"],
expected_sources=["sparql", "qdrant"],
answer_contains=["Nationaal Archief", "Stadsarchief"],
).with_inputs("question", "language"),
Example(
question="What are the differences between provincial and municipal archives?",
language="en",
expected_intent="comparative",
expected_entities=["provincial archives", "municipal archives"],
expected_sources=["sparql", "qdrant"],
answer_contains=["provincial", "municipal", "archive"],
).with_inputs("question", "language"),
Example(
question="How do Dutch university libraries compare in size?",
language="en",
expected_intent="comparative",
expected_entities=["university libraries", "dutch"],
expected_sources=["sparql", "qdrant"],
answer_contains=["university", "library", "size"],
).with_inputs("question", "language"),
# ---------------------------------------------------------------------
# EXPLORATION INTENT
# ---------------------------------------------------------------------
Example(
question="Show me archives related to World War II",
language="en",
expected_intent="exploration",
expected_entities=["archives", "world war ii"],
expected_sources=["qdrant", "sparql"],
answer_contains=["archive", "war", "collection"],
).with_inputs("question", "language"),
Example(
question="Find museums about maritime history",
language="en",
expected_intent="exploration",
expected_entities=["museums", "maritime history"],
expected_sources=["qdrant", "sparql"],
answer_contains=["museum", "maritime", "ship"],
).with_inputs("question", "language"),
Example(
question="Zoek bibliotheken met middeleeuwse manuscripten",
language="nl",
expected_intent="exploration",
expected_entities=["bibliotheken", "middeleeuwse manuscripten"],
expected_sources=["qdrant", "sparql"],
answer_contains=["bibliotheek", "manuscript"],
).with_inputs("question", "language"),
Example(
question="Discover heritage institutions with photo collections",
language="en",
expected_intent="exploration",
expected_entities=["heritage institutions", "photo collections"],
expected_sources=["qdrant", "sparql"],
answer_contains=["photo", "collection"],
).with_inputs("question", "language"),
Example(
question="Browse archives with colonial history documents",
language="en",
expected_intent="exploration",
expected_entities=["archives", "colonial history"],
expected_sources=["qdrant", "sparql"],
answer_contains=["archive", "colonial", "document"],
).with_inputs("question", "language"),
# ---------------------------------------------------------------------
# INSTITUTION TYPE SPECIFIC (GLAMORCUBESFIXPHDNT)
# ---------------------------------------------------------------------
# Galleries (G)
Example(
question="Find art galleries in Rotterdam",
language="en",
expected_intent="geographic",
expected_entities=["art galleries", "rotterdam"],
expected_sources=["postgis", "qdrant", "sparql"],
answer_contains=["Rotterdam", "gallery"],
).with_inputs("question", "language"),
# Research Centers (R)
Example(
question="What research centers focus on heritage conservation?",
language="en",
expected_intent="exploration",
expected_entities=["research centers", "heritage conservation"],
expected_sources=["qdrant", "sparql"],
answer_contains=["research", "conservation"],
).with_inputs("question", "language"),
# Botanical/Zoo (B)
Example(
question="List botanical gardens in the Netherlands",
language="en",
expected_intent="statistical",
expected_entities=["botanical gardens", "netherlands"],
expected_sources=["sparql", "qdrant"],
answer_contains=["botanical", "garden"],
).with_inputs("question", "language"),
# Education (E)
Example(
question="Which universities have heritage collections?",
language="en",
expected_intent="exploration",
expected_entities=["universities", "heritage collections"],
expected_sources=["qdrant", "sparql"],
answer_contains=["university", "collection"],
).with_inputs("question", "language"),
# Societies (S)
Example(
question="Find historical societies in Gelderland",
language="en",
expected_intent="geographic",
expected_entities=["historical societies", "gelderland"],
expected_sources=["postgis", "qdrant", "sparql"],
answer_contains=["Gelderland", "society"],
).with_inputs("question", "language"),
# Holy Sites (H)
Example(
question="Which churches have historical archives?",
language="en",
expected_intent="exploration",
expected_entities=["churches", "historical archives"],
expected_sources=["qdrant", "sparql"],
answer_contains=["church", "archive"],
).with_inputs("question", "language"),
# Digital Platforms (D)
Example(
question="What digital heritage portals exist in the Netherlands?",
language="en",
expected_intent="exploration",
expected_entities=["digital heritage portals", "netherlands"],
expected_sources=["qdrant", "sparql"],
answer_contains=["digital", "portal"],
).with_inputs("question", "language"),
]
# ==========================================================================
# VALIDATION EXAMPLES (30% of data, held out)
# ==========================================================================
valset = [
Example(
question="Hoeveel archieven heeft Noord-Holland?",
language="nl",
expected_intent="statistical",
expected_entities=["noord-holland", "archieven"],
expected_sources=["sparql", "qdrant"],
answer_contains=["Noord-Holland", "archief", "aantal"],
).with_inputs("question", "language"),
Example(
question="When was the Nationaal Archief founded?",
language="en",
expected_intent="temporal",
expected_entities=["nationaal archief"],
expected_sources=["sparql", "qdrant"],
answer_contains=["Nationaal Archief", "founded", "year"],
).with_inputs("question", "language"),
Example(
question="Show galleries in Utrecht",
language="en",
expected_intent="geographic",
expected_entities=["galleries", "utrecht"],
expected_sources=["postgis", "qdrant", "sparql"],
answer_contains=["gallery", "Utrecht"],
).with_inputs("question", "language"),
Example(
question="Compare archives in Amsterdam and Rotterdam",
language="en",
expected_intent="comparative",
expected_entities=["archives", "amsterdam", "rotterdam"],
expected_sources=["sparql", "qdrant"],
answer_contains=["archive", "Amsterdam", "Rotterdam"],
).with_inputs("question", "language"),
Example(
question="Welke musea behoren tot het Museumkaart netwerk?",
language="nl",
expected_intent="relational",
expected_entities=["musea", "museumkaart"],
expected_sources=["typedb", "sparql"],
answer_contains=["museum", "Museumkaart"],
).with_inputs("question", "language"),
Example(
question="Find libraries with special collections",
language="en",
expected_intent="exploration",
expected_entities=["libraries", "special collections"],
expected_sources=["qdrant", "sparql"],
answer_contains=["library", "special", "collection"],
).with_inputs("question", "language"),
Example(
question="What is the address of Het Scheepvaartmuseum?",
language="en",
expected_intent="entity_lookup",
expected_entities=["scheepvaartmuseum"],
expected_sources=["sparql", "qdrant"],
answer_contains=["Scheepvaartmuseum", "Amsterdam", "address"],
).with_inputs("question", "language"),
Example(
question="Toon erfgoedinstellingen bij Eindhoven",
language="nl",
expected_intent="geographic",
expected_entities=["eindhoven", "erfgoedinstellingen"],
expected_sources=["postgis", "qdrant", "sparql"],
answer_contains=["Eindhoven"],
).with_inputs("question", "language"),
Example(
question="How many heritage institutions have ISIL codes?",
language="en",
expected_intent="statistical",
expected_entities=["heritage institutions", "isil"],
expected_sources=["sparql", "qdrant"],
answer_contains=["ISIL", "number", "institution"],
).with_inputs("question", "language"),
Example(
question="Which museums relocated after 2010?",
language="en",
expected_intent="temporal",
expected_entities=["museums", "relocated", "2010"],
expected_sources=["typedb", "sparql"],
answer_contains=["museum", "relocat", "2010"],
).with_inputs("question", "language"),
]
return trainset, valset
def get_intent_distribution() -> dict[str, int]:
"""Get distribution of intents in training data."""
trainset, valset = get_extended_training_data()
all_examples = trainset + valset
distribution = {}
for ex in all_examples:
intent = ex.expected_intent
distribution[intent] = distribution.get(intent, 0) + 1
return distribution
def get_language_distribution() -> dict[str, int]:
"""Get distribution of languages in training data."""
trainset, valset = get_extended_training_data()
all_examples = trainset + valset
distribution = {}
for ex in all_examples:
lang = ex.language
distribution[lang] = distribution.get(lang, 0) + 1
return distribution
if __name__ == "__main__":
# Print statistics
trainset, valset = get_extended_training_data()
print("GEPA Extended Training Data Statistics")
print("=" * 50)
print(f"Training examples: {len(trainset)}")
print(f"Validation examples: {len(valset)}")
print(f"Total examples: {len(trainset) + len(valset)}")
print("\nIntent distribution:")
for intent, count in sorted(get_intent_distribution().items()):
print(f" {intent}: {count}")
print("\nLanguage distribution:")
for lang, count in sorted(get_language_distribution().items()):
print(f" {lang}: {count}")