569 lines
23 KiB
Python
569 lines
23 KiB
Python
"""
|
|
Extended GEPA Training Examples for Heritage RAG
|
|
|
|
This module provides additional training and validation examples for
|
|
GEPA optimization, based on real heritage institution query patterns.
|
|
|
|
Usage:
|
|
from gepa_training_extended import get_extended_training_data
|
|
trainset, valset = get_extended_training_data()
|
|
"""
|
|
|
|
from dspy import Example
|
|
|
|
|
|
def get_extended_training_data() -> tuple[list[Example], list[Example]]:
|
|
"""Get extended training and validation data for GEPA optimization.
|
|
|
|
Returns comprehensive examples covering:
|
|
- All 7 query intents (geographic, statistical, relational, temporal,
|
|
entity_lookup, comparative, exploration)
|
|
- Multiple languages (nl, en, de, fr)
|
|
- All GLAMORCUBESFIXPHDNT institution types
|
|
- Complex multi-hop queries
|
|
- Edge cases and disambiguation
|
|
|
|
Returns:
|
|
Tuple of (trainset, valset) with Example objects
|
|
"""
|
|
|
|
# ==========================================================================
|
|
# TRAINING EXAMPLES (70% of data)
|
|
# ==========================================================================
|
|
|
|
trainset = [
|
|
# ---------------------------------------------------------------------
|
|
# GEOGRAPHIC INTENT
|
|
# ---------------------------------------------------------------------
|
|
Example(
|
|
question="Hoeveel musea zijn er in Amsterdam?",
|
|
language="nl",
|
|
expected_intent="geographic",
|
|
expected_entities=["amsterdam", "musea"],
|
|
expected_sources=["postgis", "qdrant", "sparql"],
|
|
answer_contains=["Amsterdam", "museum"],
|
|
).with_inputs("question", "language"),
|
|
|
|
Example(
|
|
question="Show me archives within 10km of Utrecht Centraal",
|
|
language="en",
|
|
expected_intent="geographic",
|
|
expected_entities=["utrecht centraal", "archives"],
|
|
expected_sources=["postgis", "qdrant", "sparql"],
|
|
answer_contains=["Utrecht", "archive", "km"],
|
|
).with_inputs("question", "language"),
|
|
|
|
Example(
|
|
question="Welke bibliotheken zijn er in Limburg?",
|
|
language="nl",
|
|
expected_intent="geographic",
|
|
expected_entities=["limburg", "bibliotheken"],
|
|
expected_sources=["postgis", "qdrant", "sparql"],
|
|
answer_contains=["Limburg", "bibliotheek"],
|
|
).with_inputs("question", "language"),
|
|
|
|
Example(
|
|
question="Find heritage institutions in the Randstad region",
|
|
language="en",
|
|
expected_intent="geographic",
|
|
expected_entities=["randstad", "heritage institutions"],
|
|
expected_sources=["postgis", "qdrant", "sparql"],
|
|
answer_contains=["Randstad", "Amsterdam", "Rotterdam"],
|
|
).with_inputs("question", "language"),
|
|
|
|
Example(
|
|
question="Zeig mir Museen in der Nähe von Maastricht",
|
|
language="de",
|
|
expected_intent="geographic",
|
|
expected_entities=["maastricht", "museen"],
|
|
expected_sources=["postgis", "qdrant", "sparql"],
|
|
answer_contains=["Maastricht", "Museum"],
|
|
).with_inputs("question", "language"),
|
|
|
|
# ---------------------------------------------------------------------
|
|
# STATISTICAL INTENT
|
|
# ---------------------------------------------------------------------
|
|
Example(
|
|
question="How many libraries are there in the Netherlands?",
|
|
language="en",
|
|
expected_intent="statistical",
|
|
expected_entities=["libraries", "netherlands"],
|
|
expected_sources=["sparql", "qdrant"],
|
|
answer_contains=["librar", "Netherlands", "number"],
|
|
).with_inputs("question", "language"),
|
|
|
|
Example(
|
|
question="Hoeveel erfgoedinstellingen heeft Noord-Holland?",
|
|
language="nl",
|
|
expected_intent="statistical",
|
|
expected_entities=["noord-holland", "erfgoedinstellingen"],
|
|
expected_sources=["sparql", "qdrant"],
|
|
answer_contains=["Noord-Holland", "aantal"],
|
|
).with_inputs("question", "language"),
|
|
|
|
Example(
|
|
question="What percentage of Dutch museums have a Wikidata entry?",
|
|
language="en",
|
|
expected_intent="statistical",
|
|
expected_entities=["dutch museums", "wikidata"],
|
|
expected_sources=["sparql", "qdrant"],
|
|
answer_contains=["percent", "Wikidata", "museum"],
|
|
).with_inputs("question", "language"),
|
|
|
|
Example(
|
|
question="Count archives by province in the Netherlands",
|
|
language="en",
|
|
expected_intent="statistical",
|
|
expected_entities=["archives", "province", "netherlands"],
|
|
expected_sources=["sparql", "qdrant"],
|
|
answer_contains=["archive", "province"],
|
|
).with_inputs("question", "language"),
|
|
|
|
Example(
|
|
question="Wat is de verdeling van erfgoedinstellingen per type?",
|
|
language="nl",
|
|
expected_intent="statistical",
|
|
expected_entities=["erfgoedinstellingen", "type", "verdeling"],
|
|
expected_sources=["sparql", "qdrant"],
|
|
answer_contains=["verdeling", "type"],
|
|
).with_inputs("question", "language"),
|
|
|
|
# ---------------------------------------------------------------------
|
|
# RELATIONAL INTENT
|
|
# ---------------------------------------------------------------------
|
|
Example(
|
|
question="What institutions are part of the Erfgoed Leiden network?",
|
|
language="en",
|
|
expected_intent="relational",
|
|
expected_entities=["erfgoed leiden"],
|
|
expected_sources=["typedb", "sparql"],
|
|
answer_contains=["Erfgoed", "Leiden", "member"],
|
|
).with_inputs("question", "language"),
|
|
|
|
Example(
|
|
question="Welke archieven zijn onderdeel van het Nationaal Archief?",
|
|
language="nl",
|
|
expected_intent="relational",
|
|
expected_entities=["nationaal archief"],
|
|
expected_sources=["typedb", "sparql"],
|
|
answer_contains=["Nationaal Archief", "onderdeel"],
|
|
).with_inputs("question", "language"),
|
|
|
|
Example(
|
|
question="Show relationships between museums in Amsterdam",
|
|
language="en",
|
|
expected_intent="relational",
|
|
expected_entities=["museums", "amsterdam"],
|
|
expected_sources=["typedb", "sparql"],
|
|
answer_contains=["Amsterdam", "museum", "relation"],
|
|
).with_inputs("question", "language"),
|
|
|
|
Example(
|
|
question="Which archives merged to form Noord-Hollands Archief?",
|
|
language="en",
|
|
expected_intent="relational",
|
|
expected_entities=["noord-hollands archief"],
|
|
expected_sources=["typedb", "sparql"],
|
|
answer_contains=["Noord-Hollands Archief", "merge"],
|
|
).with_inputs("question", "language"),
|
|
|
|
# ---------------------------------------------------------------------
|
|
# TEMPORAL INTENT
|
|
# ---------------------------------------------------------------------
|
|
Example(
|
|
question="When was the Rijksmuseum founded?",
|
|
language="en",
|
|
expected_intent="temporal",
|
|
expected_entities=["rijksmuseum"],
|
|
expected_sources=["sparql", "qdrant"],
|
|
answer_contains=["Rijksmuseum", "founded", "1800"],
|
|
).with_inputs("question", "language"),
|
|
|
|
Example(
|
|
question="Welke bibliotheken zijn gefuseerd sinds 2000?",
|
|
language="nl",
|
|
expected_intent="temporal",
|
|
expected_entities=["bibliotheken", "fusie", "2000"],
|
|
expected_sources=["typedb", "sparql"],
|
|
answer_contains=["bibliotheek", "fusie"],
|
|
).with_inputs("question", "language"),
|
|
|
|
Example(
|
|
question="Show the history of organizational changes for Stadsarchief Amsterdam",
|
|
language="en",
|
|
expected_intent="temporal",
|
|
expected_entities=["stadsarchief amsterdam"],
|
|
expected_sources=["typedb", "sparql"],
|
|
answer_contains=["Stadsarchief", "Amsterdam", "change"],
|
|
).with_inputs("question", "language"),
|
|
|
|
Example(
|
|
question="Which museums closed in the last 10 years?",
|
|
language="en",
|
|
expected_intent="temporal",
|
|
expected_entities=["museums", "closed"],
|
|
expected_sources=["typedb", "sparql"],
|
|
answer_contains=["museum", "closed", "year"],
|
|
).with_inputs("question", "language"),
|
|
|
|
Example(
|
|
question="Wanneer is het Nationaal Archief opgericht?",
|
|
language="nl",
|
|
expected_intent="temporal",
|
|
expected_entities=["nationaal archief"],
|
|
expected_sources=["sparql", "qdrant"],
|
|
answer_contains=["Nationaal Archief", "opgericht"],
|
|
).with_inputs("question", "language"),
|
|
|
|
# ---------------------------------------------------------------------
|
|
# ENTITY LOOKUP INTENT
|
|
# ---------------------------------------------------------------------
|
|
Example(
|
|
question="Where is the Rijksmuseum located?",
|
|
language="en",
|
|
expected_intent="entity_lookup",
|
|
expected_entities=["rijksmuseum"],
|
|
expected_sources=["sparql", "qdrant"],
|
|
answer_contains=["Rijksmuseum", "Amsterdam", "Museumplein"],
|
|
).with_inputs("question", "language"),
|
|
|
|
Example(
|
|
question="What is the ISIL code for Koninklijke Bibliotheek?",
|
|
language="en",
|
|
expected_intent="entity_lookup",
|
|
expected_entities=["koninklijke bibliotheek", "isil"],
|
|
expected_sources=["sparql", "qdrant"],
|
|
answer_contains=["Koninklijke Bibliotheek", "ISIL", "NL-"],
|
|
).with_inputs("question", "language"),
|
|
|
|
Example(
|
|
question="Geef informatie over het Van Gogh Museum",
|
|
language="nl",
|
|
expected_intent="entity_lookup",
|
|
expected_entities=["van gogh museum"],
|
|
expected_sources=["sparql", "qdrant"],
|
|
answer_contains=["Van Gogh", "Museum", "Amsterdam"],
|
|
).with_inputs("question", "language"),
|
|
|
|
Example(
|
|
question="What type of institution is Naturalis?",
|
|
language="en",
|
|
expected_intent="entity_lookup",
|
|
expected_entities=["naturalis"],
|
|
expected_sources=["sparql", "qdrant"],
|
|
answer_contains=["Naturalis", "museum", "natural history"],
|
|
).with_inputs("question", "language"),
|
|
|
|
Example(
|
|
question="Show me details about Regionaal Archief Tilburg",
|
|
language="en",
|
|
expected_intent="entity_lookup",
|
|
expected_entities=["regionaal archief tilburg"],
|
|
expected_sources=["sparql", "qdrant"],
|
|
answer_contains=["Tilburg", "archief"],
|
|
).with_inputs("question", "language"),
|
|
|
|
# ---------------------------------------------------------------------
|
|
# COMPARATIVE INTENT
|
|
# ---------------------------------------------------------------------
|
|
Example(
|
|
question="Compare the collections of Rijksmuseum and Van Gogh Museum",
|
|
language="en",
|
|
expected_intent="comparative",
|
|
expected_entities=["rijksmuseum", "van gogh museum"],
|
|
expected_sources=["sparql", "qdrant"],
|
|
answer_contains=["Rijksmuseum", "Van Gogh", "collection"],
|
|
).with_inputs("question", "language"),
|
|
|
|
Example(
|
|
question="Vergelijk het Nationaal Archief met het Stadsarchief Amsterdam",
|
|
language="nl",
|
|
expected_intent="comparative",
|
|
expected_entities=["nationaal archief", "stadsarchief amsterdam"],
|
|
expected_sources=["sparql", "qdrant"],
|
|
answer_contains=["Nationaal Archief", "Stadsarchief"],
|
|
).with_inputs("question", "language"),
|
|
|
|
Example(
|
|
question="What are the differences between provincial and municipal archives?",
|
|
language="en",
|
|
expected_intent="comparative",
|
|
expected_entities=["provincial archives", "municipal archives"],
|
|
expected_sources=["sparql", "qdrant"],
|
|
answer_contains=["provincial", "municipal", "archive"],
|
|
).with_inputs("question", "language"),
|
|
|
|
Example(
|
|
question="How do Dutch university libraries compare in size?",
|
|
language="en",
|
|
expected_intent="comparative",
|
|
expected_entities=["university libraries", "dutch"],
|
|
expected_sources=["sparql", "qdrant"],
|
|
answer_contains=["university", "library", "size"],
|
|
).with_inputs("question", "language"),
|
|
|
|
# ---------------------------------------------------------------------
|
|
# EXPLORATION INTENT
|
|
# ---------------------------------------------------------------------
|
|
Example(
|
|
question="Show me archives related to World War II",
|
|
language="en",
|
|
expected_intent="exploration",
|
|
expected_entities=["archives", "world war ii"],
|
|
expected_sources=["qdrant", "sparql"],
|
|
answer_contains=["archive", "war", "collection"],
|
|
).with_inputs("question", "language"),
|
|
|
|
Example(
|
|
question="Find museums about maritime history",
|
|
language="en",
|
|
expected_intent="exploration",
|
|
expected_entities=["museums", "maritime history"],
|
|
expected_sources=["qdrant", "sparql"],
|
|
answer_contains=["museum", "maritime", "ship"],
|
|
).with_inputs("question", "language"),
|
|
|
|
Example(
|
|
question="Zoek bibliotheken met middeleeuwse manuscripten",
|
|
language="nl",
|
|
expected_intent="exploration",
|
|
expected_entities=["bibliotheken", "middeleeuwse manuscripten"],
|
|
expected_sources=["qdrant", "sparql"],
|
|
answer_contains=["bibliotheek", "manuscript"],
|
|
).with_inputs("question", "language"),
|
|
|
|
Example(
|
|
question="Discover heritage institutions with photo collections",
|
|
language="en",
|
|
expected_intent="exploration",
|
|
expected_entities=["heritage institutions", "photo collections"],
|
|
expected_sources=["qdrant", "sparql"],
|
|
answer_contains=["photo", "collection"],
|
|
).with_inputs("question", "language"),
|
|
|
|
Example(
|
|
question="Browse archives with colonial history documents",
|
|
language="en",
|
|
expected_intent="exploration",
|
|
expected_entities=["archives", "colonial history"],
|
|
expected_sources=["qdrant", "sparql"],
|
|
answer_contains=["archive", "colonial", "document"],
|
|
).with_inputs("question", "language"),
|
|
|
|
# ---------------------------------------------------------------------
|
|
# INSTITUTION TYPE SPECIFIC (GLAMORCUBESFIXPHDNT)
|
|
# ---------------------------------------------------------------------
|
|
|
|
# Galleries (G)
|
|
Example(
|
|
question="Find art galleries in Rotterdam",
|
|
language="en",
|
|
expected_intent="geographic",
|
|
expected_entities=["art galleries", "rotterdam"],
|
|
expected_sources=["postgis", "qdrant", "sparql"],
|
|
answer_contains=["Rotterdam", "gallery"],
|
|
).with_inputs("question", "language"),
|
|
|
|
# Research Centers (R)
|
|
Example(
|
|
question="What research centers focus on heritage conservation?",
|
|
language="en",
|
|
expected_intent="exploration",
|
|
expected_entities=["research centers", "heritage conservation"],
|
|
expected_sources=["qdrant", "sparql"],
|
|
answer_contains=["research", "conservation"],
|
|
).with_inputs("question", "language"),
|
|
|
|
# Botanical/Zoo (B)
|
|
Example(
|
|
question="List botanical gardens in the Netherlands",
|
|
language="en",
|
|
expected_intent="statistical",
|
|
expected_entities=["botanical gardens", "netherlands"],
|
|
expected_sources=["sparql", "qdrant"],
|
|
answer_contains=["botanical", "garden"],
|
|
).with_inputs("question", "language"),
|
|
|
|
# Education (E)
|
|
Example(
|
|
question="Which universities have heritage collections?",
|
|
language="en",
|
|
expected_intent="exploration",
|
|
expected_entities=["universities", "heritage collections"],
|
|
expected_sources=["qdrant", "sparql"],
|
|
answer_contains=["university", "collection"],
|
|
).with_inputs("question", "language"),
|
|
|
|
# Societies (S)
|
|
Example(
|
|
question="Find historical societies in Gelderland",
|
|
language="en",
|
|
expected_intent="geographic",
|
|
expected_entities=["historical societies", "gelderland"],
|
|
expected_sources=["postgis", "qdrant", "sparql"],
|
|
answer_contains=["Gelderland", "society"],
|
|
).with_inputs("question", "language"),
|
|
|
|
# Holy Sites (H)
|
|
Example(
|
|
question="Which churches have historical archives?",
|
|
language="en",
|
|
expected_intent="exploration",
|
|
expected_entities=["churches", "historical archives"],
|
|
expected_sources=["qdrant", "sparql"],
|
|
answer_contains=["church", "archive"],
|
|
).with_inputs("question", "language"),
|
|
|
|
# Digital Platforms (D)
|
|
Example(
|
|
question="What digital heritage portals exist in the Netherlands?",
|
|
language="en",
|
|
expected_intent="exploration",
|
|
expected_entities=["digital heritage portals", "netherlands"],
|
|
expected_sources=["qdrant", "sparql"],
|
|
answer_contains=["digital", "portal"],
|
|
).with_inputs("question", "language"),
|
|
]
|
|
|
|
# ==========================================================================
|
|
# VALIDATION EXAMPLES (30% of data, held out)
|
|
# ==========================================================================
|
|
|
|
valset = [
|
|
Example(
|
|
question="Hoeveel archieven heeft Noord-Holland?",
|
|
language="nl",
|
|
expected_intent="statistical",
|
|
expected_entities=["noord-holland", "archieven"],
|
|
expected_sources=["sparql", "qdrant"],
|
|
answer_contains=["Noord-Holland", "archief", "aantal"],
|
|
).with_inputs("question", "language"),
|
|
|
|
Example(
|
|
question="When was the Nationaal Archief founded?",
|
|
language="en",
|
|
expected_intent="temporal",
|
|
expected_entities=["nationaal archief"],
|
|
expected_sources=["sparql", "qdrant"],
|
|
answer_contains=["Nationaal Archief", "founded", "year"],
|
|
).with_inputs("question", "language"),
|
|
|
|
Example(
|
|
question="Show galleries in Utrecht",
|
|
language="en",
|
|
expected_intent="geographic",
|
|
expected_entities=["galleries", "utrecht"],
|
|
expected_sources=["postgis", "qdrant", "sparql"],
|
|
answer_contains=["gallery", "Utrecht"],
|
|
).with_inputs("question", "language"),
|
|
|
|
Example(
|
|
question="Compare archives in Amsterdam and Rotterdam",
|
|
language="en",
|
|
expected_intent="comparative",
|
|
expected_entities=["archives", "amsterdam", "rotterdam"],
|
|
expected_sources=["sparql", "qdrant"],
|
|
answer_contains=["archive", "Amsterdam", "Rotterdam"],
|
|
).with_inputs("question", "language"),
|
|
|
|
Example(
|
|
question="Welke musea behoren tot het Museumkaart netwerk?",
|
|
language="nl",
|
|
expected_intent="relational",
|
|
expected_entities=["musea", "museumkaart"],
|
|
expected_sources=["typedb", "sparql"],
|
|
answer_contains=["museum", "Museumkaart"],
|
|
).with_inputs("question", "language"),
|
|
|
|
Example(
|
|
question="Find libraries with special collections",
|
|
language="en",
|
|
expected_intent="exploration",
|
|
expected_entities=["libraries", "special collections"],
|
|
expected_sources=["qdrant", "sparql"],
|
|
answer_contains=["library", "special", "collection"],
|
|
).with_inputs("question", "language"),
|
|
|
|
Example(
|
|
question="What is the address of Het Scheepvaartmuseum?",
|
|
language="en",
|
|
expected_intent="entity_lookup",
|
|
expected_entities=["scheepvaartmuseum"],
|
|
expected_sources=["sparql", "qdrant"],
|
|
answer_contains=["Scheepvaartmuseum", "Amsterdam", "address"],
|
|
).with_inputs("question", "language"),
|
|
|
|
Example(
|
|
question="Toon erfgoedinstellingen bij Eindhoven",
|
|
language="nl",
|
|
expected_intent="geographic",
|
|
expected_entities=["eindhoven", "erfgoedinstellingen"],
|
|
expected_sources=["postgis", "qdrant", "sparql"],
|
|
answer_contains=["Eindhoven"],
|
|
).with_inputs("question", "language"),
|
|
|
|
Example(
|
|
question="How many heritage institutions have ISIL codes?",
|
|
language="en",
|
|
expected_intent="statistical",
|
|
expected_entities=["heritage institutions", "isil"],
|
|
expected_sources=["sparql", "qdrant"],
|
|
answer_contains=["ISIL", "number", "institution"],
|
|
).with_inputs("question", "language"),
|
|
|
|
Example(
|
|
question="Which museums relocated after 2010?",
|
|
language="en",
|
|
expected_intent="temporal",
|
|
expected_entities=["museums", "relocated", "2010"],
|
|
expected_sources=["typedb", "sparql"],
|
|
answer_contains=["museum", "relocat", "2010"],
|
|
).with_inputs("question", "language"),
|
|
]
|
|
|
|
return trainset, valset
|
|
|
|
|
|
def get_intent_distribution() -> dict[str, int]:
|
|
"""Get distribution of intents in training data."""
|
|
trainset, valset = get_extended_training_data()
|
|
all_examples = trainset + valset
|
|
|
|
distribution = {}
|
|
for ex in all_examples:
|
|
intent = ex.expected_intent
|
|
distribution[intent] = distribution.get(intent, 0) + 1
|
|
|
|
return distribution
|
|
|
|
|
|
def get_language_distribution() -> dict[str, int]:
|
|
"""Get distribution of languages in training data."""
|
|
trainset, valset = get_extended_training_data()
|
|
all_examples = trainset + valset
|
|
|
|
distribution = {}
|
|
for ex in all_examples:
|
|
lang = ex.language
|
|
distribution[lang] = distribution.get(lang, 0) + 1
|
|
|
|
return distribution
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# Print statistics
|
|
trainset, valset = get_extended_training_data()
|
|
|
|
print("GEPA Extended Training Data Statistics")
|
|
print("=" * 50)
|
|
print(f"Training examples: {len(trainset)}")
|
|
print(f"Validation examples: {len(valset)}")
|
|
print(f"Total examples: {len(trainset) + len(valset)}")
|
|
|
|
print("\nIntent distribution:")
|
|
for intent, count in sorted(get_intent_distribution().items()):
|
|
print(f" {intent}: {count}")
|
|
|
|
print("\nLanguage distribution:")
|
|
for lang, count in sorted(get_language_distribution().items()):
|
|
print(f" {lang}: {count}")
|