""" Extended GEPA Training Examples for Heritage RAG This module provides additional training and validation examples for GEPA optimization, based on real heritage institution query patterns. Usage: from gepa_training_extended import get_extended_training_data trainset, valset = get_extended_training_data() """ from dspy import Example def get_extended_training_data() -> tuple[list[Example], list[Example]]: """Get extended training and validation data for GEPA optimization. Returns comprehensive examples covering: - All 7 query intents (geographic, statistical, relational, temporal, entity_lookup, comparative, exploration) - Multiple languages (nl, en, de, fr) - All GLAMORCUBESFIXPHDNT institution types - Complex multi-hop queries - Edge cases and disambiguation Returns: Tuple of (trainset, valset) with Example objects """ # ========================================================================== # TRAINING EXAMPLES (70% of data) # ========================================================================== trainset = [ # --------------------------------------------------------------------- # GEOGRAPHIC INTENT # --------------------------------------------------------------------- Example( question="Hoeveel musea zijn er in Amsterdam?", language="nl", expected_intent="geographic", expected_entities=["amsterdam", "musea"], expected_sources=["postgis", "qdrant", "sparql"], answer_contains=["Amsterdam", "museum"], ).with_inputs("question", "language"), Example( question="Show me archives within 10km of Utrecht Centraal", language="en", expected_intent="geographic", expected_entities=["utrecht centraal", "archives"], expected_sources=["postgis", "qdrant", "sparql"], answer_contains=["Utrecht", "archive", "km"], ).with_inputs("question", "language"), Example( question="Welke bibliotheken zijn er in Limburg?", language="nl", expected_intent="geographic", expected_entities=["limburg", "bibliotheken"], expected_sources=["postgis", "qdrant", "sparql"], answer_contains=["Limburg", "bibliotheek"], ).with_inputs("question", "language"), Example( question="Find heritage institutions in the Randstad region", language="en", expected_intent="geographic", expected_entities=["randstad", "heritage institutions"], expected_sources=["postgis", "qdrant", "sparql"], answer_contains=["Randstad", "Amsterdam", "Rotterdam"], ).with_inputs("question", "language"), Example( question="Zeig mir Museen in der Nähe von Maastricht", language="de", expected_intent="geographic", expected_entities=["maastricht", "museen"], expected_sources=["postgis", "qdrant", "sparql"], answer_contains=["Maastricht", "Museum"], ).with_inputs("question", "language"), # --------------------------------------------------------------------- # STATISTICAL INTENT # --------------------------------------------------------------------- Example( question="How many libraries are there in the Netherlands?", language="en", expected_intent="statistical", expected_entities=["libraries", "netherlands"], expected_sources=["sparql", "qdrant"], answer_contains=["librar", "Netherlands", "number"], ).with_inputs("question", "language"), Example( question="Hoeveel erfgoedinstellingen heeft Noord-Holland?", language="nl", expected_intent="statistical", expected_entities=["noord-holland", "erfgoedinstellingen"], expected_sources=["sparql", "qdrant"], answer_contains=["Noord-Holland", "aantal"], ).with_inputs("question", "language"), Example( question="What percentage of Dutch museums have a Wikidata entry?", language="en", expected_intent="statistical", expected_entities=["dutch museums", "wikidata"], expected_sources=["sparql", "qdrant"], answer_contains=["percent", "Wikidata", "museum"], ).with_inputs("question", "language"), Example( question="Count archives by province in the Netherlands", language="en", expected_intent="statistical", expected_entities=["archives", "province", "netherlands"], expected_sources=["sparql", "qdrant"], answer_contains=["archive", "province"], ).with_inputs("question", "language"), Example( question="Wat is de verdeling van erfgoedinstellingen per type?", language="nl", expected_intent="statistical", expected_entities=["erfgoedinstellingen", "type", "verdeling"], expected_sources=["sparql", "qdrant"], answer_contains=["verdeling", "type"], ).with_inputs("question", "language"), # --------------------------------------------------------------------- # RELATIONAL INTENT # --------------------------------------------------------------------- Example( question="What institutions are part of the Erfgoed Leiden network?", language="en", expected_intent="relational", expected_entities=["erfgoed leiden"], expected_sources=["typedb", "sparql"], answer_contains=["Erfgoed", "Leiden", "member"], ).with_inputs("question", "language"), Example( question="Welke archieven zijn onderdeel van het Nationaal Archief?", language="nl", expected_intent="relational", expected_entities=["nationaal archief"], expected_sources=["typedb", "sparql"], answer_contains=["Nationaal Archief", "onderdeel"], ).with_inputs("question", "language"), Example( question="Show relationships between museums in Amsterdam", language="en", expected_intent="relational", expected_entities=["museums", "amsterdam"], expected_sources=["typedb", "sparql"], answer_contains=["Amsterdam", "museum", "relation"], ).with_inputs("question", "language"), Example( question="Which archives merged to form Noord-Hollands Archief?", language="en", expected_intent="relational", expected_entities=["noord-hollands archief"], expected_sources=["typedb", "sparql"], answer_contains=["Noord-Hollands Archief", "merge"], ).with_inputs("question", "language"), # --------------------------------------------------------------------- # TEMPORAL INTENT # --------------------------------------------------------------------- Example( question="When was the Rijksmuseum founded?", language="en", expected_intent="temporal", expected_entities=["rijksmuseum"], expected_sources=["sparql", "qdrant"], answer_contains=["Rijksmuseum", "founded", "1800"], ).with_inputs("question", "language"), Example( question="Welke bibliotheken zijn gefuseerd sinds 2000?", language="nl", expected_intent="temporal", expected_entities=["bibliotheken", "fusie", "2000"], expected_sources=["typedb", "sparql"], answer_contains=["bibliotheek", "fusie"], ).with_inputs("question", "language"), Example( question="Show the history of organizational changes for Stadsarchief Amsterdam", language="en", expected_intent="temporal", expected_entities=["stadsarchief amsterdam"], expected_sources=["typedb", "sparql"], answer_contains=["Stadsarchief", "Amsterdam", "change"], ).with_inputs("question", "language"), Example( question="Which museums closed in the last 10 years?", language="en", expected_intent="temporal", expected_entities=["museums", "closed"], expected_sources=["typedb", "sparql"], answer_contains=["museum", "closed", "year"], ).with_inputs("question", "language"), Example( question="Wanneer is het Nationaal Archief opgericht?", language="nl", expected_intent="temporal", expected_entities=["nationaal archief"], expected_sources=["sparql", "qdrant"], answer_contains=["Nationaal Archief", "opgericht"], ).with_inputs("question", "language"), # --------------------------------------------------------------------- # ENTITY LOOKUP INTENT # --------------------------------------------------------------------- Example( question="Where is the Rijksmuseum located?", language="en", expected_intent="entity_lookup", expected_entities=["rijksmuseum"], expected_sources=["sparql", "qdrant"], answer_contains=["Rijksmuseum", "Amsterdam", "Museumplein"], ).with_inputs("question", "language"), Example( question="What is the ISIL code for Koninklijke Bibliotheek?", language="en", expected_intent="entity_lookup", expected_entities=["koninklijke bibliotheek", "isil"], expected_sources=["sparql", "qdrant"], answer_contains=["Koninklijke Bibliotheek", "ISIL", "NL-"], ).with_inputs("question", "language"), Example( question="Geef informatie over het Van Gogh Museum", language="nl", expected_intent="entity_lookup", expected_entities=["van gogh museum"], expected_sources=["sparql", "qdrant"], answer_contains=["Van Gogh", "Museum", "Amsterdam"], ).with_inputs("question", "language"), Example( question="What type of institution is Naturalis?", language="en", expected_intent="entity_lookup", expected_entities=["naturalis"], expected_sources=["sparql", "qdrant"], answer_contains=["Naturalis", "museum", "natural history"], ).with_inputs("question", "language"), Example( question="Show me details about Regionaal Archief Tilburg", language="en", expected_intent="entity_lookup", expected_entities=["regionaal archief tilburg"], expected_sources=["sparql", "qdrant"], answer_contains=["Tilburg", "archief"], ).with_inputs("question", "language"), # --------------------------------------------------------------------- # COMPARATIVE INTENT # --------------------------------------------------------------------- Example( question="Compare the collections of Rijksmuseum and Van Gogh Museum", language="en", expected_intent="comparative", expected_entities=["rijksmuseum", "van gogh museum"], expected_sources=["sparql", "qdrant"], answer_contains=["Rijksmuseum", "Van Gogh", "collection"], ).with_inputs("question", "language"), Example( question="Vergelijk het Nationaal Archief met het Stadsarchief Amsterdam", language="nl", expected_intent="comparative", expected_entities=["nationaal archief", "stadsarchief amsterdam"], expected_sources=["sparql", "qdrant"], answer_contains=["Nationaal Archief", "Stadsarchief"], ).with_inputs("question", "language"), Example( question="What are the differences between provincial and municipal archives?", language="en", expected_intent="comparative", expected_entities=["provincial archives", "municipal archives"], expected_sources=["sparql", "qdrant"], answer_contains=["provincial", "municipal", "archive"], ).with_inputs("question", "language"), Example( question="How do Dutch university libraries compare in size?", language="en", expected_intent="comparative", expected_entities=["university libraries", "dutch"], expected_sources=["sparql", "qdrant"], answer_contains=["university", "library", "size"], ).with_inputs("question", "language"), # --------------------------------------------------------------------- # EXPLORATION INTENT # --------------------------------------------------------------------- Example( question="Show me archives related to World War II", language="en", expected_intent="exploration", expected_entities=["archives", "world war ii"], expected_sources=["qdrant", "sparql"], answer_contains=["archive", "war", "collection"], ).with_inputs("question", "language"), Example( question="Find museums about maritime history", language="en", expected_intent="exploration", expected_entities=["museums", "maritime history"], expected_sources=["qdrant", "sparql"], answer_contains=["museum", "maritime", "ship"], ).with_inputs("question", "language"), Example( question="Zoek bibliotheken met middeleeuwse manuscripten", language="nl", expected_intent="exploration", expected_entities=["bibliotheken", "middeleeuwse manuscripten"], expected_sources=["qdrant", "sparql"], answer_contains=["bibliotheek", "manuscript"], ).with_inputs("question", "language"), Example( question="Discover heritage institutions with photo collections", language="en", expected_intent="exploration", expected_entities=["heritage institutions", "photo collections"], expected_sources=["qdrant", "sparql"], answer_contains=["photo", "collection"], ).with_inputs("question", "language"), Example( question="Browse archives with colonial history documents", language="en", expected_intent="exploration", expected_entities=["archives", "colonial history"], expected_sources=["qdrant", "sparql"], answer_contains=["archive", "colonial", "document"], ).with_inputs("question", "language"), # --------------------------------------------------------------------- # INSTITUTION TYPE SPECIFIC (GLAMORCUBESFIXPHDNT) # --------------------------------------------------------------------- # Galleries (G) Example( question="Find art galleries in Rotterdam", language="en", expected_intent="geographic", expected_entities=["art galleries", "rotterdam"], expected_sources=["postgis", "qdrant", "sparql"], answer_contains=["Rotterdam", "gallery"], ).with_inputs("question", "language"), # Research Centers (R) Example( question="What research centers focus on heritage conservation?", language="en", expected_intent="exploration", expected_entities=["research centers", "heritage conservation"], expected_sources=["qdrant", "sparql"], answer_contains=["research", "conservation"], ).with_inputs("question", "language"), # Botanical/Zoo (B) Example( question="List botanical gardens in the Netherlands", language="en", expected_intent="statistical", expected_entities=["botanical gardens", "netherlands"], expected_sources=["sparql", "qdrant"], answer_contains=["botanical", "garden"], ).with_inputs("question", "language"), # Education (E) Example( question="Which universities have heritage collections?", language="en", expected_intent="exploration", expected_entities=["universities", "heritage collections"], expected_sources=["qdrant", "sparql"], answer_contains=["university", "collection"], ).with_inputs("question", "language"), # Societies (S) Example( question="Find historical societies in Gelderland", language="en", expected_intent="geographic", expected_entities=["historical societies", "gelderland"], expected_sources=["postgis", "qdrant", "sparql"], answer_contains=["Gelderland", "society"], ).with_inputs("question", "language"), # Holy Sites (H) Example( question="Which churches have historical archives?", language="en", expected_intent="exploration", expected_entities=["churches", "historical archives"], expected_sources=["qdrant", "sparql"], answer_contains=["church", "archive"], ).with_inputs("question", "language"), # Digital Platforms (D) Example( question="What digital heritage portals exist in the Netherlands?", language="en", expected_intent="exploration", expected_entities=["digital heritage portals", "netherlands"], expected_sources=["qdrant", "sparql"], answer_contains=["digital", "portal"], ).with_inputs("question", "language"), ] # ========================================================================== # VALIDATION EXAMPLES (30% of data, held out) # ========================================================================== valset = [ Example( question="Hoeveel archieven heeft Noord-Holland?", language="nl", expected_intent="statistical", expected_entities=["noord-holland", "archieven"], expected_sources=["sparql", "qdrant"], answer_contains=["Noord-Holland", "archief", "aantal"], ).with_inputs("question", "language"), Example( question="When was the Nationaal Archief founded?", language="en", expected_intent="temporal", expected_entities=["nationaal archief"], expected_sources=["sparql", "qdrant"], answer_contains=["Nationaal Archief", "founded", "year"], ).with_inputs("question", "language"), Example( question="Show galleries in Utrecht", language="en", expected_intent="geographic", expected_entities=["galleries", "utrecht"], expected_sources=["postgis", "qdrant", "sparql"], answer_contains=["gallery", "Utrecht"], ).with_inputs("question", "language"), Example( question="Compare archives in Amsterdam and Rotterdam", language="en", expected_intent="comparative", expected_entities=["archives", "amsterdam", "rotterdam"], expected_sources=["sparql", "qdrant"], answer_contains=["archive", "Amsterdam", "Rotterdam"], ).with_inputs("question", "language"), Example( question="Welke musea behoren tot het Museumkaart netwerk?", language="nl", expected_intent="relational", expected_entities=["musea", "museumkaart"], expected_sources=["typedb", "sparql"], answer_contains=["museum", "Museumkaart"], ).with_inputs("question", "language"), Example( question="Find libraries with special collections", language="en", expected_intent="exploration", expected_entities=["libraries", "special collections"], expected_sources=["qdrant", "sparql"], answer_contains=["library", "special", "collection"], ).with_inputs("question", "language"), Example( question="What is the address of Het Scheepvaartmuseum?", language="en", expected_intent="entity_lookup", expected_entities=["scheepvaartmuseum"], expected_sources=["sparql", "qdrant"], answer_contains=["Scheepvaartmuseum", "Amsterdam", "address"], ).with_inputs("question", "language"), Example( question="Toon erfgoedinstellingen bij Eindhoven", language="nl", expected_intent="geographic", expected_entities=["eindhoven", "erfgoedinstellingen"], expected_sources=["postgis", "qdrant", "sparql"], answer_contains=["Eindhoven"], ).with_inputs("question", "language"), Example( question="How many heritage institutions have ISIL codes?", language="en", expected_intent="statistical", expected_entities=["heritage institutions", "isil"], expected_sources=["sparql", "qdrant"], answer_contains=["ISIL", "number", "institution"], ).with_inputs("question", "language"), Example( question="Which museums relocated after 2010?", language="en", expected_intent="temporal", expected_entities=["museums", "relocated", "2010"], expected_sources=["typedb", "sparql"], answer_contains=["museum", "relocat", "2010"], ).with_inputs("question", "language"), ] return trainset, valset def get_intent_distribution() -> dict[str, int]: """Get distribution of intents in training data.""" trainset, valset = get_extended_training_data() all_examples = trainset + valset distribution = {} for ex in all_examples: intent = ex.expected_intent distribution[intent] = distribution.get(intent, 0) + 1 return distribution def get_language_distribution() -> dict[str, int]: """Get distribution of languages in training data.""" trainset, valset = get_extended_training_data() all_examples = trainset + valset distribution = {} for ex in all_examples: lang = ex.language distribution[lang] = distribution.get(lang, 0) + 1 return distribution if __name__ == "__main__": # Print statistics trainset, valset = get_extended_training_data() print("GEPA Extended Training Data Statistics") print("=" * 50) print(f"Training examples: {len(trainset)}") print(f"Validation examples: {len(valset)}") print(f"Total examples: {len(trainset) + len(valset)}") print("\nIntent distribution:") for intent, count in sorted(get_intent_distribution().items()): print(f" {intent}: {count}") print("\nLanguage distribution:") for lang, count in sorted(get_language_distribution().items()): print(f" {lang}: {count}")