diff --git a/docs/plan/external_design_patterns/01_graphrag_design_patterns.md b/docs/plan/external_design_patterns/01_graphrag_design_patterns.md index 84521381ff..4ba74b2ab9 100644 --- a/docs/plan/external_design_patterns/01_graphrag_design_patterns.md +++ b/docs/plan/external_design_patterns/01_graphrag_design_patterns.md @@ -1678,8 +1678,8 @@ class SemanticDecisionRouter: # Institution + aggregation (statistical) { "condition": lambda s: s.entity_type == "institution" and s.requires_aggregation, - "primary_backend": "ducklake", - "secondary_backend": "sparql", + "primary_backend": "sparql", # SPARQL COUNT/SUM aggregations + "secondary_backend": "qdrant", }, # Default institution query { @@ -1730,9 +1730,9 @@ Query → ConversationContextResolver → FykeFilter → SemanticSignalExtractor ↓ ┌─────────────────┼─────────────────┐ ↓ ↓ ↓ - TemplateClassifier PersonRetriever DuckLakeQuery + TemplateClassifier PersonRetriever SPARQLAggregation ↓ ↓ ↓ - SPARQL Qdrant SQL + SPARQL Qdrant SPARQL ``` **Implementation in dspy_heritage_rag.py**: @@ -2032,7 +2032,7 @@ class EmbeddingIntentClassifier: 1. **Immediate**: Add `entity_type` routing to distinguish person vs institution queries 2. **Short-term**: Implement embedding-based intent classification as pre-filter -3. **Medium-term**: Add DuckLake routing for statistical queries +3. **Medium-term**: Add SPARQL aggregation templates for statistical queries (COUNT, SUM, AVG) --- diff --git a/docs/plan/external_design_patterns/04_temporal_semantic_hypergraph.md b/docs/plan/external_design_patterns/04_temporal_semantic_hypergraph.md index 325b36b194..a024bde8de 100644 --- a/docs/plan/external_design_patterns/04_temporal_semantic_hypergraph.md +++ b/docs/plan/external_design_patterns/04_temporal_semantic_hypergraph.md @@ -25,7 +25,8 @@ This guide provides **production-ready code patterns** for implementing the adva - **Oxigraph** → RDF/SPARQL storage and queries - **Qdrant** → Vector embeddings and semantic search - **DSPy** → LLM orchestration and optimization -- **DuckLake** → SQL analytics + +**Note**: DuckLake is available for offline analytics but NOT used in real-time RAG retrieval. **No new frameworks required.** @@ -768,11 +769,13 @@ class SemanticDecisionRouter: return config - # Statistical queries → DuckLake + # Statistical queries → SPARQL aggregations (COUNT, SUM, AVG, GROUP BY) + # NOTE: DuckLake is for offline analytics only, not real-time RAG retrieval if signals.requires_aggregation: return RouteConfig( - primary_backend="ducklake", - secondary_backend="sparql", + primary_backend="sparql", + secondary_backend="qdrant", + qdrant_collection="heritage_custodians", ) # Temporal queries → Temporal SPARQL templates @@ -1370,7 +1373,7 @@ class TestSemanticRouting: assert signals.requires_aggregation == True route = router.decision_router.route(signals) - assert route.primary_backend == "ducklake" + assert route.primary_backend == "sparql" # SPARQL aggregations def test_temporal_query_routing(self, router): """Test that temporal queries use temporal templates.""" diff --git a/docs/plan/prompt-query_template_mapping/SOTA_analysis.md b/docs/plan/prompt-query_template_mapping/SOTA_analysis.md new file mode 100644 index 0000000000..bbe0761f10 --- /dev/null +++ b/docs/plan/prompt-query_template_mapping/SOTA_analysis.md @@ -0,0 +1,516 @@ +# SOTA Analysis: Template-Based SPARQL Generation + +**Date**: 2025-01-07 +**Status**: Active Research +**Author**: OpenCode + +## Executive Summary + +Based on comprehensive research of 2024-2025 academic papers and industry practices, this document compares our current implementation against state-of-the-art (SOTA) approaches and recommends improvements. + +**Key Finding**: Our 3-tier architecture (regex → embedding → LLM) is well-aligned with SOTA hybrid approaches. The primary improvement opportunities are: +1. Add RAG-enhanced tier between embedding and LLM +2. Implement SPARQL validation feedback loop +3. Schema-aware slot filling +4. GEPA optimization for DSPy modules + +--- + +## 1. Research Survey + +### 1.1 SPARQL-LLM (arXiv 2512.14277, Dec 2024) + +**Key Innovation**: Real-time SPARQL generation with 24% F1 improvement over TEXT2SPARQL winners. + +**Architecture**: +``` +User Question + ↓ +┌─────────────────────────────────────────┐ +│ Metadata Indexer │ +│ - Schema classes/properties indexed │ +│ - Example Q&A pairs vectorized │ +└─────────────────────────────────────────┘ + ↓ +┌─────────────────────────────────────────┐ +│ Prompt Builder (RAG) │ +│ - Retrieve similar examples │ +│ - Retrieve relevant schema fragments │ +│ - Compose context-rich prompt │ +└─────────────────────────────────────────┘ + ↓ +┌─────────────────────────────────────────┐ +│ SPARQL Generator │ +│ - LLM generates SPARQL │ +│ - Validation against schema │ +│ - Iterative correction loop │ +└─────────────────────────────────────────┘ + ↓ +Validated SPARQL +``` + +**Relevance to GLAM**: +- ✅ We have schema (LinkML) but don't use it in prompts +- ✅ We have example Q&A in templates but don't retrieve semantically +- ❌ Missing: Schema-aware validation loop + +### 1.2 COT-SPARQL (SEMANTICS 2024) + +**Key Innovation**: Chain-of-Thought prompting with context injection. + +**Two Context Types**: +- **Context A**: Entity and relation extraction from question +- **Context B**: Most semantically similar example from training set + +**Performance**: 4.4% F1 improvement on QALD-10, 3.0% on QALD-9 + +**Relevance to GLAM**: +- ✅ Our embedding matcher finds similar patterns (partial Context B) +- ❌ Missing: Entity/relation extraction step (Context A) +- ❌ Missing: CoT prompting in LLM tier + +### 1.3 KGQuest (arXiv 2511.11258, Nov 2024) + +**Key Innovation**: Deterministic template generation + LLM refinement. + +**Architecture**: +``` +KG Triplets + ↓ +Cluster by relation type + ↓ +Generate rule-based templates (deterministic) + ↓ +LLM refinement for fluency (lightweight, controlled) +``` + +**Relevance to GLAM**: +- ✅ Validates our template-first approach +- ✅ We use deterministic templates with LLM fallback +- 💡 Insight: Use LLM only for refinement, not generation + +### 1.4 Hybrid Template + LLM Fallback (LinkedIn, May 2024) + +**Key Innovation**: Explicit tiered architecture with fallback. + +**Recommended Pattern**: +```python +def process_query(question): + # Tier 1: Template matching (deterministic, high accuracy) + match = template_matcher.match(question) + if match and match.confidence >= 0.85: + return render_template(match) + + # Tier 2: LLM generation (fallback) + return llm_generate_sparql(question, schema_context) +``` + +**Relevance to GLAM**: +- ✅ We already implement this pattern +- 💡 Our threshold is 0.75, could consider raising for higher precision + +### 1.5 GEPA Optimization (DSPy, 2024-2025) + +**Key Innovation**: Genetic-Pareto optimization for prompt evolution. + +**Approach**: +- Dual-model: Cheap student LM + Smart reflection LM +- Iterate: Run → Analyze failures → Generate improved prompts +- Results: 10-20% accuracy improvements typical + +**Relevance to GLAM**: +- ❌ We use static DSPy signatures without optimization +- 💡 Could apply GEPA to TemplateClassifier and SlotExtractor + +### 1.6 Intent-Driven Hybrid Architecture (2024) + +**Key Pattern**: Intent classification → Template selection → Slot filling → LLM fallback + +``` +User Query + ↓ +┌─────────────────────────────────────────┐ +│ Intent Classifier │ +│ - Embedding-based classification │ +│ - Hierarchical intent taxonomy │ +└─────────────────────────────────────────┘ + ↓ +┌─────────────────────────────────────────┐ +│ Template Selector │ +│ - Map intent → available templates │ +│ - FAISS/vector retrieval for similar │ +└─────────────────────────────────────────┘ + ↓ +┌─────────────────────────────────────────┐ +│ Slot Filler │ +│ - Schema-aware extraction │ +│ - Validation against ontology │ +└─────────────────────────────────────────┘ + ↓ +┌─────────────────────────────────────────┐ +│ LLM Fallback │ +│ - Only when template fails │ +│ - Constrained generation │ +└─────────────────────────────────────────┘ +``` + +**Relevance to GLAM**: +- ✅ We have semantic router for intent +- ✅ We have template classification +- ❌ Missing: Hierarchical intent taxonomy +- ❌ Missing: Schema-aware slot validation + +--- + +## 2. Current GLAM Architecture + +### 2.1 Current 3-Tier System + +``` +User Question + ↓ +┌─────────────────────────────────────────┐ +│ TIER 1: Pattern Matching │ +│ - Regex-based template matching │ +│ - Slot type validation │ +│ - Confidence ≥ 0.75 required │ +│ - ~1ms latency │ +└─────────────────────────────────────────┘ + ↓ (if no match) +┌─────────────────────────────────────────┐ +│ TIER 2: Embedding Matching │ +│ - Sentence-transformer embeddings │ +│ - Cosine similarity ≥ 0.70 │ +│ - ~50ms latency (cached) │ +└─────────────────────────────────────────┘ + ↓ (if no match) +┌─────────────────────────────────────────┐ +│ TIER 3: LLM Classification │ +│ - DSPy ChainOfThought │ +│ - Template ID classification │ +│ - ~500-2000ms latency │ +└─────────────────────────────────────────┘ + ↓ +Slot Extraction (DSPy) + ↓ +Template Instantiation (Jinja2) + ↓ +SPARQL Query +``` + +### 2.2 Strengths + +| Aspect | Current Implementation | Rating | +|--------|----------------------|--------| +| Deterministic first | Regex before embeddings before LLM | ⭐⭐⭐⭐⭐ | +| Semantic similarity | Sentence-transformer embeddings | ⭐⭐⭐⭐ | +| Multilingual | Dutch/English/German patterns | ⭐⭐⭐⭐ | +| Conversation context | Context resolver for follow-ups | ⭐⭐⭐⭐ | +| Relevance filtering | Fyke filter for out-of-scope | ⭐⭐⭐⭐ | +| Slot resolution | Synonym resolver with fuzzy match | ⭐⭐⭐⭐ | +| Template variants | Region/country/ISIL variants | ⭐⭐⭐⭐ | + +### 2.3 Gaps vs SOTA + +| Gap | SOTA Reference | Impact | Priority | +|-----|---------------|--------|----------| +| No RAG-enhanced tier | SPARQL-LLM, FIRESPARQL | Medium | High | +| No SPARQL validation loop | SPARQL-LLM | High | High | +| No schema-aware slot filling | Auto-KGQA, LLM-based NL2SPARQL | Medium | Medium | +| No GEPA optimization | DSPy GEPA tutorials | Medium | Medium | +| No hierarchical intents | Intent classification patterns | Low | Low | +| Limited metrics | SPARQL-LLM | Low | Low | + +--- + +## 3. Proposed Improvements + +### 3.1 Add Tier 2.5: RAG-Enhanced Matching + +Insert a RAG tier between embedding matching and LLM fallback: + +```python +# After embedding match fails: +@dataclass +class RAGEnhancedMatch: + """Context-enriched matching using similar examples.""" + + def match(self, question: str, templates: dict) -> Optional[TemplateMatchResult]: + # Retrieve top-3 most similar Q&A examples from YAML + similar_examples = self._retrieve_similar_examples(question, k=3) + + # Check if examples strongly suggest a template + template_votes = Counter(ex.template_id for ex in similar_examples) + top_template, count = template_votes.most_common(1)[0] + + if count >= 2: # 2 of 3 examples agree + return TemplateMatchResult( + matched=True, + template_id=top_template, + confidence=0.75 + (count / 3) * 0.15, # 0.80-0.90 + reasoning=f"RAG: {count}/3 similar examples use {top_template}" + ) + return None +``` + +**Benefits**: +- Handles paraphrases that embeddings miss +- Uses existing example data in templates +- Cheaper than LLM fallback + +### 3.2 Add SPARQL Validation Feedback Loop + +After template instantiation, validate SPARQL against schema: + +```python +class SPARQLValidator: + """Validates generated SPARQL against ontology schema.""" + + def __init__(self, schema_path: Path): + self.valid_predicates = self._load_predicates(schema_path) + self.valid_classes = self._load_classes(schema_path) + + def validate(self, sparql: str) -> ValidationResult: + errors = [] + + # Extract predicates used in query + predicates = re.findall(r'(hc:\w+|schema:\w+)', sparql) + for pred in predicates: + if pred not in self.valid_predicates: + errors.append(f"Unknown predicate: {pred}") + + # Extract classes + classes = re.findall(r'a\s+(hcc:\w+)', sparql) + for cls in classes: + if cls not in self.valid_classes: + errors.append(f"Unknown class: {cls}") + + return ValidationResult( + valid=len(errors) == 0, + errors=errors, + suggestions=self._suggest_fixes(errors) + ) + + def correct_with_llm(self, sparql: str, errors: list[str]) -> str: + """Use LLM to correct validation errors.""" + prompt = f""" + The following SPARQL query has errors: + + ```sparql + {sparql} + ``` + + Errors found: + {chr(10).join(f'- {e}' for e in errors)} + + Correct the query. Return only the corrected SPARQL. + """ + # Call LLM for correction + return self._call_llm(prompt) +``` + +**Benefits**: +- Catches schema mismatches before execution +- Enables iterative correction (SPARQL-LLM pattern) +- Reduces runtime errors + +### 3.3 Schema-Aware Slot Filling + +Use ontology to validate extracted slot values: + +```python +class SchemaAwareSlotExtractor(dspy.Module): + """Slot extraction with ontology validation.""" + + def __init__(self, ontology_path: Path): + super().__init__() + self.extract = dspy.ChainOfThought(SlotExtractorSignature) + self.ontology = self._load_ontology(ontology_path) + + def forward(self, question: str, template_id: str, ...) -> dict[str, str]: + # Standard DSPy extraction + raw_slots = self.extract(question=question, ...) + + # Validate against ontology + validated_slots = {} + for slot_name, value in raw_slots.items(): + if slot_name == "institution_type": + # Check if value maps to valid hc:institutionType + if value in self.ontology.institution_types: + validated_slots[slot_name] = value + else: + # Try fuzzy match against ontology + match = self._fuzzy_match_ontology(value, "institution_types") + if match: + validated_slots[slot_name] = match + logger.info(f"Corrected slot: {value} → {match}") + + return validated_slots +``` + +**Benefits**: +- Ensures slot values are ontology-compliant +- Auto-corrects minor extraction errors +- Reduces downstream SPARQL errors + +### 3.4 GEPA Optimization for DSPy Modules + +Add GEPA optimization training for key modules: + +```python +# backend/rag/optimization/gepa_training.py + +import dspy +from dspy import GEPA + +def optimize_template_classifier(): + """Optimize TemplateClassifier using GEPA.""" + + # Load training data from template examples + training_data = load_training_examples() + + # Define metric + def classification_metric(example, prediction): + return 1.0 if prediction.template_id == example.expected_template else 0.0 + + # Initialize GEPA optimizer + optimizer = GEPA( + metric=classification_metric, + num_candidates=10, + num_threads=4, + ) + + # Optimize + classifier = TemplateClassifier() + optimized = optimizer.compile( + classifier, + trainset=training_data, + max_rounds=5, + ) + + # Save optimized module + optimized.save("optimized_template_classifier.json") + + return optimized +``` + +**Benefits**: +- 10-20% accuracy improvement typical +- Automated prompt refinement +- Domain-specific optimization + +### 3.5 Hierarchical Intent Classification + +Structure intents hierarchically for scalability: + +```yaml +# Intent taxonomy for 50+ intents +intent_hierarchy: + geographic: + - list_by_location + - count_by_location + - compare_locations + temporal: + - point_in_time + - timeline + - events_in_period + - founding_date + entity: + - find_by_name + - find_by_identifier + statistical: + - count_by_type + - distribution + - aggregation + financial: + - budget_threshold + - expense_comparison +``` + +```python +class HierarchicalIntentClassifier: + """Two-stage intent classification for scalability.""" + + def classify(self, question: str) -> IntentResult: + # Stage 1: Classify into top-level category (5 options) + top_level = self._classify_top_level(question) # geographic, temporal, etc. + + # Stage 2: Classify into specific intent within category + specific = self._classify_specific(question, top_level) + + return IntentResult( + top_level=top_level, + specific=specific, + confidence=min(top_level.confidence, specific.confidence) + ) +``` + +**Benefits**: +- Scales to 50+ templates without accuracy loss +- Faster classification (fewer options per stage) +- Better organized codebase + +--- + +## 4. Implementation Priority + +### Phase 1: High Impact (1-2 days) + +1. **SPARQL Validation Loop** (3.2) + - Load schema from LinkML + - Validate predicates/classes + - Add LLM correction step + +2. **Metrics Enhancement** (3.6) + - Track tier usage distribution + - Track latency per tier + - Track validation error rates + +### Phase 2: Medium Impact (2-3 days) + +3. **RAG-Enhanced Tier** (3.1) + - Index template examples + - Implement retrieval + - Add as Tier 2.5 + +4. **Schema-Aware Slot Filling** (3.3) + - Load ontology + - Validate extracted values + - Auto-correct mismatches + +### Phase 3: Optimization (3-5 days) + +5. **GEPA Training** (3.4) + - Create training dataset + - Define metrics + - Run optimization + - Deploy optimized modules + +6. **Hierarchical Intents** (3.5) + - Design taxonomy + - Implement two-stage classifier + - Migrate existing templates + +--- + +## 5. Expected Outcomes + +| Improvement | Expected Impact | Measurement | +|-------------|-----------------|-------------| +| SPARQL Validation | -50% runtime errors | Error rate tracking | +| RAG-Enhanced Tier | +5-10% template match rate | Tier 2.5 success rate | +| Schema-Aware Slots | -30% slot errors | Validation error logs | +| GEPA Optimization | +10-20% LLM tier accuracy | Template classification F1 | +| Hierarchical Intents | Ready for 50+ templates | Intent classification latency | + +--- + +## 6. References + +1. SPARQL-LLM (arXiv:2512.14277) - Real-time SPARQL generation +2. COT-SPARQL (SEMANTICS 2024) - Chain-of-Thought prompting +3. KGQuest (arXiv:2511.11258) - Deterministic template + LLM refinement +4. FIRESPARQL (arXiv:2508.10467) - Modular framework with fine-tuning +5. Auto-KGQA (ESWC 2024) - Autonomous KG subgraph selection +6. DSPy GEPA - Reflective prompt evolution +7. Hybrid NLQ→SPARQL (LinkedIn 2024) - Template-first with LLM fallback