feat(rag): add factual query fast path - skip LLM for count/list queries
- Add ontology cache warming at startup in lifespan() function - Add is_factual_query() detection in template_sparql.py (12 templates) - Add factual_result and sparql_query fields to DSPyQueryResponse - Skip LLM generation for factual templates (count, list, compare) - Execute SPARQL directly and return results as table (~15s → ~2s latency) - Update ConversationPanel.tsx to render factual results table - Add CSS styling for factual results with green theme For queries like 'hoeveel archieven zijn er in Den Haag', the SPARQL results ARE the answer - no need for expensive LLM prose generation.
This commit is contained in:
parent
85d9cee82f
commit
0b0ea75070
4 changed files with 1458 additions and 59 deletions
|
|
@ -683,6 +683,10 @@ class DSPyQueryResponse(BaseModel):
|
|||
template_used: bool = False # Whether template-based SPARQL was used (vs LLM generation)
|
||||
template_id: str | None = None # Which template was used (e.g., "institution_by_city", "person_by_name")
|
||||
|
||||
# Factual query mode - skip LLM generation for count/list queries
|
||||
factual_result: bool = False # True if this is a direct SPARQL result (no LLM prose generation)
|
||||
sparql_query: str | None = None # The SPARQL query that was executed (for transparency)
|
||||
|
||||
|
||||
def extract_llm_response_metadata(
|
||||
lm: Any,
|
||||
|
|
@ -1833,6 +1837,28 @@ async def lifespan(app: FastAPI) -> AsyncIterator[None]:
|
|||
except Exception as e:
|
||||
logger.warning(f"Failed to initialize AtomicCacheManager: {e}")
|
||||
|
||||
# === ONTOLOGY CACHE WARMUP: Pre-load KG values to avoid cold-start latency ===
|
||||
# The OntologyLoader queries the Knowledge Graph for valid slot values (cities, regions, types).
|
||||
# These queries can take 1-3 seconds each on first access.
|
||||
# By pre-loading at startup, we eliminate this delay for users.
|
||||
ontology_warmup_start = time.perf_counter()
|
||||
try:
|
||||
from template_sparql import get_ontology_loader
|
||||
|
||||
logger.info("Warming up ontology cache (pre-loading KG values)...")
|
||||
ontology = get_ontology_loader()
|
||||
ontology.load() # Triggers KG queries for institution_types, subregions, cities, etc.
|
||||
|
||||
ontology_warmup_duration = time.perf_counter() - ontology_warmup_start
|
||||
cache_stats = ontology.get_kg_cache_stats()
|
||||
logger.info(
|
||||
f"✅ Ontology cache warmed up in {ontology_warmup_duration:.2f}s "
|
||||
f"({cache_stats['cache_size']} KG queries cached, TTL={cache_stats['ttl_seconds']}s)"
|
||||
)
|
||||
except Exception as e:
|
||||
ontology_warmup_duration = time.perf_counter() - ontology_warmup_start
|
||||
logger.warning(f"Failed to warm up ontology cache: {e}")
|
||||
|
||||
logger.info("Heritage RAG API started")
|
||||
|
||||
yield
|
||||
|
|
@ -2967,6 +2993,158 @@ async def dspy_query(request: DSPyQueryRequest) -> DSPyQueryResponse:
|
|||
except Exception as e:
|
||||
logger.warning(f"Atomic decomposition failed: {e}")
|
||||
|
||||
# ==========================================================================
|
||||
# FACTUAL QUERY FAST PATH: Skip LLM for count/list queries
|
||||
# ==========================================================================
|
||||
# For factual queries (counts, lists, comparisons), the SPARQL results ARE
|
||||
# the answer. No need for expensive LLM prose generation - just return the
|
||||
# table directly. This can reduce latency from ~15s to ~2s.
|
||||
# ==========================================================================
|
||||
try:
|
||||
from template_sparql import get_template_pipeline, is_factual_query
|
||||
|
||||
template_pipeline = get_template_pipeline()
|
||||
|
||||
# Build conversation history for template context resolution
|
||||
history_for_template = []
|
||||
for turn in request.context:
|
||||
if turn.get("question") and turn.get("answer"):
|
||||
history_for_template.append({
|
||||
"question": turn["question"],
|
||||
"answer": turn["answer"]
|
||||
})
|
||||
|
||||
# Try template matching (this handles follow-up resolution internally)
|
||||
template_result = template_pipeline(
|
||||
question=request.question,
|
||||
language=request.language,
|
||||
history=history_for_template,
|
||||
conversation_state=conversation_state,
|
||||
)
|
||||
|
||||
# Check if this is a factual query that can skip LLM
|
||||
if template_result.matched and is_factual_query(template_result.template_id):
|
||||
logger.info(
|
||||
f"[FACTUAL-QUERY] Template '{template_result.template_id}' is factual - "
|
||||
f"skipping LLM generation (confidence={template_result.confidence:.2f})"
|
||||
)
|
||||
|
||||
# Execute SPARQL directly
|
||||
sparql_query = template_result.sparql
|
||||
sparql_results: list[dict[str, Any]] = []
|
||||
sparql_error: str | None = None
|
||||
|
||||
try:
|
||||
if retriever:
|
||||
client = await retriever._get_sparql_client()
|
||||
response = await client.post(
|
||||
settings.sparql_endpoint,
|
||||
data={"query": sparql_query},
|
||||
headers={"Accept": "application/sparql-results+json"},
|
||||
timeout=30.0,
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
bindings = data.get("results", {}).get("bindings", [])
|
||||
sparql_results = [
|
||||
{k: v.get("value") for k, v in binding.items()}
|
||||
for binding in bindings
|
||||
]
|
||||
else:
|
||||
sparql_error = f"SPARQL returned {response.status_code}"
|
||||
else:
|
||||
sparql_error = "Retriever not available"
|
||||
|
||||
except Exception as e:
|
||||
sparql_error = str(e)
|
||||
logger.warning(f"[FACTUAL-QUERY] SPARQL execution failed: {e}")
|
||||
|
||||
elapsed_ms = (time.time() - start_time) * 1000
|
||||
|
||||
# Generate a simple summary answer based on result type
|
||||
if sparql_error:
|
||||
answer = f"Er is een fout opgetreden bij het uitvoeren van de query: {sparql_error}"
|
||||
elif not sparql_results:
|
||||
answer = "Geen resultaten gevonden."
|
||||
elif template_result.template_id and "count" in template_result.template_id:
|
||||
# Count query - format as count
|
||||
count_value = sparql_results[0].get("count", len(sparql_results))
|
||||
answer = f"Aantal: {count_value}"
|
||||
else:
|
||||
# List query - just indicate result count
|
||||
answer = f"Gevonden: {len(sparql_results)} resultaten. Zie de tabel hieronder."
|
||||
|
||||
# Build response with factual_result=True
|
||||
factual_response = DSPyQueryResponse(
|
||||
question=request.question,
|
||||
resolved_question=getattr(template_result, "resolved_question", None),
|
||||
answer=answer,
|
||||
sources_used=["SPARQL Knowledge Graph"],
|
||||
visualization={"type": "table", "sparql_query": sparql_query},
|
||||
retrieved_results=sparql_results,
|
||||
query_type="factual",
|
||||
query_time_ms=round(elapsed_ms, 2),
|
||||
conversation_turn=len(request.context),
|
||||
cache_hit=False,
|
||||
session_id=session_id,
|
||||
template_used=True,
|
||||
template_id=template_result.template_id,
|
||||
factual_result=True,
|
||||
sparql_query=sparql_query,
|
||||
)
|
||||
|
||||
# Update session with this turn
|
||||
if session_mgr and session_id:
|
||||
try:
|
||||
await session_mgr.add_turn_to_session(
|
||||
session_id=session_id,
|
||||
question=request.question,
|
||||
answer=answer,
|
||||
resolved_question=getattr(template_result, "resolved_question", None),
|
||||
template_id=template_result.template_id,
|
||||
slots=template_result.slots or {},
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to update session: {e}")
|
||||
|
||||
# Record metrics
|
||||
if METRICS_AVAILABLE and record_query:
|
||||
try:
|
||||
record_query(
|
||||
endpoint="dspy_query",
|
||||
template_used=True,
|
||||
template_id=template_result.template_id,
|
||||
cache_hit=False,
|
||||
status="success",
|
||||
duration_seconds=elapsed_ms / 1000,
|
||||
intent="factual",
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to record metrics: {e}")
|
||||
|
||||
# Cache the response
|
||||
if retriever:
|
||||
await retriever.cache.set_dspy(
|
||||
question=request.question,
|
||||
language=request.language,
|
||||
llm_provider="none", # No LLM used
|
||||
embedding_model=request.embedding_model,
|
||||
response=factual_response.model_dump(),
|
||||
context=request.context if request.context else None,
|
||||
)
|
||||
|
||||
logger.info(f"[FACTUAL-QUERY] Returned {len(sparql_results)} results in {elapsed_ms:.2f}ms (LLM skipped)")
|
||||
return factual_response
|
||||
|
||||
except ImportError as e:
|
||||
logger.debug(f"Template SPARQL not available for factual query detection: {e}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Factual query detection failed (continuing with full pipeline): {e}")
|
||||
|
||||
# ==========================================================================
|
||||
# FULL RAG PIPELINE: For non-factual queries or when factual detection fails
|
||||
# ==========================================================================
|
||||
try:
|
||||
# Import DSPy pipeline and History
|
||||
import dspy
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
|
|
@ -1033,3 +1033,107 @@
|
|||
font-size: 0.625rem;
|
||||
}
|
||||
}
|
||||
|
||||
/* =============================================================================
|
||||
FACTUAL QUERY RESULTS TABLE
|
||||
For direct SPARQL results when LLM generation is skipped (count/list queries)
|
||||
============================================================================= */
|
||||
|
||||
.conversation-panel__factual-results {
|
||||
margin-top: 0.75rem;
|
||||
border: 1px solid #10b981;
|
||||
border-radius: 8px;
|
||||
overflow: hidden;
|
||||
background: white;
|
||||
}
|
||||
|
||||
.conversation-panel__factual-badge {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 0.5rem;
|
||||
padding: 0.5rem 0.75rem;
|
||||
background: linear-gradient(135deg, #dcfce7 0%, #d1fae5 100%);
|
||||
border-bottom: 1px solid #10b981;
|
||||
font-size: 0.75rem;
|
||||
font-weight: 600;
|
||||
color: #166534;
|
||||
}
|
||||
|
||||
.conversation-panel__factual-badge svg {
|
||||
color: #10b981;
|
||||
}
|
||||
|
||||
.conversation-panel__results-table-wrapper {
|
||||
overflow-x: auto;
|
||||
max-height: 400px;
|
||||
overflow-y: auto;
|
||||
}
|
||||
|
||||
.conversation-panel__results-table {
|
||||
width: 100%;
|
||||
border-collapse: collapse;
|
||||
font-size: 0.8125rem;
|
||||
}
|
||||
|
||||
.conversation-panel__results-table th {
|
||||
position: sticky;
|
||||
top: 0;
|
||||
background: #f0fdf4;
|
||||
padding: 0.5rem 0.75rem;
|
||||
text-align: left;
|
||||
font-weight: 600;
|
||||
color: #166534;
|
||||
border-bottom: 2px solid #10b981;
|
||||
white-space: nowrap;
|
||||
}
|
||||
|
||||
.conversation-panel__results-table td {
|
||||
padding: 0.5rem 0.75rem;
|
||||
border-bottom: 1px solid #e5e7eb;
|
||||
vertical-align: top;
|
||||
max-width: 300px;
|
||||
overflow: hidden;
|
||||
text-overflow: ellipsis;
|
||||
}
|
||||
|
||||
.conversation-panel__results-table tr:hover td {
|
||||
background: #f0fdf4;
|
||||
}
|
||||
|
||||
.conversation-panel__results-table a {
|
||||
color: #10b981;
|
||||
text-decoration: none;
|
||||
}
|
||||
|
||||
.conversation-panel__results-table a:hover {
|
||||
text-decoration: underline;
|
||||
}
|
||||
|
||||
.conversation-panel__results-truncated {
|
||||
padding: 0.5rem 0.75rem;
|
||||
background: #fef3c7;
|
||||
color: #92400e;
|
||||
font-size: 0.75rem;
|
||||
text-align: center;
|
||||
border-top: 1px solid #fbbf24;
|
||||
}
|
||||
|
||||
/* Responsive: Factual Results Table */
|
||||
@media (max-width: 768px) {
|
||||
.conversation-panel__results-table-wrapper {
|
||||
max-height: 250px;
|
||||
}
|
||||
|
||||
.conversation-panel__results-table {
|
||||
font-size: 0.75rem;
|
||||
}
|
||||
|
||||
.conversation-panel__results-table th,
|
||||
.conversation-panel__results-table td {
|
||||
padding: 0.375rem 0.5rem;
|
||||
}
|
||||
|
||||
.conversation-panel__results-table td {
|
||||
max-width: 150px;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -213,6 +213,7 @@ interface Message {
|
|||
errorCode?: string;
|
||||
llmProviderUsed?: string; // Which LLM provider generated this response
|
||||
llmResponse?: LLMResponseMetadata; // Full LLM response metadata including chain-of-thought
|
||||
factualResult?: boolean; // True if this is a direct SPARQL result (no LLM prose generation)
|
||||
}
|
||||
|
||||
interface HistoryItem {
|
||||
|
|
@ -374,6 +375,7 @@ export const ConversationPanel: React.FC<ConversationPanelProps> = ({ onQueryGen
|
|||
sourcesUsed: string[];
|
||||
llmProviderUsed?: string;
|
||||
llmResponse?: LLMResponseMetadata; // Full LLM response with reasoning_content
|
||||
factualResult?: boolean; // True if LLM was skipped for factual query
|
||||
}> => {
|
||||
// Determine API endpoint based on environment
|
||||
const hostname = window.location.hostname;
|
||||
|
|
@ -414,12 +416,13 @@ export const ConversationPanel: React.FC<ConversationPanelProps> = ({ onQueryGen
|
|||
|
||||
const data = await response.json();
|
||||
return {
|
||||
sparql: data.visualization?.sparql_query || data.sparql, // Get SPARQL from visualization if available
|
||||
sparql: data.visualization?.sparql_query || data.sparql_query || data.sparql, // Get SPARQL from visualization or direct field
|
||||
sparqlResults: data.retrieved_results, // Raw results for debug display
|
||||
answer: data.answer || data.explanation || '',
|
||||
sourcesUsed: data.sources_used || selectedSources,
|
||||
llmProviderUsed: data.llm_provider_used,
|
||||
llmResponse: data.llm_response, // Pass through chain-of-thought metadata
|
||||
factualResult: data.factual_result, // True if LLM was skipped for factual query
|
||||
};
|
||||
};
|
||||
|
||||
|
|
@ -472,6 +475,7 @@ export const ConversationPanel: React.FC<ConversationPanelProps> = ({ onQueryGen
|
|||
sourcesUsed: result.sourcesUsed,
|
||||
llmProviderUsed: result.llmProviderUsed,
|
||||
llmResponse: result.llmResponse,
|
||||
factualResult: result.factualResult, // Direct SPARQL result flag
|
||||
isLoading: false,
|
||||
}
|
||||
: msg
|
||||
|
|
@ -965,6 +969,51 @@ export const ConversationPanel: React.FC<ConversationPanelProps> = ({ onQueryGen
|
|||
<>
|
||||
<p>{message.content}</p>
|
||||
|
||||
{/* Factual Query Results Table - shown when LLM was skipped */}
|
||||
{message.factualResult && message.sparqlResults && message.sparqlResults.length > 0 && (
|
||||
<div className="conversation-panel__factual-results">
|
||||
<div className="conversation-panel__factual-badge">
|
||||
<Database size={14} />
|
||||
<span>{language === 'nl' ? 'Direct uit kennisgraaf' : 'Direct from knowledge graph'}</span>
|
||||
</div>
|
||||
<div className="conversation-panel__results-table-wrapper">
|
||||
<table className="conversation-panel__results-table">
|
||||
<thead>
|
||||
<tr>
|
||||
{Object.keys(message.sparqlResults[0]).map(key => (
|
||||
<th key={key}>{key}</th>
|
||||
))}
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{message.sparqlResults.slice(0, 50).map((row, idx) => (
|
||||
<tr key={idx}>
|
||||
{Object.entries(row).map(([key, value]) => (
|
||||
<td key={key}>
|
||||
{typeof value === 'string' && value.startsWith('http') ? (
|
||||
<a href={value} target="_blank" rel="noopener noreferrer" title={value}>
|
||||
{value.split('/').pop() || value}
|
||||
</a>
|
||||
) : (
|
||||
String(value ?? '')
|
||||
)}
|
||||
</td>
|
||||
))}
|
||||
</tr>
|
||||
))}
|
||||
</tbody>
|
||||
</table>
|
||||
{message.sparqlResults.length > 50 && (
|
||||
<div className="conversation-panel__results-truncated">
|
||||
{language === 'nl'
|
||||
? `Toont 50 van ${message.sparqlResults.length} resultaten`
|
||||
: `Showing 50 of ${message.sparqlResults.length} results`}
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* Chain-of-Thought Reasoning (GLM 4.7 Interleaved Thinking) */}
|
||||
{message.llmResponse?.reasoning_content && (
|
||||
<details className="conversation-panel__reasoning">
|
||||
|
|
|
|||
Loading…
Reference in a new issue