fix(rag): correct SPARQL ontology prefixes for LinkML schema

- Update HeritageSPARQLGenerator docstring with correct prefixes
- Change main class from hc:Custodian to crm:E39_Actor
- Change type property from hcp:institutionType to org:classification
- Update type values from single letters to full names (MUSEUM, ARCHIVE, etc.)
- Add rate limit handling with exponential backoff for 429 errors
- Fix test_live_rag.py sample queries to use correct ontology
- Update optimized_models instructions with correct prefixes
This commit is contained in:
kempersc 2025-12-22 21:31:08 +01:00
parent 7a056fa746
commit 8e97a7beca
4 changed files with 584 additions and 86 deletions

View file

@ -20,6 +20,7 @@ from __future__ import annotations
import asyncio import asyncio
import json import json
import logging import logging
import random
from dataclasses import dataclass, field from dataclasses import dataclass, field
from datetime import datetime, timezone from datetime import datetime, timezone
from enum import Enum from enum import Enum
@ -32,6 +33,104 @@ from dspy.streaming import StatusMessage, StreamListener, StatusMessageProvider
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# =============================================================================
# RATE LIMIT HANDLING
# =============================================================================
def is_rate_limit_error(error: Exception) -> bool:
"""Check if an exception is a rate limit error (429).
Handles both direct errors and ExceptionGroups from asyncio.TaskGroup.
"""
error_str = str(error).lower()
# Direct rate limit indicators
if '429' in error_str or 'rate' in error_str or '1305' in error_str:
return True
# Check nested exceptions in ExceptionGroup (from asyncio.TaskGroup)
if hasattr(error, 'exceptions'):
for sub_exc in error.exceptions:
if is_rate_limit_error(sub_exc):
return True
# Check __cause__ chain
if error.__cause__ and is_rate_limit_error(error.__cause__):
return True
return False
def extract_actual_error(error: Exception) -> Exception:
"""Extract the actual error from an ExceptionGroup if present."""
if hasattr(error, 'exceptions'):
for sub_exc in error.exceptions:
# Return rate limit error if found
if is_rate_limit_error(sub_exc):
return sub_exc
# Recursively check nested groups
actual = extract_actual_error(sub_exc)
if actual is not sub_exc:
return actual
return error
async def call_with_rate_limit_retry(
func: Callable,
*args,
max_retries: int = 3,
base_delay: float = 2.0,
max_delay: float = 30.0,
**kwargs
) -> Any:
"""Call a function with exponential backoff retry on rate limit errors.
Args:
func: The function to call (can be sync or async)
*args: Positional arguments for the function
max_retries: Maximum number of retry attempts
base_delay: Initial delay in seconds
max_delay: Maximum delay in seconds
**kwargs: Keyword arguments for the function
Returns:
The function's return value
Raises:
The original exception if max retries exceeded or non-rate-limit error
"""
last_exception = None
for attempt in range(max_retries + 1):
try:
# Call the function (handle both sync and async)
result = func(*args, **kwargs)
if asyncio.iscoroutine(result):
result = await result
return result
except Exception as e:
last_exception = e
actual_error = extract_actual_error(e)
if is_rate_limit_error(e) and attempt < max_retries:
# Calculate delay with exponential backoff + jitter
delay = min(base_delay * (2 ** attempt) + random.uniform(0, 1), max_delay)
logger.warning(
f"Rate limited (attempt {attempt + 1}/{max_retries + 1}), "
f"waiting {delay:.1f}s before retry. Error: {actual_error}"
)
await asyncio.sleep(delay)
else:
# Not a rate limit error, or max retries exceeded
if is_rate_limit_error(e):
logger.error(f"Max retries ({max_retries}) exceeded for rate limit")
raise
# Should not reach here, but just in case
raise last_exception if last_exception else RuntimeError("Unexpected retry loop exit")
# Semantic cache imports (graceful degradation if not available) # Semantic cache imports (graceful degradation if not available)
SEMANTIC_CACHE_AVAILABLE = False SEMANTIC_CACHE_AVAILABLE = False
get_cache: Optional[Callable[[], Any]] = None get_cache: Optional[Callable[[], Any]] = None
@ -229,71 +328,95 @@ class HeritageQueryIntent(dspy.Signature):
class HeritageSPARQLGenerator(dspy.Signature): class HeritageSPARQLGenerator(dspy.Signature):
"""Generate SPARQL queries for heritage custodian knowledge graph. """Generate SPARQL queries for heritage custodian knowledge graph.
You are an expert in SPARQL and the Heritage Custodian Ontology. You are an expert in SPARQL and the Heritage Custodian Ontology (based on LinkML schema).
Generate valid SPARQL queries that work with our Oxigraph endpoint. Generate valid SPARQL queries that work with our Oxigraph endpoint.
Key prefixes (MUST USE THESE EXACT URIs): REQUIRED PREFIXES (MUST USE THESE EXACT URIs):
- PREFIX hc: <https://nde.nl/ontology/hc/class/> - PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>
- PREFIX hcp: <https://w3id.org/heritage/custodian/> - PREFIX org: <http://www.w3.org/ns/org#>
- PREFIX ghcid: <https://w3id.org/heritage/custodian/>
- PREFIX skos: <http://www.w3.org/2004/02/skos/core#> - PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
- PREFIX schema: <http://schema.org/> - PREFIX schema: <http://schema.org/>
- PREFIX foaf: <http://xmlns.com/foaf/0.1/> - PREFIX foaf: <http://xmlns.com/foaf/0.1/>
- PREFIX dct: <http://purl.org/dc/terms/> - PREFIX dcterms: <http://purl.org/dc/terms/>
- PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/> - PREFIX hc: <https://nde.nl/ontology/hc/>
- PREFIX wdt: <http://www.wikidata.org/prop/direct/> - PREFIX rico: <https://www.ica.org/standards/RiC/ontology#>
- PREFIX prov: <http://www.w3.org/ns/prov#>
- PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> - PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
Key classes: MAIN CLASS (from LinkML Custodian.yaml):
- hc:Custodian - Heritage custodian institution - crm:E39_Actor - Heritage custodian institution (this is THE class for all custodians)
- schema:Place - Geographic location
- foaf:OnlineAccount - Social media profile
Key properties: KEY PROPERTIES:
- skos:prefLabel - Institution name (literal) - dcterms:identifier - Unique identifier (hc_id)
- hcp:institutionType - Type code (M, L, A, G, etc.) - skos:prefLabel - Preferred/display name (literal)
- schema:addressCountry - Country (Wikidata entity) - org:classification - Custodian type (MUSEUM, LIBRARY, ARCHIVE, etc.)
- foaf:homepage - Website URL
- crm:P53_has_former_or_current_location - Location link - crm:P53_has_former_or_current_location - Location link
- foaf:homepage - Website URL
- org:subOrganizationOf - Parent organization
- crm:P46_is_composed_of - Collection links
- schema:foundingDate - Founding date (xsd:date)
TEMPORAL PROPERTIES (for founding/oldest queries): CUSTODIAN TYPE VALUES (use FULL names, not single letters):
- schema:foundingDate - Institution founding date (xsd:date, e.g., "1800-01-01") MUSEUM, LIBRARY, ARCHIVE, GALLERY, OFFICIAL_INSTITUTION, RESEARCH_CENTER,
- hcp:foundingYear - Founding year as integer (xsd:integer, e.g., 1800) COMMERCIAL, UNSPECIFIED, BIO_CUSTODIAN, EDUCATION_PROVIDER, HERITAGE_SOCIETY,
- wdt:P571 - Wikidata inception date (same as schema:foundingDate) FEATURE_CUSTODIAN, INTANGIBLE_HERITAGE_GROUP, MIXED, PERSONAL_COLLECTION,
HOLY_SACRED_SITE, DIGITAL_PLATFORM, NON_PROFIT, TASTE_SCENT_HERITAGE
Example - Find oldest archives: Example - Find all museums:
```sparql ```sparql
PREFIX schema: <http://schema.org/> PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>
PREFIX org: <http://www.w3.org/ns/org#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#> PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX hcp: <https://w3id.org/heritage/custodian/>
SELECT ?inst ?name ?founded WHERE { SELECT ?custodian ?name WHERE {
?inst a <https://nde.nl/ontology/hc/class/Custodian> ; ?custodian a crm:E39_Actor ;
skos:prefLabel ?name ; org:classification "MUSEUM" ;
hcp:institutionType "A" ; skos:prefLabel ?name .
schema:foundingDate ?founded . }
LIMIT 100
```
Example - Find custodian by name (case-insensitive search):
```sparql
PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
SELECT ?custodian ?name WHERE {
?custodian a crm:E39_Actor ;
skos:prefLabel ?name .
FILTER(CONTAINS(LCASE(STR(?name)), "rijksmuseum"))
}
```
Example - Count custodians by type:
```sparql
PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>
PREFIX org: <http://www.w3.org/ns/org#>
SELECT ?type (COUNT(?custodian) AS ?count) WHERE {
?custodian a crm:E39_Actor ;
org:classification ?type .
}
GROUP BY ?type
ORDER BY DESC(?count)
```
Example - Find oldest archives by founding date:
```sparql
PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>
PREFIX org: <http://www.w3.org/ns/org#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX schema: <http://schema.org/>
SELECT ?custodian ?name ?founded WHERE {
?custodian a crm:E39_Actor ;
org:classification "ARCHIVE" ;
skos:prefLabel ?name ;
schema:foundingDate ?founded .
} }
ORDER BY ?founded ORDER BY ?founded
LIMIT 10 LIMIT 10
``` ```
Example - Find museums founded before 1900:
```sparql
PREFIX schema: <http://schema.org/>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX hcp: <https://w3id.org/heritage/custodian/>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
SELECT ?inst ?name ?year WHERE {
?inst a <https://nde.nl/ontology/hc/class/Custodian> ;
skos:prefLabel ?name ;
hcp:institutionType "M" ;
hcp:foundingYear ?year .
FILTER(?year < 1900)
}
ORDER BY ?year
LIMIT 20
```
""" """
question: str = dspy.InputField(desc="Natural language question") question: str = dspy.InputField(desc="Natural language question")
@ -1530,14 +1653,16 @@ def create_heritage_tools(
JSON string of nearby institutions with distances JSON string of nearby institutions with distances
""" """
sparql = f""" sparql = f"""
PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>
PREFIX org: <http://www.w3.org/ns/org#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX geo: <http://www.opengis.net/ont/geosparql#> PREFIX geo: <http://www.opengis.net/ont/geosparql#>
PREFIX hco: <https://w3id.org/hco/>
PREFIX geof: <http://www.opengis.net/def/function/geosparql/> PREFIX geof: <http://www.opengis.net/def/function/geosparql/>
SELECT ?inst ?name ?type ?distance WHERE {{ SELECT ?inst ?name ?type ?distance WHERE {{
?inst a hco:HeritageCustodian ; ?inst a crm:E39_Actor ;
hco:name ?name ; skos:prefLabel ?name ;
hco:institutionType ?type ; org:classification ?type ;
geo:hasGeometry/geo:asWKT ?wkt . geo:hasGeometry/geo:asWKT ?wkt .
BIND(geof:distance(?wkt, "POINT({longitude} {latitude})"^^geo:wktLiteral, <http://www.opengis.net/def/uom/OGC/1.0/kilometre>) AS ?distance) BIND(geof:distance(?wkt, "POINT({longitude} {latitude})"^^geo:wktLiteral, <http://www.opengis.net/def/uom/OGC/1.0/kilometre>) AS ?distance)
@ -1571,27 +1696,30 @@ def create_heritage_tools(
JSON string with full institution details JSON string with full institution details
""" """
sparql = f""" sparql = f"""
PREFIX hco: <https://w3id.org/hco/> PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>
PREFIX org: <http://www.w3.org/ns/org#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#> PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX dct: <http://purl.org/dc/terms/> PREFIX dcterms: <http://purl.org/dc/terms/>
PREFIX schema: <http://schema.org/>
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
SELECT * WHERE {{ SELECT * WHERE {{
?inst a hco:HeritageCustodian . ?inst a crm:E39_Actor .
{{ {{
?inst hco:ghcid "{identifier}" ?inst dcterms:identifier "{identifier}"
}} UNION {{ }} UNION {{
?inst hco:isil "{identifier}" ?inst dcterms:identifier "{identifier}"
}} UNION {{ }} UNION {{
?inst dct:identifier <http://www.wikidata.org/entity/{identifier}> ?inst dcterms:identifier <http://www.wikidata.org/entity/{identifier}>
}} }}
?inst skos:prefLabel ?name . ?inst skos:prefLabel ?name .
OPTIONAL {{ ?inst hco:institutionType ?type }} OPTIONAL {{ ?inst org:classification ?type }}
OPTIONAL {{ ?inst hco:city ?city }} OPTIONAL {{ ?inst crm:P53_has_former_or_current_location ?location }}
OPTIONAL {{ ?inst hco:country ?country }} OPTIONAL {{ ?inst schema:foundingDate ?founded }}
OPTIONAL {{ ?inst hco:foundingDate ?founded }} OPTIONAL {{ ?inst dcterms:description ?desc }}
OPTIONAL {{ ?inst hco:description ?desc }} OPTIONAL {{ ?inst foaf:homepage ?website }}
}} }}
""" """
return query_knowledge_graph(sparql) return query_knowledge_graph(sparql)
@ -1622,15 +1750,15 @@ def create_heritage_tools(
type_filter = f'FILTER(?type = "{institution_type}")' type_filter = f'FILTER(?type = "{institution_type}")'
sparql = f""" sparql = f"""
PREFIX hco: <https://w3id.org/hco/> PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>
PREFIX org: <http://www.w3.org/ns/org#>
SELECT ?{group_by} (COUNT(?inst) AS ?count) WHERE {{ SELECT ?type (COUNT(?inst) AS ?count) WHERE {{
?inst a hco:HeritageCustodian ; ?inst a crm:E39_Actor ;
hco:{group_by} ?{group_by} ; org:classification ?type .
hco:institutionType ?type .
{type_filter} {type_filter}
}} }}
GROUP BY ?{group_by} GROUP BY ?type
ORDER BY DESC(?count) ORDER BY DESC(?count)
LIMIT 50 LIMIT 50
""" """
@ -3264,6 +3392,376 @@ class HeritageRAGPipeline(dspy.Module):
return prediction return prediction
async def forward_streaming(
self,
question: str,
language: str = "nl",
history: History = None,
include_viz: bool = True,
skip_cache: bool = False,
embedding_model: str | None = None,
) -> AsyncIterator[dict]:
"""Execute RAG pipeline with streaming answer generation.
Yields dictionaries with different types:
- {"type": "retrieval_complete", "context": ..., "routing": ..., "retrieved_results": ...}
- {"type": "token", "content": "..."} (streaming answer tokens)
- {"type": "answer_complete", "prediction": ...} (final result)
Args:
question: User's natural language question
language: Response language (nl, en)
history: Previous conversation turns for multi-turn context
include_viz: Whether to include visualization config
skip_cache: Force bypass cache lookup
embedding_model: Optional embedding model (minilm_384, openai_1536, bge_768)
Yields:
Dictionaries with streaming progress and tokens
"""
import asyncio
# Initialize empty history if not provided
if history is None:
history = History(messages=[])
# =================================================================
# Cache Check - Look for cached response before expensive LLM calls
# =================================================================
if SEMANTIC_CACHE_AVAILABLE and not skip_cache:
try:
if should_bypass_cache is not None and get_cache is not None:
if not should_bypass_cache(question):
cache = get_cache()
cached_response = cache.get_sync(question, language=language)
if cached_response and not cached_response.get("_warmup_entry"):
logger.info(f"Cache HIT for streaming query: {question[:50]}...")
# Return cached response immediately
yield {
"type": "cache_hit",
"prediction": Prediction(
answer=cached_response.get("answer", ""),
intent=cached_response.get("intent", "exploration"),
entities=cached_response.get("entities", []),
sparql=cached_response.get("sparql"),
sources_used=cached_response.get("sources_used", []),
confidence=cached_response.get("confidence", 0.9),
citations=cached_response.get("citations", []),
follow_up=cached_response.get("follow_up", []),
visualization=cached_response.get("visualization"),
cache_hit=True,
)
}
return
except Exception as e:
logger.warning(f"Cache lookup failed in streaming mode: {e}")
# =================================================================
# RETRIEVAL PHASE - Run synchronously in thread pool to not block
# =================================================================
detected_query_type = "institution"
retrieved_results = []
context_parts = [f"Query: {question}"]
sparql = None
entities = None
# Step 1: Route query (with rate limit retry)
try:
routing = await call_with_rate_limit_retry(
self.router,
question=question,
language=language,
history=history,
max_retries=3,
base_delay=2.0,
)
except Exception as e:
actual_error = extract_actual_error(e)
logger.error(f"Router failed after retries: {actual_error}")
raise
resolved_question = getattr(routing, 'resolved_question', question)
# Small delay between LLM calls to reduce rate limit pressure
await asyncio.sleep(0.5)
# Step 2: Extract entities (optional, for context - with rate limit retry)
try:
entities = await call_with_rate_limit_retry(
self.entity_extractor,
question=question,
language=language,
max_retries=2, # Fewer retries since this is optional
base_delay=1.5,
)
except Exception as e:
actual_error = extract_actual_error(e)
logger.warning(f"Entity extraction failed: {actual_error}")
entities = None
# Small delay before streaming answer generation
await asyncio.sleep(0.5)
# Step 3: Retrieval from databases
if self.retriever:
try:
# Detect if this is a person query
question_lower = question.lower()
person_indicators = ['wie ', 'who ', 'medewerker', 'staff', 'curator', 'director', 'werkt', 'works',
'employee', 'team', 'directeur', 'conservator', 'archivaris', 'archivist',
'bibliothecaris', 'librarian', 'contactpersoon', 'contact person']
is_person_query = any(indicator in question_lower for indicator in person_indicators)
if is_person_query:
detected_query_type = "person"
logger.info(f"Detected PERSON query for streaming: {resolved_question[:50]}...")
# Search for persons
if hasattr(self.retriever, 'search_persons'):
person_results = self.retriever.search_persons(query=resolved_question, k=10, using=embedding_model)
if person_results:
context_parts.append("\n[RETRIEVED STAFF/PEOPLE - Real data from heritage database]:")
for p in person_results:
name = getattr(p, 'name', 'Unknown')
headline = getattr(p, 'headline', '')
custodian = getattr(p, 'custodian_name', '')
entry = f"- {name}"
if headline:
entry += f" ({headline})"
if custodian:
entry += f" at {custodian}"
context_parts.append(entry)
retrieved_results.append({
"type": "person",
"name": name,
"headline": headline,
"custodian_name": custodian,
"score": getattr(p, 'combined_score', 0),
})
else:
# Institution search
logger.info(f"Performing INSTITUTION retrieval for streaming: {resolved_question[:50]}...")
inst_results = self.retriever.search(query=resolved_question, k=10, auto_route=False, using=embedding_model)
if inst_results:
context_parts.append("\n[RETRIEVED INSTITUTIONS - Real data from heritage database]:")
for inst in inst_results:
if hasattr(inst, 'to_dict'):
inst_dict = inst.to_dict()
name = inst_dict.get('name', 'Unknown')
inst_type = inst_dict.get('metadata', {}).get('type', '')
city = inst_dict.get('metadata', {}).get('city', '')
inst_dict['type'] = 'institution'
retrieved_results.append(inst_dict)
else:
name = getattr(inst, 'name', 'Unknown')
inst_type = getattr(inst, 'type', '')
city = getattr(inst, 'city', '')
retrieved_results.append({
"type": "institution",
"name": name,
"institution_type": inst_type,
"city": city,
})
entry = f"- {name}"
if inst_type:
entry += f" ({inst_type})"
if city:
entry += f" in {city}"
context_parts.append(entry)
except Exception as e:
logger.warning(f"Retrieval failed in streaming mode: {e}")
context_parts.append(f"\n[Retrieval error: {str(e)}]")
context = "\n".join(context_parts)
# Yield retrieval complete event
yield {
"type": "retrieval_complete",
"context": context,
"routing": {
"intent": routing.intent,
"sources": routing.sources,
"resolved_question": resolved_question,
},
"retrieved_results": retrieved_results,
"query_type": detected_query_type,
}
# =================================================================
# ANSWER GENERATION PHASE - Stream tokens using dspy.streamify
# =================================================================
answer_text = ""
confidence = 0.8
citations = []
follow_up = []
streaming_succeeded = False
retry_count = 0
max_stream_retries = 2
while not streaming_succeeded and retry_count <= max_stream_retries:
try:
# Create streamified version of the answer generator
streamified_answer_gen = dspy.streamify(self.answer_gen)
# Use quality_lm context if available
lm_context = dspy.settings.context(lm=self.quality_lm) if self.quality_lm else dspy.settings.context()
with lm_context:
async for value in streamified_answer_gen(
question=resolved_question,
context=context,
history=history,
sources=routing.sources,
language=language,
):
if isinstance(value, dspy.Prediction):
# Final prediction - extract all fields
answer_text = value.answer
confidence = getattr(value, 'confidence', 0.8)
citations = getattr(value, 'citations', [])
follow_up = getattr(value, 'follow_up', [])
streaming_succeeded = True
elif isinstance(value, str):
# Streaming token
yield {"type": "token", "content": value}
else:
# Handle ModelResponseStream from litellm/DSPy
# Token text is in choices[0].delta.content or .reasoning_content
token_text = None
# Try to extract content from streaming response
if hasattr(value, 'choices') and value.choices:
delta = getattr(value.choices[0], 'delta', None)
if delta:
# Check both content and reasoning_content (for GLM models)
token_text = getattr(delta, 'content', None) or getattr(delta, 'reasoning_content', None)
# Fallback: check for message attribute (StatusMessage)
if token_text is None and hasattr(value, 'message'):
yield {"type": "status", "message": value.message}
continue
# Yield extracted token if we got text
if token_text:
yield {"type": "token", "content": token_text}
# If we get here, streaming completed
streaming_succeeded = True
except Exception as e:
actual_error = extract_actual_error(e)
retry_count += 1
# Check if rate limited and can retry
if is_rate_limit_error(e) and retry_count <= max_stream_retries:
delay = 2.0 * (2 ** (retry_count - 1)) + random.uniform(0, 1)
logger.warning(
f"Streaming rate limited (attempt {retry_count}/{max_stream_retries + 1}), "
f"waiting {delay:.1f}s before retry. Error: {actual_error}"
)
await asyncio.sleep(delay)
continue
# Not rate limited or max retries exceeded - fall back to sync
logger.warning(
f"Streaming answer generation failed after {retry_count} attempts, "
f"falling back to sync. Error: {actual_error}"
)
break
# Fallback to synchronous generation if streaming failed
if not streaming_succeeded:
try:
# Use rate limit retry for sync fallback too
lm_context = dspy.settings.context(lm=self.quality_lm) if self.quality_lm else dspy.settings.context()
with lm_context:
answer_result = await call_with_rate_limit_retry(
self.answer_gen,
question=resolved_question,
context=context,
history=history,
sources=routing.sources,
language=language,
max_retries=3,
base_delay=2.0,
)
answer_text = answer_result.answer
confidence = answer_result.confidence
citations = answer_result.citations
follow_up = answer_result.follow_up
# Yield the full answer as one token (fallback behavior)
logger.info("Sync fallback succeeded - yielding full answer as single token")
yield {"type": "token", "content": answer_text}
except Exception as fallback_e:
actual_error = extract_actual_error(fallback_e)
logger.exception(f"Fallback answer generation also failed: {actual_error}")
answer_text = "Er is een fout opgetreden bij het genereren van het antwoord."
yield {"type": "token", "content": answer_text}
# Step 4: Visualization selection (if needed)
viz_config = None
if include_viz:
try:
viz_result = self.viz_selector(
question=question,
intent=routing.intent,
schema_fields=["name", "type", "city", "country", "lat", "lon"],
result_count=len(retrieved_results),
)
viz_config = {
"type": viz_result.viz_type,
"config": viz_result.config,
"reasoning": viz_result.reasoning,
}
except Exception as e:
logger.warning(f"Visualization selection failed: {e}")
# Build final prediction
prediction = Prediction(
answer=answer_text,
intent=routing.intent,
entities=entities,
sparql=sparql,
sources_used=routing.sources,
confidence=confidence,
citations=citations,
follow_up=follow_up,
visualization=viz_config,
cache_hit=False,
resolved_question=resolved_question,
retrieved_results=retrieved_results,
query_type=detected_query_type,
embedding_model_used=embedding_model,
)
# Cache the response (fire and forget)
if SEMANTIC_CACHE_AVAILABLE and not skip_cache and confidence >= 0.7:
try:
if get_cache is not None:
cache = get_cache()
response_dict = {
"answer": answer_text,
"intent": routing.intent,
"entities": entities.institutions if hasattr(entities, 'institutions') else [],
"sparql": sparql,
"sources_used": routing.sources,
"confidence": confidence,
"citations": citations,
"follow_up": follow_up,
"visualization": viz_config,
}
cache.set_sync(question, response_dict, intent=routing.intent, language=language)
except Exception as e:
logger.warning(f"Failed to cache streaming response: {e}")
# Yield final prediction
yield {"type": "answer_complete", "prediction": prediction}
# ============================================================================= # =============================================================================
# 7. FACTORY FUNCTIONS # 7. FACTORY FUNCTIONS

View file

@ -369,7 +369,7 @@
} }
], ],
"signature": { "signature": {
"instructions": "Generate SPARQL queries for heritage custodian knowledge graph.\n\nYou are an expert in SPARQL and the Heritage Custodian Ontology.\nGenerate valid SPARQL queries that work with our Oxigraph endpoint.\n\nKey prefixes (MUST USE THESE EXACT URIs):\n- PREFIX hc: <https://nde.nl/ontology/hc/class/>\n- PREFIX hcp: <https://nde.nl/ontology/hc/>\n- PREFIX ghcid: <https://w3id.org/heritage/custodian/>\n- PREFIX skos: <http://www.w3.org/2004/02/skos/core#>\n- PREFIX schema: <http://schema.org/>\n- PREFIX foaf: <http://xmlns.com/foaf/0.1/>\n- PREFIX dct: <http://purl.org/dc/terms/>\n- PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>\n- PREFIX wdt: <http://www.wikidata.org/prop/direct/>\n\nKey classes:\n- hc:Custodian - Heritage custodian institution\n- schema:Place - Geographic location\n- foaf:OnlineAccount - Social media profile\n\nKey properties:\n- skos:prefLabel - Institution name\n- hcp:custodian_type - Type (MUSEUM, LIBRARY, ARCHIVE, etc.)\n- schema:addressCountry - Country code\n- foaf:homepage - Website\n- crm:P53_has_former_or_current_location - Location link", "instructions": "Generate SPARQL queries for heritage custodian knowledge graph.\n\nYou are an expert in SPARQL and the Heritage Custodian Ontology.\nGenerate valid SPARQL queries that work with our Oxigraph endpoint.\n\nKey prefixes (MUST USE THESE EXACT URIs):\n- PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>\n- PREFIX org: <http://www.w3.org/ns/org#>\n- PREFIX ghcid: <https://w3id.org/heritage/custodian/>\n- PREFIX skos: <http://www.w3.org/2004/02/skos/core#>\n- PREFIX schema: <http://schema.org/>\n- PREFIX foaf: <http://xmlns.com/foaf/0.1/>\n- PREFIX dct: <http://purl.org/dc/terms/>\n- PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>\n- PREFIX wdt: <http://www.wikidata.org/prop/direct/>\n\nKey classes:\n- crm:E39_Actor - Heritage custodian institution\n- schema:Place - Geographic location\n- foaf:OnlineAccount - Social media profile\n\nKey properties:\n- skos:prefLabel - Institution name\n- org:classification - Type (MUSEUM, LIBRARY, ARCHIVE, etc.)\n- schema:addressCountry - Country code\n- foaf:homepage - Website\n- crm:P53_has_former_or_current_location - Location link",
"fields": [ "fields": [
{ {
"prefix": "Question:", "prefix": "Question:",
@ -547,7 +547,7 @@
} }
], ],
"signature": { "signature": {
"instructions": "Generate SPARQL queries for heritage custodian knowledge graph.\n\nYou are an expert in SPARQL and the Heritage Custodian Ontology.\nGenerate valid SPARQL queries that work with our Oxigraph endpoint.\n\nKey prefixes (MUST USE THESE EXACT URIs):\n- PREFIX hc: <https://nde.nl/ontology/hc/class/>\n- PREFIX hcp: <https://nde.nl/ontology/hc/>\n- PREFIX ghcid: <https://w3id.org/heritage/custodian/>\n- PREFIX skos: <http://www.w3.org/2004/02/skos/core#>\n- PREFIX schema: <http://schema.org/>\n- PREFIX foaf: <http://xmlns.com/foaf/0.1/>\n- PREFIX dct: <http://purl.org/dc/terms/>\n- PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>\n- PREFIX wdt: <http://www.wikidata.org/prop/direct/>\n\nKey classes:\n- hc:Custodian - Heritage custodian institution\n- schema:Place - Geographic location\n- foaf:OnlineAccount - Social media profile\n\nKey properties:\n- skos:prefLabel - Institution name\n- hcp:custodian_type - Type (MUSEUM, LIBRARY, ARCHIVE, etc.)\n- schema:addressCountry - Country code\n- foaf:homepage - Website\n- crm:P53_has_former_or_current_location - Location link", "instructions": "Generate SPARQL queries for heritage custodian knowledge graph.\n\nYou are an expert in SPARQL and the Heritage Custodian Ontology.\nGenerate valid SPARQL queries that work with our Oxigraph endpoint.\n\nKey prefixes (MUST USE THESE EXACT URIs):\n- PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>\n- PREFIX org: <http://www.w3.org/ns/org#>\n- PREFIX ghcid: <https://w3id.org/heritage/custodian/>\n- PREFIX skos: <http://www.w3.org/2004/02/skos/core#>\n- PREFIX schema: <http://schema.org/>\n- PREFIX foaf: <http://xmlns.com/foaf/0.1/>\n- PREFIX dct: <http://purl.org/dc/terms/>\n- PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>\n- PREFIX wdt: <http://www.wikidata.org/prop/direct/>\n\nKey classes:\n- crm:E39_Actor - Heritage custodian institution\n- schema:Place - Geographic location\n- foaf:OnlineAccount - Social media profile\n\nKey properties:\n- skos:prefLabel - Institution name\n- org:classification - Type (MUSEUM, LIBRARY, ARCHIVE, etc.)\n- schema:addressCountry - Country code\n- foaf:homepage - Website\n- crm:P53_has_former_or_current_location - Location link",
"fields": [ "fields": [
{ {
"prefix": "Question:", "prefix": "Question:",
@ -1072,4 +1072,4 @@
"cloudpickle": "3.1" "cloudpickle": "3.1"
} }
} }
} }

View file

@ -369,7 +369,7 @@
} }
], ],
"signature": { "signature": {
"instructions": "Generate SPARQL queries for heritage custodian knowledge graph.\n\nYou are an expert in SPARQL and the Heritage Custodian Ontology.\nGenerate valid SPARQL queries that work with our Oxigraph endpoint.\n\nKey prefixes (MUST USE THESE EXACT URIs):\n- PREFIX hc: <https://nde.nl/ontology/hc/class/>\n- PREFIX hcp: <https://nde.nl/ontology/hc/>\n- PREFIX ghcid: <https://w3id.org/heritage/custodian/>\n- PREFIX skos: <http://www.w3.org/2004/02/skos/core#>\n- PREFIX schema: <http://schema.org/>\n- PREFIX foaf: <http://xmlns.com/foaf/0.1/>\n- PREFIX dct: <http://purl.org/dc/terms/>\n- PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>\n- PREFIX wdt: <http://www.wikidata.org/prop/direct/>\n\nKey classes:\n- hc:Custodian - Heritage custodian institution\n- schema:Place - Geographic location\n- foaf:OnlineAccount - Social media profile\n\nKey properties:\n- skos:prefLabel - Institution name\n- hcp:custodian_type - Type (MUSEUM, LIBRARY, ARCHIVE, etc.)\n- schema:addressCountry - Country code\n- foaf:homepage - Website\n- crm:P53_has_former_or_current_location - Location link", "instructions": "Generate SPARQL queries for heritage custodian knowledge graph.\n\nYou are an expert in SPARQL and the Heritage Custodian Ontology.\nGenerate valid SPARQL queries that work with our Oxigraph endpoint.\n\nKey prefixes (MUST USE THESE EXACT URIs):\n- PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>\n- PREFIX org: <http://www.w3.org/ns/org#>\n- PREFIX ghcid: <https://w3id.org/heritage/custodian/>\n- PREFIX skos: <http://www.w3.org/2004/02/skos/core#>\n- PREFIX schema: <http://schema.org/>\n- PREFIX foaf: <http://xmlns.com/foaf/0.1/>\n- PREFIX dct: <http://purl.org/dc/terms/>\n- PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>\n- PREFIX wdt: <http://www.wikidata.org/prop/direct/>\n\nKey classes:\n- crm:E39_Actor - Heritage custodian institution\n- schema:Place - Geographic location\n- foaf:OnlineAccount - Social media profile\n\nKey properties:\n- skos:prefLabel - Institution name\n- org:classification - Type (MUSEUM, LIBRARY, ARCHIVE, etc.)\n- schema:addressCountry - Country code\n- foaf:homepage - Website\n- crm:P53_has_former_or_current_location - Location link",
"fields": [ "fields": [
{ {
"prefix": "Question:", "prefix": "Question:",
@ -547,7 +547,7 @@
} }
], ],
"signature": { "signature": {
"instructions": "Generate SPARQL queries for heritage custodian knowledge graph.\n\nYou are an expert in SPARQL and the Heritage Custodian Ontology.\nGenerate valid SPARQL queries that work with our Oxigraph endpoint.\n\nKey prefixes (MUST USE THESE EXACT URIs):\n- PREFIX hc: <https://nde.nl/ontology/hc/class/>\n- PREFIX hcp: <https://nde.nl/ontology/hc/>\n- PREFIX ghcid: <https://w3id.org/heritage/custodian/>\n- PREFIX skos: <http://www.w3.org/2004/02/skos/core#>\n- PREFIX schema: <http://schema.org/>\n- PREFIX foaf: <http://xmlns.com/foaf/0.1/>\n- PREFIX dct: <http://purl.org/dc/terms/>\n- PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>\n- PREFIX wdt: <http://www.wikidata.org/prop/direct/>\n\nKey classes:\n- hc:Custodian - Heritage custodian institution\n- schema:Place - Geographic location\n- foaf:OnlineAccount - Social media profile\n\nKey properties:\n- skos:prefLabel - Institution name\n- hcp:custodian_type - Type (MUSEUM, LIBRARY, ARCHIVE, etc.)\n- schema:addressCountry - Country code\n- foaf:homepage - Website\n- crm:P53_has_former_or_current_location - Location link", "instructions": "Generate SPARQL queries for heritage custodian knowledge graph.\n\nYou are an expert in SPARQL and the Heritage Custodian Ontology.\nGenerate valid SPARQL queries that work with our Oxigraph endpoint.\n\nKey prefixes (MUST USE THESE EXACT URIs):\n- PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>\n- PREFIX org: <http://www.w3.org/ns/org#>\n- PREFIX ghcid: <https://w3id.org/heritage/custodian/>\n- PREFIX skos: <http://www.w3.org/2004/02/skos/core#>\n- PREFIX schema: <http://schema.org/>\n- PREFIX foaf: <http://xmlns.com/foaf/0.1/>\n- PREFIX dct: <http://purl.org/dc/terms/>\n- PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>\n- PREFIX wdt: <http://www.wikidata.org/prop/direct/>\n\nKey classes:\n- crm:E39_Actor - Heritage custodian institution\n- schema:Place - Geographic location\n- foaf:OnlineAccount - Social media profile\n\nKey properties:\n- skos:prefLabel - Institution name\n- org:classification - Type (MUSEUM, LIBRARY, ARCHIVE, etc.)\n- schema:addressCountry - Country code\n- foaf:homepage - Website\n- crm:P53_has_former_or_current_location - Location link",
"fields": [ "fields": [
{ {
"prefix": "Question:", "prefix": "Question:",
@ -1072,4 +1072,4 @@
"cloudpickle": "3.1" "cloudpickle": "3.1"
} }
} }
} }

View file

@ -49,8 +49,8 @@ def test_sparql_endpoint():
# Count custodians # Count custodians
query = """ query = """
PREFIX hc: <https://nde.nl/ontology/hc/class/> PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>
SELECT (COUNT(*) as ?count) WHERE { ?s a hc:Custodian } SELECT (COUNT(*) as ?count) WHERE { ?s a crm:E39_Actor }
""" """
response = httpx.post( response = httpx.post(
@ -200,35 +200,35 @@ def run_sample_queries():
queries = [ queries = [
("Museums by country", """ ("Museums by country", """
PREFIX hc: <https://nde.nl/ontology/hc/class/> PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>
PREFIX hcp: <https://nde.nl/ontology/hc/> PREFIX org: <http://www.w3.org/ns/org#>
PREFIX schema: <http://schema.org/> PREFIX schema: <http://schema.org/>
SELECT ?country (COUNT(?s) as ?count) WHERE { SELECT ?country (COUNT(?s) as ?count) WHERE {
?s a hc:Custodian ; ?s a crm:E39_Actor ;
hcp:custodian_type "MUSEUM" ; org:classification "MUSEUM" ;
schema:addressCountry ?country . schema:addressCountry ?country .
} GROUP BY ?country ORDER BY DESC(?count) LIMIT 10 } GROUP BY ?country ORDER BY DESC(?count) LIMIT 10
"""), """),
("Dutch archives with websites", """ ("Dutch archives with websites", """
PREFIX hc: <https://nde.nl/ontology/hc/class/> PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>
PREFIX hcp: <https://nde.nl/ontology/hc/> PREFIX org: <http://www.w3.org/ns/org#>
PREFIX schema: <http://schema.org/> PREFIX schema: <http://schema.org/>
PREFIX foaf: <http://xmlns.com/foaf/0.1/> PREFIX foaf: <http://xmlns.com/foaf/0.1/>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#> PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
SELECT ?name ?homepage WHERE { SELECT ?name ?homepage WHERE {
?s a hc:Custodian ; ?s a crm:E39_Actor ;
hcp:custodian_type "ARCHIVE" ; org:classification "ARCHIVE" ;
schema:addressCountry "NL" ; schema:addressCountry "NL" ;
skos:prefLabel ?name ; skos:prefLabel ?name ;
foaf:homepage ?homepage . foaf:homepage ?homepage .
} LIMIT 10 } LIMIT 10
"""), """),
("Heritage institutions with social media", """ ("Heritage institutions with social media", """
PREFIX hc: <https://nde.nl/ontology/hc/class/> PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>
PREFIX foaf: <http://xmlns.com/foaf/0.1/> PREFIX foaf: <http://xmlns.com/foaf/0.1/>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#> PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
SELECT ?name (COUNT(?account) as ?social_count) WHERE { SELECT ?name (COUNT(?account) as ?social_count) WHERE {
?s a hc:Custodian ; ?s a crm:E39_Actor ;
skos:prefLabel ?name ; skos:prefLabel ?name ;
foaf:account ?account . foaf:account ?account .
} GROUP BY ?s ?name ORDER BY DESC(?social_count) LIMIT 10 } GROUP BY ?s ?name ORDER BY DESC(?social_count) LIMIT 10