feat(rag): add Rule 46 epistemic provenance tracking

Track full lineage of RAG responses: WHERE data comes from, WHEN it was
retrieved, HOW it was processed (SPARQL/vector/LLM).

Backend changes:
- Add provenance.py with EpistemicProvenance, DataTier, SourceAttribution
- Integrate provenance into MultiSourceRetriever.merge_results()
- Return epistemic_provenance in DSPyQueryResponse

Frontend changes:
- Pass EpistemicProvenance through useMultiDatabaseRAG hook
- Display provenance in ConversationPage (for cache transparency)

Schema fixes:
- Fix truncated example in has_observation.yaml slot definition

References:
- Pavlyshyn's Context Graphs and Data Traces paper
- LinkML ProvenanceBlock schema pattern
This commit is contained in:
kempersc 2026-01-10 18:42:43 +01:00
parent 54dd4a9803
commit 6c19ef8661
7 changed files with 602 additions and 28 deletions

View file

@ -65,6 +65,18 @@ from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import StreamingResponse
from pydantic import BaseModel, Field
# Rule 46: Epistemic Provenance Tracking
from .provenance import (
EpistemicProvenance,
EpistemicDataSource,
DataTier,
RetrievalSource,
SourceAttribution,
infer_data_tier,
build_derivation_chain,
aggregate_data_tier,
)
# Configure logging
logging.basicConfig(
level=logging.INFO,
@ -686,6 +698,9 @@ class DSPyQueryResponse(BaseModel):
# Factual query mode - skip LLM generation for count/list queries
factual_result: bool = False # True if this is a direct SPARQL result (no LLM prose generation)
sparql_query: str | None = None # The SPARQL query that was executed (for transparency)
# Rule 46: Epistemic Provenance Tracking
epistemic_provenance: dict[str, Any] | None = None # Full provenance chain for transparency
def extract_llm_response_metadata(
@ -1525,15 +1540,37 @@ class MultiSourceRetriever:
self,
results: list[RetrievalResult],
max_results: int = 20,
) -> list[dict[str, Any]]:
template_used: bool = False,
template_id: str | None = None,
) -> tuple[list[dict[str, Any]], EpistemicProvenance]:
"""Merge and deduplicate results from multiple sources.
Uses reciprocal rank fusion for score combination.
Returns merged items AND epistemic provenance tracking.
Rule 46: Epistemic Provenance Tracking
"""
from datetime import datetime, timezone
# Track items by GHCID for deduplication
merged: dict[str, dict[str, Any]] = {}
# Initialize provenance tracking
tier_counts: dict[DataTier, int] = {}
sources_queried = [r.source.value for r in results]
total_retrieved = sum(len(r.items) for r in results)
for result in results:
# Map DataSource to RetrievalSource
source_map = {
DataSource.QDRANT: RetrievalSource.QDRANT,
DataSource.SPARQL: RetrievalSource.SPARQL,
DataSource.TYPEDB: RetrievalSource.TYPEDB,
DataSource.POSTGIS: RetrievalSource.POSTGIS,
DataSource.CACHE: RetrievalSource.CACHE,
}
retrieval_source = source_map.get(result.source, RetrievalSource.LLM_SYNTHESIS)
for rank, item in enumerate(result.items):
ghcid = item.get("ghcid", item.get("id", f"unknown_{rank}"))
@ -1541,6 +1578,17 @@ class MultiSourceRetriever:
merged[ghcid] = item.copy()
merged[ghcid]["_sources"] = []
merged[ghcid]["_rrf_score"] = 0.0
merged[ghcid]["_data_tier"] = None
# Infer data tier for this item
item_tier = infer_data_tier(item, retrieval_source)
tier_counts[item_tier] = tier_counts.get(item_tier, 0) + 1
# Track best (lowest) tier for each item
if merged[ghcid]["_data_tier"] is None:
merged[ghcid]["_data_tier"] = item_tier.value
else:
merged[ghcid]["_data_tier"] = min(merged[ghcid]["_data_tier"], item_tier.value)
# Reciprocal Rank Fusion
rrf_score = 1.0 / (60 + rank) # k=60 is standard
@ -1564,7 +1612,31 @@ class MultiSourceRetriever:
reverse=True,
)
return sorted_items[:max_results]
final_items = sorted_items[:max_results]
# Build epistemic provenance
provenance = EpistemicProvenance(
dataSource=EpistemicDataSource.RAG_PIPELINE,
dataTier=aggregate_data_tier(tier_counts),
sourceTimestamp=datetime.now(timezone.utc).isoformat(),
derivationChain=build_derivation_chain(
sources_used=sources_queried,
template_used=template_used,
template_id=template_id,
),
revalidationPolicy="weekly",
sourcesQueried=sources_queried,
totalRetrieved=total_retrieved,
totalAfterFusion=len(final_items),
dataTierBreakdown={
f"tier_{tier.value}": count
for tier, count in tier_counts.items()
},
templateUsed=template_used,
templateId=template_id,
)
return final_items, provenance
async def close(self) -> None:
"""Clean up resources."""
@ -2262,8 +2334,8 @@ async def query_rag(request: QueryRequest) -> QueryResponse:
institution_types=geo_filters["institution_types"],
)
# Merge results
merged_items = retriever.merge_results(results, max_results=request.k * 2)
# Merge results with provenance tracking
merged_items, retrieval_provenance = retriever.merge_results(results, max_results=request.k * 2)
# Generate visualization config if requested
visualization = None
@ -2906,24 +2978,64 @@ async def dspy_query(request: DSPyQueryRequest) -> DSPyQueryResponse:
# Fall back to dict if LLMResponseMetadata fails
llm_response_obj = llm_response_cached # type: ignore[assignment]
# Rule 46: Build provenance for cache hit responses
cached_sources = cached.get("sources", [])
cached_template_used = cached_context.get("template_used", False)
cached_template_id = cached_context.get("template_id")
cached_llm_provider = cached_context.get("llm_provider")
cached_llm_model = cached_context.get("llm_model")
# Infer data tier - prioritize cached provenance if present
cached_provenance = cached_context.get("epistemic_provenance")
if cached_provenance:
# Use the cached provenance, but mark it as coming from cache
cache_provenance = cached_provenance.copy()
if "CACHE" not in cache_provenance.get("derivationChain", []):
cache_provenance.setdefault("derivationChain", []).insert(0, "CACHE:hit")
else:
# Build fresh provenance for older cache entries
cache_tier = DataTier.TIER_3_CROWD_SOURCED.value
if cached_template_used:
cache_tier = DataTier.TIER_1_AUTHORITATIVE.value
elif any(s.lower() in ["sparql", "typedb"] for s in cached_sources):
cache_tier = DataTier.TIER_1_AUTHORITATIVE.value
cache_provenance = EpistemicProvenance(
dataSource=EpistemicDataSource.CACHE_AGGREGATION,
dataTier=cache_tier,
derivationChain=["CACHE:hit"] + build_derivation_chain(
sources_used=cached_sources,
template_used=cached_template_used,
template_id=cached_template_id,
llm_provider=cached_llm_provider,
),
sourcesQueried=cached_sources,
templateUsed=cached_template_used,
templateId=cached_template_id,
llmProvider=cached_llm_provider,
llmModel=cached_llm_model,
).model_dump()
response_data = {
"question": request.question,
"answer": cached.get("answer", ""),
"sources_used": cached.get("sources", []),
"sources_used": cached_sources,
"visualization": visualization,
"resolved_question": cached_context.get("resolved_question"),
"retrieved_results": cached_context.get("retrieved_results"),
"query_type": cached_context.get("query_type"),
"embedding_model_used": cached_context.get("embedding_model"),
"llm_model_used": cached_context.get("llm_model"),
"llm_model_used": cached_llm_model,
"query_time_ms": round(elapsed_ms, 2),
"cache_hit": True,
"llm_response": llm_response_obj, # GLM 4.7 reasoning_content from cache
# Session management - return session_id for follow-up queries
"session_id": session_id,
# Template tracking from cache
"template_used": cached_context.get("template_used", False),
"template_id": cached_context.get("template_id"),
"template_used": cached_template_used,
"template_id": cached_template_id,
# Rule 46: Epistemic provenance for transparency
"epistemic_provenance": cache_provenance,
}
# Record cache hit metrics
@ -3074,6 +3186,70 @@ async def dspy_query(request: DSPyQueryRequest) -> DSPyQueryResponse:
"scores": {"combined": 1.0},
})
logger.debug(f"[FACTUAL-QUERY] COUNT query result: {sparql_results[0].get('count') if sparql_results else 0}")
# Execute companion query if available to get entity results for map/list
# This fetches the actual institution records that were counted
companion_query = getattr(template_result, 'companion_query', None)
if companion_query:
try:
companion_response = await client.post(
settings.sparql_endpoint,
data={"query": companion_query},
headers={"Accept": "application/sparql-results+json"},
timeout=30.0,
)
if companion_response.status_code == 200:
companion_data = companion_response.json()
companion_bindings = companion_data.get("results", {}).get("bindings", [])
companion_raw = [
{k: v.get("value") for k, v in binding.items()}
for binding in companion_bindings
]
# Transform companion results to frontend format
companion_results = []
for row in companion_raw:
lat = None
lon = None
if row.get("lat"):
try:
lat = float(row["lat"])
except (ValueError, TypeError):
pass
if row.get("lon"):
try:
lon = float(row["lon"])
except (ValueError, TypeError):
pass
companion_results.append({
"name": row.get("name"),
"institution_uri": row.get("institution"),
"metadata": {
"latitude": lat,
"longitude": lon,
"city": row.get("city") or template_result.slots.get("city"),
"institution_type": template_result.slots.get("institution_type"),
},
"scores": {"combined": 1.0},
})
# Store companion results - these will be used for map/list display
# while sparql_results contains the count for the answer text
if companion_results:
logger.info(f"[COMPANION-QUERY] Fetched {len(companion_results)} entities for display, {sum(1 for r in companion_results if r['metadata'].get('latitude'))} with coordinates")
# Replace sparql_results with companion results for display
# but preserve the count value for answer rendering
count_value = sparql_results[0].get("count", 0) if sparql_results else 0
sparql_results = companion_results
# Add count to first result so it's available for ui_template
if sparql_results:
sparql_results[0]["count"] = count_value
else:
logger.warning(f"[COMPANION-QUERY] Failed with status {companion_response.status_code}")
except Exception as ce:
logger.warning(f"[COMPANION-QUERY] Execution failed: {ce}")
else:
# Transform SPARQL results to match frontend expected format
# Frontend expects: {name, website, metadata: {latitude, longitude, city, ...}}
@ -3579,12 +3755,45 @@ async def dspy_query(request: DSPyQueryRequest) -> DSPyQueryResponse:
template_used = getattr(result, "template_used", False)
template_id = getattr(result, "template_id", None)
# Rule 46: Build epistemic provenance for transparency
# This tracks WHERE, WHEN, and HOW the response data originated
sources_used_list = getattr(result, "sources_used", [])
# Infer data tier from sources - SPARQL/TypeDB are authoritative, Qdrant may include scraped data
inferred_tier = DataTier.TIER_3_CROWD_SOURCED.value # Default
if template_used:
# Template-based SPARQL uses curated Oxigraph data
inferred_tier = DataTier.TIER_1_AUTHORITATIVE.value
elif any(s.lower() in ["sparql", "typedb"] for s in sources_used_list):
inferred_tier = DataTier.TIER_1_AUTHORITATIVE.value
elif any(s.lower() == "qdrant" for s in sources_used_list):
inferred_tier = DataTier.TIER_3_CROWD_SOURCED.value
# Build provenance object
response_provenance = EpistemicProvenance(
dataSource=EpistemicDataSource.RAG_PIPELINE,
dataTier=inferred_tier,
derivationChain=build_derivation_chain(
sources_used=sources_used_list,
template_used=template_used,
template_id=template_id,
llm_provider=llm_provider_used,
),
sourcesQueried=sources_used_list,
totalRetrieved=len(retrieved_results) if retrieved_results else 0,
totalAfterFusion=len(retrieved_results) if retrieved_results else 0,
templateUsed=template_used,
templateId=template_id,
llmProvider=llm_provider_used,
llmModel=llm_model_used,
)
# Build response object
response = DSPyQueryResponse(
question=request.question,
resolved_question=getattr(result, "resolved_question", None),
answer=getattr(result, "answer", "Geen antwoord gevonden."),
sources_used=getattr(result, "sources_used", []),
sources_used=sources_used_list,
visualization=visualization,
retrieved_results=retrieved_results, # Raw data for frontend visualization
query_type=query_type, # "person" or "institution"
@ -3606,6 +3815,8 @@ async def dspy_query(request: DSPyQueryRequest) -> DSPyQueryResponse:
# Template SPARQL tracking
template_used=template_used,
template_id=template_id,
# Rule 46: Epistemic provenance for transparency
epistemic_provenance=response_provenance.model_dump(),
)
# Update session with this turn for multi-turn conversation support
@ -3880,23 +4091,63 @@ async def stream_dspy_query_response(
"data": cached.get("visualization_data"),
}
# Rule 46: Build provenance for streaming cache hit responses
stream_cached_sources = cached.get("sources", [])
stream_cached_template_used = cached_context.get("template_used", False)
stream_cached_template_id = cached_context.get("template_id")
stream_cached_llm_provider = cached_context.get("llm_provider")
stream_cached_llm_model = cached_context.get("llm_model")
# Infer data tier - prioritize cached provenance if present
stream_cached_prov = cached_context.get("epistemic_provenance")
if stream_cached_prov:
# Use the cached provenance, but mark it as coming from cache
stream_cache_provenance = stream_cached_prov.copy()
if "CACHE" not in stream_cache_provenance.get("derivationChain", []):
stream_cache_provenance.setdefault("derivationChain", []).insert(0, "CACHE:hit")
else:
# Build fresh provenance for older cache entries
stream_cache_tier = DataTier.TIER_3_CROWD_SOURCED.value
if stream_cached_template_used:
stream_cache_tier = DataTier.TIER_1_AUTHORITATIVE.value
elif any(s.lower() in ["sparql", "typedb"] for s in stream_cached_sources):
stream_cache_tier = DataTier.TIER_1_AUTHORITATIVE.value
stream_cache_provenance = EpistemicProvenance(
dataSource=EpistemicDataSource.CACHE_AGGREGATION,
dataTier=stream_cache_tier,
derivationChain=["CACHE:hit"] + build_derivation_chain(
sources_used=stream_cached_sources,
template_used=stream_cached_template_used,
template_id=stream_cached_template_id,
llm_provider=stream_cached_llm_provider,
),
sourcesQueried=stream_cached_sources,
templateUsed=stream_cached_template_used,
templateId=stream_cached_template_id,
llmProvider=stream_cached_llm_provider,
llmModel=stream_cached_llm_model,
).model_dump()
response_data = {
"question": request.question,
"answer": cached.get("answer", ""),
"sources_used": cached.get("sources", []),
"sources_used": stream_cached_sources,
"visualization": visualization,
"resolved_question": cached_context.get("resolved_question"),
"retrieved_results": cached_context.get("retrieved_results"),
"query_type": cached_context.get("query_type"),
"embedding_model_used": cached_context.get("embedding_model"),
"llm_model_used": cached_context.get("llm_model"),
"llm_model_used": stream_cached_llm_model,
"query_time_ms": round(elapsed_ms, 2),
"cache_hit": True,
# Session management
"session_id": session_id,
# Template tracking from cache
"template_used": cached_context.get("template_used", False),
"template_id": cached_context.get("template_id"),
"template_used": stream_cached_template_used,
"template_id": stream_cached_template_id,
# Rule 46: Epistemic provenance for transparency
"epistemic_provenance": stream_cache_provenance,
}
# Record cache hit metrics for streaming endpoint
@ -4191,11 +4442,41 @@ async def stream_dspy_query_response(
latency_ms=int(elapsed_ms),
)
# Rule 46: Build epistemic provenance for streaming endpoint
stream_sources_used = getattr(result, "sources_used", [])
stream_template_used = getattr(result, "template_used", False)
stream_template_id = getattr(result, "template_id", None)
# Infer data tier from sources
stream_tier = DataTier.TIER_3_CROWD_SOURCED.value
if stream_template_used:
stream_tier = DataTier.TIER_1_AUTHORITATIVE.value
elif any(s.lower() in ["sparql", "typedb"] for s in stream_sources_used):
stream_tier = DataTier.TIER_1_AUTHORITATIVE.value
stream_provenance = EpistemicProvenance(
dataSource=EpistemicDataSource.RAG_PIPELINE,
dataTier=stream_tier,
derivationChain=build_derivation_chain(
sources_used=stream_sources_used,
template_used=stream_template_used,
template_id=stream_template_id,
llm_provider=llm_provider_used,
),
sourcesQueried=stream_sources_used,
totalRetrieved=len(retrieved_results) if retrieved_results else 0,
totalAfterFusion=len(retrieved_results) if retrieved_results else 0,
templateUsed=stream_template_used,
templateId=stream_template_id,
llmProvider=llm_provider_used,
llmModel=llm_model_used,
)
response = DSPyQueryResponse(
question=request.question,
resolved_question=getattr(result, "resolved_question", None),
answer=getattr(result, "answer", "Geen antwoord gevonden."),
sources_used=getattr(result, "sources_used", []),
sources_used=stream_sources_used,
visualization=visualization,
retrieved_results=retrieved_results,
query_type=query_type,
@ -4212,8 +4493,10 @@ async def stream_dspy_query_response(
llm_response=llm_response_metadata,
# Session management fields for multi-turn conversations
session_id=session_id,
template_used=getattr(result, "template_used", False),
template_id=getattr(result, "template_id", None),
template_used=stream_template_used,
template_id=stream_template_id,
# Rule 46: Epistemic provenance for transparency
epistemic_provenance=stream_provenance.model_dump(),
)
# Update session with this turn (before caching)
@ -4339,8 +4622,8 @@ async def stream_query_response(
"count": len(source_results[0].items) if source_results else 0,
}) + "\n"
# Merge and finalize
merged = retriever.merge_results(results)
# Merge and finalize with provenance
merged, stream_provenance = retriever.merge_results(results)
elapsed_ms = (asyncio.get_event_loop().time() - start_time) * 1000
yield json.dumps({
@ -4348,6 +4631,7 @@ async def stream_query_response(
"results": merged,
"query_time_ms": round(elapsed_ms, 2),
"result_count": len(merged),
"epistemic_provenance": stream_provenance.model_dump() if stream_provenance else None,
}) + "\n"

280
backend/rag/provenance.py Normal file
View file

@ -0,0 +1,280 @@
"""
Epistemic Provenance Tracking for RAG Pipeline
Rule 46: Ontology-Driven Cache Segmentation
This module tracks the full lineage of how RAG responses are derived, including:
- Which data sources contributed each piece of information
- Data quality tiers (1=authoritative, 4=inferred)
- Derivation chain showing processing steps
- Confidence scores and temporal validity
The provenance data flows through the pipeline:
1. Retriever adds source attribution
2. merge_results() aggregates across sources
3. DSPy generator adds LLM inference step
4. Response includes full EpistemicProvenance
Frontend displays this in ProvenanceTooltip when hovering over cache badges.
References:
- Pavlyshyn's "Context Graphs and Data Traces: Building Epistemology Layers for Agentic Memory"
- LinkML ProvenanceBlock in schemas/20251121/linkml/modules/classes/
"""
from __future__ import annotations
from datetime import datetime, timezone
from enum import Enum
from typing import Any
from pydantic import BaseModel, Field
class DataTier(int, Enum):
"""Data quality tier - aligned with LinkML DataTierEnum and frontend DataTier type.
Lower numbers = higher authority.
"""
TIER_1_AUTHORITATIVE = 1 # ISIL Registry, Nationaal Archief, official government data
TIER_2_VERIFIED = 2 # Wikidata, Google Maps, verified institutional websites
TIER_3_CROWD_SOURCED = 3 # User reviews, community edits, unverified web scrapes
TIER_4_INFERRED = 4 # LLM extraction, inference, aggregation
class EpistemicDataSource(str, Enum):
"""Data source types - aligned with frontend EpistemicDataSource type."""
ISIL_REGISTRY = "ISIL_REGISTRY"
WIKIDATA = "WIKIDATA"
CUSTODIAN_YAML = "CUSTODIAN_YAML"
GOOGLE_MAPS = "GOOGLE_MAPS"
WEB_SCRAPE = "WEB_SCRAPE"
LLM_INFERENCE = "LLM_INFERENCE"
SPARQL_QUERY = "SPARQL_QUERY"
RAG_PIPELINE = "RAG_PIPELINE"
USER_PROVIDED = "USER_PROVIDED"
CACHE_AGGREGATION = "CACHE_AGGREGATION"
class RetrievalSource(str, Enum):
"""Source system for retrieval - maps to DataSource enum."""
QDRANT = "qdrant"
SPARQL = "sparql"
TYPEDB = "typedb"
POSTGIS = "postgis"
CACHE = "cache"
LLM_SYNTHESIS = "llm_synthesis"
class SourceAttribution(BaseModel):
"""Attribution to a specific data source for a single result item.
Tracks how each retrieved item contributed to the final result.
"""
source: RetrievalSource = Field(description="Which retriever returned this item")
data_tier: DataTier = Field(description="Quality tier of this source")
retrieval_rank: int = Field(description="Position in source's result list")
rrf_contribution: float = Field(default=0.0, description="RRF score contribution from this source")
retrieved_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
query_time_ms: float = Field(default=0.0, description="Time taken by this retriever")
# Optional source-specific details
sparql_query: str | None = Field(default=None, description="SPARQL query if source is SPARQL")
vector_similarity: float | None = Field(default=None, description="Cosine similarity if source is Qdrant")
collection_name: str | None = Field(default=None, description="Collection name for vector search")
template_id: str | None = Field(default=None, description="Template ID if template-based SPARQL")
class EpistemicProvenance(BaseModel):
"""Complete epistemic provenance for a RAG response.
This model is designed to be JSON-serializable and compatible with the
frontend's EpistemicProvenance TypeScript interface.
Fields align with frontend/src/lib/storage/semantic-cache.ts:103-121
"""
# Core provenance fields (required for frontend compatibility)
dataSource: EpistemicDataSource = Field(
default=EpistemicDataSource.RAG_PIPELINE,
description="Primary data source for this response"
)
dataTier: int = Field(
default=3,
ge=1,
le=4,
description="Data quality tier (1=authoritative, 4=inferred)"
)
sourceTimestamp: str = Field(
default_factory=lambda: datetime.now(timezone.utc).isoformat(),
description="When the source data was retrieved"
)
derivationChain: list[str] = Field(
default_factory=list,
description="Chain of processing steps, e.g. ['SPARQL:Oxigraph', 'RAG:retrieve', 'LLM:groq']"
)
revalidationPolicy: str = Field(
default="weekly",
description="How often to revalidate: static, daily, weekly, on_access"
)
confidenceScore: float | None = Field(
default=None,
description="Confidence score 0-1, if applicable"
)
# Extended provenance (for detailed analysis)
sourcesQueried: list[str] = Field(
default_factory=list,
description="List of sources that were queried"
)
totalRetrieved: int = Field(
default=0,
description="Total items retrieved before fusion"
)
totalAfterFusion: int = Field(
default=0,
description="Items after RRF fusion and deduplication"
)
dataTierBreakdown: dict[str, int] = Field(
default_factory=dict,
description="Count of results per data tier"
)
templateUsed: bool = Field(
default=False,
description="Whether template-based SPARQL was used"
)
templateId: str | None = Field(
default=None,
description="Which template was used, if any"
)
llmProvider: str | None = Field(
default=None,
description="LLM provider used for generation"
)
llmModel: str | None = Field(
default=None,
description="Specific LLM model used"
)
class Config:
"""Pydantic configuration."""
use_enum_values = True # Serialize enums to their values
def infer_data_tier(item: dict[str, Any], source: RetrievalSource) -> DataTier:
"""Infer data tier from item metadata and source.
Args:
item: Retrieved item with potential provenance metadata
source: Which retriever returned this item
Returns:
Appropriate DataTier based on item provenance
"""
# Check for explicit data_tier in item
if "data_tier" in item:
tier = item["data_tier"]
if isinstance(tier, int) and 1 <= tier <= 4:
return DataTier(tier)
if isinstance(tier, str):
tier_map = {
"TIER_1_AUTHORITATIVE": DataTier.TIER_1_AUTHORITATIVE,
"TIER_2_VERIFIED": DataTier.TIER_2_VERIFIED,
"TIER_3_CROWD_SOURCED": DataTier.TIER_3_CROWD_SOURCED,
"TIER_4_INFERRED": DataTier.TIER_4_INFERRED,
}
if tier in tier_map:
return tier_map[tier]
# Check provenance block if present
provenance = item.get("provenance", {})
if provenance.get("data_source") == "CSV_REGISTRY":
return DataTier.TIER_1_AUTHORITATIVE
if provenance.get("data_source") == "WIKIDATA":
return DataTier.TIER_2_VERIFIED
# Infer from source
source_tiers = {
RetrievalSource.SPARQL: DataTier.TIER_1_AUTHORITATIVE, # Oxigraph has curated data
RetrievalSource.TYPEDB: DataTier.TIER_1_AUTHORITATIVE, # TypeDB has curated data
RetrievalSource.QDRANT: DataTier.TIER_3_CROWD_SOURCED, # Vector search may include scraped data
RetrievalSource.POSTGIS: DataTier.TIER_2_VERIFIED, # GeoNames data
RetrievalSource.CACHE: DataTier.TIER_3_CROWD_SOURCED, # Cached responses
RetrievalSource.LLM_SYNTHESIS: DataTier.TIER_4_INFERRED, # LLM-generated
}
return source_tiers.get(source, DataTier.TIER_4_INFERRED)
def build_derivation_chain(
sources_used: list[str],
template_used: bool = False,
template_id: str | None = None,
llm_provider: str | None = None,
) -> list[str]:
"""Build the derivation chain from processing steps.
Args:
sources_used: List of data sources queried
template_used: Whether template-based SPARQL was used
template_id: Template ID if used
llm_provider: LLM provider for generation
Returns:
List of derivation steps, e.g. ['SPARQL:Oxigraph', 'RAG:retrieve', 'LLM:groq']
"""
chain = []
# Add retrieval sources
source_map = {
"qdrant": "Vector:Qdrant",
"sparql": "SPARQL:Oxigraph",
"typedb": "Graph:TypeDB",
"postgis": "Geo:PostGIS",
}
for source in sources_used:
if source.lower() in source_map:
chain.append(source_map[source.lower()])
# Add template or RAG step
if template_used and template_id:
chain.append(f"Template:{template_id}")
else:
chain.append("RAG:retrieve")
# Add LLM generation step
if llm_provider:
chain.append(f"LLM:{llm_provider}")
return chain
def aggregate_data_tier(tier_counts: dict[DataTier, int]) -> int:
"""Aggregate multiple data tiers to a single representative tier.
Uses pessimistic aggregation: the overall tier is the worst (highest number)
tier with significant contribution (>20% of results).
Args:
tier_counts: Count of results per tier
Returns:
Aggregated tier number (1-4)
"""
if not tier_counts:
return 4 # Default to inferred if no data
total = sum(tier_counts.values())
if total == 0:
return 4
# Check from worst to best, return first tier with >20% contribution
for tier in [DataTier.TIER_4_INFERRED, DataTier.TIER_3_CROWD_SOURCED,
DataTier.TIER_2_VERIFIED, DataTier.TIER_1_AUTHORITATIVE]:
count = tier_counts.get(tier, 0)
if count > 0 and (count / total) > 0.2:
return tier.value
# If all tiers have <20% contribution, return the mode
most_common = max(tier_counts, key=tier_counts.get) # type: ignore[arg-type]
return most_common.value

View file

@ -1,5 +1,5 @@
{
"generated": "2026-01-10T15:49:44.857Z",
"generated": "2026-01-10T17:17:56.765Z",
"version": "1.0.0",
"categories": [
{

View file

@ -33,6 +33,6 @@ slots:
Custodian:
hc_id: "https://nde.nl/ontology/hc/nl-nh-ams-m-rm"
has_observation:
- "https://nde.nl/ontology/hc/observation/isil-registry-2024"\
- "https://nde.nl/ontology/hc/observation/wikid...
description: Usage example
- "https://nde.nl/ontology/hc/observation/isil-registry-2024"
- "https://nde.nl/ontology/hc/observation/wikidata-q190804"
description: Usage example showing a Custodian hub linked to multiple observations from different sources

View file

@ -20,7 +20,7 @@
import { useState, useCallback, useRef, useEffect } from 'react';
import type { QdrantSearchResult } from './useQdrant';
import { semanticCache, type CachedResponse, type CacheStats, type CacheLookupResult } from '../lib/storage/semantic-cache';
import { semanticCache, type CachedResponse, type CacheStats, type CacheLookupResult, type EpistemicProvenance } from '../lib/storage/semantic-cache';
import type { LLMProviderType } from '../lib/storage/ui-state';
// Configuration - all services use Caddy proxy paths
@ -138,6 +138,8 @@ export interface RAGResponse {
// Secondary reply type for composite visualizations (e.g., factual_count + map_points)
secondaryReplyType?: ReplyType;
secondaryReplyContent?: ReplyContent;
// Rule 46: Epistemic provenance for transparency (WHERE, WHEN, HOW data originated)
epistemicProvenance?: EpistemicProvenance;
}
export interface RAGSource {
@ -1128,6 +1130,8 @@ async function callDSPy(
// Secondary reply type for composite visualizations (e.g., factual_count + map_points)
secondaryReplyType?: string;
secondaryReplyContent?: ReplyContent;
// Rule 46: Epistemic provenance for transparency (WHERE, WHEN, HOW data originated)
epistemicProvenance?: EpistemicProvenance;
}> {
// Format conversation history for DSPy backend
// Backend expects: context = [{question: "...", answer: "..."}, ...]
@ -1307,6 +1311,8 @@ async function callDSPy(
// Secondary reply type for composite visualizations (e.g., factual_count + map_points)
secondaryReplyType: data.secondary_reply_type as ReplyType | undefined,
secondaryReplyContent: data.secondary_reply_content as ReplyContent | undefined,
// Rule 46: Epistemic provenance from backend
epistemicProvenance: data.epistemic_provenance as EpistemicProvenance | undefined,
};
}
@ -1846,6 +1852,8 @@ export function useMultiDatabaseRAG(): UseMultiDatabaseRAGReturn {
// Reply type classification from backend classify_and_format()
replyType: dspyResponse.replyType as ReplyType | undefined,
replyContent: dspyResponse.replyContent,
// Rule 46: Epistemic provenance from backend
epistemicProvenance: dspyResponse.epistemicProvenance,
};
// Update pagination state with correct queryType from DSPy response

View file

@ -1914,7 +1914,9 @@ const ConversationPage: React.FC = () => {
const cacheSimilarity = lastCacheLookup?.similarity;
const cacheMethod = lastCacheLookup?.method;
const cacheTier = lastCacheLookup?.tier;
const cacheProvenance = lastCacheLookup?.entry?.epistemicProvenance;
// Rule 46: Prioritize backend epistemic provenance over cache-only provenance
// Backend provenance includes full derivation chain; cache provenance is only for cache hits
const epistemicProvenance = response.epistemicProvenance || lastCacheLookup?.entry?.epistemicProvenance;
const cacheLookupTimeMs = lastCacheLookup?.lookupTimeMs;
// Replace loading message with response
@ -1929,7 +1931,7 @@ const ConversationPage: React.FC = () => {
cacheSimilarity: cacheSimilarity,
cacheMethod: cacheMethod,
cacheTier: cacheTier,
epistemicProvenance: cacheProvenance,
epistemicProvenance: epistemicProvenance,
cacheLookupTimeMs: cacheLookupTimeMs,
}
: msg

View file

@ -33,6 +33,6 @@ slots:
Custodian:
hc_id: "https://nde.nl/ontology/hc/nl-nh-ams-m-rm"
has_observation:
- "https://nde.nl/ontology/hc/observation/isil-registry-2024"\
- "https://nde.nl/ontology/hc/observation/wikid...
description: Usage example
- "https://nde.nl/ontology/hc/observation/isil-registry-2024"
- "https://nde.nl/ontology/hc/observation/wikidata-q190804"
description: Usage example showing a Custodian hub linked to multiple observations from different sources