Track full lineage of RAG responses: WHERE data comes from, WHEN it was retrieved, HOW it was processed (SPARQL/vector/LLM). Backend changes: - Add provenance.py with EpistemicProvenance, DataTier, SourceAttribution - Integrate provenance into MultiSourceRetriever.merge_results() - Return epistemic_provenance in DSPyQueryResponse Frontend changes: - Pass EpistemicProvenance through useMultiDatabaseRAG hook - Display provenance in ConversationPage (for cache transparency) Schema fixes: - Fix truncated example in has_observation.yaml slot definition References: - Pavlyshyn's Context Graphs and Data Traces paper - LinkML ProvenanceBlock schema pattern
280 lines
9.8 KiB
Python
280 lines
9.8 KiB
Python
"""
|
|
Epistemic Provenance Tracking for RAG Pipeline
|
|
|
|
Rule 46: Ontology-Driven Cache Segmentation
|
|
|
|
This module tracks the full lineage of how RAG responses are derived, including:
|
|
- Which data sources contributed each piece of information
|
|
- Data quality tiers (1=authoritative, 4=inferred)
|
|
- Derivation chain showing processing steps
|
|
- Confidence scores and temporal validity
|
|
|
|
The provenance data flows through the pipeline:
|
|
1. Retriever → adds source attribution
|
|
2. merge_results() → aggregates across sources
|
|
3. DSPy generator → adds LLM inference step
|
|
4. Response → includes full EpistemicProvenance
|
|
|
|
Frontend displays this in ProvenanceTooltip when hovering over cache badges.
|
|
|
|
References:
|
|
- Pavlyshyn's "Context Graphs and Data Traces: Building Epistemology Layers for Agentic Memory"
|
|
- LinkML ProvenanceBlock in schemas/20251121/linkml/modules/classes/
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from datetime import datetime, timezone
|
|
from enum import Enum
|
|
from typing import Any
|
|
|
|
from pydantic import BaseModel, Field
|
|
|
|
|
|
class DataTier(int, Enum):
|
|
"""Data quality tier - aligned with LinkML DataTierEnum and frontend DataTier type.
|
|
|
|
Lower numbers = higher authority.
|
|
"""
|
|
TIER_1_AUTHORITATIVE = 1 # ISIL Registry, Nationaal Archief, official government data
|
|
TIER_2_VERIFIED = 2 # Wikidata, Google Maps, verified institutional websites
|
|
TIER_3_CROWD_SOURCED = 3 # User reviews, community edits, unverified web scrapes
|
|
TIER_4_INFERRED = 4 # LLM extraction, inference, aggregation
|
|
|
|
|
|
class EpistemicDataSource(str, Enum):
|
|
"""Data source types - aligned with frontend EpistemicDataSource type."""
|
|
ISIL_REGISTRY = "ISIL_REGISTRY"
|
|
WIKIDATA = "WIKIDATA"
|
|
CUSTODIAN_YAML = "CUSTODIAN_YAML"
|
|
GOOGLE_MAPS = "GOOGLE_MAPS"
|
|
WEB_SCRAPE = "WEB_SCRAPE"
|
|
LLM_INFERENCE = "LLM_INFERENCE"
|
|
SPARQL_QUERY = "SPARQL_QUERY"
|
|
RAG_PIPELINE = "RAG_PIPELINE"
|
|
USER_PROVIDED = "USER_PROVIDED"
|
|
CACHE_AGGREGATION = "CACHE_AGGREGATION"
|
|
|
|
|
|
class RetrievalSource(str, Enum):
|
|
"""Source system for retrieval - maps to DataSource enum."""
|
|
QDRANT = "qdrant"
|
|
SPARQL = "sparql"
|
|
TYPEDB = "typedb"
|
|
POSTGIS = "postgis"
|
|
CACHE = "cache"
|
|
LLM_SYNTHESIS = "llm_synthesis"
|
|
|
|
|
|
class SourceAttribution(BaseModel):
|
|
"""Attribution to a specific data source for a single result item.
|
|
|
|
Tracks how each retrieved item contributed to the final result.
|
|
"""
|
|
source: RetrievalSource = Field(description="Which retriever returned this item")
|
|
data_tier: DataTier = Field(description="Quality tier of this source")
|
|
retrieval_rank: int = Field(description="Position in source's result list")
|
|
rrf_contribution: float = Field(default=0.0, description="RRF score contribution from this source")
|
|
retrieved_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
|
|
query_time_ms: float = Field(default=0.0, description="Time taken by this retriever")
|
|
|
|
# Optional source-specific details
|
|
sparql_query: str | None = Field(default=None, description="SPARQL query if source is SPARQL")
|
|
vector_similarity: float | None = Field(default=None, description="Cosine similarity if source is Qdrant")
|
|
collection_name: str | None = Field(default=None, description="Collection name for vector search")
|
|
template_id: str | None = Field(default=None, description="Template ID if template-based SPARQL")
|
|
|
|
|
|
class EpistemicProvenance(BaseModel):
|
|
"""Complete epistemic provenance for a RAG response.
|
|
|
|
This model is designed to be JSON-serializable and compatible with the
|
|
frontend's EpistemicProvenance TypeScript interface.
|
|
|
|
Fields align with frontend/src/lib/storage/semantic-cache.ts:103-121
|
|
"""
|
|
|
|
# Core provenance fields (required for frontend compatibility)
|
|
dataSource: EpistemicDataSource = Field(
|
|
default=EpistemicDataSource.RAG_PIPELINE,
|
|
description="Primary data source for this response"
|
|
)
|
|
dataTier: int = Field(
|
|
default=3,
|
|
ge=1,
|
|
le=4,
|
|
description="Data quality tier (1=authoritative, 4=inferred)"
|
|
)
|
|
sourceTimestamp: str = Field(
|
|
default_factory=lambda: datetime.now(timezone.utc).isoformat(),
|
|
description="When the source data was retrieved"
|
|
)
|
|
derivationChain: list[str] = Field(
|
|
default_factory=list,
|
|
description="Chain of processing steps, e.g. ['SPARQL:Oxigraph', 'RAG:retrieve', 'LLM:groq']"
|
|
)
|
|
revalidationPolicy: str = Field(
|
|
default="weekly",
|
|
description="How often to revalidate: static, daily, weekly, on_access"
|
|
)
|
|
confidenceScore: float | None = Field(
|
|
default=None,
|
|
description="Confidence score 0-1, if applicable"
|
|
)
|
|
|
|
# Extended provenance (for detailed analysis)
|
|
sourcesQueried: list[str] = Field(
|
|
default_factory=list,
|
|
description="List of sources that were queried"
|
|
)
|
|
totalRetrieved: int = Field(
|
|
default=0,
|
|
description="Total items retrieved before fusion"
|
|
)
|
|
totalAfterFusion: int = Field(
|
|
default=0,
|
|
description="Items after RRF fusion and deduplication"
|
|
)
|
|
dataTierBreakdown: dict[str, int] = Field(
|
|
default_factory=dict,
|
|
description="Count of results per data tier"
|
|
)
|
|
templateUsed: bool = Field(
|
|
default=False,
|
|
description="Whether template-based SPARQL was used"
|
|
)
|
|
templateId: str | None = Field(
|
|
default=None,
|
|
description="Which template was used, if any"
|
|
)
|
|
llmProvider: str | None = Field(
|
|
default=None,
|
|
description="LLM provider used for generation"
|
|
)
|
|
llmModel: str | None = Field(
|
|
default=None,
|
|
description="Specific LLM model used"
|
|
)
|
|
|
|
class Config:
|
|
"""Pydantic configuration."""
|
|
use_enum_values = True # Serialize enums to their values
|
|
|
|
|
|
def infer_data_tier(item: dict[str, Any], source: RetrievalSource) -> DataTier:
|
|
"""Infer data tier from item metadata and source.
|
|
|
|
Args:
|
|
item: Retrieved item with potential provenance metadata
|
|
source: Which retriever returned this item
|
|
|
|
Returns:
|
|
Appropriate DataTier based on item provenance
|
|
"""
|
|
# Check for explicit data_tier in item
|
|
if "data_tier" in item:
|
|
tier = item["data_tier"]
|
|
if isinstance(tier, int) and 1 <= tier <= 4:
|
|
return DataTier(tier)
|
|
if isinstance(tier, str):
|
|
tier_map = {
|
|
"TIER_1_AUTHORITATIVE": DataTier.TIER_1_AUTHORITATIVE,
|
|
"TIER_2_VERIFIED": DataTier.TIER_2_VERIFIED,
|
|
"TIER_3_CROWD_SOURCED": DataTier.TIER_3_CROWD_SOURCED,
|
|
"TIER_4_INFERRED": DataTier.TIER_4_INFERRED,
|
|
}
|
|
if tier in tier_map:
|
|
return tier_map[tier]
|
|
|
|
# Check provenance block if present
|
|
provenance = item.get("provenance", {})
|
|
if provenance.get("data_source") == "CSV_REGISTRY":
|
|
return DataTier.TIER_1_AUTHORITATIVE
|
|
if provenance.get("data_source") == "WIKIDATA":
|
|
return DataTier.TIER_2_VERIFIED
|
|
|
|
# Infer from source
|
|
source_tiers = {
|
|
RetrievalSource.SPARQL: DataTier.TIER_1_AUTHORITATIVE, # Oxigraph has curated data
|
|
RetrievalSource.TYPEDB: DataTier.TIER_1_AUTHORITATIVE, # TypeDB has curated data
|
|
RetrievalSource.QDRANT: DataTier.TIER_3_CROWD_SOURCED, # Vector search may include scraped data
|
|
RetrievalSource.POSTGIS: DataTier.TIER_2_VERIFIED, # GeoNames data
|
|
RetrievalSource.CACHE: DataTier.TIER_3_CROWD_SOURCED, # Cached responses
|
|
RetrievalSource.LLM_SYNTHESIS: DataTier.TIER_4_INFERRED, # LLM-generated
|
|
}
|
|
|
|
return source_tiers.get(source, DataTier.TIER_4_INFERRED)
|
|
|
|
|
|
def build_derivation_chain(
|
|
sources_used: list[str],
|
|
template_used: bool = False,
|
|
template_id: str | None = None,
|
|
llm_provider: str | None = None,
|
|
) -> list[str]:
|
|
"""Build the derivation chain from processing steps.
|
|
|
|
Args:
|
|
sources_used: List of data sources queried
|
|
template_used: Whether template-based SPARQL was used
|
|
template_id: Template ID if used
|
|
llm_provider: LLM provider for generation
|
|
|
|
Returns:
|
|
List of derivation steps, e.g. ['SPARQL:Oxigraph', 'RAG:retrieve', 'LLM:groq']
|
|
"""
|
|
chain = []
|
|
|
|
# Add retrieval sources
|
|
source_map = {
|
|
"qdrant": "Vector:Qdrant",
|
|
"sparql": "SPARQL:Oxigraph",
|
|
"typedb": "Graph:TypeDB",
|
|
"postgis": "Geo:PostGIS",
|
|
}
|
|
for source in sources_used:
|
|
if source.lower() in source_map:
|
|
chain.append(source_map[source.lower()])
|
|
|
|
# Add template or RAG step
|
|
if template_used and template_id:
|
|
chain.append(f"Template:{template_id}")
|
|
else:
|
|
chain.append("RAG:retrieve")
|
|
|
|
# Add LLM generation step
|
|
if llm_provider:
|
|
chain.append(f"LLM:{llm_provider}")
|
|
|
|
return chain
|
|
|
|
|
|
def aggregate_data_tier(tier_counts: dict[DataTier, int]) -> int:
|
|
"""Aggregate multiple data tiers to a single representative tier.
|
|
|
|
Uses pessimistic aggregation: the overall tier is the worst (highest number)
|
|
tier with significant contribution (>20% of results).
|
|
|
|
Args:
|
|
tier_counts: Count of results per tier
|
|
|
|
Returns:
|
|
Aggregated tier number (1-4)
|
|
"""
|
|
if not tier_counts:
|
|
return 4 # Default to inferred if no data
|
|
|
|
total = sum(tier_counts.values())
|
|
if total == 0:
|
|
return 4
|
|
|
|
# Check from worst to best, return first tier with >20% contribution
|
|
for tier in [DataTier.TIER_4_INFERRED, DataTier.TIER_3_CROWD_SOURCED,
|
|
DataTier.TIER_2_VERIFIED, DataTier.TIER_1_AUTHORITATIVE]:
|
|
count = tier_counts.get(tier, 0)
|
|
if count > 0 and (count / total) > 0.2:
|
|
return tier.value
|
|
|
|
# If all tiers have <20% contribution, return the mode
|
|
most_common = max(tier_counts, key=tier_counts.get) # type: ignore[arg-type]
|
|
return most_common.value
|