glam/backend/rag/provenance.py
kempersc 6c19ef8661 feat(rag): add Rule 46 epistemic provenance tracking
Track full lineage of RAG responses: WHERE data comes from, WHEN it was
retrieved, HOW it was processed (SPARQL/vector/LLM).

Backend changes:
- Add provenance.py with EpistemicProvenance, DataTier, SourceAttribution
- Integrate provenance into MultiSourceRetriever.merge_results()
- Return epistemic_provenance in DSPyQueryResponse

Frontend changes:
- Pass EpistemicProvenance through useMultiDatabaseRAG hook
- Display provenance in ConversationPage (for cache transparency)

Schema fixes:
- Fix truncated example in has_observation.yaml slot definition

References:
- Pavlyshyn's Context Graphs and Data Traces paper
- LinkML ProvenanceBlock schema pattern
2026-01-10 18:42:43 +01:00

280 lines
9.8 KiB
Python

"""
Epistemic Provenance Tracking for RAG Pipeline
Rule 46: Ontology-Driven Cache Segmentation
This module tracks the full lineage of how RAG responses are derived, including:
- Which data sources contributed each piece of information
- Data quality tiers (1=authoritative, 4=inferred)
- Derivation chain showing processing steps
- Confidence scores and temporal validity
The provenance data flows through the pipeline:
1. Retriever → adds source attribution
2. merge_results() → aggregates across sources
3. DSPy generator → adds LLM inference step
4. Response → includes full EpistemicProvenance
Frontend displays this in ProvenanceTooltip when hovering over cache badges.
References:
- Pavlyshyn's "Context Graphs and Data Traces: Building Epistemology Layers for Agentic Memory"
- LinkML ProvenanceBlock in schemas/20251121/linkml/modules/classes/
"""
from __future__ import annotations
from datetime import datetime, timezone
from enum import Enum
from typing import Any
from pydantic import BaseModel, Field
class DataTier(int, Enum):
"""Data quality tier - aligned with LinkML DataTierEnum and frontend DataTier type.
Lower numbers = higher authority.
"""
TIER_1_AUTHORITATIVE = 1 # ISIL Registry, Nationaal Archief, official government data
TIER_2_VERIFIED = 2 # Wikidata, Google Maps, verified institutional websites
TIER_3_CROWD_SOURCED = 3 # User reviews, community edits, unverified web scrapes
TIER_4_INFERRED = 4 # LLM extraction, inference, aggregation
class EpistemicDataSource(str, Enum):
"""Data source types - aligned with frontend EpistemicDataSource type."""
ISIL_REGISTRY = "ISIL_REGISTRY"
WIKIDATA = "WIKIDATA"
CUSTODIAN_YAML = "CUSTODIAN_YAML"
GOOGLE_MAPS = "GOOGLE_MAPS"
WEB_SCRAPE = "WEB_SCRAPE"
LLM_INFERENCE = "LLM_INFERENCE"
SPARQL_QUERY = "SPARQL_QUERY"
RAG_PIPELINE = "RAG_PIPELINE"
USER_PROVIDED = "USER_PROVIDED"
CACHE_AGGREGATION = "CACHE_AGGREGATION"
class RetrievalSource(str, Enum):
"""Source system for retrieval - maps to DataSource enum."""
QDRANT = "qdrant"
SPARQL = "sparql"
TYPEDB = "typedb"
POSTGIS = "postgis"
CACHE = "cache"
LLM_SYNTHESIS = "llm_synthesis"
class SourceAttribution(BaseModel):
"""Attribution to a specific data source for a single result item.
Tracks how each retrieved item contributed to the final result.
"""
source: RetrievalSource = Field(description="Which retriever returned this item")
data_tier: DataTier = Field(description="Quality tier of this source")
retrieval_rank: int = Field(description="Position in source's result list")
rrf_contribution: float = Field(default=0.0, description="RRF score contribution from this source")
retrieved_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
query_time_ms: float = Field(default=0.0, description="Time taken by this retriever")
# Optional source-specific details
sparql_query: str | None = Field(default=None, description="SPARQL query if source is SPARQL")
vector_similarity: float | None = Field(default=None, description="Cosine similarity if source is Qdrant")
collection_name: str | None = Field(default=None, description="Collection name for vector search")
template_id: str | None = Field(default=None, description="Template ID if template-based SPARQL")
class EpistemicProvenance(BaseModel):
"""Complete epistemic provenance for a RAG response.
This model is designed to be JSON-serializable and compatible with the
frontend's EpistemicProvenance TypeScript interface.
Fields align with frontend/src/lib/storage/semantic-cache.ts:103-121
"""
# Core provenance fields (required for frontend compatibility)
dataSource: EpistemicDataSource = Field(
default=EpistemicDataSource.RAG_PIPELINE,
description="Primary data source for this response"
)
dataTier: int = Field(
default=3,
ge=1,
le=4,
description="Data quality tier (1=authoritative, 4=inferred)"
)
sourceTimestamp: str = Field(
default_factory=lambda: datetime.now(timezone.utc).isoformat(),
description="When the source data was retrieved"
)
derivationChain: list[str] = Field(
default_factory=list,
description="Chain of processing steps, e.g. ['SPARQL:Oxigraph', 'RAG:retrieve', 'LLM:groq']"
)
revalidationPolicy: str = Field(
default="weekly",
description="How often to revalidate: static, daily, weekly, on_access"
)
confidenceScore: float | None = Field(
default=None,
description="Confidence score 0-1, if applicable"
)
# Extended provenance (for detailed analysis)
sourcesQueried: list[str] = Field(
default_factory=list,
description="List of sources that were queried"
)
totalRetrieved: int = Field(
default=0,
description="Total items retrieved before fusion"
)
totalAfterFusion: int = Field(
default=0,
description="Items after RRF fusion and deduplication"
)
dataTierBreakdown: dict[str, int] = Field(
default_factory=dict,
description="Count of results per data tier"
)
templateUsed: bool = Field(
default=False,
description="Whether template-based SPARQL was used"
)
templateId: str | None = Field(
default=None,
description="Which template was used, if any"
)
llmProvider: str | None = Field(
default=None,
description="LLM provider used for generation"
)
llmModel: str | None = Field(
default=None,
description="Specific LLM model used"
)
class Config:
"""Pydantic configuration."""
use_enum_values = True # Serialize enums to their values
def infer_data_tier(item: dict[str, Any], source: RetrievalSource) -> DataTier:
"""Infer data tier from item metadata and source.
Args:
item: Retrieved item with potential provenance metadata
source: Which retriever returned this item
Returns:
Appropriate DataTier based on item provenance
"""
# Check for explicit data_tier in item
if "data_tier" in item:
tier = item["data_tier"]
if isinstance(tier, int) and 1 <= tier <= 4:
return DataTier(tier)
if isinstance(tier, str):
tier_map = {
"TIER_1_AUTHORITATIVE": DataTier.TIER_1_AUTHORITATIVE,
"TIER_2_VERIFIED": DataTier.TIER_2_VERIFIED,
"TIER_3_CROWD_SOURCED": DataTier.TIER_3_CROWD_SOURCED,
"TIER_4_INFERRED": DataTier.TIER_4_INFERRED,
}
if tier in tier_map:
return tier_map[tier]
# Check provenance block if present
provenance = item.get("provenance", {})
if provenance.get("data_source") == "CSV_REGISTRY":
return DataTier.TIER_1_AUTHORITATIVE
if provenance.get("data_source") == "WIKIDATA":
return DataTier.TIER_2_VERIFIED
# Infer from source
source_tiers = {
RetrievalSource.SPARQL: DataTier.TIER_1_AUTHORITATIVE, # Oxigraph has curated data
RetrievalSource.TYPEDB: DataTier.TIER_1_AUTHORITATIVE, # TypeDB has curated data
RetrievalSource.QDRANT: DataTier.TIER_3_CROWD_SOURCED, # Vector search may include scraped data
RetrievalSource.POSTGIS: DataTier.TIER_2_VERIFIED, # GeoNames data
RetrievalSource.CACHE: DataTier.TIER_3_CROWD_SOURCED, # Cached responses
RetrievalSource.LLM_SYNTHESIS: DataTier.TIER_4_INFERRED, # LLM-generated
}
return source_tiers.get(source, DataTier.TIER_4_INFERRED)
def build_derivation_chain(
sources_used: list[str],
template_used: bool = False,
template_id: str | None = None,
llm_provider: str | None = None,
) -> list[str]:
"""Build the derivation chain from processing steps.
Args:
sources_used: List of data sources queried
template_used: Whether template-based SPARQL was used
template_id: Template ID if used
llm_provider: LLM provider for generation
Returns:
List of derivation steps, e.g. ['SPARQL:Oxigraph', 'RAG:retrieve', 'LLM:groq']
"""
chain = []
# Add retrieval sources
source_map = {
"qdrant": "Vector:Qdrant",
"sparql": "SPARQL:Oxigraph",
"typedb": "Graph:TypeDB",
"postgis": "Geo:PostGIS",
}
for source in sources_used:
if source.lower() in source_map:
chain.append(source_map[source.lower()])
# Add template or RAG step
if template_used and template_id:
chain.append(f"Template:{template_id}")
else:
chain.append("RAG:retrieve")
# Add LLM generation step
if llm_provider:
chain.append(f"LLM:{llm_provider}")
return chain
def aggregate_data_tier(tier_counts: dict[DataTier, int]) -> int:
"""Aggregate multiple data tiers to a single representative tier.
Uses pessimistic aggregation: the overall tier is the worst (highest number)
tier with significant contribution (>20% of results).
Args:
tier_counts: Count of results per tier
Returns:
Aggregated tier number (1-4)
"""
if not tier_counts:
return 4 # Default to inferred if no data
total = sum(tier_counts.values())
if total == 0:
return 4
# Check from worst to best, return first tier with >20% contribution
for tier in [DataTier.TIER_4_INFERRED, DataTier.TIER_3_CROWD_SOURCED,
DataTier.TIER_2_VERIFIED, DataTier.TIER_1_AUTHORITATIVE]:
count = tier_counts.get(tier, 0)
if count > 0 and (count / total) > 0.2:
return tier.value
# If all tiers have <20% contribution, return the mode
most_common = max(tier_counts, key=tier_counts.get) # type: ignore[arg-type]
return most_common.value