""" Epistemic Provenance Tracking for RAG Pipeline Rule 46: Ontology-Driven Cache Segmentation This module tracks the full lineage of how RAG responses are derived, including: - Which data sources contributed each piece of information - Data quality tiers (1=authoritative, 4=inferred) - Derivation chain showing processing steps - Confidence scores and temporal validity The provenance data flows through the pipeline: 1. Retriever → adds source attribution 2. merge_results() → aggregates across sources 3. DSPy generator → adds LLM inference step 4. Response → includes full EpistemicProvenance Frontend displays this in ProvenanceTooltip when hovering over cache badges. References: - Pavlyshyn's "Context Graphs and Data Traces: Building Epistemology Layers for Agentic Memory" - LinkML ProvenanceBlock in schemas/20251121/linkml/modules/classes/ """ from __future__ import annotations from datetime import datetime, timezone from enum import Enum from typing import Any from pydantic import BaseModel, Field class DataTier(int, Enum): """Data quality tier - aligned with LinkML DataTierEnum and frontend DataTier type. Lower numbers = higher authority. """ TIER_1_AUTHORITATIVE = 1 # ISIL Registry, Nationaal Archief, official government data TIER_2_VERIFIED = 2 # Wikidata, Google Maps, verified institutional websites TIER_3_CROWD_SOURCED = 3 # User reviews, community edits, unverified web scrapes TIER_4_INFERRED = 4 # LLM extraction, inference, aggregation class EpistemicDataSource(str, Enum): """Data source types - aligned with frontend EpistemicDataSource type.""" ISIL_REGISTRY = "ISIL_REGISTRY" WIKIDATA = "WIKIDATA" CUSTODIAN_YAML = "CUSTODIAN_YAML" GOOGLE_MAPS = "GOOGLE_MAPS" WEB_SCRAPE = "WEB_SCRAPE" LLM_INFERENCE = "LLM_INFERENCE" SPARQL_QUERY = "SPARQL_QUERY" RAG_PIPELINE = "RAG_PIPELINE" USER_PROVIDED = "USER_PROVIDED" CACHE_AGGREGATION = "CACHE_AGGREGATION" class RetrievalSource(str, Enum): """Source system for retrieval - maps to DataSource enum.""" QDRANT = "qdrant" SPARQL = "sparql" TYPEDB = "typedb" POSTGIS = "postgis" CACHE = "cache" LLM_SYNTHESIS = "llm_synthesis" class SourceAttribution(BaseModel): """Attribution to a specific data source for a single result item. Tracks how each retrieved item contributed to the final result. """ source: RetrievalSource = Field(description="Which retriever returned this item") data_tier: DataTier = Field(description="Quality tier of this source") retrieval_rank: int = Field(description="Position in source's result list") rrf_contribution: float = Field(default=0.0, description="RRF score contribution from this source") retrieved_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc)) query_time_ms: float = Field(default=0.0, description="Time taken by this retriever") # Optional source-specific details sparql_query: str | None = Field(default=None, description="SPARQL query if source is SPARQL") vector_similarity: float | None = Field(default=None, description="Cosine similarity if source is Qdrant") collection_name: str | None = Field(default=None, description="Collection name for vector search") template_id: str | None = Field(default=None, description="Template ID if template-based SPARQL") class EpistemicProvenance(BaseModel): """Complete epistemic provenance for a RAG response. This model is designed to be JSON-serializable and compatible with the frontend's EpistemicProvenance TypeScript interface. Fields align with frontend/src/lib/storage/semantic-cache.ts:103-121 """ # Core provenance fields (required for frontend compatibility) dataSource: EpistemicDataSource = Field( default=EpistemicDataSource.RAG_PIPELINE, description="Primary data source for this response" ) dataTier: int = Field( default=3, ge=1, le=4, description="Data quality tier (1=authoritative, 4=inferred)" ) sourceTimestamp: str = Field( default_factory=lambda: datetime.now(timezone.utc).isoformat(), description="When the source data was retrieved" ) derivationChain: list[str] = Field( default_factory=list, description="Chain of processing steps, e.g. ['SPARQL:Oxigraph', 'RAG:retrieve', 'LLM:groq']" ) revalidationPolicy: str = Field( default="weekly", description="How often to revalidate: static, daily, weekly, on_access" ) confidenceScore: float | None = Field( default=None, description="Confidence score 0-1, if applicable" ) # Extended provenance (for detailed analysis) sourcesQueried: list[str] = Field( default_factory=list, description="List of sources that were queried" ) totalRetrieved: int = Field( default=0, description="Total items retrieved before fusion" ) totalAfterFusion: int = Field( default=0, description="Items after RRF fusion and deduplication" ) dataTierBreakdown: dict[str, int] = Field( default_factory=dict, description="Count of results per data tier" ) templateUsed: bool = Field( default=False, description="Whether template-based SPARQL was used" ) templateId: str | None = Field( default=None, description="Which template was used, if any" ) llmProvider: str | None = Field( default=None, description="LLM provider used for generation" ) llmModel: str | None = Field( default=None, description="Specific LLM model used" ) class Config: """Pydantic configuration.""" use_enum_values = True # Serialize enums to their values def infer_data_tier(item: dict[str, Any], source: RetrievalSource) -> DataTier: """Infer data tier from item metadata and source. Args: item: Retrieved item with potential provenance metadata source: Which retriever returned this item Returns: Appropriate DataTier based on item provenance """ # Check for explicit data_tier in item if "data_tier" in item: tier = item["data_tier"] if isinstance(tier, int) and 1 <= tier <= 4: return DataTier(tier) if isinstance(tier, str): tier_map = { "TIER_1_AUTHORITATIVE": DataTier.TIER_1_AUTHORITATIVE, "TIER_2_VERIFIED": DataTier.TIER_2_VERIFIED, "TIER_3_CROWD_SOURCED": DataTier.TIER_3_CROWD_SOURCED, "TIER_4_INFERRED": DataTier.TIER_4_INFERRED, } if tier in tier_map: return tier_map[tier] # Check provenance block if present provenance = item.get("provenance", {}) if provenance.get("data_source") == "CSV_REGISTRY": return DataTier.TIER_1_AUTHORITATIVE if provenance.get("data_source") == "WIKIDATA": return DataTier.TIER_2_VERIFIED # Infer from source source_tiers = { RetrievalSource.SPARQL: DataTier.TIER_1_AUTHORITATIVE, # Oxigraph has curated data RetrievalSource.TYPEDB: DataTier.TIER_1_AUTHORITATIVE, # TypeDB has curated data RetrievalSource.QDRANT: DataTier.TIER_3_CROWD_SOURCED, # Vector search may include scraped data RetrievalSource.POSTGIS: DataTier.TIER_2_VERIFIED, # GeoNames data RetrievalSource.CACHE: DataTier.TIER_3_CROWD_SOURCED, # Cached responses RetrievalSource.LLM_SYNTHESIS: DataTier.TIER_4_INFERRED, # LLM-generated } return source_tiers.get(source, DataTier.TIER_4_INFERRED) def build_derivation_chain( sources_used: list[str], template_used: bool = False, template_id: str | None = None, llm_provider: str | None = None, ) -> list[str]: """Build the derivation chain from processing steps. Args: sources_used: List of data sources queried template_used: Whether template-based SPARQL was used template_id: Template ID if used llm_provider: LLM provider for generation Returns: List of derivation steps, e.g. ['SPARQL:Oxigraph', 'RAG:retrieve', 'LLM:groq'] """ chain = [] # Add retrieval sources source_map = { "qdrant": "Vector:Qdrant", "sparql": "SPARQL:Oxigraph", "typedb": "Graph:TypeDB", "postgis": "Geo:PostGIS", } for source in sources_used: if source.lower() in source_map: chain.append(source_map[source.lower()]) # Add template or RAG step if template_used and template_id: chain.append(f"Template:{template_id}") else: chain.append("RAG:retrieve") # Add LLM generation step if llm_provider: chain.append(f"LLM:{llm_provider}") return chain def aggregate_data_tier(tier_counts: dict[DataTier, int]) -> int: """Aggregate multiple data tiers to a single representative tier. Uses pessimistic aggregation: the overall tier is the worst (highest number) tier with significant contribution (>20% of results). Args: tier_counts: Count of results per tier Returns: Aggregated tier number (1-4) """ if not tier_counts: return 4 # Default to inferred if no data total = sum(tier_counts.values()) if total == 0: return 4 # Check from worst to best, return first tier with >20% contribution for tier in [DataTier.TIER_4_INFERRED, DataTier.TIER_3_CROWD_SOURCED, DataTier.TIER_2_VERIFIED, DataTier.TIER_1_AUTHORITATIVE]: count = tier_counts.get(tier, 0) if count > 0 and (count / total) > 0.2: return tier.value # If all tiers have <20% contribution, return the mode most_common = max(tier_counts, key=tier_counts.get) # type: ignore[arg-type] return most_common.value