glam/backend/rag/provenance.py

"""
Epistemic Provenance Tracking for RAG Pipeline

Rule 46: Ontology-Driven Cache Segmentation

This module tracks the full lineage of how RAG responses are derived, including:
- Which data sources contributed each piece of information
- Data quality tiers (1=authoritative, 4=inferred)
- Derivation chain showing processing steps
- Confidence scores and temporal validity

The provenance data flows through the pipeline:
1. Retriever → adds source attribution
2. merge_results() → aggregates across sources
3. DSPy generator → adds LLM inference step
4. Response → includes full EpistemicProvenance

Frontend displays this in ProvenanceTooltip when hovering over cache badges.

References:
- Pavlyshyn's "Context Graphs and Data Traces: Building Epistemology Layers for Agentic Memory"
- LinkML ProvenanceBlock in schemas/20251121/linkml/modules/classes/
"""

from __future__ import annotations

from datetime import datetime, timezone
from enum import Enum
from typing import Any

from pydantic import BaseModel, Field


class DataTier(int, Enum):
    """Data quality tier - aligned with LinkML DataTierEnum and frontend DataTier type.

    Lower numbers = higher authority.
    """
    TIER_1_AUTHORITATIVE = 1  # ISIL Registry, Nationaal Archief, official government data
    TIER_2_VERIFIED = 2        # Wikidata, Google Maps, verified institutional websites
    TIER_3_CROWD_SOURCED = 3   # User reviews, community edits, unverified web scrapes
    TIER_4_INFERRED = 4        # LLM extraction, inference, aggregation


class EpistemicDataSource(str, Enum):
    """Data source types - aligned with frontend EpistemicDataSource type."""
    ISIL_REGISTRY = "ISIL_REGISTRY"
    WIKIDATA = "WIKIDATA"
    CUSTODIAN_YAML = "CUSTODIAN_YAML"
    GOOGLE_MAPS = "GOOGLE_MAPS"
    WEB_SCRAPE = "WEB_SCRAPE"
    LLM_INFERENCE = "LLM_INFERENCE"
    SPARQL_QUERY = "SPARQL_QUERY"
    RAG_PIPELINE = "RAG_PIPELINE"
    USER_PROVIDED = "USER_PROVIDED"
    CACHE_AGGREGATION = "CACHE_AGGREGATION"


class RetrievalSource(str, Enum):
    """Source system for retrieval - maps to DataSource enum."""
    QDRANT = "qdrant"
    SPARQL = "sparql"
    TYPEDB = "typedb"
    POSTGIS = "postgis"
    CACHE = "cache"
    LLM_SYNTHESIS = "llm_synthesis"


class SourceAttribution(BaseModel):
    """Attribution to a specific data source for a single result item.

    Tracks how each retrieved item contributed to the final result.
    """
    source: RetrievalSource = Field(description="Which retriever returned this item")
    data_tier: DataTier = Field(description="Quality tier of this source")
    retrieval_rank: int = Field(description="Position in source's result list")
    rrf_contribution: float = Field(default=0.0, description="RRF score contribution from this source")
    retrieved_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
    query_time_ms: float = Field(default=0.0, description="Time taken by this retriever")

    # Optional source-specific details
    sparql_query: str | None = Field(default=None, description="SPARQL query if source is SPARQL")
    vector_similarity: float | None = Field(default=None, description="Cosine similarity if source is Qdrant")
    collection_name: str | None = Field(default=None, description="Collection name for vector search")
    template_id: str | None = Field(default=None, description="Template ID if template-based SPARQL")


class EpistemicProvenance(BaseModel):
    """Complete epistemic provenance for a RAG response.

    This model is designed to be JSON-serializable and compatible with the
    frontend's EpistemicProvenance TypeScript interface.

    Fields align with frontend/src/lib/storage/semantic-cache.ts:103-121
    """

    # Core provenance fields (required for frontend compatibility)
    dataSource: EpistemicDataSource = Field(
        default=EpistemicDataSource.RAG_PIPELINE,
        description="Primary data source for this response"
    )
    dataTier: int = Field(
        default=3,
        ge=1,
        le=4,
        description="Data quality tier (1=authoritative, 4=inferred)"
    )
    sourceTimestamp: str = Field(
        default_factory=lambda: datetime.now(timezone.utc).isoformat(),
        description="When the source data was retrieved"
    )
    derivationChain: list[str] = Field(
        default_factory=list,
        description="Chain of processing steps, e.g. ['SPARQL:Oxigraph', 'RAG:retrieve', 'LLM:groq']"
    )
    revalidationPolicy: str = Field(
        default="weekly",
        description="How often to revalidate: static, daily, weekly, on_access"
    )
    confidenceScore: float | None = Field(
        default=None,
        description="Confidence score 0-1, if applicable"
    )

    # Extended provenance (for detailed analysis)
    sourcesQueried: list[str] = Field(
        default_factory=list,
        description="List of sources that were queried"
    )
    totalRetrieved: int = Field(
        default=0,
        description="Total items retrieved before fusion"
    )
    totalAfterFusion: int = Field(
        default=0,
        description="Items after RRF fusion and deduplication"
    )
    dataTierBreakdown: dict[str, int] = Field(
        default_factory=dict,
        description="Count of results per data tier"
    )
    templateUsed: bool = Field(
        default=False,
        description="Whether template-based SPARQL was used"
    )
    templateId: str | None = Field(
        default=None,
        description="Which template was used, if any"
    )
    llmProvider: str | None = Field(
        default=None,
        description="LLM provider used for generation"
    )
    llmModel: str | None = Field(
        default=None,
        description="Specific LLM model used"
    )

    class Config:
        """Pydantic configuration."""
        use_enum_values = True  # Serialize enums to their values


def infer_data_tier(item: dict[str, Any], source: RetrievalSource) -> DataTier:
    """Infer data tier from item metadata and source.

    Args:
        item: Retrieved item with potential provenance metadata
        source: Which retriever returned this item

    Returns:
        Appropriate DataTier based on item provenance
    """
    # Check for explicit data_tier in item
    if "data_tier" in item:
        tier = item["data_tier"]
        if isinstance(tier, int) and 1 <= tier <= 4:
            return DataTier(tier)
        if isinstance(tier, str):
            tier_map = {
                "TIER_1_AUTHORITATIVE": DataTier.TIER_1_AUTHORITATIVE,
                "TIER_2_VERIFIED": DataTier.TIER_2_VERIFIED,
                "TIER_3_CROWD_SOURCED": DataTier.TIER_3_CROWD_SOURCED,
                "TIER_4_INFERRED": DataTier.TIER_4_INFERRED,
            }
            if tier in tier_map:
                return tier_map[tier]

    # Check provenance block if present
    provenance = item.get("provenance", {})
    if provenance.get("data_source") == "CSV_REGISTRY":
        return DataTier.TIER_1_AUTHORITATIVE
    if provenance.get("data_source") == "WIKIDATA":
        return DataTier.TIER_2_VERIFIED

    # Infer from source
    source_tiers = {
        RetrievalSource.SPARQL: DataTier.TIER_1_AUTHORITATIVE,  # Oxigraph has curated data
        RetrievalSource.TYPEDB: DataTier.TIER_1_AUTHORITATIVE,  # TypeDB has curated data
        RetrievalSource.QDRANT: DataTier.TIER_3_CROWD_SOURCED,  # Vector search may include scraped data
        RetrievalSource.POSTGIS: DataTier.TIER_2_VERIFIED,       # GeoNames data
        RetrievalSource.CACHE: DataTier.TIER_3_CROWD_SOURCED,    # Cached responses
        RetrievalSource.LLM_SYNTHESIS: DataTier.TIER_4_INFERRED, # LLM-generated
    }

    return source_tiers.get(source, DataTier.TIER_4_INFERRED)


def build_derivation_chain(
    sources_used: list[str],
    template_used: bool = False,
    template_id: str | None = None,
    llm_provider: str | None = None,
) -> list[str]:
    """Build the derivation chain from processing steps.

    Args:
        sources_used: List of data sources queried
        template_used: Whether template-based SPARQL was used
        template_id: Template ID if used
        llm_provider: LLM provider for generation

    Returns:
        List of derivation steps, e.g. ['SPARQL:Oxigraph', 'RAG:retrieve', 'LLM:groq']
    """
    chain = []

    # Add retrieval sources
    source_map = {
        "qdrant": "Vector:Qdrant",
        "sparql": "SPARQL:Oxigraph",
        "typedb": "Graph:TypeDB",
        "postgis": "Geo:PostGIS",
    }
    for source in sources_used:
        if source.lower() in source_map:
            chain.append(source_map[source.lower()])

    # Add template or RAG step
    if template_used and template_id:
        chain.append(f"Template:{template_id}")
    else:
        chain.append("RAG:retrieve")

    # Add LLM generation step
    if llm_provider:
        chain.append(f"LLM:{llm_provider}")

    return chain


def aggregate_data_tier(tier_counts: dict[DataTier, int]) -> int:
    """Aggregate multiple data tiers to a single representative tier.

    Uses pessimistic aggregation: the overall tier is the worst (highest number)
    tier with significant contribution (>20% of results).

    Args:
        tier_counts: Count of results per tier

    Returns:
        Aggregated tier number (1-4)
    """
    if not tier_counts:
        return 4  # Default to inferred if no data

    total = sum(tier_counts.values())
    if total == 0:
        return 4

    # Check from worst to best, return first tier with >20% contribution
    for tier in [DataTier.TIER_4_INFERRED, DataTier.TIER_3_CROWD_SOURCED,
                 DataTier.TIER_2_VERIFIED, DataTier.TIER_1_AUTHORITATIVE]:
        count = tier_counts.get(tier, 0)
        if count > 0 and (count / total) > 0.2:
            return tier.value

    # If all tiers have <20% contribution, return the mode
    most_common = max(tier_counts, key=tier_counts.get)  # type: ignore[arg-type]
    return most_common.value