glam/backend/rag/temporal_resolver.py

"""
Temporal Conflict Resolution for Heritage Data

Handles cases where multiple facts exist for the same property at overlapping times.
Based on: docs/plan/external_design_patterns/04_temporal_semantic_hypergraph.md

Strategies:
1. Temporal ordering: Use fact valid at query time
2. Recency: Prefer more recent sources
3. Authority: Prefer authoritative sources (Tier 1)
4. Confidence: Use higher confidence facts
"""

from dataclasses import dataclass, field
from datetime import datetime
from typing import Optional
import logging

logger = logging.getLogger(__name__)


@dataclass
class TemporalFact:
    """A fact with temporal validity."""
    property: str
    value: str
    valid_from: datetime
    valid_to: Optional[datetime]
    source: str
    confidence: float = 1.0
    ghcid: Optional[str] = None


@dataclass
class ConflictResolution:
    """Result of conflict resolution."""
    property: str
    authoritative_value: str
    valid_for_date: datetime
    conflict_type: str
    explanation: str
    alternative_values: list[TemporalFact] = field(default_factory=list)


class TemporalConflictResolver:
    """
    Resolve conflicts between temporal facts.

    Uses a multi-factor scoring system:
    - Source authority (Tier 1-4)
    - Confidence scores
    - Temporal recency
    """

    SOURCE_AUTHORITY = {
        "TIER_1_AUTHORITATIVE": 1.0,
        "TIER_2_VERIFIED": 0.8,
        "TIER_3_CROWD_SOURCED": 0.6,
        "TIER_4_INFERRED": 0.4,
    }

    def resolve_conflicts(
        self,
        ghcid: str,
        facts: list[TemporalFact],
        query_date: Optional[datetime] = None
    ) -> list[ConflictResolution]:
        """
        Resolve all conflicts in a set of facts.

        Args:
            ghcid: Institution identifier
            facts: All facts about the institution
            query_date: Point in time for resolution (default: now)

        Returns:
            List of conflict resolutions with authoritative values
        """
        if query_date is None:
            query_date = datetime.now()

        # Group facts by property
        by_property: dict[str, list[TemporalFact]] = {}
        for fact in facts:
            by_property.setdefault(fact.property, []).append(fact)

        resolutions = []

        for prop, prop_facts in by_property.items():
            # Find facts valid at query_date
            valid_facts = [
                f for f in prop_facts
                if f.valid_from <= query_date and
                   (f.valid_to is None or f.valid_to > query_date)
            ]

            if len(valid_facts) <= 1:
                # No conflict
                continue

            # Multiple valid facts - resolve conflict
            resolution = self._resolve_property_conflict(
                prop, valid_facts, query_date
            )
            resolutions.append(resolution)

        return resolutions

    def get_authoritative_value(
        self,
        ghcid: str,
        property: str,
        facts: list[TemporalFact],
        query_date: Optional[datetime] = None
    ) -> Optional[str]:
        """
        Get the authoritative value for a single property.

        Convenience method for single-property lookups.
        """
        if query_date is None:
            query_date = datetime.now()

        # Filter facts for this property
        prop_facts = [f for f in facts if f.property == property]

        if not prop_facts:
            return None

        # Find facts valid at query_date
        valid_facts = [
            f for f in prop_facts
            if f.valid_from <= query_date and
               (f.valid_to is None or f.valid_to > query_date)
        ]

        if not valid_facts:
            return None

        if len(valid_facts) == 1:
            return valid_facts[0].value

        # Resolve conflict
        resolution = self._resolve_property_conflict(property, valid_facts, query_date)
        return resolution.authoritative_value

    def _resolve_property_conflict(
        self,
        property: str,
        facts: list[TemporalFact],
        query_date: datetime
    ) -> ConflictResolution:
        """
        Resolve conflict for a single property.
        """
        # Score each fact
        scored = []
        for fact in facts:
            score = self._compute_authority_score(fact)
            scored.append((fact, score))

        # Sort by score (descending)
        scored.sort(key=lambda x: x[1], reverse=True)

        winner = scored[0][0]
        alternatives = [f for f, s in scored[1:]]

        # Determine conflict type
        if all(f.value == winner.value for f in facts):
            conflict_type = "redundant"  # Same value from multiple sources
        elif self._is_name_change(facts):
            conflict_type = "name_change"
        elif self._is_location_change(facts, property):
            conflict_type = "location_change"
        else:
            conflict_type = "data_inconsistency"

        explanation = self._generate_explanation(
            property, winner, alternatives, conflict_type, query_date
        )

        return ConflictResolution(
            property=property,
            authoritative_value=winner.value,
            valid_for_date=query_date,
            conflict_type=conflict_type,
            explanation=explanation,
            alternative_values=alternatives
        )

    def _compute_authority_score(self, fact: TemporalFact) -> float:
        """Compute authority score for a fact."""
        # Base authority from source tier
        authority = self.SOURCE_AUTHORITY.get(fact.source, 0.5)

        # Boost for confidence
        authority *= fact.confidence

        # Recency bonus (facts with recent valid_from get slight boost)
        days_old = (datetime.now() - fact.valid_from).days
        recency_factor = 1.0 / (1.0 + days_old / 365.0)  # Decay over years
        authority *= (0.8 + 0.2 * recency_factor)

        return authority

    def _is_name_change(self, facts: list[TemporalFact]) -> bool:
        """Check if conflict represents a name change."""
        # Name changes typically have non-overlapping validity
        facts_sorted = sorted(facts, key=lambda f: f.valid_from)
        for i in range(len(facts_sorted) - 1):
            if facts_sorted[i].valid_to == facts_sorted[i+1].valid_from:
                return True
        return False

    def _is_location_change(self, facts: list[TemporalFact], property: str) -> bool:
        """Check if conflict represents a location change."""
        return property in ["city", "address", "location", "settlementName", "subregionCode"]

    def _generate_explanation(
        self,
        property: str,
        winner: TemporalFact,
        alternatives: list[TemporalFact],
        conflict_type: str,
        query_date: datetime
    ) -> str:
        """Generate human-readable explanation of resolution."""
        if conflict_type == "name_change":
            return (
                f"The institution name changed over time. "
                f"At {query_date.strftime('%Y-%m-%d')}, the authoritative name was '{winner.value}'. "
                f"Previous names: {', '.join(f.value for f in alternatives)}."
            )
        elif conflict_type == "location_change":
            return (
                f"The institution relocated. "
                f"At {query_date.strftime('%Y-%m-%d')}, it was located at '{winner.value}'."
            )
        elif conflict_type == "redundant":
            return f"Multiple sources confirm: {winner.value}"
        else:
            return (
                f"Data conflict for {property}. "
                f"Using '{winner.value}' from {winner.source} (confidence: {winner.confidence:.2f}). "
                f"Alternative values exist in other sources."
            )


# Singleton instance
_resolver: Optional[TemporalConflictResolver] = None


def get_temporal_resolver() -> TemporalConflictResolver:
    """Get or create singleton resolver instance."""
    global _resolver
    if _resolver is None:
        _resolver = TemporalConflictResolver()
    return _resolver