glam/src/glam_extractor/annotators/uncertainty.py

"""
Uncertainty Detection Module for GLAM-NER.

This module provides detection and classification of epistemic uncertainty
markers (hedging language) in heritage institution documents.

Based on linguistic research on hedging and epistemic modality:
- Hyland (1998) - Hedging in Scientific Research Articles
- Lakoff (1972) - Hedges: A Study in Meaning Criteria
- Holmes (1988) - Doubt and Certainty in ESL Textbooks

Usage:
    >>> from glam_extractor.annotators.uncertainty import (
    ...     UncertaintyDetector,
    ...     HedgeType,
    ...     UncertaintyLevel,
    ... )
    >>>
    >>> detector = UncertaintyDetector()
    >>> result = detector.analyze("The painting is probably from the 17th century.")
    >>> print(result.level)  # UncertaintyLevel.MODERATE
    >>> print(result.hedges)  # [Hedge(text='probably', type=HedgeType.PROBABILITY)]
"""

import re
from dataclasses import dataclass, field
from enum import Enum
from typing import Any, Dict, List, Optional, Set, Tuple


# =============================================================================
# HEDGE TYPES (Epistemic Markers)
# =============================================================================

class HedgeType(str, Enum):
    """
    Classification of epistemic uncertainty markers.

    Based on Hyland (1998) taxonomy adapted for heritage domain.
    """
    # Modal auxiliaries
    MODAL_POSSIBILITY = "MODAL.POSSIBILITY"  # may, might, could
    MODAL_PROBABILITY = "MODAL.PROBABILITY"  # should, would, will (speculative)
    MODAL_NECESSITY = "MODAL.NECESSITY"      # must (epistemic), need

    # Lexical verbs
    VERB_SPECULATIVE = "VERB.SPECULATIVE"    # suggest, indicate, appear, seem
    VERB_DEDUCTIVE = "VERB.DEDUCTIVE"        # conclude, infer, deduce
    VERB_TENTATIVE = "VERB.TENTATIVE"        # believe, think, assume, suppose

    # Adverbs
    ADVERB_PROBABILITY = "ADV.PROBABILITY"   # probably, possibly, perhaps, maybe
    ADVERB_FREQUENCY = "ADV.FREQUENCY"       # often, sometimes, occasionally
    ADVERB_APPROXIMATION = "ADV.APPROX"      # approximately, roughly, about

    # Adjectives
    ADJ_EPISTEMIC = "ADJ.EPISTEMIC"          # possible, probable, likely, uncertain
    ADJ_APPROXIMATIVE = "ADJ.APPROX"         # approximate, rough, estimated

    # Quantifiers (shields)
    QUANT_APPROXIMATION = "QUANT.APPROX"     # some, several, many, few
    QUANT_VAGUE = "QUANT.VAGUE"              # a number of, various, numerous

    # Phrases
    PHRASE_ATTRIBUTION = "PHRASE.ATTR"       # according to, based on, it is said
    PHRASE_CONDITIONAL = "PHRASE.COND"       # if, in case, assuming that
    PHRASE_LIMITATION = "PHRASE.LIMIT"       # to some extent, in some cases

    # Temporal uncertainty
    TEMPORAL_VAGUE = "TMP.VAGUE"             # around, circa, approximately (dates)
    TEMPORAL_RANGE = "TMP.RANGE"             # between X and Y, X to Y

    # Source attribution
    SOURCE_HEARSAY = "SRC.HEARSAY"           # reportedly, allegedly, supposedly
    SOURCE_TRADITION = "SRC.TRADITION"       # traditionally, by tradition

    # Heritage-specific
    HERITAGE_ATTRIBUTION = "HER.ATTR"        # attributed to, school of, circle of
    HERITAGE_DATING = "HER.DATE"             # dated to, dating from (uncertain)
    HERITAGE_PROVENANCE = "HER.PROV"         # provenance unclear, origin unknown


class UncertaintyLevel(str, Enum):
    """
    Overall uncertainty level of a claim or statement.

    Scale from CERTAIN to HIGHLY_UNCERTAIN.
    """
    CERTAIN = "CERTAIN"                # No hedging detected
    NEAR_CERTAIN = "NEAR_CERTAIN"      # Minor hedging (approximately, circa)
    MODERATE = "MODERATE"              # Moderate hedging (probably, likely)
    UNCERTAIN = "UNCERTAIN"            # Significant hedging (possibly, might)
    HIGHLY_UNCERTAIN = "HIGHLY_UNCERTAIN"  # Strong hedging (attributed to, uncertain)


class UncertaintyDimension(str, Enum):
    """
    Dimension of uncertainty (what aspect is uncertain).
    """
    ATTRIBUTION = "ATTRIBUTION"     # Who created/did something
    DATING = "DATING"               # When something happened
    LOCATION = "LOCATION"           # Where something is/was
    IDENTITY = "IDENTITY"           # What something is
    QUANTITY = "QUANTITY"           # How many/much
    PROVENANCE = "PROVENANCE"       # Origin/history
    AUTHENTICITY = "AUTHENTICITY"   # Whether genuine
    INTERPRETATION = "INTERPRETATION"  # Meaning/significance


# =============================================================================
# HEDGE PATTERNS (Multilingual)
# =============================================================================

# English hedging patterns
HEDGE_PATTERNS_EN: Dict[HedgeType, List[str]] = {
    # Modal auxiliaries
    HedgeType.MODAL_POSSIBILITY: [
        r"\bmay\b", r"\bmight\b", r"\bcould\b", r"\bcan\b(?!\s+not)",
    ],
    HedgeType.MODAL_PROBABILITY: [
        r"\bshould\b", r"\bwould\b", r"\bwill\b(?=\s+(?:probably|likely|perhaps))",
    ],
    HedgeType.MODAL_NECESSITY: [
        r"\bmust\b(?=\s+(?:have|be))", r"\bneed(?:s)?\s+to\b",
    ],

    # Lexical verbs
    HedgeType.VERB_SPECULATIVE: [
        r"\bsuggests?\b", r"\bindicates?\b", r"\bappears?\b", r"\bseems?\b",
        r"\bimplies?\b", r"\bpoints?\s+to\b",
    ],
    HedgeType.VERB_DEDUCTIVE: [
        r"\bconcludes?\b", r"\binfers?\b", r"\bdeduces?\b", r"\breasons?\b",
    ],
    HedgeType.VERB_TENTATIVE: [
        r"\bbelieves?\b", r"\bthinks?\b", r"\bassumes?\b", r"\bsupposes?\b",
        r"\bconsiders?\b", r"\bestimate[sd]?\b", r"\bguess(?:es)?\b",
    ],

    # Adverbs
    HedgeType.ADVERB_PROBABILITY: [
        r"\bprobably\b", r"\bpossibly\b", r"\bperhaps\b", r"\bmaybe\b",
        r"\bpresumably\b", r"\bconceivably\b", r"\blikely\b", r"\bunlikely\b",
    ],
    HedgeType.ADVERB_FREQUENCY: [
        r"\busually\b", r"\boften\b", r"\bsometimes\b", r"\boccasionally\b",
        r"\brarely\b", r"\bseldom\b", r"\bgenerally\b", r"\btypically\b",
    ],
    HedgeType.ADVERB_APPROXIMATION: [
        r"\bapproximately\b", r"\broughly\b", r"\babout\b", r"\baround\b",
        r"\bnearly\b", r"\balmost\b", r"\bcirca\b", r"\bc\.\b", r"\bca\.\b",
    ],

    # Adjectives
    HedgeType.ADJ_EPISTEMIC: [
        r"\bpossible\b", r"\bprobable\b", r"\blikely\b", r"\buncertain\b",
        r"\bdoubtful\b", r"\bquestionable\b", r"\bplausible\b", r"\bconceivable\b",
    ],
    HedgeType.ADJ_APPROXIMATIVE: [
        r"\bapproximate\b", r"\brough\b", r"\bestimated\b", r"\btentative\b",
    ],

    # Quantifiers
    HedgeType.QUANT_APPROXIMATION: [
        r"\bsome\b", r"\bseveral\b", r"\bmany\b", r"\bfew\b", r"\bmost\b",
    ],
    HedgeType.QUANT_VAGUE: [
        r"\ba\s+number\s+of\b", r"\bvarious\b", r"\bnumerous\b", r"\bcertain\b",
        r"\bcountless\b", r"\bhandful\s+of\b",
    ],

    # Phrases
    HedgeType.PHRASE_ATTRIBUTION: [
        r"\baccording\s+to\b", r"\bbased\s+on\b", r"\bit\s+is\s+said\b",
        r"\breportedly\b", r"\bsources\s+say\b", r"\bsources\s+suggest\b",
    ],
    HedgeType.PHRASE_CONDITIONAL: [
        r"\bif\b(?=\s+(?:true|correct|accurate))", r"\bassuming\s+that\b",
        r"\bprovided\s+that\b", r"\bin\s+case\b",
    ],
    HedgeType.PHRASE_LIMITATION: [
        r"\bto\s+some\s+extent\b", r"\bin\s+some\s+cases\b", r"\bto\s+a\s+degree\b",
        r"\bpartially\b", r"\bin\s+part\b",
    ],

    # Temporal
    HedgeType.TEMPORAL_VAGUE: [
        r"\bcirca\b", r"\bc\.\s*\d", r"\bca\.\s*\d", r"\baround\s+\d",
        r"\babout\s+\d", r"\bapproximately\s+\d",
    ],
    HedgeType.TEMPORAL_RANGE: [
        r"\bbetween\s+\d+\s+and\s+\d+", r"\d+\s*[-–—]\s*\d+",
        r"\d+\s+to\s+\d+",
    ],

    # Source attribution
    HedgeType.SOURCE_HEARSAY: [
        r"\breportedly\b", r"\ballegedly\b", r"\bsupposedly\b", r"\bseemingly\b",
        r"\bapparently\b",
    ],
    HedgeType.SOURCE_TRADITION: [
        r"\btraditionally\b", r"\bby\s+tradition\b", r"\blegend\s+has\s+it\b",
    ],

    # Heritage-specific
    HedgeType.HERITAGE_ATTRIBUTION: [
        r"\battributed\s+to\b", r"\bschool\s+of\b", r"\bcircle\s+of\b",
        r"\bfollower\s+of\b", r"\bworkshop\s+of\b", r"\bmanner\s+of\b",
        r"\bstyle\s+of\b", r"\bafter\b(?=\s+[A-Z])",  # "after Rembrandt"
    ],
    HedgeType.HERITAGE_DATING: [
        r"\bdated\s+to\b", r"\bdating\s+from\b", r"\bdating\s+to\b",
        r"\bbelieved\s+to\s+date\b", r"\bthought\s+to\s+be\s+from\b",
    ],
    HedgeType.HERITAGE_PROVENANCE: [
        r"\bprovenance\s+unclear\b", r"\bprovenance\s+unknown\b",
        r"\borigin\s+unknown\b", r"\borigin\s+uncertain\b",
        r"\bhistory\s+unclear\b",
    ],
}

# Dutch hedging patterns
HEDGE_PATTERNS_NL: Dict[HedgeType, List[str]] = {
    HedgeType.MODAL_POSSIBILITY: [
        r"\bkan\b", r"\bkunnen\b", r"\bzou\b", r"\bzouden\b", r"\bmag\b",
    ],
    HedgeType.ADVERB_PROBABILITY: [
        r"\bwaarschijnlijk\b", r"\bmogelijk\b", r"\bmisschien\b", r"\bvermoedelijk\b",
    ],
    HedgeType.ADVERB_APPROXIMATION: [
        r"\bongeveer\b", r"\brond\b", r"\bcirca\b", r"\bc\.\b", r"\bca\.\b",
    ],
    HedgeType.HERITAGE_ATTRIBUTION: [
        r"\btoegschreven\s+aan\b", r"\bschool\s+van\b", r"\bkring\s+van\b",
        r"\bvolgeling\s+van\b", r"\bwerkplaats\s+van\b",
    ],
}

# German hedging patterns
HEDGE_PATTERNS_DE: Dict[HedgeType, List[str]] = {
    HedgeType.MODAL_POSSIBILITY: [
        r"\bkann\b", r"\bkönnte\b", r"\bkönnen\b", r"\bdürfte\b", r"\bmag\b",
    ],
    HedgeType.ADVERB_PROBABILITY: [
        r"\bwahrscheinlich\b", r"\bmöglicherweise\b", r"\bvielleicht\b",
        r"\bvermutlich\b", r"\banscheinend\b",
    ],
    HedgeType.ADVERB_APPROXIMATION: [
        r"\bungefähr\b", r"\betwa\b", r"\bcirca\b", r"\bca\.\b", r"\brund\b",
    ],
    HedgeType.HERITAGE_ATTRIBUTION: [
        r"\bzugeschrieben\b", r"\bSchule\s+von\b", r"\bKreis\s+von\b",
        r"\bWerkstatt\s+von\b", r"\bUmkreis\s+von\b",
    ],
}

# French hedging patterns
HEDGE_PATTERNS_FR: Dict[HedgeType, List[str]] = {
    HedgeType.MODAL_POSSIBILITY: [
        r"\bpeut\b", r"\bpourrait\b", r"\bpourrait\b", r"\bpuisse\b",
    ],
    HedgeType.ADVERB_PROBABILITY: [
        r"\bprobablement\b", r"\bpeut-être\b", r"\bsans\s+doute\b",
        r"\bapparemment\b", r"\bvraisemblablement\b",
    ],
    HedgeType.ADVERB_APPROXIMATION: [
        r"\benviron\b", r"\bà\s+peu\s+près\b", r"\bcirca\b", r"\bvers\b",
    ],
    HedgeType.HERITAGE_ATTRIBUTION: [
        r"\battribué\s+à\b", r"\bécole\s+de\b", r"\bentourage\s+de\b",
        r"\batelier\s+de\b", r"\bmanière\s+de\b",
    ],
}


# =============================================================================
# HEDGE DATA STRUCTURES
# =============================================================================

@dataclass
class Hedge:
    """
    A detected hedging marker in text.
    """
    text: str                    # The hedging expression
    hedge_type: HedgeType        # Type classification
    start: int                   # Start position in text
    end: int                     # End position in text
    confidence: float = 1.0      # Detection confidence
    dimension: Optional[UncertaintyDimension] = None  # What aspect is uncertain

    def __hash__(self):
        return hash((self.text, self.hedge_type, self.start, self.end))


@dataclass
class UncertaintyAnalysis:
    """
    Complete uncertainty analysis for a text.
    """
    text: str                                # Original text
    level: UncertaintyLevel                  # Overall uncertainty level
    hedges: List[Hedge] = field(default_factory=list)  # Detected hedges
    hedge_density: float = 0.0               # Hedges per 100 words
    dimension_scores: Dict[UncertaintyDimension, float] = field(default_factory=dict)

    # Summary statistics
    hedge_count: int = 0
    word_count: int = 0

    def to_dict(self) -> Dict[str, Any]:
        """Export to dictionary."""
        return {
            "level": self.level.value,
            "hedges": [
                {
                    "text": h.text,
                    "type": h.hedge_type.value,
                    "start": h.start,
                    "end": h.end,
                    "confidence": h.confidence,
                    "dimension": h.dimension.value if h.dimension else None,
                }
                for h in self.hedges
            ],
            "hedge_density": self.hedge_density,
            "dimension_scores": {
                k.value: v for k, v in self.dimension_scores.items()
            },
            "hedge_count": self.hedge_count,
            "word_count": self.word_count,
        }


# =============================================================================
# UNCERTAINTY DETECTOR
# =============================================================================

class UncertaintyDetector:
    """
    Detector for epistemic uncertainty markers (hedging language).

    Supports multilingual detection (EN, NL, DE, FR).

    Usage:
        >>> detector = UncertaintyDetector(languages=['en', 'nl'])
        >>> result = detector.analyze("The painting is probably from circa 1650.")
        >>> print(result.level)  # UncertaintyLevel.MODERATE
    """

    # Hedge type weights for uncertainty scoring
    HEDGE_WEIGHTS: Dict[HedgeType, float] = {
        # High uncertainty (strong hedging)
        HedgeType.HERITAGE_ATTRIBUTION: 0.9,
        HedgeType.HERITAGE_PROVENANCE: 0.9,
        HedgeType.SOURCE_HEARSAY: 0.8,
        HedgeType.ADJ_EPISTEMIC: 0.7,
        HedgeType.MODAL_POSSIBILITY: 0.7,
        HedgeType.VERB_TENTATIVE: 0.7,

        # Moderate uncertainty
        HedgeType.ADVERB_PROBABILITY: 0.6,
        HedgeType.VERB_SPECULATIVE: 0.6,
        HedgeType.PHRASE_ATTRIBUTION: 0.5,
        HedgeType.HERITAGE_DATING: 0.5,
        HedgeType.SOURCE_TRADITION: 0.5,

        # Low uncertainty (mild hedging)
        HedgeType.ADVERB_APPROXIMATION: 0.3,
        HedgeType.TEMPORAL_VAGUE: 0.3,
        HedgeType.TEMPORAL_RANGE: 0.2,
        HedgeType.QUANT_APPROXIMATION: 0.2,
        HedgeType.QUANT_VAGUE: 0.2,
        HedgeType.ADJ_APPROXIMATIVE: 0.3,

        # Minimal uncertainty
        HedgeType.ADVERB_FREQUENCY: 0.1,
        HedgeType.MODAL_PROBABILITY: 0.4,
        HedgeType.MODAL_NECESSITY: 0.3,
        HedgeType.VERB_DEDUCTIVE: 0.2,
        HedgeType.PHRASE_CONDITIONAL: 0.4,
        HedgeType.PHRASE_LIMITATION: 0.3,
    }

    # Dimension inference based on context patterns
    DIMENSION_PATTERNS: Dict[UncertaintyDimension, List[str]] = {
        UncertaintyDimension.ATTRIBUTION: [
            r"(?:painted|created|made|authored|by|attributed)\s+(?:by|to)",
            r"(?:artist|painter|author|creator)",
        ],
        UncertaintyDimension.DATING: [
            r"(?:date|dated|dating|century|period|era|year)",
            r"\d{3,4}",  # Years
        ],
        UncertaintyDimension.LOCATION: [
            r"(?:location|located|place|where|from|origin)",
            r"(?:city|country|region|museum|gallery)",
        ],
        UncertaintyDimension.QUANTITY: [
            r"(?:number|count|amount|many|few|several)",
            r"(?:pieces|items|works|objects)",
        ],
        UncertaintyDimension.PROVENANCE: [
            r"(?:provenance|ownership|history|collection)",
            r"(?:acquired|purchased|donated|inherited)",
        ],
        UncertaintyDimension.AUTHENTICITY: [
            r"(?:authentic|genuine|original|fake|forgery|copy)",
            r"(?:attribution|school of|workshop)",
        ],
    }

    def __init__(
        self,
        languages: Optional[List[str]] = None,
        custom_patterns: Optional[Dict[HedgeType, List[str]]] = None,
    ):
        """
        Initialize detector.

        Args:
            languages: List of language codes to detect (default: ['en'])
            custom_patterns: Additional custom patterns by hedge type
        """
        self.languages = languages or ['en']
        self._patterns: Dict[HedgeType, List[re.Pattern]] = {}
        self._compile_patterns(custom_patterns)

    def _compile_patterns(self, custom_patterns: Optional[Dict[HedgeType, List[str]]]):
        """Compile regex patterns for all languages."""
        # Combine patterns from all specified languages
        all_patterns: Dict[HedgeType, List[str]] = {}

        pattern_sources = {
            'en': HEDGE_PATTERNS_EN,
            'nl': HEDGE_PATTERNS_NL,
            'de': HEDGE_PATTERNS_DE,
            'fr': HEDGE_PATTERNS_FR,
        }

        for lang in self.languages:
            if lang in pattern_sources:
                for hedge_type, patterns in pattern_sources[lang].items():
                    if hedge_type not in all_patterns:
                        all_patterns[hedge_type] = []
                    all_patterns[hedge_type].extend(patterns)

        # Add custom patterns
        if custom_patterns:
            for hedge_type, patterns in custom_patterns.items():
                if hedge_type not in all_patterns:
                    all_patterns[hedge_type] = []
                all_patterns[hedge_type].extend(patterns)

        # Compile all patterns
        for hedge_type, patterns in all_patterns.items():
            self._patterns[hedge_type] = [
                re.compile(p, re.IGNORECASE) for p in patterns
            ]

    def detect_hedges(self, text: str) -> List[Hedge]:
        """
        Detect hedging markers in text.

        Args:
            text: Text to analyze

        Returns:
            List of detected Hedge objects
        """
        hedges: List[Hedge] = []
        seen_spans: Set[Tuple[int, int]] = set()

        for hedge_type, patterns in self._patterns.items():
            for pattern in patterns:
                for match in pattern.finditer(text):
                    span = (match.start(), match.end())

                    # Avoid overlapping matches
                    if span not in seen_spans:
                        seen_spans.add(span)

                        # Infer dimension from context
                        dimension = self._infer_dimension(text, match.start(), match.end())

                        hedges.append(Hedge(
                            text=match.group(),
                            hedge_type=hedge_type,
                            start=match.start(),
                            end=match.end(),
                            confidence=1.0,
                            dimension=dimension,
                        ))

        # Sort by position
        hedges.sort(key=lambda h: h.start)
        return hedges

    def _infer_dimension(self, text: str, start: int, end: int) -> Optional[UncertaintyDimension]:
        """Infer what dimension is uncertain from surrounding context."""
        # Get context window (50 chars before and after)
        context_start = max(0, start - 50)
        context_end = min(len(text), end + 50)
        context = text[context_start:context_end].lower()

        for dimension, patterns in self.DIMENSION_PATTERNS.items():
            for pattern in patterns:
                if re.search(pattern, context, re.IGNORECASE):
                    return dimension

        return None

    def _calculate_uncertainty_level(
        self,
        hedges: List[Hedge],
        word_count: int,
    ) -> UncertaintyLevel:
        """
        Calculate overall uncertainty level from hedges.

        Uses weighted scoring based on hedge types.
        """
        if not hedges or word_count == 0:
            return UncertaintyLevel.CERTAIN

        # Calculate weighted score
        total_weight = sum(
            self.HEDGE_WEIGHTS.get(h.hedge_type, 0.5)
            for h in hedges
        )

        # Normalize by word count (per 100 words)
        normalized_score = (total_weight / word_count) * 100

        # Also consider absolute number of high-weight hedges
        high_weight_count = sum(
            1 for h in hedges
            if self.HEDGE_WEIGHTS.get(h.hedge_type, 0) >= 0.7
        )

        # Determine level
        if normalized_score < 0.5 and high_weight_count == 0:
            return UncertaintyLevel.CERTAIN
        elif normalized_score < 2.0 and high_weight_count == 0:
            return UncertaintyLevel.NEAR_CERTAIN
        elif normalized_score < 5.0 and high_weight_count <= 1:
            return UncertaintyLevel.MODERATE
        elif normalized_score < 10.0 or high_weight_count <= 2:
            return UncertaintyLevel.UNCERTAIN
        else:
            return UncertaintyLevel.HIGHLY_UNCERTAIN

    def analyze(self, text: str) -> UncertaintyAnalysis:
        """
        Perform complete uncertainty analysis on text.

        Args:
            text: Text to analyze

        Returns:
            UncertaintyAnalysis with detected hedges and overall assessment
        """
        # Detect hedges
        hedges = self.detect_hedges(text)

        # Count words (simple tokenization)
        word_count = len(text.split())

        # Calculate hedge density
        hedge_density = (len(hedges) / word_count * 100) if word_count > 0 else 0.0

        # Calculate dimension scores
        dimension_scores: Dict[UncertaintyDimension, float] = {}
        for hedge in hedges:
            if hedge.dimension:
                weight = self.HEDGE_WEIGHTS.get(hedge.hedge_type, 0.5)
                dimension_scores[hedge.dimension] = dimension_scores.get(
                    hedge.dimension, 0.0
                ) + weight

        # Calculate overall level
        level = self._calculate_uncertainty_level(hedges, word_count)

        return UncertaintyAnalysis(
            text=text,
            level=level,
            hedges=hedges,
            hedge_density=hedge_density,
            dimension_scores=dimension_scores,
            hedge_count=len(hedges),
            word_count=word_count,
        )

    def get_claim_confidence(
        self,
        claim_text: str,
        context: Optional[str] = None,
    ) -> float:
        """
        Calculate confidence score for a claim based on hedging.

        Returns a value between 0.0 (highly uncertain) and 1.0 (certain).

        Args:
            claim_text: The claim text to analyze
            context: Optional surrounding context

        Returns:
            Confidence score (0.0-1.0)
        """
        # Analyze claim text
        analysis = self.analyze(claim_text)

        # Also analyze context if provided
        if context:
            context_analysis = self.analyze(context)
            # Combine hedges from both
            all_hedges = analysis.hedges + context_analysis.hedges
        else:
            all_hedges = analysis.hedges

        if not all_hedges:
            return 1.0  # No hedging = certain

        # Calculate penalty based on hedge weights
        total_penalty = sum(
            self.HEDGE_WEIGHTS.get(h.hedge_type, 0.5)
            for h in all_hedges
        )

        # Normalize: max penalty of 1.0 (fully uncertain)
        penalty = min(1.0, total_penalty / 2.0)  # 2.0 hedge weight = 0 confidence

        return max(0.0, 1.0 - penalty)

    def annotate_entities_with_uncertainty(
        self,
        entities: List[Dict[str, Any]],
        text: str,
    ) -> List[Dict[str, Any]]:
        """
        Annotate extracted entities with uncertainty information.

        Args:
            entities: List of extracted entities (dicts with text, start, end)
            text: Full document text

        Returns:
            Entities with added uncertainty fields
        """
        analysis = self.analyze(text)

        for entity in entities:
            entity_start = entity.get('start', 0)
            entity_end = entity.get('end', len(text))
            entity_text = entity.get('text', '')

            # Find hedges near this entity
            nearby_hedges = [
                h for h in analysis.hedges
                if abs(h.start - entity_start) < 50 or abs(h.end - entity_end) < 50
            ]

            # Calculate entity-specific confidence
            if nearby_hedges:
                entity['uncertainty'] = {
                    'hedges': [
                        {'text': h.text, 'type': h.hedge_type.value}
                        for h in nearby_hedges
                    ],
                    'confidence': self.get_claim_confidence(entity_text, text),
                    'level': self._calculate_uncertainty_level(
                        nearby_hedges,
                        len(entity_text.split()) or 1
                    ).value,
                }
            else:
                entity['uncertainty'] = {
                    'hedges': [],
                    'confidence': 1.0,
                    'level': UncertaintyLevel.CERTAIN.value,
                }

        return entities


# =============================================================================
# CONVENIENCE FUNCTIONS
# =============================================================================

def detect_hedges(text: str, languages: Optional[List[str]] = None) -> List[Hedge]:
    """
    Detect hedging markers in text.

    Args:
        text: Text to analyze
        languages: Language codes (default: ['en'])

    Returns:
        List of Hedge objects
    """
    detector = UncertaintyDetector(languages=languages)
    return detector.detect_hedges(text)


def analyze_uncertainty(text: str, languages: Optional[List[str]] = None) -> UncertaintyAnalysis:
    """
    Perform complete uncertainty analysis.

    Args:
        text: Text to analyze
        languages: Language codes (default: ['en'])

    Returns:
        UncertaintyAnalysis
    """
    detector = UncertaintyDetector(languages=languages)
    return detector.analyze(text)


def get_confidence_score(claim: str, context: Optional[str] = None) -> float:
    """
    Get confidence score for a claim based on hedging language.

    Args:
        claim: Claim text
        context: Optional surrounding context

    Returns:
        Confidence score (0.0-1.0)
    """
    detector = UncertaintyDetector()
    return detector.get_claim_confidence(claim, context)


__all__ = [
    # Enums
    "HedgeType",
    "UncertaintyLevel",
    "UncertaintyDimension",
    # Data structures
    "Hedge",
    "UncertaintyAnalysis",
    # Main class
    "UncertaintyDetector",
    # Convenience functions
    "detect_hedges",
    "analyze_uncertainty",
    "get_confidence_score",
]