glam/src/glam_extractor/annotators/uncertainty.py
2025-12-05 15:30:23 +01:00

750 lines
26 KiB
Python

"""
Uncertainty Detection Module for GLAM-NER.
This module provides detection and classification of epistemic uncertainty
markers (hedging language) in heritage institution documents.
Based on linguistic research on hedging and epistemic modality:
- Hyland (1998) - Hedging in Scientific Research Articles
- Lakoff (1972) - Hedges: A Study in Meaning Criteria
- Holmes (1988) - Doubt and Certainty in ESL Textbooks
Usage:
>>> from glam_extractor.annotators.uncertainty import (
... UncertaintyDetector,
... HedgeType,
... UncertaintyLevel,
... )
>>>
>>> detector = UncertaintyDetector()
>>> result = detector.analyze("The painting is probably from the 17th century.")
>>> print(result.level) # UncertaintyLevel.MODERATE
>>> print(result.hedges) # [Hedge(text='probably', type=HedgeType.PROBABILITY)]
"""
import re
from dataclasses import dataclass, field
from enum import Enum
from typing import Any, Dict, List, Optional, Set, Tuple
# =============================================================================
# HEDGE TYPES (Epistemic Markers)
# =============================================================================
class HedgeType(str, Enum):
"""
Classification of epistemic uncertainty markers.
Based on Hyland (1998) taxonomy adapted for heritage domain.
"""
# Modal auxiliaries
MODAL_POSSIBILITY = "MODAL.POSSIBILITY" # may, might, could
MODAL_PROBABILITY = "MODAL.PROBABILITY" # should, would, will (speculative)
MODAL_NECESSITY = "MODAL.NECESSITY" # must (epistemic), need
# Lexical verbs
VERB_SPECULATIVE = "VERB.SPECULATIVE" # suggest, indicate, appear, seem
VERB_DEDUCTIVE = "VERB.DEDUCTIVE" # conclude, infer, deduce
VERB_TENTATIVE = "VERB.TENTATIVE" # believe, think, assume, suppose
# Adverbs
ADVERB_PROBABILITY = "ADV.PROBABILITY" # probably, possibly, perhaps, maybe
ADVERB_FREQUENCY = "ADV.FREQUENCY" # often, sometimes, occasionally
ADVERB_APPROXIMATION = "ADV.APPROX" # approximately, roughly, about
# Adjectives
ADJ_EPISTEMIC = "ADJ.EPISTEMIC" # possible, probable, likely, uncertain
ADJ_APPROXIMATIVE = "ADJ.APPROX" # approximate, rough, estimated
# Quantifiers (shields)
QUANT_APPROXIMATION = "QUANT.APPROX" # some, several, many, few
QUANT_VAGUE = "QUANT.VAGUE" # a number of, various, numerous
# Phrases
PHRASE_ATTRIBUTION = "PHRASE.ATTR" # according to, based on, it is said
PHRASE_CONDITIONAL = "PHRASE.COND" # if, in case, assuming that
PHRASE_LIMITATION = "PHRASE.LIMIT" # to some extent, in some cases
# Temporal uncertainty
TEMPORAL_VAGUE = "TMP.VAGUE" # around, circa, approximately (dates)
TEMPORAL_RANGE = "TMP.RANGE" # between X and Y, X to Y
# Source attribution
SOURCE_HEARSAY = "SRC.HEARSAY" # reportedly, allegedly, supposedly
SOURCE_TRADITION = "SRC.TRADITION" # traditionally, by tradition
# Heritage-specific
HERITAGE_ATTRIBUTION = "HER.ATTR" # attributed to, school of, circle of
HERITAGE_DATING = "HER.DATE" # dated to, dating from (uncertain)
HERITAGE_PROVENANCE = "HER.PROV" # provenance unclear, origin unknown
class UncertaintyLevel(str, Enum):
"""
Overall uncertainty level of a claim or statement.
Scale from CERTAIN to HIGHLY_UNCERTAIN.
"""
CERTAIN = "CERTAIN" # No hedging detected
NEAR_CERTAIN = "NEAR_CERTAIN" # Minor hedging (approximately, circa)
MODERATE = "MODERATE" # Moderate hedging (probably, likely)
UNCERTAIN = "UNCERTAIN" # Significant hedging (possibly, might)
HIGHLY_UNCERTAIN = "HIGHLY_UNCERTAIN" # Strong hedging (attributed to, uncertain)
class UncertaintyDimension(str, Enum):
"""
Dimension of uncertainty (what aspect is uncertain).
"""
ATTRIBUTION = "ATTRIBUTION" # Who created/did something
DATING = "DATING" # When something happened
LOCATION = "LOCATION" # Where something is/was
IDENTITY = "IDENTITY" # What something is
QUANTITY = "QUANTITY" # How many/much
PROVENANCE = "PROVENANCE" # Origin/history
AUTHENTICITY = "AUTHENTICITY" # Whether genuine
INTERPRETATION = "INTERPRETATION" # Meaning/significance
# =============================================================================
# HEDGE PATTERNS (Multilingual)
# =============================================================================
# English hedging patterns
HEDGE_PATTERNS_EN: Dict[HedgeType, List[str]] = {
# Modal auxiliaries
HedgeType.MODAL_POSSIBILITY: [
r"\bmay\b", r"\bmight\b", r"\bcould\b", r"\bcan\b(?!\s+not)",
],
HedgeType.MODAL_PROBABILITY: [
r"\bshould\b", r"\bwould\b", r"\bwill\b(?=\s+(?:probably|likely|perhaps))",
],
HedgeType.MODAL_NECESSITY: [
r"\bmust\b(?=\s+(?:have|be))", r"\bneed(?:s)?\s+to\b",
],
# Lexical verbs
HedgeType.VERB_SPECULATIVE: [
r"\bsuggests?\b", r"\bindicates?\b", r"\bappears?\b", r"\bseems?\b",
r"\bimplies?\b", r"\bpoints?\s+to\b",
],
HedgeType.VERB_DEDUCTIVE: [
r"\bconcludes?\b", r"\binfers?\b", r"\bdeduces?\b", r"\breasons?\b",
],
HedgeType.VERB_TENTATIVE: [
r"\bbelieves?\b", r"\bthinks?\b", r"\bassumes?\b", r"\bsupposes?\b",
r"\bconsiders?\b", r"\bestimate[sd]?\b", r"\bguess(?:es)?\b",
],
# Adverbs
HedgeType.ADVERB_PROBABILITY: [
r"\bprobably\b", r"\bpossibly\b", r"\bperhaps\b", r"\bmaybe\b",
r"\bpresumably\b", r"\bconceivably\b", r"\blikely\b", r"\bunlikely\b",
],
HedgeType.ADVERB_FREQUENCY: [
r"\busually\b", r"\boften\b", r"\bsometimes\b", r"\boccasionally\b",
r"\brarely\b", r"\bseldom\b", r"\bgenerally\b", r"\btypically\b",
],
HedgeType.ADVERB_APPROXIMATION: [
r"\bapproximately\b", r"\broughly\b", r"\babout\b", r"\baround\b",
r"\bnearly\b", r"\balmost\b", r"\bcirca\b", r"\bc\.\b", r"\bca\.\b",
],
# Adjectives
HedgeType.ADJ_EPISTEMIC: [
r"\bpossible\b", r"\bprobable\b", r"\blikely\b", r"\buncertain\b",
r"\bdoubtful\b", r"\bquestionable\b", r"\bplausible\b", r"\bconceivable\b",
],
HedgeType.ADJ_APPROXIMATIVE: [
r"\bapproximate\b", r"\brough\b", r"\bestimated\b", r"\btentative\b",
],
# Quantifiers
HedgeType.QUANT_APPROXIMATION: [
r"\bsome\b", r"\bseveral\b", r"\bmany\b", r"\bfew\b", r"\bmost\b",
],
HedgeType.QUANT_VAGUE: [
r"\ba\s+number\s+of\b", r"\bvarious\b", r"\bnumerous\b", r"\bcertain\b",
r"\bcountless\b", r"\bhandful\s+of\b",
],
# Phrases
HedgeType.PHRASE_ATTRIBUTION: [
r"\baccording\s+to\b", r"\bbased\s+on\b", r"\bit\s+is\s+said\b",
r"\breportedly\b", r"\bsources\s+say\b", r"\bsources\s+suggest\b",
],
HedgeType.PHRASE_CONDITIONAL: [
r"\bif\b(?=\s+(?:true|correct|accurate))", r"\bassuming\s+that\b",
r"\bprovided\s+that\b", r"\bin\s+case\b",
],
HedgeType.PHRASE_LIMITATION: [
r"\bto\s+some\s+extent\b", r"\bin\s+some\s+cases\b", r"\bto\s+a\s+degree\b",
r"\bpartially\b", r"\bin\s+part\b",
],
# Temporal
HedgeType.TEMPORAL_VAGUE: [
r"\bcirca\b", r"\bc\.\s*\d", r"\bca\.\s*\d", r"\baround\s+\d",
r"\babout\s+\d", r"\bapproximately\s+\d",
],
HedgeType.TEMPORAL_RANGE: [
r"\bbetween\s+\d+\s+and\s+\d+", r"\d+\s*[-–—]\s*\d+",
r"\d+\s+to\s+\d+",
],
# Source attribution
HedgeType.SOURCE_HEARSAY: [
r"\breportedly\b", r"\ballegedly\b", r"\bsupposedly\b", r"\bseemingly\b",
r"\bapparently\b",
],
HedgeType.SOURCE_TRADITION: [
r"\btraditionally\b", r"\bby\s+tradition\b", r"\blegend\s+has\s+it\b",
],
# Heritage-specific
HedgeType.HERITAGE_ATTRIBUTION: [
r"\battributed\s+to\b", r"\bschool\s+of\b", r"\bcircle\s+of\b",
r"\bfollower\s+of\b", r"\bworkshop\s+of\b", r"\bmanner\s+of\b",
r"\bstyle\s+of\b", r"\bafter\b(?=\s+[A-Z])", # "after Rembrandt"
],
HedgeType.HERITAGE_DATING: [
r"\bdated\s+to\b", r"\bdating\s+from\b", r"\bdating\s+to\b",
r"\bbelieved\s+to\s+date\b", r"\bthought\s+to\s+be\s+from\b",
],
HedgeType.HERITAGE_PROVENANCE: [
r"\bprovenance\s+unclear\b", r"\bprovenance\s+unknown\b",
r"\borigin\s+unknown\b", r"\borigin\s+uncertain\b",
r"\bhistory\s+unclear\b",
],
}
# Dutch hedging patterns
HEDGE_PATTERNS_NL: Dict[HedgeType, List[str]] = {
HedgeType.MODAL_POSSIBILITY: [
r"\bkan\b", r"\bkunnen\b", r"\bzou\b", r"\bzouden\b", r"\bmag\b",
],
HedgeType.ADVERB_PROBABILITY: [
r"\bwaarschijnlijk\b", r"\bmogelijk\b", r"\bmisschien\b", r"\bvermoedelijk\b",
],
HedgeType.ADVERB_APPROXIMATION: [
r"\bongeveer\b", r"\brond\b", r"\bcirca\b", r"\bc\.\b", r"\bca\.\b",
],
HedgeType.HERITAGE_ATTRIBUTION: [
r"\btoegschreven\s+aan\b", r"\bschool\s+van\b", r"\bkring\s+van\b",
r"\bvolgeling\s+van\b", r"\bwerkplaats\s+van\b",
],
}
# German hedging patterns
HEDGE_PATTERNS_DE: Dict[HedgeType, List[str]] = {
HedgeType.MODAL_POSSIBILITY: [
r"\bkann\b", r"\bkönnte\b", r"\bkönnen\b", r"\bdürfte\b", r"\bmag\b",
],
HedgeType.ADVERB_PROBABILITY: [
r"\bwahrscheinlich\b", r"\bmöglicherweise\b", r"\bvielleicht\b",
r"\bvermutlich\b", r"\banscheinend\b",
],
HedgeType.ADVERB_APPROXIMATION: [
r"\bungefähr\b", r"\betwa\b", r"\bcirca\b", r"\bca\.\b", r"\brund\b",
],
HedgeType.HERITAGE_ATTRIBUTION: [
r"\bzugeschrieben\b", r"\bSchule\s+von\b", r"\bKreis\s+von\b",
r"\bWerkstatt\s+von\b", r"\bUmkreis\s+von\b",
],
}
# French hedging patterns
HEDGE_PATTERNS_FR: Dict[HedgeType, List[str]] = {
HedgeType.MODAL_POSSIBILITY: [
r"\bpeut\b", r"\bpourrait\b", r"\bpourrait\b", r"\bpuisse\b",
],
HedgeType.ADVERB_PROBABILITY: [
r"\bprobablement\b", r"\bpeut-être\b", r"\bsans\s+doute\b",
r"\bapparemment\b", r"\bvraisemblablement\b",
],
HedgeType.ADVERB_APPROXIMATION: [
r"\benviron\b", r"\\s+peu\s+près\b", r"\bcirca\b", r"\bvers\b",
],
HedgeType.HERITAGE_ATTRIBUTION: [
r"\battribué\s+à\b", r"\bécole\s+de\b", r"\bentourage\s+de\b",
r"\batelier\s+de\b", r"\bmanière\s+de\b",
],
}
# =============================================================================
# HEDGE DATA STRUCTURES
# =============================================================================
@dataclass
class Hedge:
"""
A detected hedging marker in text.
"""
text: str # The hedging expression
hedge_type: HedgeType # Type classification
start: int # Start position in text
end: int # End position in text
confidence: float = 1.0 # Detection confidence
dimension: Optional[UncertaintyDimension] = None # What aspect is uncertain
def __hash__(self):
return hash((self.text, self.hedge_type, self.start, self.end))
@dataclass
class UncertaintyAnalysis:
"""
Complete uncertainty analysis for a text.
"""
text: str # Original text
level: UncertaintyLevel # Overall uncertainty level
hedges: List[Hedge] = field(default_factory=list) # Detected hedges
hedge_density: float = 0.0 # Hedges per 100 words
dimension_scores: Dict[UncertaintyDimension, float] = field(default_factory=dict)
# Summary statistics
hedge_count: int = 0
word_count: int = 0
def to_dict(self) -> Dict[str, Any]:
"""Export to dictionary."""
return {
"level": self.level.value,
"hedges": [
{
"text": h.text,
"type": h.hedge_type.value,
"start": h.start,
"end": h.end,
"confidence": h.confidence,
"dimension": h.dimension.value if h.dimension else None,
}
for h in self.hedges
],
"hedge_density": self.hedge_density,
"dimension_scores": {
k.value: v for k, v in self.dimension_scores.items()
},
"hedge_count": self.hedge_count,
"word_count": self.word_count,
}
# =============================================================================
# UNCERTAINTY DETECTOR
# =============================================================================
class UncertaintyDetector:
"""
Detector for epistemic uncertainty markers (hedging language).
Supports multilingual detection (EN, NL, DE, FR).
Usage:
>>> detector = UncertaintyDetector(languages=['en', 'nl'])
>>> result = detector.analyze("The painting is probably from circa 1650.")
>>> print(result.level) # UncertaintyLevel.MODERATE
"""
# Hedge type weights for uncertainty scoring
HEDGE_WEIGHTS: Dict[HedgeType, float] = {
# High uncertainty (strong hedging)
HedgeType.HERITAGE_ATTRIBUTION: 0.9,
HedgeType.HERITAGE_PROVENANCE: 0.9,
HedgeType.SOURCE_HEARSAY: 0.8,
HedgeType.ADJ_EPISTEMIC: 0.7,
HedgeType.MODAL_POSSIBILITY: 0.7,
HedgeType.VERB_TENTATIVE: 0.7,
# Moderate uncertainty
HedgeType.ADVERB_PROBABILITY: 0.6,
HedgeType.VERB_SPECULATIVE: 0.6,
HedgeType.PHRASE_ATTRIBUTION: 0.5,
HedgeType.HERITAGE_DATING: 0.5,
HedgeType.SOURCE_TRADITION: 0.5,
# Low uncertainty (mild hedging)
HedgeType.ADVERB_APPROXIMATION: 0.3,
HedgeType.TEMPORAL_VAGUE: 0.3,
HedgeType.TEMPORAL_RANGE: 0.2,
HedgeType.QUANT_APPROXIMATION: 0.2,
HedgeType.QUANT_VAGUE: 0.2,
HedgeType.ADJ_APPROXIMATIVE: 0.3,
# Minimal uncertainty
HedgeType.ADVERB_FREQUENCY: 0.1,
HedgeType.MODAL_PROBABILITY: 0.4,
HedgeType.MODAL_NECESSITY: 0.3,
HedgeType.VERB_DEDUCTIVE: 0.2,
HedgeType.PHRASE_CONDITIONAL: 0.4,
HedgeType.PHRASE_LIMITATION: 0.3,
}
# Dimension inference based on context patterns
DIMENSION_PATTERNS: Dict[UncertaintyDimension, List[str]] = {
UncertaintyDimension.ATTRIBUTION: [
r"(?:painted|created|made|authored|by|attributed)\s+(?:by|to)",
r"(?:artist|painter|author|creator)",
],
UncertaintyDimension.DATING: [
r"(?:date|dated|dating|century|period|era|year)",
r"\d{3,4}", # Years
],
UncertaintyDimension.LOCATION: [
r"(?:location|located|place|where|from|origin)",
r"(?:city|country|region|museum|gallery)",
],
UncertaintyDimension.QUANTITY: [
r"(?:number|count|amount|many|few|several)",
r"(?:pieces|items|works|objects)",
],
UncertaintyDimension.PROVENANCE: [
r"(?:provenance|ownership|history|collection)",
r"(?:acquired|purchased|donated|inherited)",
],
UncertaintyDimension.AUTHENTICITY: [
r"(?:authentic|genuine|original|fake|forgery|copy)",
r"(?:attribution|school of|workshop)",
],
}
def __init__(
self,
languages: Optional[List[str]] = None,
custom_patterns: Optional[Dict[HedgeType, List[str]]] = None,
):
"""
Initialize detector.
Args:
languages: List of language codes to detect (default: ['en'])
custom_patterns: Additional custom patterns by hedge type
"""
self.languages = languages or ['en']
self._patterns: Dict[HedgeType, List[re.Pattern]] = {}
self._compile_patterns(custom_patterns)
def _compile_patterns(self, custom_patterns: Optional[Dict[HedgeType, List[str]]]):
"""Compile regex patterns for all languages."""
# Combine patterns from all specified languages
all_patterns: Dict[HedgeType, List[str]] = {}
pattern_sources = {
'en': HEDGE_PATTERNS_EN,
'nl': HEDGE_PATTERNS_NL,
'de': HEDGE_PATTERNS_DE,
'fr': HEDGE_PATTERNS_FR,
}
for lang in self.languages:
if lang in pattern_sources:
for hedge_type, patterns in pattern_sources[lang].items():
if hedge_type not in all_patterns:
all_patterns[hedge_type] = []
all_patterns[hedge_type].extend(patterns)
# Add custom patterns
if custom_patterns:
for hedge_type, patterns in custom_patterns.items():
if hedge_type not in all_patterns:
all_patterns[hedge_type] = []
all_patterns[hedge_type].extend(patterns)
# Compile all patterns
for hedge_type, patterns in all_patterns.items():
self._patterns[hedge_type] = [
re.compile(p, re.IGNORECASE) for p in patterns
]
def detect_hedges(self, text: str) -> List[Hedge]:
"""
Detect hedging markers in text.
Args:
text: Text to analyze
Returns:
List of detected Hedge objects
"""
hedges: List[Hedge] = []
seen_spans: Set[Tuple[int, int]] = set()
for hedge_type, patterns in self._patterns.items():
for pattern in patterns:
for match in pattern.finditer(text):
span = (match.start(), match.end())
# Avoid overlapping matches
if span not in seen_spans:
seen_spans.add(span)
# Infer dimension from context
dimension = self._infer_dimension(text, match.start(), match.end())
hedges.append(Hedge(
text=match.group(),
hedge_type=hedge_type,
start=match.start(),
end=match.end(),
confidence=1.0,
dimension=dimension,
))
# Sort by position
hedges.sort(key=lambda h: h.start)
return hedges
def _infer_dimension(self, text: str, start: int, end: int) -> Optional[UncertaintyDimension]:
"""Infer what dimension is uncertain from surrounding context."""
# Get context window (50 chars before and after)
context_start = max(0, start - 50)
context_end = min(len(text), end + 50)
context = text[context_start:context_end].lower()
for dimension, patterns in self.DIMENSION_PATTERNS.items():
for pattern in patterns:
if re.search(pattern, context, re.IGNORECASE):
return dimension
return None
def _calculate_uncertainty_level(
self,
hedges: List[Hedge],
word_count: int,
) -> UncertaintyLevel:
"""
Calculate overall uncertainty level from hedges.
Uses weighted scoring based on hedge types.
"""
if not hedges or word_count == 0:
return UncertaintyLevel.CERTAIN
# Calculate weighted score
total_weight = sum(
self.HEDGE_WEIGHTS.get(h.hedge_type, 0.5)
for h in hedges
)
# Normalize by word count (per 100 words)
normalized_score = (total_weight / word_count) * 100
# Also consider absolute number of high-weight hedges
high_weight_count = sum(
1 for h in hedges
if self.HEDGE_WEIGHTS.get(h.hedge_type, 0) >= 0.7
)
# Determine level
if normalized_score < 0.5 and high_weight_count == 0:
return UncertaintyLevel.CERTAIN
elif normalized_score < 2.0 and high_weight_count == 0:
return UncertaintyLevel.NEAR_CERTAIN
elif normalized_score < 5.0 and high_weight_count <= 1:
return UncertaintyLevel.MODERATE
elif normalized_score < 10.0 or high_weight_count <= 2:
return UncertaintyLevel.UNCERTAIN
else:
return UncertaintyLevel.HIGHLY_UNCERTAIN
def analyze(self, text: str) -> UncertaintyAnalysis:
"""
Perform complete uncertainty analysis on text.
Args:
text: Text to analyze
Returns:
UncertaintyAnalysis with detected hedges and overall assessment
"""
# Detect hedges
hedges = self.detect_hedges(text)
# Count words (simple tokenization)
word_count = len(text.split())
# Calculate hedge density
hedge_density = (len(hedges) / word_count * 100) if word_count > 0 else 0.0
# Calculate dimension scores
dimension_scores: Dict[UncertaintyDimension, float] = {}
for hedge in hedges:
if hedge.dimension:
weight = self.HEDGE_WEIGHTS.get(hedge.hedge_type, 0.5)
dimension_scores[hedge.dimension] = dimension_scores.get(
hedge.dimension, 0.0
) + weight
# Calculate overall level
level = self._calculate_uncertainty_level(hedges, word_count)
return UncertaintyAnalysis(
text=text,
level=level,
hedges=hedges,
hedge_density=hedge_density,
dimension_scores=dimension_scores,
hedge_count=len(hedges),
word_count=word_count,
)
def get_claim_confidence(
self,
claim_text: str,
context: Optional[str] = None,
) -> float:
"""
Calculate confidence score for a claim based on hedging.
Returns a value between 0.0 (highly uncertain) and 1.0 (certain).
Args:
claim_text: The claim text to analyze
context: Optional surrounding context
Returns:
Confidence score (0.0-1.0)
"""
# Analyze claim text
analysis = self.analyze(claim_text)
# Also analyze context if provided
if context:
context_analysis = self.analyze(context)
# Combine hedges from both
all_hedges = analysis.hedges + context_analysis.hedges
else:
all_hedges = analysis.hedges
if not all_hedges:
return 1.0 # No hedging = certain
# Calculate penalty based on hedge weights
total_penalty = sum(
self.HEDGE_WEIGHTS.get(h.hedge_type, 0.5)
for h in all_hedges
)
# Normalize: max penalty of 1.0 (fully uncertain)
penalty = min(1.0, total_penalty / 2.0) # 2.0 hedge weight = 0 confidence
return max(0.0, 1.0 - penalty)
def annotate_entities_with_uncertainty(
self,
entities: List[Dict[str, Any]],
text: str,
) -> List[Dict[str, Any]]:
"""
Annotate extracted entities with uncertainty information.
Args:
entities: List of extracted entities (dicts with text, start, end)
text: Full document text
Returns:
Entities with added uncertainty fields
"""
analysis = self.analyze(text)
for entity in entities:
entity_start = entity.get('start', 0)
entity_end = entity.get('end', len(text))
entity_text = entity.get('text', '')
# Find hedges near this entity
nearby_hedges = [
h for h in analysis.hedges
if abs(h.start - entity_start) < 50 or abs(h.end - entity_end) < 50
]
# Calculate entity-specific confidence
if nearby_hedges:
entity['uncertainty'] = {
'hedges': [
{'text': h.text, 'type': h.hedge_type.value}
for h in nearby_hedges
],
'confidence': self.get_claim_confidence(entity_text, text),
'level': self._calculate_uncertainty_level(
nearby_hedges,
len(entity_text.split()) or 1
).value,
}
else:
entity['uncertainty'] = {
'hedges': [],
'confidence': 1.0,
'level': UncertaintyLevel.CERTAIN.value,
}
return entities
# =============================================================================
# CONVENIENCE FUNCTIONS
# =============================================================================
def detect_hedges(text: str, languages: Optional[List[str]] = None) -> List[Hedge]:
"""
Detect hedging markers in text.
Args:
text: Text to analyze
languages: Language codes (default: ['en'])
Returns:
List of Hedge objects
"""
detector = UncertaintyDetector(languages=languages)
return detector.detect_hedges(text)
def analyze_uncertainty(text: str, languages: Optional[List[str]] = None) -> UncertaintyAnalysis:
"""
Perform complete uncertainty analysis.
Args:
text: Text to analyze
languages: Language codes (default: ['en'])
Returns:
UncertaintyAnalysis
"""
detector = UncertaintyDetector(languages=languages)
return detector.analyze(text)
def get_confidence_score(claim: str, context: Optional[str] = None) -> float:
"""
Get confidence score for a claim based on hedging language.
Args:
claim: Claim text
context: Optional surrounding context
Returns:
Confidence score (0.0-1.0)
"""
detector = UncertaintyDetector()
return detector.get_claim_confidence(claim, context)
__all__ = [
# Enums
"HedgeType",
"UncertaintyLevel",
"UncertaintyDimension",
# Data structures
"Hedge",
"UncertaintyAnalysis",
# Main class
"UncertaintyDetector",
# Convenience functions
"detect_hedges",
"analyze_uncertainty",
"get_confidence_score",
]