750 lines
26 KiB
Python
750 lines
26 KiB
Python
"""
|
|
Uncertainty Detection Module for GLAM-NER.
|
|
|
|
This module provides detection and classification of epistemic uncertainty
|
|
markers (hedging language) in heritage institution documents.
|
|
|
|
Based on linguistic research on hedging and epistemic modality:
|
|
- Hyland (1998) - Hedging in Scientific Research Articles
|
|
- Lakoff (1972) - Hedges: A Study in Meaning Criteria
|
|
- Holmes (1988) - Doubt and Certainty in ESL Textbooks
|
|
|
|
Usage:
|
|
>>> from glam_extractor.annotators.uncertainty import (
|
|
... UncertaintyDetector,
|
|
... HedgeType,
|
|
... UncertaintyLevel,
|
|
... )
|
|
>>>
|
|
>>> detector = UncertaintyDetector()
|
|
>>> result = detector.analyze("The painting is probably from the 17th century.")
|
|
>>> print(result.level) # UncertaintyLevel.MODERATE
|
|
>>> print(result.hedges) # [Hedge(text='probably', type=HedgeType.PROBABILITY)]
|
|
"""
|
|
|
|
import re
|
|
from dataclasses import dataclass, field
|
|
from enum import Enum
|
|
from typing import Any, Dict, List, Optional, Set, Tuple
|
|
|
|
|
|
# =============================================================================
|
|
# HEDGE TYPES (Epistemic Markers)
|
|
# =============================================================================
|
|
|
|
class HedgeType(str, Enum):
|
|
"""
|
|
Classification of epistemic uncertainty markers.
|
|
|
|
Based on Hyland (1998) taxonomy adapted for heritage domain.
|
|
"""
|
|
# Modal auxiliaries
|
|
MODAL_POSSIBILITY = "MODAL.POSSIBILITY" # may, might, could
|
|
MODAL_PROBABILITY = "MODAL.PROBABILITY" # should, would, will (speculative)
|
|
MODAL_NECESSITY = "MODAL.NECESSITY" # must (epistemic), need
|
|
|
|
# Lexical verbs
|
|
VERB_SPECULATIVE = "VERB.SPECULATIVE" # suggest, indicate, appear, seem
|
|
VERB_DEDUCTIVE = "VERB.DEDUCTIVE" # conclude, infer, deduce
|
|
VERB_TENTATIVE = "VERB.TENTATIVE" # believe, think, assume, suppose
|
|
|
|
# Adverbs
|
|
ADVERB_PROBABILITY = "ADV.PROBABILITY" # probably, possibly, perhaps, maybe
|
|
ADVERB_FREQUENCY = "ADV.FREQUENCY" # often, sometimes, occasionally
|
|
ADVERB_APPROXIMATION = "ADV.APPROX" # approximately, roughly, about
|
|
|
|
# Adjectives
|
|
ADJ_EPISTEMIC = "ADJ.EPISTEMIC" # possible, probable, likely, uncertain
|
|
ADJ_APPROXIMATIVE = "ADJ.APPROX" # approximate, rough, estimated
|
|
|
|
# Quantifiers (shields)
|
|
QUANT_APPROXIMATION = "QUANT.APPROX" # some, several, many, few
|
|
QUANT_VAGUE = "QUANT.VAGUE" # a number of, various, numerous
|
|
|
|
# Phrases
|
|
PHRASE_ATTRIBUTION = "PHRASE.ATTR" # according to, based on, it is said
|
|
PHRASE_CONDITIONAL = "PHRASE.COND" # if, in case, assuming that
|
|
PHRASE_LIMITATION = "PHRASE.LIMIT" # to some extent, in some cases
|
|
|
|
# Temporal uncertainty
|
|
TEMPORAL_VAGUE = "TMP.VAGUE" # around, circa, approximately (dates)
|
|
TEMPORAL_RANGE = "TMP.RANGE" # between X and Y, X to Y
|
|
|
|
# Source attribution
|
|
SOURCE_HEARSAY = "SRC.HEARSAY" # reportedly, allegedly, supposedly
|
|
SOURCE_TRADITION = "SRC.TRADITION" # traditionally, by tradition
|
|
|
|
# Heritage-specific
|
|
HERITAGE_ATTRIBUTION = "HER.ATTR" # attributed to, school of, circle of
|
|
HERITAGE_DATING = "HER.DATE" # dated to, dating from (uncertain)
|
|
HERITAGE_PROVENANCE = "HER.PROV" # provenance unclear, origin unknown
|
|
|
|
|
|
class UncertaintyLevel(str, Enum):
|
|
"""
|
|
Overall uncertainty level of a claim or statement.
|
|
|
|
Scale from CERTAIN to HIGHLY_UNCERTAIN.
|
|
"""
|
|
CERTAIN = "CERTAIN" # No hedging detected
|
|
NEAR_CERTAIN = "NEAR_CERTAIN" # Minor hedging (approximately, circa)
|
|
MODERATE = "MODERATE" # Moderate hedging (probably, likely)
|
|
UNCERTAIN = "UNCERTAIN" # Significant hedging (possibly, might)
|
|
HIGHLY_UNCERTAIN = "HIGHLY_UNCERTAIN" # Strong hedging (attributed to, uncertain)
|
|
|
|
|
|
class UncertaintyDimension(str, Enum):
|
|
"""
|
|
Dimension of uncertainty (what aspect is uncertain).
|
|
"""
|
|
ATTRIBUTION = "ATTRIBUTION" # Who created/did something
|
|
DATING = "DATING" # When something happened
|
|
LOCATION = "LOCATION" # Where something is/was
|
|
IDENTITY = "IDENTITY" # What something is
|
|
QUANTITY = "QUANTITY" # How many/much
|
|
PROVENANCE = "PROVENANCE" # Origin/history
|
|
AUTHENTICITY = "AUTHENTICITY" # Whether genuine
|
|
INTERPRETATION = "INTERPRETATION" # Meaning/significance
|
|
|
|
|
|
# =============================================================================
|
|
# HEDGE PATTERNS (Multilingual)
|
|
# =============================================================================
|
|
|
|
# English hedging patterns
|
|
HEDGE_PATTERNS_EN: Dict[HedgeType, List[str]] = {
|
|
# Modal auxiliaries
|
|
HedgeType.MODAL_POSSIBILITY: [
|
|
r"\bmay\b", r"\bmight\b", r"\bcould\b", r"\bcan\b(?!\s+not)",
|
|
],
|
|
HedgeType.MODAL_PROBABILITY: [
|
|
r"\bshould\b", r"\bwould\b", r"\bwill\b(?=\s+(?:probably|likely|perhaps))",
|
|
],
|
|
HedgeType.MODAL_NECESSITY: [
|
|
r"\bmust\b(?=\s+(?:have|be))", r"\bneed(?:s)?\s+to\b",
|
|
],
|
|
|
|
# Lexical verbs
|
|
HedgeType.VERB_SPECULATIVE: [
|
|
r"\bsuggests?\b", r"\bindicates?\b", r"\bappears?\b", r"\bseems?\b",
|
|
r"\bimplies?\b", r"\bpoints?\s+to\b",
|
|
],
|
|
HedgeType.VERB_DEDUCTIVE: [
|
|
r"\bconcludes?\b", r"\binfers?\b", r"\bdeduces?\b", r"\breasons?\b",
|
|
],
|
|
HedgeType.VERB_TENTATIVE: [
|
|
r"\bbelieves?\b", r"\bthinks?\b", r"\bassumes?\b", r"\bsupposes?\b",
|
|
r"\bconsiders?\b", r"\bestimate[sd]?\b", r"\bguess(?:es)?\b",
|
|
],
|
|
|
|
# Adverbs
|
|
HedgeType.ADVERB_PROBABILITY: [
|
|
r"\bprobably\b", r"\bpossibly\b", r"\bperhaps\b", r"\bmaybe\b",
|
|
r"\bpresumably\b", r"\bconceivably\b", r"\blikely\b", r"\bunlikely\b",
|
|
],
|
|
HedgeType.ADVERB_FREQUENCY: [
|
|
r"\busually\b", r"\boften\b", r"\bsometimes\b", r"\boccasionally\b",
|
|
r"\brarely\b", r"\bseldom\b", r"\bgenerally\b", r"\btypically\b",
|
|
],
|
|
HedgeType.ADVERB_APPROXIMATION: [
|
|
r"\bapproximately\b", r"\broughly\b", r"\babout\b", r"\baround\b",
|
|
r"\bnearly\b", r"\balmost\b", r"\bcirca\b", r"\bc\.\b", r"\bca\.\b",
|
|
],
|
|
|
|
# Adjectives
|
|
HedgeType.ADJ_EPISTEMIC: [
|
|
r"\bpossible\b", r"\bprobable\b", r"\blikely\b", r"\buncertain\b",
|
|
r"\bdoubtful\b", r"\bquestionable\b", r"\bplausible\b", r"\bconceivable\b",
|
|
],
|
|
HedgeType.ADJ_APPROXIMATIVE: [
|
|
r"\bapproximate\b", r"\brough\b", r"\bestimated\b", r"\btentative\b",
|
|
],
|
|
|
|
# Quantifiers
|
|
HedgeType.QUANT_APPROXIMATION: [
|
|
r"\bsome\b", r"\bseveral\b", r"\bmany\b", r"\bfew\b", r"\bmost\b",
|
|
],
|
|
HedgeType.QUANT_VAGUE: [
|
|
r"\ba\s+number\s+of\b", r"\bvarious\b", r"\bnumerous\b", r"\bcertain\b",
|
|
r"\bcountless\b", r"\bhandful\s+of\b",
|
|
],
|
|
|
|
# Phrases
|
|
HedgeType.PHRASE_ATTRIBUTION: [
|
|
r"\baccording\s+to\b", r"\bbased\s+on\b", r"\bit\s+is\s+said\b",
|
|
r"\breportedly\b", r"\bsources\s+say\b", r"\bsources\s+suggest\b",
|
|
],
|
|
HedgeType.PHRASE_CONDITIONAL: [
|
|
r"\bif\b(?=\s+(?:true|correct|accurate))", r"\bassuming\s+that\b",
|
|
r"\bprovided\s+that\b", r"\bin\s+case\b",
|
|
],
|
|
HedgeType.PHRASE_LIMITATION: [
|
|
r"\bto\s+some\s+extent\b", r"\bin\s+some\s+cases\b", r"\bto\s+a\s+degree\b",
|
|
r"\bpartially\b", r"\bin\s+part\b",
|
|
],
|
|
|
|
# Temporal
|
|
HedgeType.TEMPORAL_VAGUE: [
|
|
r"\bcirca\b", r"\bc\.\s*\d", r"\bca\.\s*\d", r"\baround\s+\d",
|
|
r"\babout\s+\d", r"\bapproximately\s+\d",
|
|
],
|
|
HedgeType.TEMPORAL_RANGE: [
|
|
r"\bbetween\s+\d+\s+and\s+\d+", r"\d+\s*[-–—]\s*\d+",
|
|
r"\d+\s+to\s+\d+",
|
|
],
|
|
|
|
# Source attribution
|
|
HedgeType.SOURCE_HEARSAY: [
|
|
r"\breportedly\b", r"\ballegedly\b", r"\bsupposedly\b", r"\bseemingly\b",
|
|
r"\bapparently\b",
|
|
],
|
|
HedgeType.SOURCE_TRADITION: [
|
|
r"\btraditionally\b", r"\bby\s+tradition\b", r"\blegend\s+has\s+it\b",
|
|
],
|
|
|
|
# Heritage-specific
|
|
HedgeType.HERITAGE_ATTRIBUTION: [
|
|
r"\battributed\s+to\b", r"\bschool\s+of\b", r"\bcircle\s+of\b",
|
|
r"\bfollower\s+of\b", r"\bworkshop\s+of\b", r"\bmanner\s+of\b",
|
|
r"\bstyle\s+of\b", r"\bafter\b(?=\s+[A-Z])", # "after Rembrandt"
|
|
],
|
|
HedgeType.HERITAGE_DATING: [
|
|
r"\bdated\s+to\b", r"\bdating\s+from\b", r"\bdating\s+to\b",
|
|
r"\bbelieved\s+to\s+date\b", r"\bthought\s+to\s+be\s+from\b",
|
|
],
|
|
HedgeType.HERITAGE_PROVENANCE: [
|
|
r"\bprovenance\s+unclear\b", r"\bprovenance\s+unknown\b",
|
|
r"\borigin\s+unknown\b", r"\borigin\s+uncertain\b",
|
|
r"\bhistory\s+unclear\b",
|
|
],
|
|
}
|
|
|
|
# Dutch hedging patterns
|
|
HEDGE_PATTERNS_NL: Dict[HedgeType, List[str]] = {
|
|
HedgeType.MODAL_POSSIBILITY: [
|
|
r"\bkan\b", r"\bkunnen\b", r"\bzou\b", r"\bzouden\b", r"\bmag\b",
|
|
],
|
|
HedgeType.ADVERB_PROBABILITY: [
|
|
r"\bwaarschijnlijk\b", r"\bmogelijk\b", r"\bmisschien\b", r"\bvermoedelijk\b",
|
|
],
|
|
HedgeType.ADVERB_APPROXIMATION: [
|
|
r"\bongeveer\b", r"\brond\b", r"\bcirca\b", r"\bc\.\b", r"\bca\.\b",
|
|
],
|
|
HedgeType.HERITAGE_ATTRIBUTION: [
|
|
r"\btoegschreven\s+aan\b", r"\bschool\s+van\b", r"\bkring\s+van\b",
|
|
r"\bvolgeling\s+van\b", r"\bwerkplaats\s+van\b",
|
|
],
|
|
}
|
|
|
|
# German hedging patterns
|
|
HEDGE_PATTERNS_DE: Dict[HedgeType, List[str]] = {
|
|
HedgeType.MODAL_POSSIBILITY: [
|
|
r"\bkann\b", r"\bkönnte\b", r"\bkönnen\b", r"\bdürfte\b", r"\bmag\b",
|
|
],
|
|
HedgeType.ADVERB_PROBABILITY: [
|
|
r"\bwahrscheinlich\b", r"\bmöglicherweise\b", r"\bvielleicht\b",
|
|
r"\bvermutlich\b", r"\banscheinend\b",
|
|
],
|
|
HedgeType.ADVERB_APPROXIMATION: [
|
|
r"\bungefähr\b", r"\betwa\b", r"\bcirca\b", r"\bca\.\b", r"\brund\b",
|
|
],
|
|
HedgeType.HERITAGE_ATTRIBUTION: [
|
|
r"\bzugeschrieben\b", r"\bSchule\s+von\b", r"\bKreis\s+von\b",
|
|
r"\bWerkstatt\s+von\b", r"\bUmkreis\s+von\b",
|
|
],
|
|
}
|
|
|
|
# French hedging patterns
|
|
HEDGE_PATTERNS_FR: Dict[HedgeType, List[str]] = {
|
|
HedgeType.MODAL_POSSIBILITY: [
|
|
r"\bpeut\b", r"\bpourrait\b", r"\bpourrait\b", r"\bpuisse\b",
|
|
],
|
|
HedgeType.ADVERB_PROBABILITY: [
|
|
r"\bprobablement\b", r"\bpeut-être\b", r"\bsans\s+doute\b",
|
|
r"\bapparemment\b", r"\bvraisemblablement\b",
|
|
],
|
|
HedgeType.ADVERB_APPROXIMATION: [
|
|
r"\benviron\b", r"\bà\s+peu\s+près\b", r"\bcirca\b", r"\bvers\b",
|
|
],
|
|
HedgeType.HERITAGE_ATTRIBUTION: [
|
|
r"\battribué\s+à\b", r"\bécole\s+de\b", r"\bentourage\s+de\b",
|
|
r"\batelier\s+de\b", r"\bmanière\s+de\b",
|
|
],
|
|
}
|
|
|
|
|
|
# =============================================================================
|
|
# HEDGE DATA STRUCTURES
|
|
# =============================================================================
|
|
|
|
@dataclass
|
|
class Hedge:
|
|
"""
|
|
A detected hedging marker in text.
|
|
"""
|
|
text: str # The hedging expression
|
|
hedge_type: HedgeType # Type classification
|
|
start: int # Start position in text
|
|
end: int # End position in text
|
|
confidence: float = 1.0 # Detection confidence
|
|
dimension: Optional[UncertaintyDimension] = None # What aspect is uncertain
|
|
|
|
def __hash__(self):
|
|
return hash((self.text, self.hedge_type, self.start, self.end))
|
|
|
|
|
|
@dataclass
|
|
class UncertaintyAnalysis:
|
|
"""
|
|
Complete uncertainty analysis for a text.
|
|
"""
|
|
text: str # Original text
|
|
level: UncertaintyLevel # Overall uncertainty level
|
|
hedges: List[Hedge] = field(default_factory=list) # Detected hedges
|
|
hedge_density: float = 0.0 # Hedges per 100 words
|
|
dimension_scores: Dict[UncertaintyDimension, float] = field(default_factory=dict)
|
|
|
|
# Summary statistics
|
|
hedge_count: int = 0
|
|
word_count: int = 0
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
"""Export to dictionary."""
|
|
return {
|
|
"level": self.level.value,
|
|
"hedges": [
|
|
{
|
|
"text": h.text,
|
|
"type": h.hedge_type.value,
|
|
"start": h.start,
|
|
"end": h.end,
|
|
"confidence": h.confidence,
|
|
"dimension": h.dimension.value if h.dimension else None,
|
|
}
|
|
for h in self.hedges
|
|
],
|
|
"hedge_density": self.hedge_density,
|
|
"dimension_scores": {
|
|
k.value: v for k, v in self.dimension_scores.items()
|
|
},
|
|
"hedge_count": self.hedge_count,
|
|
"word_count": self.word_count,
|
|
}
|
|
|
|
|
|
# =============================================================================
|
|
# UNCERTAINTY DETECTOR
|
|
# =============================================================================
|
|
|
|
class UncertaintyDetector:
|
|
"""
|
|
Detector for epistemic uncertainty markers (hedging language).
|
|
|
|
Supports multilingual detection (EN, NL, DE, FR).
|
|
|
|
Usage:
|
|
>>> detector = UncertaintyDetector(languages=['en', 'nl'])
|
|
>>> result = detector.analyze("The painting is probably from circa 1650.")
|
|
>>> print(result.level) # UncertaintyLevel.MODERATE
|
|
"""
|
|
|
|
# Hedge type weights for uncertainty scoring
|
|
HEDGE_WEIGHTS: Dict[HedgeType, float] = {
|
|
# High uncertainty (strong hedging)
|
|
HedgeType.HERITAGE_ATTRIBUTION: 0.9,
|
|
HedgeType.HERITAGE_PROVENANCE: 0.9,
|
|
HedgeType.SOURCE_HEARSAY: 0.8,
|
|
HedgeType.ADJ_EPISTEMIC: 0.7,
|
|
HedgeType.MODAL_POSSIBILITY: 0.7,
|
|
HedgeType.VERB_TENTATIVE: 0.7,
|
|
|
|
# Moderate uncertainty
|
|
HedgeType.ADVERB_PROBABILITY: 0.6,
|
|
HedgeType.VERB_SPECULATIVE: 0.6,
|
|
HedgeType.PHRASE_ATTRIBUTION: 0.5,
|
|
HedgeType.HERITAGE_DATING: 0.5,
|
|
HedgeType.SOURCE_TRADITION: 0.5,
|
|
|
|
# Low uncertainty (mild hedging)
|
|
HedgeType.ADVERB_APPROXIMATION: 0.3,
|
|
HedgeType.TEMPORAL_VAGUE: 0.3,
|
|
HedgeType.TEMPORAL_RANGE: 0.2,
|
|
HedgeType.QUANT_APPROXIMATION: 0.2,
|
|
HedgeType.QUANT_VAGUE: 0.2,
|
|
HedgeType.ADJ_APPROXIMATIVE: 0.3,
|
|
|
|
# Minimal uncertainty
|
|
HedgeType.ADVERB_FREQUENCY: 0.1,
|
|
HedgeType.MODAL_PROBABILITY: 0.4,
|
|
HedgeType.MODAL_NECESSITY: 0.3,
|
|
HedgeType.VERB_DEDUCTIVE: 0.2,
|
|
HedgeType.PHRASE_CONDITIONAL: 0.4,
|
|
HedgeType.PHRASE_LIMITATION: 0.3,
|
|
}
|
|
|
|
# Dimension inference based on context patterns
|
|
DIMENSION_PATTERNS: Dict[UncertaintyDimension, List[str]] = {
|
|
UncertaintyDimension.ATTRIBUTION: [
|
|
r"(?:painted|created|made|authored|by|attributed)\s+(?:by|to)",
|
|
r"(?:artist|painter|author|creator)",
|
|
],
|
|
UncertaintyDimension.DATING: [
|
|
r"(?:date|dated|dating|century|period|era|year)",
|
|
r"\d{3,4}", # Years
|
|
],
|
|
UncertaintyDimension.LOCATION: [
|
|
r"(?:location|located|place|where|from|origin)",
|
|
r"(?:city|country|region|museum|gallery)",
|
|
],
|
|
UncertaintyDimension.QUANTITY: [
|
|
r"(?:number|count|amount|many|few|several)",
|
|
r"(?:pieces|items|works|objects)",
|
|
],
|
|
UncertaintyDimension.PROVENANCE: [
|
|
r"(?:provenance|ownership|history|collection)",
|
|
r"(?:acquired|purchased|donated|inherited)",
|
|
],
|
|
UncertaintyDimension.AUTHENTICITY: [
|
|
r"(?:authentic|genuine|original|fake|forgery|copy)",
|
|
r"(?:attribution|school of|workshop)",
|
|
],
|
|
}
|
|
|
|
def __init__(
|
|
self,
|
|
languages: Optional[List[str]] = None,
|
|
custom_patterns: Optional[Dict[HedgeType, List[str]]] = None,
|
|
):
|
|
"""
|
|
Initialize detector.
|
|
|
|
Args:
|
|
languages: List of language codes to detect (default: ['en'])
|
|
custom_patterns: Additional custom patterns by hedge type
|
|
"""
|
|
self.languages = languages or ['en']
|
|
self._patterns: Dict[HedgeType, List[re.Pattern]] = {}
|
|
self._compile_patterns(custom_patterns)
|
|
|
|
def _compile_patterns(self, custom_patterns: Optional[Dict[HedgeType, List[str]]]):
|
|
"""Compile regex patterns for all languages."""
|
|
# Combine patterns from all specified languages
|
|
all_patterns: Dict[HedgeType, List[str]] = {}
|
|
|
|
pattern_sources = {
|
|
'en': HEDGE_PATTERNS_EN,
|
|
'nl': HEDGE_PATTERNS_NL,
|
|
'de': HEDGE_PATTERNS_DE,
|
|
'fr': HEDGE_PATTERNS_FR,
|
|
}
|
|
|
|
for lang in self.languages:
|
|
if lang in pattern_sources:
|
|
for hedge_type, patterns in pattern_sources[lang].items():
|
|
if hedge_type not in all_patterns:
|
|
all_patterns[hedge_type] = []
|
|
all_patterns[hedge_type].extend(patterns)
|
|
|
|
# Add custom patterns
|
|
if custom_patterns:
|
|
for hedge_type, patterns in custom_patterns.items():
|
|
if hedge_type not in all_patterns:
|
|
all_patterns[hedge_type] = []
|
|
all_patterns[hedge_type].extend(patterns)
|
|
|
|
# Compile all patterns
|
|
for hedge_type, patterns in all_patterns.items():
|
|
self._patterns[hedge_type] = [
|
|
re.compile(p, re.IGNORECASE) for p in patterns
|
|
]
|
|
|
|
def detect_hedges(self, text: str) -> List[Hedge]:
|
|
"""
|
|
Detect hedging markers in text.
|
|
|
|
Args:
|
|
text: Text to analyze
|
|
|
|
Returns:
|
|
List of detected Hedge objects
|
|
"""
|
|
hedges: List[Hedge] = []
|
|
seen_spans: Set[Tuple[int, int]] = set()
|
|
|
|
for hedge_type, patterns in self._patterns.items():
|
|
for pattern in patterns:
|
|
for match in pattern.finditer(text):
|
|
span = (match.start(), match.end())
|
|
|
|
# Avoid overlapping matches
|
|
if span not in seen_spans:
|
|
seen_spans.add(span)
|
|
|
|
# Infer dimension from context
|
|
dimension = self._infer_dimension(text, match.start(), match.end())
|
|
|
|
hedges.append(Hedge(
|
|
text=match.group(),
|
|
hedge_type=hedge_type,
|
|
start=match.start(),
|
|
end=match.end(),
|
|
confidence=1.0,
|
|
dimension=dimension,
|
|
))
|
|
|
|
# Sort by position
|
|
hedges.sort(key=lambda h: h.start)
|
|
return hedges
|
|
|
|
def _infer_dimension(self, text: str, start: int, end: int) -> Optional[UncertaintyDimension]:
|
|
"""Infer what dimension is uncertain from surrounding context."""
|
|
# Get context window (50 chars before and after)
|
|
context_start = max(0, start - 50)
|
|
context_end = min(len(text), end + 50)
|
|
context = text[context_start:context_end].lower()
|
|
|
|
for dimension, patterns in self.DIMENSION_PATTERNS.items():
|
|
for pattern in patterns:
|
|
if re.search(pattern, context, re.IGNORECASE):
|
|
return dimension
|
|
|
|
return None
|
|
|
|
def _calculate_uncertainty_level(
|
|
self,
|
|
hedges: List[Hedge],
|
|
word_count: int,
|
|
) -> UncertaintyLevel:
|
|
"""
|
|
Calculate overall uncertainty level from hedges.
|
|
|
|
Uses weighted scoring based on hedge types.
|
|
"""
|
|
if not hedges or word_count == 0:
|
|
return UncertaintyLevel.CERTAIN
|
|
|
|
# Calculate weighted score
|
|
total_weight = sum(
|
|
self.HEDGE_WEIGHTS.get(h.hedge_type, 0.5)
|
|
for h in hedges
|
|
)
|
|
|
|
# Normalize by word count (per 100 words)
|
|
normalized_score = (total_weight / word_count) * 100
|
|
|
|
# Also consider absolute number of high-weight hedges
|
|
high_weight_count = sum(
|
|
1 for h in hedges
|
|
if self.HEDGE_WEIGHTS.get(h.hedge_type, 0) >= 0.7
|
|
)
|
|
|
|
# Determine level
|
|
if normalized_score < 0.5 and high_weight_count == 0:
|
|
return UncertaintyLevel.CERTAIN
|
|
elif normalized_score < 2.0 and high_weight_count == 0:
|
|
return UncertaintyLevel.NEAR_CERTAIN
|
|
elif normalized_score < 5.0 and high_weight_count <= 1:
|
|
return UncertaintyLevel.MODERATE
|
|
elif normalized_score < 10.0 or high_weight_count <= 2:
|
|
return UncertaintyLevel.UNCERTAIN
|
|
else:
|
|
return UncertaintyLevel.HIGHLY_UNCERTAIN
|
|
|
|
def analyze(self, text: str) -> UncertaintyAnalysis:
|
|
"""
|
|
Perform complete uncertainty analysis on text.
|
|
|
|
Args:
|
|
text: Text to analyze
|
|
|
|
Returns:
|
|
UncertaintyAnalysis with detected hedges and overall assessment
|
|
"""
|
|
# Detect hedges
|
|
hedges = self.detect_hedges(text)
|
|
|
|
# Count words (simple tokenization)
|
|
word_count = len(text.split())
|
|
|
|
# Calculate hedge density
|
|
hedge_density = (len(hedges) / word_count * 100) if word_count > 0 else 0.0
|
|
|
|
# Calculate dimension scores
|
|
dimension_scores: Dict[UncertaintyDimension, float] = {}
|
|
for hedge in hedges:
|
|
if hedge.dimension:
|
|
weight = self.HEDGE_WEIGHTS.get(hedge.hedge_type, 0.5)
|
|
dimension_scores[hedge.dimension] = dimension_scores.get(
|
|
hedge.dimension, 0.0
|
|
) + weight
|
|
|
|
# Calculate overall level
|
|
level = self._calculate_uncertainty_level(hedges, word_count)
|
|
|
|
return UncertaintyAnalysis(
|
|
text=text,
|
|
level=level,
|
|
hedges=hedges,
|
|
hedge_density=hedge_density,
|
|
dimension_scores=dimension_scores,
|
|
hedge_count=len(hedges),
|
|
word_count=word_count,
|
|
)
|
|
|
|
def get_claim_confidence(
|
|
self,
|
|
claim_text: str,
|
|
context: Optional[str] = None,
|
|
) -> float:
|
|
"""
|
|
Calculate confidence score for a claim based on hedging.
|
|
|
|
Returns a value between 0.0 (highly uncertain) and 1.0 (certain).
|
|
|
|
Args:
|
|
claim_text: The claim text to analyze
|
|
context: Optional surrounding context
|
|
|
|
Returns:
|
|
Confidence score (0.0-1.0)
|
|
"""
|
|
# Analyze claim text
|
|
analysis = self.analyze(claim_text)
|
|
|
|
# Also analyze context if provided
|
|
if context:
|
|
context_analysis = self.analyze(context)
|
|
# Combine hedges from both
|
|
all_hedges = analysis.hedges + context_analysis.hedges
|
|
else:
|
|
all_hedges = analysis.hedges
|
|
|
|
if not all_hedges:
|
|
return 1.0 # No hedging = certain
|
|
|
|
# Calculate penalty based on hedge weights
|
|
total_penalty = sum(
|
|
self.HEDGE_WEIGHTS.get(h.hedge_type, 0.5)
|
|
for h in all_hedges
|
|
)
|
|
|
|
# Normalize: max penalty of 1.0 (fully uncertain)
|
|
penalty = min(1.0, total_penalty / 2.0) # 2.0 hedge weight = 0 confidence
|
|
|
|
return max(0.0, 1.0 - penalty)
|
|
|
|
def annotate_entities_with_uncertainty(
|
|
self,
|
|
entities: List[Dict[str, Any]],
|
|
text: str,
|
|
) -> List[Dict[str, Any]]:
|
|
"""
|
|
Annotate extracted entities with uncertainty information.
|
|
|
|
Args:
|
|
entities: List of extracted entities (dicts with text, start, end)
|
|
text: Full document text
|
|
|
|
Returns:
|
|
Entities with added uncertainty fields
|
|
"""
|
|
analysis = self.analyze(text)
|
|
|
|
for entity in entities:
|
|
entity_start = entity.get('start', 0)
|
|
entity_end = entity.get('end', len(text))
|
|
entity_text = entity.get('text', '')
|
|
|
|
# Find hedges near this entity
|
|
nearby_hedges = [
|
|
h for h in analysis.hedges
|
|
if abs(h.start - entity_start) < 50 or abs(h.end - entity_end) < 50
|
|
]
|
|
|
|
# Calculate entity-specific confidence
|
|
if nearby_hedges:
|
|
entity['uncertainty'] = {
|
|
'hedges': [
|
|
{'text': h.text, 'type': h.hedge_type.value}
|
|
for h in nearby_hedges
|
|
],
|
|
'confidence': self.get_claim_confidence(entity_text, text),
|
|
'level': self._calculate_uncertainty_level(
|
|
nearby_hedges,
|
|
len(entity_text.split()) or 1
|
|
).value,
|
|
}
|
|
else:
|
|
entity['uncertainty'] = {
|
|
'hedges': [],
|
|
'confidence': 1.0,
|
|
'level': UncertaintyLevel.CERTAIN.value,
|
|
}
|
|
|
|
return entities
|
|
|
|
|
|
# =============================================================================
|
|
# CONVENIENCE FUNCTIONS
|
|
# =============================================================================
|
|
|
|
def detect_hedges(text: str, languages: Optional[List[str]] = None) -> List[Hedge]:
|
|
"""
|
|
Detect hedging markers in text.
|
|
|
|
Args:
|
|
text: Text to analyze
|
|
languages: Language codes (default: ['en'])
|
|
|
|
Returns:
|
|
List of Hedge objects
|
|
"""
|
|
detector = UncertaintyDetector(languages=languages)
|
|
return detector.detect_hedges(text)
|
|
|
|
|
|
def analyze_uncertainty(text: str, languages: Optional[List[str]] = None) -> UncertaintyAnalysis:
|
|
"""
|
|
Perform complete uncertainty analysis.
|
|
|
|
Args:
|
|
text: Text to analyze
|
|
languages: Language codes (default: ['en'])
|
|
|
|
Returns:
|
|
UncertaintyAnalysis
|
|
"""
|
|
detector = UncertaintyDetector(languages=languages)
|
|
return detector.analyze(text)
|
|
|
|
|
|
def get_confidence_score(claim: str, context: Optional[str] = None) -> float:
|
|
"""
|
|
Get confidence score for a claim based on hedging language.
|
|
|
|
Args:
|
|
claim: Claim text
|
|
context: Optional surrounding context
|
|
|
|
Returns:
|
|
Confidence score (0.0-1.0)
|
|
"""
|
|
detector = UncertaintyDetector()
|
|
return detector.get_claim_confidence(claim, context)
|
|
|
|
|
|
__all__ = [
|
|
# Enums
|
|
"HedgeType",
|
|
"UncertaintyLevel",
|
|
"UncertaintyDimension",
|
|
# Data structures
|
|
"Hedge",
|
|
"UncertaintyAnalysis",
|
|
# Main class
|
|
"UncertaintyDetector",
|
|
# Convenience functions
|
|
"detect_hedges",
|
|
"analyze_uncertainty",
|
|
"get_confidence_score",
|
|
]
|