903 lines
31 KiB
YAML
903 lines
31 KiB
YAML
# =============================================================================
|
||
# GLAM-NER Entity Annotation Convention
|
||
# Module: Uncertainty and Confidence
|
||
# Path: modules/advanced/uncertainty.yaml
|
||
# Version: 1.7.0
|
||
# =============================================================================
|
||
#
|
||
# This module defines uncertainty modeling for entity annotations, enabling
|
||
# consumers to filter, prioritize, and reason about data quality.
|
||
#
|
||
# THREE types of uncertainty are distinguished:
|
||
#
|
||
# 1. ANNOTATION UNCERTAINTY: Confidence in a specific annotation decision
|
||
# - "How sure are we that 'Rembrandt' refers to AGT.PER?"
|
||
#
|
||
# 2. EPISTEMIC UNCERTAINTY: Uncertainty about facts in the world
|
||
# - "Did Rembrandt actually paint this work?" (attribution uncertainty)
|
||
#
|
||
# 3. LINGUISTIC UNCERTAINTY: Ambiguity in the source text itself
|
||
# - "The text says 'possibly by Rembrandt'" (hedged language)
|
||
#
|
||
# These are distinct and must be tracked separately.
|
||
#
|
||
# =============================================================================
|
||
|
||
module:
|
||
id: uncertainty
|
||
name: Uncertainty and Confidence
|
||
version: "1.7.0"
|
||
status: stable
|
||
category: advanced
|
||
|
||
dependencies:
|
||
- core/convention
|
||
- core/namespaces
|
||
|
||
description: |
|
||
Uncertainty modeling captures the degree of confidence in annotations,
|
||
enabling consumers to filter, prioritize, and reason about data quality.
|
||
|
||
This framework provides:
|
||
- Numerical confidence scores with defined semantics
|
||
- Calibration methods for reliable scores
|
||
- Epistemic uncertainty classification
|
||
- Linguistic hedging detection
|
||
- Inter-annotator agreement metrics
|
||
- Uncertainty propagation rules
|
||
- Visualization guidelines
|
||
|
||
# =============================================================================
|
||
# NAMESPACES
|
||
# =============================================================================
|
||
|
||
namespaces:
|
||
prov: "http://www.w3.org/ns/prov#"
|
||
oa: "http://www.w3.org/ns/oa#"
|
||
crm: "http://www.cidoc-crm.org/cidoc-crm/"
|
||
iao: "http://purl.obolibrary.org/obo/IAO_"
|
||
dqv: "http://www.w3.org/ns/dqv#"
|
||
tei: "http://www.tei-c.org/ns/1.0"
|
||
schema: "http://schema.org/"
|
||
xsd: "http://www.w3.org/2001/XMLSchema#"
|
||
|
||
# =============================================================================
|
||
# CONFIDENCE SCORE FRAMEWORK
|
||
# =============================================================================
|
||
|
||
confidence_framework:
|
||
description: |
|
||
Numerical confidence scores on [0.0, 1.0] scale with defined semantics.
|
||
Scores should be CALIBRATED: a score of 0.8 means the annotation is
|
||
correct approximately 80% of the time across similar cases.
|
||
|
||
score_semantics:
|
||
- range: "[0.95, 1.0]"
|
||
range_min: 0.95
|
||
range_max: 1.0
|
||
inclusive: "both"
|
||
label: "CERTAIN"
|
||
tei_cert: "high"
|
||
description: |
|
||
Near-certain annotation. Human annotators would agree 95%+ of the time.
|
||
Examples: exact string matches, unambiguous proper nouns, rule-based
|
||
high-precision extractions.
|
||
action: "Accept automatically; no review needed"
|
||
color_code: "#228B22" # Forest green
|
||
|
||
- range: "[0.80, 0.95)"
|
||
range_min: 0.80
|
||
range_max: 0.95
|
||
inclusive: "left"
|
||
label: "HIGH_CONFIDENCE"
|
||
tei_cert: "high"
|
||
description: |
|
||
High confidence annotation. Strong signals but some ambiguity possible.
|
||
Examples: well-known entities with standard forms, clear context.
|
||
action: "Accept with spot-checking"
|
||
color_code: "#32CD32" # Lime green
|
||
|
||
- range: "[0.60, 0.80)"
|
||
range_min: 0.60
|
||
range_max: 0.80
|
||
inclusive: "left"
|
||
label: "MEDIUM_CONFIDENCE"
|
||
tei_cert: "medium"
|
||
description: |
|
||
Moderate confidence. Multiple interpretations possible; context helps.
|
||
Examples: common names with multiple referents, unclear boundaries.
|
||
action: "Review if resource-sensitive; accept for bulk processing"
|
||
color_code: "#FFD700" # Gold
|
||
|
||
- range: "[0.40, 0.60)"
|
||
range_min: 0.40
|
||
range_max: 0.60
|
||
inclusive: "left"
|
||
label: "LOW_CONFIDENCE"
|
||
tei_cert: "low"
|
||
description: |
|
||
Low confidence. Significant uncertainty; near chance for binary decision.
|
||
Examples: ambiguous pronouns, unknown entities, noisy text.
|
||
action: "Flag for human review"
|
||
color_code: "#FFA500" # Orange
|
||
|
||
- range: "[0.20, 0.40)"
|
||
range_min: 0.20
|
||
range_max: 0.40
|
||
inclusive: "left"
|
||
label: "VERY_LOW_CONFIDENCE"
|
||
tei_cert: "low"
|
||
description: |
|
||
Very low confidence. Model is guessing; alternative interpretations likely.
|
||
action: "Require human verification before use"
|
||
color_code: "#FF4500" # Orange-red
|
||
|
||
- range: "[0.0, 0.20)"
|
||
range_min: 0.0
|
||
range_max: 0.20
|
||
inclusive: "left"
|
||
label: "UNCERTAIN"
|
||
tei_cert: "unknown"
|
||
description: |
|
||
Near-zero confidence. Essentially no signal; included for completeness.
|
||
action: "Do not use without manual annotation"
|
||
color_code: "#DC143C" # Crimson
|
||
|
||
special_values:
|
||
- value: null
|
||
meaning: "Confidence not computed/available"
|
||
|
||
- value: 1.0
|
||
meaning: "Absolute certainty (use sparingly; only for definitional truths)"
|
||
|
||
- value: 0.0
|
||
meaning: "Definite negative (annotation is known to be incorrect)"
|
||
|
||
aggregation_rules:
|
||
description: "How to combine confidence scores"
|
||
|
||
rules:
|
||
- name: "Independent Conjunction"
|
||
formula: "P(A ∧ B) = P(A) × P(B)"
|
||
use_case: "Joint probability of independent annotations"
|
||
example: "Entity type AND entity boundaries both correct"
|
||
|
||
- name: "Conservative Minimum"
|
||
formula: "min(conf_1, conf_2, ...)"
|
||
use_case: "Weakest-link scenarios; overall quality limited by weakest component"
|
||
example: "Relationship confidence = min(subject_conf, object_conf, predicate_conf)"
|
||
|
||
- name: "Weighted Average"
|
||
formula: "Σ(weight_i × conf_i) / Σ(weight_i)"
|
||
use_case: "Combining scores from multiple sources/annotators"
|
||
example: "Ensemble model output"
|
||
|
||
- name: "Maximum"
|
||
formula: "max(conf_1, conf_2, ...)"
|
||
use_case: "Any-correct scenarios (at least one interpretation valid)"
|
||
example: "Multiple valid entity types"
|
||
|
||
# =============================================================================
|
||
# ANNOTATION UNCERTAINTY
|
||
# =============================================================================
|
||
|
||
annotation_uncertainty:
|
||
description: |
|
||
Uncertainty about the correctness of annotation DECISIONS.
|
||
This is process uncertainty, not world uncertainty.
|
||
|
||
dimensions:
|
||
- dimension: "boundary_confidence"
|
||
description: "Confidence in span boundaries (start/end offsets)"
|
||
factors:
|
||
- "Tokenization ambiguity"
|
||
- "Nested entity decisions"
|
||
- "Modifier attachment"
|
||
examples:
|
||
- text: "the Dutch painter Rembrandt"
|
||
issue: "Should span include 'the Dutch painter' or just 'Rembrandt'?"
|
||
resolution: "Mark 'Rembrandt' as head; broader span has lower confidence"
|
||
|
||
- dimension: "type_confidence"
|
||
description: "Confidence in entity type assignment"
|
||
factors:
|
||
- "Ambiguous entity types (organization vs. event: 'the conference')"
|
||
- "Metonymy (place for organization: 'Washington announced')"
|
||
- "Type granularity (museum subtype assignment)"
|
||
examples:
|
||
- text: "Apple announced new products"
|
||
issue: "AGT.PER (person)? GRP.COR (corporation)? WRK.OBJ (fruit)?"
|
||
resolution: "Context disambiguates; assign GRP.COR with high confidence"
|
||
|
||
- dimension: "referent_confidence"
|
||
description: "Confidence in identity of referred entity"
|
||
factors:
|
||
- "Ambiguous names (multiple people named 'John Smith')"
|
||
- "Unknown entities (not in knowledge base)"
|
||
- "Nickname/variant resolution"
|
||
examples:
|
||
- text: "Dr. Williams presented the findings"
|
||
issue: "Which Dr. Williams? No disambiguating context."
|
||
resolution: "Create entity with low referent_confidence; flag for KB linking"
|
||
|
||
- dimension: "extraction_confidence"
|
||
description: "Confidence from the extraction model/process"
|
||
factors:
|
||
- "Model probability/logits"
|
||
- "Rule match specificity"
|
||
- "OCR quality (for digitized documents)"
|
||
|
||
combined_score:
|
||
formula: |
|
||
annotation_confidence = min(
|
||
boundary_confidence,
|
||
type_confidence,
|
||
referent_confidence
|
||
) × extraction_confidence
|
||
|
||
rationale: |
|
||
Use minimum for boundary/type/referent (weakest link), then scale by
|
||
extraction quality. Low OCR confidence degrades all downstream scores.
|
||
|
||
# =============================================================================
|
||
# EPISTEMIC UNCERTAINTY
|
||
# =============================================================================
|
||
|
||
epistemic_uncertainty:
|
||
description: |
|
||
Uncertainty about FACTS IN THE WORLD, not annotation process.
|
||
Captures disputed, unknown, or probability claims about reality.
|
||
|
||
uncertainty_types:
|
||
- type: "DISPUTED"
|
||
description: "Multiple conflicting claims in sources"
|
||
examples:
|
||
- "Scholars dispute whether Vermeer used a camera obscura"
|
||
- "The painting's attribution is contested"
|
||
annotation_fields:
|
||
- field: "epistemic_status"
|
||
value: "disputed"
|
||
- field: "competing_claims"
|
||
value: ["claim_1", "claim_2"]
|
||
|
||
- type: "UNKNOWN"
|
||
description: "No reliable information available"
|
||
examples:
|
||
- "The artist's birthdate is unknown"
|
||
- "The work's provenance before 1900 is unrecorded"
|
||
annotation_fields:
|
||
- field: "epistemic_status"
|
||
value: "unknown"
|
||
|
||
- type: "APPROXIMATE"
|
||
description: "Value is estimated or rounded"
|
||
examples:
|
||
- "The collection contains approximately 8,000 works"
|
||
- "Painted circa 1642"
|
||
annotation_fields:
|
||
- field: "epistemic_status"
|
||
value: "approximate"
|
||
- field: "precision"
|
||
value: "circa_year | circa_decade | order_of_magnitude"
|
||
|
||
- type: "INFERRED"
|
||
description: "Derived from other facts, not directly stated"
|
||
examples:
|
||
- "If born in 1606 and died in 1669, he lived 63 years"
|
||
- "Based on style, attributed to the artist's late period"
|
||
annotation_fields:
|
||
- field: "epistemic_status"
|
||
value: "inferred"
|
||
- field: "inference_basis"
|
||
value: "description of reasoning"
|
||
|
||
- type: "HYPOTHETICAL"
|
||
description: "Conditional or speculative claim"
|
||
examples:
|
||
- "If the signature is authentic, the painting is by Vermeer"
|
||
- "The proposed identification remains unconfirmed"
|
||
annotation_fields:
|
||
- field: "epistemic_status"
|
||
value: "hypothetical"
|
||
- field: "condition"
|
||
value: "description of condition"
|
||
|
||
source_reliability:
|
||
description: "Confidence based on source quality"
|
||
|
||
tiers:
|
||
- tier: 1
|
||
label: "AUTHORITATIVE"
|
||
sources:
|
||
- "Primary sources (original documents, eyewitness accounts)"
|
||
- "Official registries (ISIL, Wikidata with references)"
|
||
- "Peer-reviewed scholarship"
|
||
default_confidence: 0.95
|
||
|
||
- tier: 2
|
||
label: "RELIABLE"
|
||
sources:
|
||
- "Institutional websites"
|
||
- "Encyclopedia entries (Britannica, Wikipedia with citations)"
|
||
- "Expert secondary sources"
|
||
default_confidence: 0.85
|
||
|
||
- tier: 3
|
||
label: "CREDIBLE"
|
||
sources:
|
||
- "News media"
|
||
- "Wikipedia without citations"
|
||
- "Aggregated databases"
|
||
default_confidence: 0.70
|
||
|
||
- tier: 4
|
||
label: "UNVERIFIED"
|
||
sources:
|
||
- "User-generated content"
|
||
- "Social media"
|
||
- "NLP extraction without verification"
|
||
default_confidence: 0.50
|
||
|
||
- tier: 5
|
||
label: "SUSPECT"
|
||
sources:
|
||
- "Known unreliable sources"
|
||
- "Contradicted by authoritative sources"
|
||
- "Outdated information"
|
||
default_confidence: 0.20
|
||
|
||
# =============================================================================
|
||
# LINGUISTIC UNCERTAINTY
|
||
# =============================================================================
|
||
|
||
linguistic_uncertainty:
|
||
description: |
|
||
Uncertainty encoded IN THE SOURCE TEXT itself through hedging,
|
||
modality, attribution, and evidentiality markers.
|
||
|
||
hedging_markers:
|
||
description: "Words/phrases indicating reduced certainty in source"
|
||
|
||
categories:
|
||
- category: "MODAL_VERBS"
|
||
markers:
|
||
- marker: "may/might"
|
||
uncertainty_reduction: 0.4
|
||
example: "The painting may be by Vermeer"
|
||
- marker: "could"
|
||
uncertainty_reduction: 0.3
|
||
example: "This could date to the 1640s"
|
||
- marker: "must (epistemic)"
|
||
uncertainty_reduction: 0.1
|
||
example: "He must have known the artist"
|
||
- marker: "should"
|
||
uncertainty_reduction: 0.2
|
||
example: "The document should contain the date"
|
||
|
||
- category: "HEDGING_ADVERBS"
|
||
markers:
|
||
- marker: "possibly/perhaps/maybe"
|
||
uncertainty_reduction: 0.4
|
||
example: "Possibly painted in Amsterdam"
|
||
- marker: "probably/likely"
|
||
uncertainty_reduction: 0.2
|
||
example: "Probably by Rembrandt's workshop"
|
||
- marker: "certainly/definitely"
|
||
uncertainty_reduction: 0.0
|
||
example: "Certainly authentic"
|
||
- marker: "apparently/seemingly"
|
||
uncertainty_reduction: 0.3
|
||
example: "Apparently a self-portrait"
|
||
|
||
- category: "HEDGING_ADJECTIVES"
|
||
markers:
|
||
- marker: "possible/potential"
|
||
uncertainty_reduction: 0.4
|
||
example: "A possible attribution to Hals"
|
||
- marker: "probable/likely"
|
||
uncertainty_reduction: 0.2
|
||
example: "The probable author"
|
||
- marker: "alleged/purported"
|
||
uncertainty_reduction: 0.5
|
||
example: "The alleged forgery"
|
||
- marker: "so-called"
|
||
uncertainty_reduction: 0.3
|
||
example: "The so-called 'Night Watch'"
|
||
|
||
- category: "ATTRIBUTION_PHRASES"
|
||
description: "Attribution to uncertain source"
|
||
markers:
|
||
- marker: "according to X"
|
||
uncertainty_reduction: "depends on X reliability"
|
||
example: "According to early sources..."
|
||
- marker: "it is said that"
|
||
uncertainty_reduction: 0.4
|
||
example: "It is said that Vermeer used..."
|
||
- marker: "traditionally attributed to"
|
||
uncertainty_reduction: 0.3
|
||
example: "Traditionally attributed to Leonardo"
|
||
- marker: "some scholars believe"
|
||
uncertainty_reduction: 0.3
|
||
example: "Some scholars believe this is..."
|
||
|
||
- category: "APPROXIMATION"
|
||
markers:
|
||
- marker: "circa/c./ca."
|
||
uncertainty_reduction: 0.2
|
||
example: "c. 1642"
|
||
- marker: "approximately/about/around"
|
||
uncertainty_reduction: 0.2
|
||
example: "Around 1,000 works"
|
||
- marker: "roughly/nearly"
|
||
uncertainty_reduction: 0.2
|
||
example: "Nearly 50 years old"
|
||
|
||
annotation_pattern:
|
||
description: "How to annotate linguistic uncertainty"
|
||
|
||
schema:
|
||
- field: "has_hedging"
|
||
type: "boolean"
|
||
description: "True if source text contains hedging"
|
||
|
||
- field: "hedging_markers"
|
||
type: "array[string]"
|
||
description: "List of hedging markers detected"
|
||
|
||
- field: "source_certainty"
|
||
type: "float"
|
||
description: "Certainty expressed in source (before extraction adjustment)"
|
||
formula: "1.0 - max(uncertainty_reductions)"
|
||
|
||
- field: "attributed_to"
|
||
type: "object"
|
||
description: "If claim is attributed, to whom"
|
||
schema:
|
||
source_name: "string"
|
||
source_reliability: "float"
|
||
is_author_endorsement: "boolean"
|
||
|
||
examples:
|
||
- text: "The painting is possibly by Rembrandt"
|
||
annotation:
|
||
has_hedging: true
|
||
hedging_markers: ["possibly"]
|
||
source_certainty: 0.6
|
||
notes: "Attribution uncertain in source text"
|
||
|
||
- text: "According to Houbraken, Rembrandt was born in Leiden"
|
||
annotation:
|
||
has_hedging: true
|
||
hedging_markers: ["according to"]
|
||
attributed_to:
|
||
source_name: "Arnold Houbraken"
|
||
source_reliability: 0.8
|
||
is_author_endorsement: false
|
||
source_certainty: 0.8
|
||
|
||
# =============================================================================
|
||
# UNCERTAINTY ANNOTATION SCHEMA
|
||
# =============================================================================
|
||
|
||
annotation_schema:
|
||
description: "Complete schema for uncertainty annotation"
|
||
|
||
fields:
|
||
# Core confidence score
|
||
- field: "confidence"
|
||
type: "float"
|
||
range: [0.0, 1.0]
|
||
required: true
|
||
description: "Overall confidence in the annotation"
|
||
|
||
# Detailed confidence breakdown
|
||
- field: "confidence_breakdown"
|
||
type: "object"
|
||
optional: true
|
||
schema:
|
||
boundary_confidence: "float"
|
||
type_confidence: "float"
|
||
referent_confidence: "float"
|
||
extraction_confidence: "float"
|
||
description: "Component confidence scores"
|
||
|
||
# Confidence metadata
|
||
- field: "confidence_method"
|
||
type: "enum"
|
||
values:
|
||
- "MODEL_PROBABILITY" # From ML model output
|
||
- "RULE_CONFIDENCE" # Assigned by rule
|
||
- "HUMAN_JUDGMENT" # Manual annotation
|
||
- "ENSEMBLE_AGGREGATE" # Combined from multiple sources
|
||
- "HEURISTIC" # Based on heuristic rules
|
||
- "CALIBRATED" # Post-hoc calibrated
|
||
description: "How confidence was computed"
|
||
|
||
- field: "calibration_status"
|
||
type: "enum"
|
||
values:
|
||
- "UNCALIBRATED" # Raw model output
|
||
- "CALIBRATED" # Adjusted for reliability
|
||
- "VALIDATED" # Verified against ground truth
|
||
description: "Whether score has been calibrated"
|
||
|
||
# Epistemic uncertainty
|
||
- field: "epistemic_status"
|
||
type: "enum"
|
||
values:
|
||
- "ASSERTED" # Presented as fact
|
||
- "DISPUTED" # Multiple conflicting claims
|
||
- "UNKNOWN" # No reliable information
|
||
- "APPROXIMATE" # Estimated value
|
||
- "INFERRED" # Derived from other facts
|
||
- "HYPOTHETICAL" # Conditional/speculative
|
||
optional: true
|
||
default: "ASSERTED"
|
||
description: "Epistemic status of the claim"
|
||
|
||
- field: "competing_claims"
|
||
type: "array[object]"
|
||
optional: true
|
||
description: "If disputed, list of competing claims"
|
||
item_schema:
|
||
claim: "string"
|
||
source: "string"
|
||
confidence: "float"
|
||
|
||
# Linguistic uncertainty
|
||
- field: "source_hedging"
|
||
type: "object"
|
||
optional: true
|
||
schema:
|
||
has_hedging: "boolean"
|
||
markers: "array[string]"
|
||
source_certainty: "float"
|
||
description: "Hedging detected in source text"
|
||
|
||
# Attribution
|
||
- field: "attribution"
|
||
type: "object"
|
||
optional: true
|
||
schema:
|
||
attributed_to: "string (source/speaker)"
|
||
attribution_type: "enum (QUOTE, PARAPHRASE, CLAIM)"
|
||
endorsement: "boolean (does author endorse?)"
|
||
description: "If claim is attributed to another source"
|
||
|
||
# Review status
|
||
- field: "review_status"
|
||
type: "enum"
|
||
values:
|
||
- "UNREVIEWED" # Not yet reviewed
|
||
- "PENDING_REVIEW" # Flagged for review
|
||
- "REVIEWED" # Reviewed, accepted
|
||
- "DISPUTED" # Reviewer disagrees
|
||
- "CORRECTED" # Original was wrong, corrected
|
||
optional: true
|
||
description: "Human review status"
|
||
|
||
- field: "reviewed_by"
|
||
type: "string"
|
||
optional: true
|
||
description: "Reviewer identifier"
|
||
|
||
- field: "review_date"
|
||
type: "string"
|
||
format: "ISO 8601"
|
||
optional: true
|
||
description: "When reviewed"
|
||
|
||
- field: "review_notes"
|
||
type: "string"
|
||
optional: true
|
||
description: "Reviewer comments"
|
||
|
||
# =============================================================================
|
||
# CALIBRATION AND VALIDATION
|
||
# =============================================================================
|
||
|
||
calibration:
|
||
description: |
|
||
Methods for ensuring confidence scores are RELIABLE (calibrated).
|
||
A calibrated score of 0.8 means 80% of annotations at that score are correct.
|
||
|
||
calibration_methods:
|
||
- method: "Temperature Scaling"
|
||
description: |
|
||
Post-hoc calibration using held-out validation set.
|
||
Learn temperature T such that softmax(logits/T) is calibrated.
|
||
suitable_for: "Neural model outputs"
|
||
|
||
- method: "Platt Scaling"
|
||
description: |
|
||
Fit sigmoid function to map raw scores to calibrated probabilities.
|
||
P(correct) = 1 / (1 + exp(A × score + B))
|
||
suitable_for: "Binary classification outputs"
|
||
|
||
- method: "Isotonic Regression"
|
||
description: |
|
||
Non-parametric calibration preserving score ordering.
|
||
Maps scores to calibrated values via piecewise constant function.
|
||
suitable_for: "When calibration curve is non-monotonic"
|
||
|
||
- method: "Histogram Binning"
|
||
description: |
|
||
Bin predictions and assign calibrated score per bin.
|
||
suitable_for: "Simple, interpretable calibration"
|
||
|
||
calibration_metrics:
|
||
- metric: "Expected Calibration Error (ECE)"
|
||
description: "Average gap between confidence and accuracy per bin"
|
||
formula: "Σ |B_i|/n × |accuracy(B_i) - confidence(B_i)|"
|
||
target: "< 0.05"
|
||
|
||
- metric: "Maximum Calibration Error (MCE)"
|
||
description: "Largest gap across bins"
|
||
formula: "max_i |accuracy(B_i) - confidence(B_i)|"
|
||
target: "< 0.10"
|
||
|
||
- metric: "Brier Score"
|
||
description: "Mean squared error of probability estimates"
|
||
formula: "Σ (p_i - y_i)² / n"
|
||
target: "Lower is better"
|
||
|
||
validation_requirements:
|
||
description: "Requirements for validating confidence scores"
|
||
|
||
requirements:
|
||
- "Held-out test set with ground truth annotations"
|
||
- "Stratified sampling across entity types and confidence ranges"
|
||
- "Minimum 100 samples per calibration bin"
|
||
- "Regular recalibration as model/data changes"
|
||
|
||
monitoring:
|
||
- "Track calibration metrics over time"
|
||
- "Alert on calibration drift"
|
||
- "Retrain calibration when ECE exceeds threshold"
|
||
|
||
# =============================================================================
|
||
# INTER-ANNOTATOR AGREEMENT
|
||
# =============================================================================
|
||
|
||
inter_annotator_agreement:
|
||
description: |
|
||
Metrics for measuring agreement between annotators, which informs
|
||
confidence estimation and task difficulty assessment.
|
||
|
||
metrics:
|
||
- metric: "Cohen's Kappa (κ)"
|
||
description: "Agreement corrected for chance (2 annotators)"
|
||
formula: "κ = (P_o - P_e) / (1 - P_e)"
|
||
interpretation:
|
||
- range: [0.81, 1.0]
|
||
label: "Almost perfect"
|
||
- range: [0.61, 0.80]
|
||
label: "Substantial"
|
||
- range: [0.41, 0.60]
|
||
label: "Moderate"
|
||
- range: [0.21, 0.40]
|
||
label: "Fair"
|
||
- range: [0.0, 0.20]
|
||
label: "Slight"
|
||
|
||
- metric: "Fleiss' Kappa"
|
||
description: "Multi-annotator extension of Cohen's κ"
|
||
use_case: "3+ annotators on same items"
|
||
|
||
- metric: "Krippendorff's Alpha (α)"
|
||
description: "Handles missing data, any number of annotators"
|
||
use_case: "Production annotation with variable annotator coverage"
|
||
|
||
- metric: "F1 Agreement"
|
||
description: "Treat one annotator as gold, compute F1"
|
||
use_case: "When one annotator is more senior/authoritative"
|
||
|
||
agreement_to_confidence:
|
||
description: |
|
||
Use IAA to inform confidence scoring. Low agreement items
|
||
should have lower confidence bounds.
|
||
|
||
heuristic:
|
||
- agreement: "All annotators agree"
|
||
confidence_boost: 0.1
|
||
|
||
- agreement: "Majority agrees (>66%)"
|
||
confidence_boost: 0.0
|
||
|
||
- agreement: "Split decision (50%)"
|
||
confidence_reduction: 0.2
|
||
flag: "REQUIRES_ADJUDICATION"
|
||
|
||
- agreement: "No majority"
|
||
confidence_reduction: 0.4
|
||
flag: "HIGHLY_AMBIGUOUS"
|
||
|
||
# =============================================================================
|
||
# UNCERTAINTY PROPAGATION
|
||
# =============================================================================
|
||
|
||
uncertainty_propagation:
|
||
description: |
|
||
How uncertainty flows through annotation pipelines and affects
|
||
downstream tasks.
|
||
|
||
propagation_rules:
|
||
- stage: "Entity Extraction → Relationship Extraction"
|
||
rule: |
|
||
Relationship confidence ≤ min(subject_confidence, object_confidence)
|
||
rationale: "Can't have high-confidence relationship with low-confidence entities"
|
||
|
||
- stage: "Mention Detection → Coreference"
|
||
rule: |
|
||
Coreference confidence ≤ min(mention_1_confidence, mention_2_confidence)
|
||
rationale: "Coreference uncertain if mentions are uncertain"
|
||
|
||
- stage: "Individual Annotations → Aggregate Statistics"
|
||
rule: |
|
||
Report confidence intervals, not just point estimates.
|
||
E.g., "8,000 ± 500 entities extracted (95% CI)"
|
||
|
||
- stage: "Multiple Sources → Merged Entity"
|
||
rule: |
|
||
merged_confidence = f(source_confidences, agreement)
|
||
Agreement boosts; conflict reduces.
|
||
|
||
monte_carlo_simulation:
|
||
description: |
|
||
For complex pipelines, use Monte Carlo simulation:
|
||
1. Sample annotations according to confidence distributions
|
||
2. Run downstream pipeline
|
||
3. Aggregate to get output distribution
|
||
use_case: "Estimating uncertainty in knowledge graph population"
|
||
|
||
# =============================================================================
|
||
# PRESENTATION AND VISUALIZATION
|
||
# =============================================================================
|
||
|
||
presentation:
|
||
description: "How to present uncertainty to users"
|
||
|
||
ui_guidelines:
|
||
- guideline: "Color Coding"
|
||
description: "Use consistent color scale from green (certain) to red (uncertain)"
|
||
implementation:
|
||
- confidence_range: "[0.9, 1.0]"
|
||
confidence_min: 0.9
|
||
confidence_max: 1.0
|
||
color: "green"
|
||
icon: "✓"
|
||
- confidence_range: "[0.7, 0.9)"
|
||
confidence_min: 0.7
|
||
confidence_max: 0.9
|
||
color: "light-green"
|
||
icon: "○"
|
||
- confidence_range: "[0.5, 0.7)"
|
||
confidence_min: 0.5
|
||
confidence_max: 0.7
|
||
color: "yellow"
|
||
icon: "?"
|
||
- confidence_range: "[0.3, 0.5)"
|
||
confidence_min: 0.3
|
||
confidence_max: 0.5
|
||
color: "orange"
|
||
icon: "⚠"
|
||
- confidence_range: "[0.0, 0.3)"
|
||
confidence_min: 0.0
|
||
confidence_max: 0.3
|
||
color: "red"
|
||
icon: "✗"
|
||
|
||
- guideline: "Uncertainty Indicators"
|
||
description: "Visual indicators scaled to confidence"
|
||
options:
|
||
- "Border thickness (thicker = less certain)"
|
||
- "Opacity (more transparent = less certain)"
|
||
- "Hatching/patterns for uncertain regions"
|
||
- "Tooltip with detailed confidence breakdown"
|
||
|
||
- guideline: "Sortable/Filterable"
|
||
description: "Allow users to sort and filter by confidence"
|
||
features:
|
||
- "Slider to set confidence threshold"
|
||
- "Show only items needing review (conf < 0.8)"
|
||
- "Sort by confidence ascending (most uncertain first)"
|
||
|
||
- guideline: "Confidence Distribution"
|
||
description: "Show overall confidence distribution"
|
||
implementation: "Histogram of confidence scores across annotations"
|
||
|
||
- guideline: "Explain Uncertainty"
|
||
description: "On hover/click, explain WHY confidence is low"
|
||
example: |
|
||
"Low confidence (0.45) because:
|
||
- Ambiguous entity type (0.52)
|
||
- Multiple possible referents in KB (0.65)
|
||
- Source text contains hedging: 'possibly'"
|
||
|
||
# =============================================================================
|
||
# EXAMPLES
|
||
# =============================================================================
|
||
|
||
examples:
|
||
- name: "High confidence entity"
|
||
text: "The Rijksmuseum in Amsterdam houses Rembrandt's Night Watch."
|
||
annotation:
|
||
entity: "Rijksmuseum"
|
||
entity_type: "GRP.ORG"
|
||
confidence: 0.97
|
||
confidence_breakdown:
|
||
boundary_confidence: 0.99
|
||
type_confidence: 0.98
|
||
referent_confidence: 0.96
|
||
extraction_confidence: 0.98
|
||
confidence_method: "MODEL_PROBABILITY"
|
||
epistemic_status: "ASSERTED"
|
||
external_uri: "https://www.wikidata.org/entity/Q190804"
|
||
|
||
- name: "Hedged attribution"
|
||
text: "The painting is possibly by Rembrandt or his workshop."
|
||
annotation:
|
||
entity: "Rembrandt or his workshop"
|
||
entity_type: "AGT.PER"
|
||
confidence: 0.75
|
||
confidence_breakdown:
|
||
boundary_confidence: 0.85
|
||
type_confidence: 0.90
|
||
referent_confidence: 0.60
|
||
extraction_confidence: 0.85
|
||
source_hedging:
|
||
has_hedging: true
|
||
markers: ["possibly", "or"]
|
||
source_certainty: 0.5
|
||
epistemic_status: "DISPUTED"
|
||
competing_claims:
|
||
- claim: "By Rembrandt himself"
|
||
confidence: 0.4
|
||
- claim: "By Rembrandt's workshop"
|
||
confidence: 0.5
|
||
- claim: "Later copy"
|
||
confidence: 0.1
|
||
|
||
- name: "Approximate temporal reference"
|
||
text: "The collection was established around 1800."
|
||
annotation:
|
||
entity: "around 1800"
|
||
entity_type: "TMP.DAB"
|
||
confidence: 0.85
|
||
epistemic_status: "APPROXIMATE"
|
||
source_hedging:
|
||
has_hedging: true
|
||
markers: ["around"]
|
||
source_certainty: 0.8
|
||
temporal_uncertainty:
|
||
point_estimate: "1800-01-01"
|
||
range_start: "1795-01-01"
|
||
range_end: "1805-12-31"
|
||
precision: "circa_decade"
|
||
|
||
- name: "Low confidence extraction needing review"
|
||
text: "Dr. J. van der Berg described the artifact."
|
||
annotation:
|
||
entity: "Dr. J. van der Berg"
|
||
entity_type: "AGT.PER"
|
||
confidence: 0.45
|
||
confidence_breakdown:
|
||
boundary_confidence: 0.90
|
||
type_confidence: 0.85
|
||
referent_confidence: 0.35
|
||
extraction_confidence: 0.80
|
||
confidence_method: "MODEL_PROBABILITY"
|
||
review_status: "PENDING_REVIEW"
|
||
review_notes: |
|
||
Multiple "J. van der Berg" in knowledge base.
|
||
Need additional context for disambiguation.
|
||
possible_referents:
|
||
- uri: "https://viaf.org/viaf/12345"
|
||
name: "Johan van der Berg (1890-1960)"
|
||
match_confidence: 0.35
|
||
- uri: "https://viaf.org/viaf/67890"
|
||
name: "Johannes van der Berg (1920-1995)"
|
||
match_confidence: 0.30
|
||
- uri: null
|
||
name: "(Unknown person)"
|
||
match_confidence: 0.35
|