# ============================================================================= # GLAM-NER Entity Annotation Convention # Module: Uncertainty and Confidence # Path: modules/advanced/uncertainty.yaml # Version: 1.7.0 # ============================================================================= # # This module defines uncertainty modeling for entity annotations, enabling # consumers to filter, prioritize, and reason about data quality. # # THREE types of uncertainty are distinguished: # # 1. ANNOTATION UNCERTAINTY: Confidence in a specific annotation decision # - "How sure are we that 'Rembrandt' refers to AGT.PER?" # # 2. EPISTEMIC UNCERTAINTY: Uncertainty about facts in the world # - "Did Rembrandt actually paint this work?" (attribution uncertainty) # # 3. LINGUISTIC UNCERTAINTY: Ambiguity in the source text itself # - "The text says 'possibly by Rembrandt'" (hedged language) # # These are distinct and must be tracked separately. # # ============================================================================= module: id: uncertainty name: Uncertainty and Confidence version: "1.7.0" status: stable category: advanced dependencies: - core/convention - core/namespaces description: | Uncertainty modeling captures the degree of confidence in annotations, enabling consumers to filter, prioritize, and reason about data quality. This framework provides: - Numerical confidence scores with defined semantics - Calibration methods for reliable scores - Epistemic uncertainty classification - Linguistic hedging detection - Inter-annotator agreement metrics - Uncertainty propagation rules - Visualization guidelines # ============================================================================= # NAMESPACES # ============================================================================= namespaces: prov: "http://www.w3.org/ns/prov#" oa: "http://www.w3.org/ns/oa#" crm: "http://www.cidoc-crm.org/cidoc-crm/" iao: "http://purl.obolibrary.org/obo/IAO_" dqv: "http://www.w3.org/ns/dqv#" tei: "http://www.tei-c.org/ns/1.0" schema: "http://schema.org/" xsd: "http://www.w3.org/2001/XMLSchema#" # ============================================================================= # CONFIDENCE SCORE FRAMEWORK # ============================================================================= confidence_framework: description: | Numerical confidence scores on [0.0, 1.0] scale with defined semantics. Scores should be CALIBRATED: a score of 0.8 means the annotation is correct approximately 80% of the time across similar cases. score_semantics: - range: "[0.95, 1.0]" range_min: 0.95 range_max: 1.0 inclusive: "both" label: "CERTAIN" tei_cert: "high" description: | Near-certain annotation. Human annotators would agree 95%+ of the time. Examples: exact string matches, unambiguous proper nouns, rule-based high-precision extractions. action: "Accept automatically; no review needed" color_code: "#228B22" # Forest green - range: "[0.80, 0.95)" range_min: 0.80 range_max: 0.95 inclusive: "left" label: "HIGH_CONFIDENCE" tei_cert: "high" description: | High confidence annotation. Strong signals but some ambiguity possible. Examples: well-known entities with standard forms, clear context. action: "Accept with spot-checking" color_code: "#32CD32" # Lime green - range: "[0.60, 0.80)" range_min: 0.60 range_max: 0.80 inclusive: "left" label: "MEDIUM_CONFIDENCE" tei_cert: "medium" description: | Moderate confidence. Multiple interpretations possible; context helps. Examples: common names with multiple referents, unclear boundaries. action: "Review if resource-sensitive; accept for bulk processing" color_code: "#FFD700" # Gold - range: "[0.40, 0.60)" range_min: 0.40 range_max: 0.60 inclusive: "left" label: "LOW_CONFIDENCE" tei_cert: "low" description: | Low confidence. Significant uncertainty; near chance for binary decision. Examples: ambiguous pronouns, unknown entities, noisy text. action: "Flag for human review" color_code: "#FFA500" # Orange - range: "[0.20, 0.40)" range_min: 0.20 range_max: 0.40 inclusive: "left" label: "VERY_LOW_CONFIDENCE" tei_cert: "low" description: | Very low confidence. Model is guessing; alternative interpretations likely. action: "Require human verification before use" color_code: "#FF4500" # Orange-red - range: "[0.0, 0.20)" range_min: 0.0 range_max: 0.20 inclusive: "left" label: "UNCERTAIN" tei_cert: "unknown" description: | Near-zero confidence. Essentially no signal; included for completeness. action: "Do not use without manual annotation" color_code: "#DC143C" # Crimson special_values: - value: null meaning: "Confidence not computed/available" - value: 1.0 meaning: "Absolute certainty (use sparingly; only for definitional truths)" - value: 0.0 meaning: "Definite negative (annotation is known to be incorrect)" aggregation_rules: description: "How to combine confidence scores" rules: - name: "Independent Conjunction" formula: "P(A ∧ B) = P(A) × P(B)" use_case: "Joint probability of independent annotations" example: "Entity type AND entity boundaries both correct" - name: "Conservative Minimum" formula: "min(conf_1, conf_2, ...)" use_case: "Weakest-link scenarios; overall quality limited by weakest component" example: "Relationship confidence = min(subject_conf, object_conf, predicate_conf)" - name: "Weighted Average" formula: "Σ(weight_i × conf_i) / Σ(weight_i)" use_case: "Combining scores from multiple sources/annotators" example: "Ensemble model output" - name: "Maximum" formula: "max(conf_1, conf_2, ...)" use_case: "Any-correct scenarios (at least one interpretation valid)" example: "Multiple valid entity types" # ============================================================================= # ANNOTATION UNCERTAINTY # ============================================================================= annotation_uncertainty: description: | Uncertainty about the correctness of annotation DECISIONS. This is process uncertainty, not world uncertainty. dimensions: - dimension: "boundary_confidence" description: "Confidence in span boundaries (start/end offsets)" factors: - "Tokenization ambiguity" - "Nested entity decisions" - "Modifier attachment" examples: - text: "the Dutch painter Rembrandt" issue: "Should span include 'the Dutch painter' or just 'Rembrandt'?" resolution: "Mark 'Rembrandt' as head; broader span has lower confidence" - dimension: "type_confidence" description: "Confidence in entity type assignment" factors: - "Ambiguous entity types (organization vs. event: 'the conference')" - "Metonymy (place for organization: 'Washington announced')" - "Type granularity (museum subtype assignment)" examples: - text: "Apple announced new products" issue: "AGT.PER (person)? GRP.COR (corporation)? WRK.OBJ (fruit)?" resolution: "Context disambiguates; assign GRP.COR with high confidence" - dimension: "referent_confidence" description: "Confidence in identity of referred entity" factors: - "Ambiguous names (multiple people named 'John Smith')" - "Unknown entities (not in knowledge base)" - "Nickname/variant resolution" examples: - text: "Dr. Williams presented the findings" issue: "Which Dr. Williams? No disambiguating context." resolution: "Create entity with low referent_confidence; flag for KB linking" - dimension: "extraction_confidence" description: "Confidence from the extraction model/process" factors: - "Model probability/logits" - "Rule match specificity" - "OCR quality (for digitized documents)" combined_score: formula: | annotation_confidence = min( boundary_confidence, type_confidence, referent_confidence ) × extraction_confidence rationale: | Use minimum for boundary/type/referent (weakest link), then scale by extraction quality. Low OCR confidence degrades all downstream scores. # ============================================================================= # EPISTEMIC UNCERTAINTY # ============================================================================= epistemic_uncertainty: description: | Uncertainty about FACTS IN THE WORLD, not annotation process. Captures disputed, unknown, or probability claims about reality. uncertainty_types: - type: "DISPUTED" description: "Multiple conflicting claims in sources" examples: - "Scholars dispute whether Vermeer used a camera obscura" - "The painting's attribution is contested" annotation_fields: - field: "epistemic_status" value: "disputed" - field: "competing_claims" value: ["claim_1", "claim_2"] - type: "UNKNOWN" description: "No reliable information available" examples: - "The artist's birthdate is unknown" - "The work's provenance before 1900 is unrecorded" annotation_fields: - field: "epistemic_status" value: "unknown" - type: "APPROXIMATE" description: "Value is estimated or rounded" examples: - "The collection contains approximately 8,000 works" - "Painted circa 1642" annotation_fields: - field: "epistemic_status" value: "approximate" - field: "precision" value: "circa_year | circa_decade | order_of_magnitude" - type: "INFERRED" description: "Derived from other facts, not directly stated" examples: - "If born in 1606 and died in 1669, he lived 63 years" - "Based on style, attributed to the artist's late period" annotation_fields: - field: "epistemic_status" value: "inferred" - field: "inference_basis" value: "description of reasoning" - type: "HYPOTHETICAL" description: "Conditional or speculative claim" examples: - "If the signature is authentic, the painting is by Vermeer" - "The proposed identification remains unconfirmed" annotation_fields: - field: "epistemic_status" value: "hypothetical" - field: "condition" value: "description of condition" source_reliability: description: "Confidence based on source quality" tiers: - tier: 1 label: "AUTHORITATIVE" sources: - "Primary sources (original documents, eyewitness accounts)" - "Official registries (ISIL, Wikidata with references)" - "Peer-reviewed scholarship" default_confidence: 0.95 - tier: 2 label: "RELIABLE" sources: - "Institutional websites" - "Encyclopedia entries (Britannica, Wikipedia with citations)" - "Expert secondary sources" default_confidence: 0.85 - tier: 3 label: "CREDIBLE" sources: - "News media" - "Wikipedia without citations" - "Aggregated databases" default_confidence: 0.70 - tier: 4 label: "UNVERIFIED" sources: - "User-generated content" - "Social media" - "NLP extraction without verification" default_confidence: 0.50 - tier: 5 label: "SUSPECT" sources: - "Known unreliable sources" - "Contradicted by authoritative sources" - "Outdated information" default_confidence: 0.20 # ============================================================================= # LINGUISTIC UNCERTAINTY # ============================================================================= linguistic_uncertainty: description: | Uncertainty encoded IN THE SOURCE TEXT itself through hedging, modality, attribution, and evidentiality markers. hedging_markers: description: "Words/phrases indicating reduced certainty in source" categories: - category: "MODAL_VERBS" markers: - marker: "may/might" uncertainty_reduction: 0.4 example: "The painting may be by Vermeer" - marker: "could" uncertainty_reduction: 0.3 example: "This could date to the 1640s" - marker: "must (epistemic)" uncertainty_reduction: 0.1 example: "He must have known the artist" - marker: "should" uncertainty_reduction: 0.2 example: "The document should contain the date" - category: "HEDGING_ADVERBS" markers: - marker: "possibly/perhaps/maybe" uncertainty_reduction: 0.4 example: "Possibly painted in Amsterdam" - marker: "probably/likely" uncertainty_reduction: 0.2 example: "Probably by Rembrandt's workshop" - marker: "certainly/definitely" uncertainty_reduction: 0.0 example: "Certainly authentic" - marker: "apparently/seemingly" uncertainty_reduction: 0.3 example: "Apparently a self-portrait" - category: "HEDGING_ADJECTIVES" markers: - marker: "possible/potential" uncertainty_reduction: 0.4 example: "A possible attribution to Hals" - marker: "probable/likely" uncertainty_reduction: 0.2 example: "The probable author" - marker: "alleged/purported" uncertainty_reduction: 0.5 example: "The alleged forgery" - marker: "so-called" uncertainty_reduction: 0.3 example: "The so-called 'Night Watch'" - category: "ATTRIBUTION_PHRASES" description: "Attribution to uncertain source" markers: - marker: "according to X" uncertainty_reduction: "depends on X reliability" example: "According to early sources..." - marker: "it is said that" uncertainty_reduction: 0.4 example: "It is said that Vermeer used..." - marker: "traditionally attributed to" uncertainty_reduction: 0.3 example: "Traditionally attributed to Leonardo" - marker: "some scholars believe" uncertainty_reduction: 0.3 example: "Some scholars believe this is..." - category: "APPROXIMATION" markers: - marker: "circa/c./ca." uncertainty_reduction: 0.2 example: "c. 1642" - marker: "approximately/about/around" uncertainty_reduction: 0.2 example: "Around 1,000 works" - marker: "roughly/nearly" uncertainty_reduction: 0.2 example: "Nearly 50 years old" annotation_pattern: description: "How to annotate linguistic uncertainty" schema: - field: "has_hedging" type: "boolean" description: "True if source text contains hedging" - field: "hedging_markers" type: "array[string]" description: "List of hedging markers detected" - field: "source_certainty" type: "float" description: "Certainty expressed in source (before extraction adjustment)" formula: "1.0 - max(uncertainty_reductions)" - field: "attributed_to" type: "object" description: "If claim is attributed, to whom" schema: source_name: "string" source_reliability: "float" is_author_endorsement: "boolean" examples: - text: "The painting is possibly by Rembrandt" annotation: has_hedging: true hedging_markers: ["possibly"] source_certainty: 0.6 notes: "Attribution uncertain in source text" - text: "According to Houbraken, Rembrandt was born in Leiden" annotation: has_hedging: true hedging_markers: ["according to"] attributed_to: source_name: "Arnold Houbraken" source_reliability: 0.8 is_author_endorsement: false source_certainty: 0.8 # ============================================================================= # UNCERTAINTY ANNOTATION SCHEMA # ============================================================================= annotation_schema: description: "Complete schema for uncertainty annotation" fields: # Core confidence score - field: "confidence" type: "float" range: [0.0, 1.0] required: true description: "Overall confidence in the annotation" # Detailed confidence breakdown - field: "confidence_breakdown" type: "object" optional: true schema: boundary_confidence: "float" type_confidence: "float" referent_confidence: "float" extraction_confidence: "float" description: "Component confidence scores" # Confidence metadata - field: "confidence_method" type: "enum" values: - "MODEL_PROBABILITY" # From ML model output - "RULE_CONFIDENCE" # Assigned by rule - "HUMAN_JUDGMENT" # Manual annotation - "ENSEMBLE_AGGREGATE" # Combined from multiple sources - "HEURISTIC" # Based on heuristic rules - "CALIBRATED" # Post-hoc calibrated description: "How confidence was computed" - field: "calibration_status" type: "enum" values: - "UNCALIBRATED" # Raw model output - "CALIBRATED" # Adjusted for reliability - "VALIDATED" # Verified against ground truth description: "Whether score has been calibrated" # Epistemic uncertainty - field: "epistemic_status" type: "enum" values: - "ASSERTED" # Presented as fact - "DISPUTED" # Multiple conflicting claims - "UNKNOWN" # No reliable information - "APPROXIMATE" # Estimated value - "INFERRED" # Derived from other facts - "HYPOTHETICAL" # Conditional/speculative optional: true default: "ASSERTED" description: "Epistemic status of the claim" - field: "competing_claims" type: "array[object]" optional: true description: "If disputed, list of competing claims" item_schema: claim: "string" source: "string" confidence: "float" # Linguistic uncertainty - field: "source_hedging" type: "object" optional: true schema: has_hedging: "boolean" markers: "array[string]" source_certainty: "float" description: "Hedging detected in source text" # Attribution - field: "attribution" type: "object" optional: true schema: attributed_to: "string (source/speaker)" attribution_type: "enum (QUOTE, PARAPHRASE, CLAIM)" endorsement: "boolean (does author endorse?)" description: "If claim is attributed to another source" # Review status - field: "review_status" type: "enum" values: - "UNREVIEWED" # Not yet reviewed - "PENDING_REVIEW" # Flagged for review - "REVIEWED" # Reviewed, accepted - "DISPUTED" # Reviewer disagrees - "CORRECTED" # Original was wrong, corrected optional: true description: "Human review status" - field: "reviewed_by" type: "string" optional: true description: "Reviewer identifier" - field: "review_date" type: "string" format: "ISO 8601" optional: true description: "When reviewed" - field: "review_notes" type: "string" optional: true description: "Reviewer comments" # ============================================================================= # CALIBRATION AND VALIDATION # ============================================================================= calibration: description: | Methods for ensuring confidence scores are RELIABLE (calibrated). A calibrated score of 0.8 means 80% of annotations at that score are correct. calibration_methods: - method: "Temperature Scaling" description: | Post-hoc calibration using held-out validation set. Learn temperature T such that softmax(logits/T) is calibrated. suitable_for: "Neural model outputs" - method: "Platt Scaling" description: | Fit sigmoid function to map raw scores to calibrated probabilities. P(correct) = 1 / (1 + exp(A × score + B)) suitable_for: "Binary classification outputs" - method: "Isotonic Regression" description: | Non-parametric calibration preserving score ordering. Maps scores to calibrated values via piecewise constant function. suitable_for: "When calibration curve is non-monotonic" - method: "Histogram Binning" description: | Bin predictions and assign calibrated score per bin. suitable_for: "Simple, interpretable calibration" calibration_metrics: - metric: "Expected Calibration Error (ECE)" description: "Average gap between confidence and accuracy per bin" formula: "Σ |B_i|/n × |accuracy(B_i) - confidence(B_i)|" target: "< 0.05" - metric: "Maximum Calibration Error (MCE)" description: "Largest gap across bins" formula: "max_i |accuracy(B_i) - confidence(B_i)|" target: "< 0.10" - metric: "Brier Score" description: "Mean squared error of probability estimates" formula: "Σ (p_i - y_i)² / n" target: "Lower is better" validation_requirements: description: "Requirements for validating confidence scores" requirements: - "Held-out test set with ground truth annotations" - "Stratified sampling across entity types and confidence ranges" - "Minimum 100 samples per calibration bin" - "Regular recalibration as model/data changes" monitoring: - "Track calibration metrics over time" - "Alert on calibration drift" - "Retrain calibration when ECE exceeds threshold" # ============================================================================= # INTER-ANNOTATOR AGREEMENT # ============================================================================= inter_annotator_agreement: description: | Metrics for measuring agreement between annotators, which informs confidence estimation and task difficulty assessment. metrics: - metric: "Cohen's Kappa (κ)" description: "Agreement corrected for chance (2 annotators)" formula: "κ = (P_o - P_e) / (1 - P_e)" interpretation: - range: [0.81, 1.0] label: "Almost perfect" - range: [0.61, 0.80] label: "Substantial" - range: [0.41, 0.60] label: "Moderate" - range: [0.21, 0.40] label: "Fair" - range: [0.0, 0.20] label: "Slight" - metric: "Fleiss' Kappa" description: "Multi-annotator extension of Cohen's κ" use_case: "3+ annotators on same items" - metric: "Krippendorff's Alpha (α)" description: "Handles missing data, any number of annotators" use_case: "Production annotation with variable annotator coverage" - metric: "F1 Agreement" description: "Treat one annotator as gold, compute F1" use_case: "When one annotator is more senior/authoritative" agreement_to_confidence: description: | Use IAA to inform confidence scoring. Low agreement items should have lower confidence bounds. heuristic: - agreement: "All annotators agree" confidence_boost: 0.1 - agreement: "Majority agrees (>66%)" confidence_boost: 0.0 - agreement: "Split decision (50%)" confidence_reduction: 0.2 flag: "REQUIRES_ADJUDICATION" - agreement: "No majority" confidence_reduction: 0.4 flag: "HIGHLY_AMBIGUOUS" # ============================================================================= # UNCERTAINTY PROPAGATION # ============================================================================= uncertainty_propagation: description: | How uncertainty flows through annotation pipelines and affects downstream tasks. propagation_rules: - stage: "Entity Extraction → Relationship Extraction" rule: | Relationship confidence ≤ min(subject_confidence, object_confidence) rationale: "Can't have high-confidence relationship with low-confidence entities" - stage: "Mention Detection → Coreference" rule: | Coreference confidence ≤ min(mention_1_confidence, mention_2_confidence) rationale: "Coreference uncertain if mentions are uncertain" - stage: "Individual Annotations → Aggregate Statistics" rule: | Report confidence intervals, not just point estimates. E.g., "8,000 ± 500 entities extracted (95% CI)" - stage: "Multiple Sources → Merged Entity" rule: | merged_confidence = f(source_confidences, agreement) Agreement boosts; conflict reduces. monte_carlo_simulation: description: | For complex pipelines, use Monte Carlo simulation: 1. Sample annotations according to confidence distributions 2. Run downstream pipeline 3. Aggregate to get output distribution use_case: "Estimating uncertainty in knowledge graph population" # ============================================================================= # PRESENTATION AND VISUALIZATION # ============================================================================= presentation: description: "How to present uncertainty to users" ui_guidelines: - guideline: "Color Coding" description: "Use consistent color scale from green (certain) to red (uncertain)" implementation: - confidence_range: "[0.9, 1.0]" confidence_min: 0.9 confidence_max: 1.0 color: "green" icon: "✓" - confidence_range: "[0.7, 0.9)" confidence_min: 0.7 confidence_max: 0.9 color: "light-green" icon: "○" - confidence_range: "[0.5, 0.7)" confidence_min: 0.5 confidence_max: 0.7 color: "yellow" icon: "?" - confidence_range: "[0.3, 0.5)" confidence_min: 0.3 confidence_max: 0.5 color: "orange" icon: "⚠" - confidence_range: "[0.0, 0.3)" confidence_min: 0.0 confidence_max: 0.3 color: "red" icon: "✗" - guideline: "Uncertainty Indicators" description: "Visual indicators scaled to confidence" options: - "Border thickness (thicker = less certain)" - "Opacity (more transparent = less certain)" - "Hatching/patterns for uncertain regions" - "Tooltip with detailed confidence breakdown" - guideline: "Sortable/Filterable" description: "Allow users to sort and filter by confidence" features: - "Slider to set confidence threshold" - "Show only items needing review (conf < 0.8)" - "Sort by confidence ascending (most uncertain first)" - guideline: "Confidence Distribution" description: "Show overall confidence distribution" implementation: "Histogram of confidence scores across annotations" - guideline: "Explain Uncertainty" description: "On hover/click, explain WHY confidence is low" example: | "Low confidence (0.45) because: - Ambiguous entity type (0.52) - Multiple possible referents in KB (0.65) - Source text contains hedging: 'possibly'" # ============================================================================= # EXAMPLES # ============================================================================= examples: - name: "High confidence entity" text: "The Rijksmuseum in Amsterdam houses Rembrandt's Night Watch." annotation: entity: "Rijksmuseum" entity_type: "GRP.ORG" confidence: 0.97 confidence_breakdown: boundary_confidence: 0.99 type_confidence: 0.98 referent_confidence: 0.96 extraction_confidence: 0.98 confidence_method: "MODEL_PROBABILITY" epistemic_status: "ASSERTED" external_uri: "https://www.wikidata.org/entity/Q190804" - name: "Hedged attribution" text: "The painting is possibly by Rembrandt or his workshop." annotation: entity: "Rembrandt or his workshop" entity_type: "AGT.PER" confidence: 0.75 confidence_breakdown: boundary_confidence: 0.85 type_confidence: 0.90 referent_confidence: 0.60 extraction_confidence: 0.85 source_hedging: has_hedging: true markers: ["possibly", "or"] source_certainty: 0.5 epistemic_status: "DISPUTED" competing_claims: - claim: "By Rembrandt himself" confidence: 0.4 - claim: "By Rembrandt's workshop" confidence: 0.5 - claim: "Later copy" confidence: 0.1 - name: "Approximate temporal reference" text: "The collection was established around 1800." annotation: entity: "around 1800" entity_type: "TMP.DAB" confidence: 0.85 epistemic_status: "APPROXIMATE" source_hedging: has_hedging: true markers: ["around"] source_certainty: 0.8 temporal_uncertainty: point_estimate: "1800-01-01" range_start: "1795-01-01" range_end: "1805-12-31" precision: "circa_decade" - name: "Low confidence extraction needing review" text: "Dr. J. van der Berg described the artifact." annotation: entity: "Dr. J. van der Berg" entity_type: "AGT.PER" confidence: 0.45 confidence_breakdown: boundary_confidence: 0.90 type_confidence: 0.85 referent_confidence: 0.35 extraction_confidence: 0.80 confidence_method: "MODEL_PROBABILITY" review_status: "PENDING_REVIEW" review_notes: | Multiple "J. van der Berg" in knowledge base. Need additional context for disambiguation. possible_referents: - uri: "https://viaf.org/viaf/12345" name: "Johan van der Berg (1890-1960)" match_confidence: 0.35 - uri: "https://viaf.org/viaf/67890" name: "Johannes van der Berg (1920-1995)" match_confidence: 0.30 - uri: null name: "(Unknown person)" match_confidence: 0.35