id: https://nde.nl/ontology/hc/class/VideoAnnotation
name: video_annotation_class
title: Video Annotation Class
imports:
- linkml:types
- ./VideoTextContent
- ./VideoTimeSegment
- ./AnnotationMotivationType
- ./AnnotationMotivationTypes
- ../slots/has_annotation_motivation
- ../slots/has_annotation_segment
- ../slots/has_annotation_type
- ../slots/detection_count
- ../slots/detection_threshold
- ../slots/frame_sample_rate
- ../slots/includes_bounding_box
- ../slots/includes_segmentation_mask
- ../slots/keyframe_extraction
- ../slots/model_architecture
- ../slots/model_task
- ../slots/specificity_annotation
- ../slots/template_specificity
- ../slots/total_frames_analyzed
- ./SpecificityAnnotation
- ./TemplateSpecificityScores
prefixes:
  linkml: https://w3id.org/linkml/
  hc: https://nde.nl/ontology/hc/
  schema: http://schema.org/
  dcterms: http://purl.org/dc/terms/
  prov: http://www.w3.org/ns/prov#
  crm: http://www.cidoc-crm.org/cidoc-crm/
  oa: http://www.w3.org/ns/oa#
  as: https://www.w3.org/ns/activitystreams#
default_prefix: hc
classes:
  VideoAnnotation:
    is_a: VideoTextContent
    class_uri: oa:Annotation
    abstract: true
    description: "Abstract base class for computer vision and multimodal video annotations.\n\n**DEFINITION**:\n\nVideoAnnotation\
      \ represents structured information derived from visual\nanalysis of video content. This includes:\n\n| Subclass | Analysis\
      \ Type | Output |\n|----------|---------------|--------|\n| VideoSceneAnnotation | Shot/scene detection | Scene boundaries,\
      \ types |\n| VideoObjectAnnotation | Object detection | Objects, faces, logos |\n| VideoOCRAnnotation | Text extraction\
      \ | On-screen text (OCR) |\n\n**RELATIONSHIP TO W3C WEB ANNOTATION**:\n\nVideoAnnotation aligns with the W3C Web Annotation\
      \ Data Model:\n\n```turtle\n:annotation a oa:Annotation ;\n    oa:hasBody :detection_result ;\n    oa:hasTarget [\n\
      \        oa:hasSource :video ;\n        oa:hasSelector [\n            a oa:FragmentSelector ;\n            dcterms:conformsTo\
      \ <http://www.w3.org/TR/media-frags/> ;\n            rdf:value \"t=30,35\"\n        ]\n    ] ;\n    oa:motivatedBy oa:classifying\
      \ .\n```\n\n**FRAME-BASED ANALYSIS**:\n\nUnlike audio transcription (continuous stream), video annotation is\ntypically\
      \ frame-based:\n\n- `frame_sample_rate`: Frames analyzed per second (e.g., 1 fps, 5 fps)\n- `total_frames_analyzed`:\
      \ Total frames processed\n- Higher sample rates = more detections but higher compute cost\n\n**DETECTION THRESHOLDS**:\n\
      \nCV models output confidence scores. Thresholds filter noise:\n\n| Threshold | Use Case |\n|-----------|----------|\n\
      | 0.9+ | High precision, production display |\n| 0.7-0.9 | Balanced, general use |\n| 0.5-0.7 | High recall, research/review\
      \ |\n| < 0.5 | Raw output, needs filtering |\n\n**MODEL ARCHITECTURE TRACKING**:\n\nDifferent model architectures have\
      \ different characteristics:\n\n| Architecture | Examples | Strengths |\n|--------------|----------|-----------|\n|\
      \ CNN | ResNet, VGG | Fast inference, good for objects |\n| Transformer | ViT, CLIP | Better context, multimodal |\n\
      | Hybrid | DETR, Swin | Balance of speed and accuracy |\n\n**HERITAGE INSTITUTION CONTEXT**:\n\nVideo annotations enable:\n\
      - **Discovery**: Find videos containing specific objects/artworks\n- **Accessibility**: Scene descriptions for visually\
      \ impaired\n- **Research**: Analyze visual content at scale\n- **Preservation**: Document visual content as text\n-\
      \ **Linking**: Connect detected artworks to collection records\n\n**CIDOC-CRM E13_Attribute_Assignment**:\n\nAnnotations\
      \ are attribute assignments - asserting properties about\nvideo segments. The CV model or human annotator is the assigning\
      \ agent.\n"
    exact_mappings:
    - oa:Annotation
    close_mappings:
    - crm:E13_Attribute_Assignment
    related_mappings:
    - as:Activity
    - schema:ClaimReview
    slots:
    - has_annotation_motivation
    - has_annotation_segment
    - has_annotation_type
    - detection_count
    - detection_threshold
    - frame_sample_rate
    - includes_bounding_box
    - includes_segmentation_mask
    - keyframe_extraction
    - model_architecture
    - model_task
    - specificity_annotation
    - template_specificity
    - total_frames_analyzed
    slot_usage:
      has_annotation_type:
        slot_uri: dcterms:type
        description: 'High-level type classification for this annotation.


          Dublin Core: type for resource categorization.


          **Standard Types**:

          - SCENE_DETECTION: Shot/scene boundary detection

          - OBJECT_DETECTION: Object, face, logo detection

          - OCR: Text-in-video extraction

          - ACTION_RECOGNITION: Human action detection

          - SEMANTIC_SEGMENTATION: Pixel-level classification

          - MULTIMODAL: Combined audio+visual analysis

          '
        range: AnnotationTypeEnum
        required: true
        examples:
        - value: OBJECT_DETECTION
          description: Object and face detection annotation
      has_annotation_segment:
        slot_uri: oa:hasBody
        description: 'List of temporal segments with detection results.


          Web Annotation: hasBody links annotation to its content.


          Each segment contains:

          - Time boundaries (start/end)

          - Detection text/description

          - Per-segment confidence


          Reuses VideoTimeSegment for consistent temporal modeling.

          '
        range: VideoTimeSegment
        multivalued: true
        required: false
        inlined_as_list: true
        examples:
        - value: '[{start_seconds: 30.0, end_seconds: 35.0, segment_text: ''Night Watch painting visible''}]'
          description: Object detection segment
      detection_threshold:
        slot_uri: hc:detectionThreshold
        description: 'Minimum confidence threshold used for detection filtering.


          Detections below this threshold were excluded from results.


          Range: 0.0 to 1.0


          **Common Values**:

          - 0.5: Standard threshold (balanced)

          - 0.7: High precision mode

          - 0.3: High recall mode (includes uncertain detections)

          '
        range: float
        required: false
        minimum_value: 0.0
        maximum_value: 1.0
        examples:
        - value: 0.5
          description: Standard detection threshold
      detection_count:
        slot_uri: hc:detectionCount
        description: 'Total number of detections across all analyzed frames.


          Useful for:

          - Understanding annotation density

          - Quality assessment

          - Performance metrics


          Note: May be higher than annotation_segments count if segments

          are aggregated or filtered.

          '
        range: integer
        required: false
        minimum_value: 0
        examples:
        - value: 342
          description: 342 total detections found
      frame_sample_rate:
        slot_uri: hc:frameSampleRate
        description: 'Number of frames analyzed per second of video.


          **Common Values**:

          - 1.0: One frame per second (efficient)

          - 5.0: Five frames per second (balanced)

          - 30.0: Every frame at 30fps (thorough but expensive)

          - 0.1: One frame every 10 seconds (overview only)


          Higher rates catch more content but increase compute cost.

          '
        range: float
        required: false
        minimum_value: 0.0
        examples:
        - value: 1.0
          description: Analyzed 1 frame per second
      total_frames_analyzed:
        slot_uri: hc:totalFramesAnalyzed
        description: 'Total number of video frames that were analyzed.


          Calculated as: video_duration_seconds × frame_sample_rate


          Useful for:

          - Understanding analysis coverage

          - Cost estimation

          - Reproducibility

          '
        range: integer
        required: false
        minimum_value: 0
        examples:
        - value: 1800
          description: Analyzed 1,800 frames (30 min video at 1 fps)
      keyframe_extraction:
        slot_uri: hc:keyframeExtraction
        description: 'Whether keyframe extraction was used instead of uniform sampling.


          **Keyframe extraction** selects visually distinct frames

          (scene changes, significant motion) rather than uniform intervals.


          - true: Keyframes extracted (variable frame selection)

          - false: Uniform sampling at frame_sample_rate


          Keyframe extraction is more efficient but may miss content

          between scene changes.

          '
        range: boolean
        required: false
        examples:
        - value: true
          description: Used keyframe extraction
      model_architecture:
        slot_uri: hc:modelArchitecture
        description: 'Architecture type of the CV/ML model used.


          **Common Architectures**:

          - CNN: Convolutional Neural Network (ResNet, VGG, EfficientNet)

          - Transformer: Vision Transformer (ViT, Swin, CLIP)

          - Hybrid: Combined architectures (DETR, ConvNeXt)

          - RNN: Recurrent (for temporal analysis)

          - GAN: Generative (for reconstruction tasks)


          Useful for understanding model capabilities and limitations.

          '
        range: string
        required: false
        examples:
        - value: Transformer
          description: Vision Transformer architecture
        - value: CNN
          description: Convolutional Neural Network
      model_task:
        slot_uri: hc:modelTask
        description: 'Specific task the model was trained for.


          **Common Tasks**:

          - classification: Image/frame classification

          - detection: Object detection with bounding boxes

          - segmentation: Pixel-level classification

          - captioning: Image/video captioning

          - embedding: Feature extraction for similarity


          A model''s task determines its output format.

          '
        range: string
        required: false
        examples:
        - value: detection
          description: Object detection task
        - value: captioning
          description: Video captioning task
      includes_bounding_box:
        slot_uri: hc:includesBoundingBoxes
        description: 'Whether annotation includes spatial bounding box coordinates.


          Bounding boxes define rectangular regions in frames where

          objects/faces/text were detected.


          Format typically: [x, y, width, height] or [x1, y1, x2, y2]


          - true: Spatial coordinates available in segment data

          - false: Only temporal information (no spatial)

          '
        range: boolean
        required: false
        examples:
        - value: true
          description: Includes bounding box coordinates
      includes_segmentation_mask:
        slot_uri: hc:includesSegmentationMasks
        description: 'Whether annotation includes pixel-level segmentation masks.


          Segmentation masks provide precise object boundaries

          (more detailed than bounding boxes).


          - true: Pixel masks available (typically as separate files)

          - false: No segmentation data


          Masks are memory-intensive; often stored externally.

          '
        range: boolean
        required: false
        examples:
        - value: false
          description: No segmentation masks included
      has_annotation_motivation:
        slot_uri: oa:motivatedBy
        description: 'The motivation or purpose for creating this annotation.


          Web Annotation: motivatedBy describes why annotation was created.


          **Standard Motivations** (from W3C Web Annotation):

          - ClassifyingMotivation: Categorizing content

          - DescribingMotivation: Adding description

          - IdentifyingMotivation: Identifying depicted things

          - TaggingMotivation: Adding tags/keywords

          - LinkingMotivation: Linking to external resources

          - CommentingMotivation: Adding commentary


          **Heritage-Specific Extensions**:

          - AccessibilityMotivation: For accessibility services

          - DiscoveryMotivation: For search/discovery

          - PreservationMotivation: For digital preservation

          - ResearchMotivation: For scholarly research

          '
        range: AnnotationMotivationType
        required: false
        examples:
        - value: ClassifyingMotivation
          description: Annotation for classification purposes
      specificity_annotation:
        range: SpecificityAnnotation
        inlined: true
      template_specificity:
        range: TemplateSpecificityScores
        inlined: true
    comments:
    - Abstract base for all CV/multimodal video annotations
    - Extends VideoTextContent with frame-based analysis parameters
    - W3C Web Annotation compatible structure
    - Supports both temporal and spatial annotation
    - Tracks detection thresholds and model architecture
    see_also:
    - https://www.w3.org/TR/annotation-model/
    - http://www.cidoc-crm.org/cidoc-crm/E13_Attribute_Assignment
    - https://iiif.io/api/presentation/3.0/
enums:
  AnnotationTypeEnum:
    description: 'Types of video annotation based on analysis method.

      '
    permissible_values:
      SCENE_DETECTION:
        description: Shot and scene boundary detection
      OBJECT_DETECTION:
        description: Object, face, and logo detection
      OCR:
        description: Optical character recognition (text-in-video)
      ACTION_RECOGNITION:
        description: Human action and activity detection
      SEMANTIC_SEGMENTATION:
        description: Pixel-level semantic classification
      POSE_ESTIMATION:
        description: Human body pose detection
      EMOTION_RECOGNITION:
        description: Facial emotion/expression analysis
      MULTIMODAL:
        description: Combined audio-visual analysis
      CAPTIONING:
        description: Automated video captioning/description
      CUSTOM:
        description: Custom annotation type