# Video Annotation Class
# Abstract base class for computer vision and multimodal video annotations
#
# Part of Heritage Custodian Ontology v0.9.5
#
# HIERARCHY:
# E73_Information_Object (CIDOC-CRM)
#     │
#     └── VideoTextContent (abstract base)
#             │
#             ├── VideoTranscript (audio-derived)
#             │       │
#             │       └── VideoSubtitle (time-coded captions)
#             │
#             └── VideoAnnotation (this class - ABSTRACT)
#                     │
#                     ├── VideoSceneAnnotation (scene/shot detection)
#                     ├── VideoObjectAnnotation (object/face/logo detection)
#                     └── VideoOCRAnnotation (text-in-video extraction)
#
# DESIGN RATIONALE:
# VideoAnnotation is the abstract parent for all annotations derived from
# visual analysis of video content. Unlike VideoTranscript (audio-derived),
# these annotations come from computer vision, multimodal AI, or manual
# visual analysis.
#
# Key differences from transcript branch:
# - Frame-based rather than audio-based analysis
# - Spatial information (bounding boxes, regions)
# - Detection thresholds and frame sampling
# - Multiple detection types per segment
#
# ONTOLOGY ALIGNMENT:
# - W3C Web Annotation (oa:Annotation) for annotation structure
# - CIDOC-CRM E13_Attribute_Assignment for attribution activities
# - IIIF Presentation API for spatial/temporal selectors

id: https://nde.nl/ontology/hc/class/VideoAnnotation
name: video_annotation_class
title: Video Annotation Class

imports:
  - linkml:types
  - ./VideoTextContent
  - ./VideoTimeSegment

prefixes:
  linkml: https://w3id.org/linkml/
  hc: https://nde.nl/ontology/hc/
  schema: http://schema.org/
  dcterms: http://purl.org/dc/terms/
  prov: http://www.w3.org/ns/prov#
  crm: http://www.cidoc-crm.org/cidoc-crm/
  oa: http://www.w3.org/ns/oa#
  as: https://www.w3.org/ns/activitystreams#

default_prefix: hc

classes:

  VideoAnnotation:
    is_a: VideoTextContent
    class_uri: oa:Annotation
    abstract: true
    description: |
      Abstract base class for computer vision and multimodal video annotations.
      
      **DEFINITION**:
      
      VideoAnnotation represents structured information derived from visual
      analysis of video content. This includes:
      
      | Subclass | Analysis Type | Output |
      |----------|---------------|--------|
      | VideoSceneAnnotation | Shot/scene detection | Scene boundaries, types |
      | VideoObjectAnnotation | Object detection | Objects, faces, logos |
      | VideoOCRAnnotation | Text extraction | On-screen text (OCR) |
      
      **RELATIONSHIP TO W3C WEB ANNOTATION**:
      
      VideoAnnotation aligns with the W3C Web Annotation Data Model:
      
      ```turtle
      :annotation a oa:Annotation ;
          oa:hasBody :detection_result ;
          oa:hasTarget [
              oa:hasSource :video ;
              oa:hasSelector [
                  a oa:FragmentSelector ;
                  dcterms:conformsTo <http://www.w3.org/TR/media-frags/> ;
                  rdf:value "t=30,35"
              ]
          ] ;
          oa:motivatedBy oa:classifying .
      ```
      
      **FRAME-BASED ANALYSIS**:
      
      Unlike audio transcription (continuous stream), video annotation is
      typically frame-based:
      
      - `frame_sample_rate`: Frames analyzed per second (e.g., 1 fps, 5 fps)
      - `total_frames_analyzed`: Total frames processed
      - Higher sample rates = more detections but higher compute cost
      
      **DETECTION THRESHOLDS**:
      
      CV models output confidence scores. Thresholds filter noise:
      
      | Threshold | Use Case |
      |-----------|----------|
      | 0.9+ | High precision, production display |
      | 0.7-0.9 | Balanced, general use |
      | 0.5-0.7 | High recall, research/review |
      | < 0.5 | Raw output, needs filtering |
      
      **MODEL ARCHITECTURE TRACKING**:
      
      Different model architectures have different characteristics:
      
      | Architecture | Examples | Strengths |
      |--------------|----------|-----------|
      | CNN | ResNet, VGG | Fast inference, good for objects |
      | Transformer | ViT, CLIP | Better context, multimodal |
      | Hybrid | DETR, Swin | Balance of speed and accuracy |
      
      **HERITAGE INSTITUTION CONTEXT**:
      
      Video annotations enable:
      - **Discovery**: Find videos containing specific objects/artworks
      - **Accessibility**: Scene descriptions for visually impaired
      - **Research**: Analyze visual content at scale
      - **Preservation**: Document visual content as text
      - **Linking**: Connect detected artworks to collection records
      
      **CIDOC-CRM E13_Attribute_Assignment**:
      
      Annotations are attribute assignments - asserting properties about
      video segments. The CV model or human annotator is the assigning agent.
    
    exact_mappings:
      - oa:Annotation
    
    close_mappings:
      - crm:E13_Attribute_Assignment
    
    related_mappings:
      - as:Activity
      - schema:ClaimReview
    
    slots:
      # Annotation structure
      - annotation_type
      - annotation_segments
      
      # Detection parameters
      - detection_threshold
      - detection_count
      
      # Frame analysis
      - frame_sample_rate
      - total_frames_analyzed
      - keyframe_extraction
      
      # Model details
      - model_architecture
      - model_task
      
      # Spatial information
      - includes_bounding_boxes
      - includes_segmentation_masks
      
      # Annotation motivation
      - annotation_motivation
    
    slot_usage:
      annotation_type:
        slot_uri: dcterms:type
        description: |
          High-level type classification for this annotation.
          
          Dublin Core: type for resource categorization.
          
          **Standard Types**:
          - SCENE_DETECTION: Shot/scene boundary detection
          - OBJECT_DETECTION: Object, face, logo detection
          - OCR: Text-in-video extraction
          - ACTION_RECOGNITION: Human action detection
          - SEMANTIC_SEGMENTATION: Pixel-level classification
          - MULTIMODAL: Combined audio+visual analysis
        range: AnnotationTypeEnum
        required: true
        examples:
          - value: "OBJECT_DETECTION"
            description: "Object and face detection annotation"
      
      annotation_segments:
        slot_uri: oa:hasBody
        description: |
          List of temporal segments with detection results.
          
          Web Annotation: hasBody links annotation to its content.
          
          Each segment contains:
          - Time boundaries (start/end)
          - Detection text/description
          - Per-segment confidence
          
          Reuses VideoTimeSegment for consistent temporal modeling.
        range: VideoTimeSegment
        multivalued: true
        required: false
        inlined_as_list: true
        examples:
          - value: "[{start_seconds: 30.0, end_seconds: 35.0, segment_text: 'Night Watch painting visible'}]"
            description: "Object detection segment"
      
      detection_threshold:
        slot_uri: hc:detectionThreshold
        description: |
          Minimum confidence threshold used for detection filtering.
          
          Detections below this threshold were excluded from results.
          
          Range: 0.0 to 1.0
          
          **Common Values**:
          - 0.5: Standard threshold (balanced)
          - 0.7: High precision mode
          - 0.3: High recall mode (includes uncertain detections)
        range: float
        required: false
        minimum_value: 0.0
        maximum_value: 1.0
        examples:
          - value: 0.5
            description: "Standard detection threshold"
      
      detection_count:
        slot_uri: hc:detectionCount
        description: |
          Total number of detections across all analyzed frames.
          
          Useful for:
          - Understanding annotation density
          - Quality assessment
          - Performance metrics
          
          Note: May be higher than annotation_segments count if segments
          are aggregated or filtered.
        range: integer
        required: false
        minimum_value: 0
        examples:
          - value: 342
            description: "342 total detections found"
      
      frame_sample_rate:
        slot_uri: hc:frameSampleRate
        description: |
          Number of frames analyzed per second of video.
          
          **Common Values**:
          - 1.0: One frame per second (efficient)
          - 5.0: Five frames per second (balanced)
          - 30.0: Every frame at 30fps (thorough but expensive)
          - 0.1: One frame every 10 seconds (overview only)
          
          Higher rates catch more content but increase compute cost.
        range: float
        required: false
        minimum_value: 0.0
        examples:
          - value: 1.0
            description: "Analyzed 1 frame per second"
      
      total_frames_analyzed:
        slot_uri: hc:totalFramesAnalyzed
        description: |
          Total number of video frames that were analyzed.
          
          Calculated as: video_duration_seconds × frame_sample_rate
          
          Useful for:
          - Understanding analysis coverage
          - Cost estimation
          - Reproducibility
        range: integer
        required: false
        minimum_value: 0
        examples:
          - value: 1800
            description: "Analyzed 1,800 frames (30 min video at 1 fps)"
      
      keyframe_extraction:
        slot_uri: hc:keyframeExtraction
        description: |
          Whether keyframe extraction was used instead of uniform sampling.
          
          **Keyframe extraction** selects visually distinct frames
          (scene changes, significant motion) rather than uniform intervals.
          
          - true: Keyframes extracted (variable frame selection)
          - false: Uniform sampling at frame_sample_rate
          
          Keyframe extraction is more efficient but may miss content
          between scene changes.
        range: boolean
        required: false
        examples:
          - value: true
            description: "Used keyframe extraction"
      
      model_architecture:
        slot_uri: hc:modelArchitecture
        description: |
          Architecture type of the CV/ML model used.
          
          **Common Architectures**:
          - CNN: Convolutional Neural Network (ResNet, VGG, EfficientNet)
          - Transformer: Vision Transformer (ViT, Swin, CLIP)
          - Hybrid: Combined architectures (DETR, ConvNeXt)
          - RNN: Recurrent (for temporal analysis)
          - GAN: Generative (for reconstruction tasks)
          
          Useful for understanding model capabilities and limitations.
        range: string
        required: false
        examples:
          - value: "Transformer"
            description: "Vision Transformer architecture"
          - value: "CNN"
            description: "Convolutional Neural Network"
      
      model_task:
        slot_uri: hc:modelTask
        description: |
          Specific task the model was trained for.
          
          **Common Tasks**:
          - classification: Image/frame classification
          - detection: Object detection with bounding boxes
          - segmentation: Pixel-level classification
          - captioning: Image/video captioning
          - embedding: Feature extraction for similarity
          
          A model's task determines its output format.
        range: string
        required: false
        examples:
          - value: "detection"
            description: "Object detection task"
          - value: "captioning"
            description: "Video captioning task"
      
      includes_bounding_boxes:
        slot_uri: hc:includesBoundingBoxes
        description: |
          Whether annotation includes spatial bounding box coordinates.
          
          Bounding boxes define rectangular regions in frames where
          objects/faces/text were detected.
          
          Format typically: [x, y, width, height] or [x1, y1, x2, y2]
          
          - true: Spatial coordinates available in segment data
          - false: Only temporal information (no spatial)
        range: boolean
        required: false
        examples:
          - value: true
            description: "Includes bounding box coordinates"
      
      includes_segmentation_masks:
        slot_uri: hc:includesSegmentationMasks
        description: |
          Whether annotation includes pixel-level segmentation masks.
          
          Segmentation masks provide precise object boundaries
          (more detailed than bounding boxes).
          
          - true: Pixel masks available (typically as separate files)
          - false: No segmentation data
          
          Masks are memory-intensive; often stored externally.
        range: boolean
        required: false
        examples:
          - value: false
            description: "No segmentation masks included"
      
      annotation_motivation:
        slot_uri: oa:motivatedBy
        description: |
          The motivation or purpose for creating this annotation.
          
          Web Annotation: motivatedBy describes why annotation was created.
          
          **Standard Motivations** (from W3C Web Annotation):
          - classifying: Categorizing content
          - describing: Adding description
          - identifying: Identifying depicted things
          - tagging: Adding tags/keywords
          - linking: Linking to external resources
          
          **Heritage-Specific**:
          - accessibility: For accessibility services
          - discovery: For search/discovery
          - preservation: For digital preservation
        range: AnnotationMotivationEnum
        required: false
        examples:
          - value: "CLASSIFYING"
            description: "Annotation for classification purposes"
    
    comments:
      - "Abstract base for all CV/multimodal video annotations"
      - "Extends VideoTextContent with frame-based analysis parameters"
      - "W3C Web Annotation compatible structure"
      - "Supports both temporal and spatial annotation"
      - "Tracks detection thresholds and model architecture"
    
    see_also:
      - "https://www.w3.org/TR/annotation-model/"
      - "http://www.cidoc-crm.org/cidoc-crm/E13_Attribute_Assignment"
      - "https://iiif.io/api/presentation/3.0/"

# ============================================================================
# Enumerations
# ============================================================================

enums:
  
  AnnotationTypeEnum:
    description: |
      Types of video annotation based on analysis method.
    permissible_values:
      SCENE_DETECTION:
        description: Shot and scene boundary detection
      OBJECT_DETECTION:
        description: Object, face, and logo detection
      OCR:
        description: Optical character recognition (text-in-video)
      ACTION_RECOGNITION:
        description: Human action and activity detection
      SEMANTIC_SEGMENTATION:
        description: Pixel-level semantic classification
      POSE_ESTIMATION:
        description: Human body pose detection
      EMOTION_RECOGNITION:
        description: Facial emotion/expression analysis
      MULTIMODAL:
        description: Combined audio-visual analysis
      CAPTIONING:
        description: Automated video captioning/description
      CUSTOM:
        description: Custom annotation type
  
  AnnotationMotivationEnum:
    description: |
      Motivation for creating annotation (W3C Web Annotation aligned).
    permissible_values:
      CLASSIFYING:
        description: Categorizing or classifying content
        meaning: oa:classifying
      DESCRIBING:
        description: Adding descriptive information
        meaning: oa:describing
      IDENTIFYING:
        description: Identifying depicted entities
        meaning: oa:identifying
      TAGGING:
        description: Adding tags or keywords
        meaning: oa:tagging
      LINKING:
        description: Linking to external resources
        meaning: oa:linking
      COMMENTING:
        description: Adding commentary
        meaning: oa:commenting
      ACCESSIBILITY:
        description: Providing accessibility support
      DISCOVERY:
        description: Enabling search and discovery
      PRESERVATION:
        description: Supporting digital preservation
      RESEARCH:
        description: Supporting research and analysis

# ============================================================================
# Slot Definitions
# ============================================================================

slots:
  annotation_type:
    description: High-level type of video annotation
    range: AnnotationTypeEnum
  
  annotation_segments:
    description: List of temporal segments with detection results
    range: VideoTimeSegment
    multivalued: true
  
  detection_threshold:
    description: Minimum confidence threshold for detection filtering
    range: float
  
  detection_count:
    description: Total number of detections found
    range: integer
  
  frame_sample_rate:
    description: Frames analyzed per second of video
    range: float
  
  total_frames_analyzed:
    description: Total number of frames analyzed
    range: integer
  
  keyframe_extraction:
    description: Whether keyframe extraction was used
    range: boolean
  
  model_architecture:
    description: Architecture type of CV/ML model (CNN, Transformer, etc.)
    range: string
  
  model_task:
    description: Specific task model was trained for
    range: string
  
  includes_bounding_boxes:
    description: Whether annotation includes spatial bounding boxes
    range: boolean
  
  includes_segmentation_masks:
    description: Whether annotation includes pixel segmentation masks
    range: boolean
  
  annotation_motivation:
    description: Motivation for creating annotation (W3C Web Annotation)
    range: AnnotationMotivationEnum