glam/schemas/20251121/linkml/modules/classes/VideoTimeSegment.yaml

# Video Time Segment Class
# Reusable temporal segment for video content (subtitles, annotations, chapters)
#
# Part of Heritage Custodian Ontology v0.9.5
#
# STRUCTURE:
# VideoTimeSegment (this class)
#     - start_time, end_time (ISO 8601 duration)
#     - start_seconds, end_seconds (float for computation)
#     - segment_text (text content for this segment)
#     - confidence (for ASR/CV generated content)
#
# USED BY:
# - VideoSubtitle (time-coded caption entries)
# - VideoAnnotation (scene/object detection segments)
# - VideoChapter (user-defined chapters)
#
# ONTOLOGY ALIGNMENT:
# - Maps to Media Fragments URI 1.0 (W3C) for temporal addressing
# - CIDOC-CRM E52_Time-Span for temporal extent
# - Web Annotation oa:FragmentSelector for annotation targets

id: https://nde.nl/ontology/hc/class/VideoTimeSegment
name: video_time_segment_class
title: Video Time Segment Class

imports:
  - linkml:types

prefixes:
  linkml: https://w3id.org/linkml/
  hc: https://nde.nl/ontology/hc/
  schema: http://schema.org/
  dcterms: http://purl.org/dc/terms/
  crm: http://www.cidoc-crm.org/cidoc-crm/
  oa: http://www.w3.org/ns/oa#
  ma: http://www.w3.org/ns/ma-ont#

default_prefix: hc

classes:

  VideoTimeSegment:
    class_uri: crm:E52_Time-Span
    abstract: false
    description: |
      A temporal segment within a video, defined by start and end times.

      **DEFINITION**:

      VideoTimeSegment represents a bounded temporal portion of video content.
      It is the foundational unit for time-coded content including:
      - Subtitle/caption entries (text displayed at specific times)
      - Annotation segments (detected scenes, objects, faces)
      - Chapter markers (user-defined content sections)

      **DUAL TIME REPRESENTATION**:

      Times are stored in two formats for different use cases:

      | Format | Example | Use Case |
      |--------|---------|----------|
      | ISO 8601 duration | PT0M30S | Human-readable, serialization |
      | Seconds (float) | 30.0 | Computation, synchronization |

      Both representations MUST be kept in sync. The seconds format is
      primary for computation; ISO 8601 is derived for display/storage.

      **MEDIA FRAGMENTS URI (W3C)**:

      VideoTimeSegment aligns with W3C Media Fragments URI 1.0 specification
      for addressing temporal fragments of video:

      ```
      https://example.com/video.mp4#t=30,35
      ```

      The `start_seconds` and `end_seconds` map directly to the `t=` parameter.

      **WEB ANNOTATION COMPATIBILITY**:

      When used as an annotation target selector:
      - Maps to `oa:FragmentSelector` with `conformsTo` Media Fragments
      - Enables interoperability with W3C Web Annotation Data Model

      **CIDOC-CRM E52_Time-Span**:

      In cultural heritage documentation:
      - E52_Time-Span is the extent of a time-span
      - Used for temporal properties of cultural objects
      - VideoTimeSegment extends this to media-specific temporal segments

      **CONFIDENCE SCORING**:

      For segments generated by ASR (speech recognition) or CV (computer vision):
      - `confidence`: 0.0-1.0 score for segment accuracy
      - Enables filtering by quality threshold
      - Critical for AI-generated transcripts and annotations

      **HERITAGE USE CASES**:

      | Use Case | Example | Start | End |
      |----------|---------|-------|-----|
      | Subtitle entry | "Welcome to the museum" | 0:30 | 0:35 |
      | Scene annotation | "Exhibition hall panorama" | 1:00 | 1:30 |
      | Chapter marker | "Introduction" | 0:00 | 2:00 |
      | Object detection | "Painting: Night Watch" | 3:15 | 3:20 |
      | Speaker change | "Curator speaking" | 5:00 | 7:30 |

    exact_mappings:
      - crm:E52_Time-Span
      - oa:FragmentSelector

    close_mappings:
      - ma:MediaFragment

    related_mappings:
      - schema:Clip

    slots:
      # Time boundaries (ISO 8601 duration format)
      - start_time
      - end_time

      # Time boundaries (seconds for computation)
      - start_seconds
      - end_seconds

      # Content
      - segment_text
      - segment_index

      # Quality
      - confidence

      # Metadata
      - speaker_id
      - speaker_label

    slot_usage:
      start_time:
        slot_uri: ma:hasStartTime
        description: |
          Start time of segment as ISO 8601 duration from video beginning.

          Media Ontology: hasStartTime for temporal start.

          **Format**: ISO 8601 duration (e.g., "PT0M30S" = 30 seconds from start)

          **Common Patterns**:
          - PT0S = Start of video (0 seconds)
          - PT30S = 30 seconds
          - PT1M30S = 1 minute 30 seconds
          - PT1H15M30S = 1 hour 15 minutes 30 seconds
        range: string
        required: false
        pattern: "^PT(\\d+H)?(\\d+M)?(\\d+(\\.\\d+)?S)?$"
        examples:
          - value: "PT0M30S"
            description: "30 seconds from video start"
          - value: "PT1H15M30S"
            description: "1 hour 15 minutes 30 seconds"

      end_time:
        slot_uri: ma:hasEndTime
        description: |
          End time of segment as ISO 8601 duration from video beginning.

          Media Ontology: hasEndTime for temporal end.

          Must be greater than or equal to start_time.
        range: string
        required: false
        pattern: "^PT(\\d+H)?(\\d+M)?(\\d+(\\.\\d+)?S)?$"
        examples:
          - value: "PT0M35S"
            description: "35 seconds from video start"

      start_seconds:
        slot_uri: hc:startSeconds
        description: |
          Start time in seconds (floating point) from video beginning.

          **PRIMARY for computation**. Use for:
          - Video player synchronization
          - Duration calculations
          - Time-based sorting and filtering

          Precision to milliseconds (3 decimal places) is typical.
        range: float
        required: true
        minimum_value: 0.0
        examples:
          - value: 30.0
            description: "30 seconds from start"
          - value: 30.500
            description: "30.5 seconds (millisecond precision)"

      end_seconds:
        slot_uri: hc:endSeconds
        description: |
          End time in seconds (floating point) from video beginning.

          Must be greater than start_seconds.

          For single-frame annotations (e.g., object detection in one frame),
          end_seconds may equal start_seconds or be slightly greater.
        range: float
        required: true
        minimum_value: 0.0
        examples:
          - value: 35.0
            description: "35 seconds from start"

      segment_text:
        slot_uri: oa:bodyValue
        description: |
          Text content for this segment.

          Web Annotation: bodyValue for textual content.

          **Usage by content type**:
          - Subtitles: Displayed caption text
          - Transcripts: Spoken words during this segment
          - Annotations: Description of detected content
          - Chapters: Chapter title/description
        range: string
        required: false
        examples:
          - value: "Welkom bij het Rijksmuseum"
            description: "Dutch subtitle text"
          - value: "The curator explains the painting's history"
            description: "Transcript segment"

      segment_index:
        slot_uri: hc:segmentIndex
        description: |
          Sequential index of this segment within the parent content.

          Zero-based index for ordering segments:
          - Subtitle: Order in which captions appear
          - Annotation: Detection sequence

          Enables reconstruction of segment order when times overlap
          or for stable sorting.
        range: integer
        required: false
        minimum_value: 0
        examples:
          - value: 0
            description: "First segment"
          - value: 42
            description: "43rd segment (zero-indexed)"

      confidence:
        slot_uri: hc:confidence
        description: |
          Confidence score for AI-generated content.

          Range: 0.0 (no confidence) to 1.0 (complete certainty)

          **Applies to**:
          - ASR-generated transcript/subtitle segments
          - CV-detected scene or object annotations
          - OCR-extracted text from video frames

          **Thresholds** (suggested):
          - > 0.9: High confidence, suitable for display
          - 0.7-0.9: Medium, may need review
          - < 0.7: Low, flag for human verification
        range: float
        required: false
        minimum_value: 0.0
        maximum_value: 1.0
        examples:
          - value: 0.95
            description: "High confidence ASR segment"
          - value: 0.72
            description: "Medium confidence, may contain errors"

      speaker_id:
        slot_uri: hc:speakerId
        description: |
          Identifier for the speaker during this segment.

          For transcripts with speaker diarization:
          - Links to identified speaker (e.g., "SPEAKER_01")
          - May be resolved to actual person identity

          Enables multi-speaker transcript navigation.
        range: string
        required: false
        examples:
          - value: "SPEAKER_01"
            description: "First identified speaker"
          - value: "curator_taco_dibbits"
            description: "Resolved speaker identity"

      speaker_label:
        slot_uri: hc:speakerLabel
        description: |
          Human-readable label for the speaker.

          Display name for the speaker during this segment:
          - May be generic ("Narrator", "Interviewer")
          - May be specific ("Dr. Taco Dibbits, Museum Director")

          Distinguished from speaker_id which is a machine identifier.
        range: string
        required: false
        examples:
          - value: "Narrator"
            description: "Generic speaker label"
          - value: "Dr. Taco Dibbits, Museum Director"
            description: "Specific identified speaker"

    rules:
      - postconditions:
          description: end_seconds must be >= start_seconds
          # Note: LinkML doesn't support direct comparison rules,
          # but this documents the constraint for validation

    comments:
      - "Reusable time segment for subtitles, annotations, chapters"
      - "Dual time format: ISO 8601 for serialization, seconds for computation"
      - "Aligns with W3C Media Fragments URI specification"
      - "Confidence scoring for AI-generated content"
      - "Speaker diarization support for multi-speaker transcripts"

    see_also:
      - "https://www.w3.org/TR/media-frags/"
      - "https://www.w3.org/TR/annotation-model/"
      - "https://www.w3.org/ns/ma-ont"
      - "http://www.cidoc-crm.org/cidoc-crm/E52_Time-Span"

# ============================================================================
# Slot Definitions
# ============================================================================

slots:
  start_time:
    description: Start time as ISO 8601 duration from video beginning
    range: string

  end_time:
    description: End time as ISO 8601 duration from video beginning
    range: string

  start_seconds:
    description: Start time in seconds (float) from video beginning
    range: float

  end_seconds:
    description: End time in seconds (float) from video beginning
    range: float

  segment_text:
    description: Text content for this time segment
    range: string

  segment_index:
    description: Sequential index of segment within parent
    range: integer

  confidence:
    description: Confidence score for AI-generated content (0.0-1.0)
    range: float

  speaker_id:
    description: Identifier for speaker during this segment
    range: string

  speaker_label:
    description: Human-readable label for speaker
    range: string