glam/schemas/20251121/linkml/modules/classes/VideoAudioAnnotation.yaml

# Video Audio Annotation Class
# Models audio event detection in video content (speech, music, silence, diarization)
#
# Part of Heritage Custodian Ontology v0.9.10
#
# HIERARCHY:
# VideoAnnotation (abstract base)
#     │
#     ├── VideoSceneAnnotation (scene/shot detection)
#     ├── VideoObjectAnnotation (object/face/logo detection)
#     ├── VideoOCRAnnotation (text-in-video extraction)
#     └── VideoAudioAnnotation (this class)
#             - Speech detection and diarization
#             - Music detection and classification
#             - Sound event detection
#             - Silence/noise detection
#
# HERITAGE INSTITUTION USE CASES:
# - Speaker identification in curator interviews
# - Music detection in promotional videos
# - Silence detection for video quality analysis
# - Language detection for multilingual content
# - Applause/audience reaction in lecture recordings
# - Sound effects in exhibition media
#
# ONTOLOGY ALIGNMENT:
# - W3C Web Annotation for annotation structure
# - CIDOC-CRM E13_Attribute_Assignment for attribution
# - W3C Media Ontology for audio properties
# - Speech-to-Text standards for diarization

id: https://nde.nl/ontology/hc/class/VideoAudioAnnotation
name: video_audio_annotation_class
title: Video Audio Annotation Class

imports:
  - linkml:types
  - ./VideoAnnotation
  - ./VideoTimeSegment

prefixes:
  linkml: https://w3id.org/linkml/
  hc: https://nde.nl/ontology/hc/
  schema: http://schema.org/
  dcterms: http://purl.org/dc/terms/
  prov: http://www.w3.org/ns/prov#
  crm: http://www.cidoc-crm.org/cidoc-crm/
  oa: http://www.w3.org/ns/oa#
  ma: http://www.w3.org/ns/ma-ont#
  wd: http://www.wikidata.org/entity/

default_prefix: hc

# ============================================================================
# Classes
# ============================================================================

classes:

  VideoAudioAnnotation:
    is_a: VideoAnnotation
    class_uri: hc:VideoAudioAnnotation
    abstract: false
    description: |
      Annotation for audio events detected in video content.

      **DEFINITION**:

      VideoAudioAnnotation captures structured information derived from audio
      analysis of video content. This includes speech, music, silence, and
      various sound events.

      **AUDIO ANALYSIS TYPES**:

      | Type | Description | Use Case |
      |------|-------------|----------|
      | **Speech Detection** | Identify spoken segments | Transcript alignment |
      | **Speaker Diarization** | Who spoke when | Interview navigation |
      | **Music Detection** | Identify musical segments | Content classification |
      | **Sound Events** | Applause, laughter, etc. | Audience engagement |
      | **Silence Detection** | Find quiet segments | Quality assessment |
      | **Language Detection** | Identify spoken languages | Multilingual content |

      **SPEAKER DIARIZATION**:

      Diarization answers "who spoke when":

      ```
      0:00-0:15  Speaker 1 (Curator)
      0:15-0:45  Speaker 2 (Artist)
      0:45-1:00  Speaker 1 (Curator)
      1:00-1:30  Speaker 3 (Museum Director)
      ```

      Heritage applications:
      - Navigate to specific speakers in interviews
      - Count speaking time per person
      - Identify unnamed speakers for annotation
      - Build speaker databases for recognition

      **MUSIC DETECTION**:

      Music detection classifies audio segments as containing music:

      | Category | Examples |
      |----------|----------|
      | **Background music** | Documentary soundtracks |
      | **Featured music** | Concert recordings, performances |
      | **Historical music** | Archival recordings |
      | **Licensed music** | Rights-managed content |

      Music segments may also include:
      - Genre classification (classical, jazz, folk)
      - Mood/tempo analysis
      - Fingerprinting for identification

      **SOUND EVENT DETECTION**:

      Non-speech, non-music audio events:

      | Event Type | Heritage Context |
      |------------|------------------|
      | APPLAUSE | Lecture recordings, openings |
      | LAUGHTER | Tour guides, educational content |
      | CROWD_NOISE | Event documentation |
      | DOOR/FOOTSTEPS | Ambient archive recordings |
      | NATURE_SOUNDS | Outdoor heritage site recordings |
      | MACHINERY | Industrial heritage, conservation |

      **LANGUAGE DETECTION**:

      Multilingual heritage content requires language identification:

      ```yaml
      speech_segments:
        - start: 0.0
          end: 120.0
          language: nl
          speaker_id: speaker_001
        - start: 120.0
          end: 240.0
          language: en
          speaker_id: speaker_001  # Same speaker, switched language
      ```

      **AUDIO QUALITY ANALYSIS**:

      Audio quality metrics for preservation and accessibility:

      | Metric | Description | Threshold |
      |--------|-------------|-----------|
      | SNR | Signal-to-noise ratio | > 20 dB good |
      | Clipping | Peak distortion | None ideal |
      | Noise floor | Background noise level | < -50 dB good |
      | Frequency response | Bandwidth | Full-range ideal |

      **HERITAGE INSTITUTION USE CASES**:

      | Content Type | Audio Analysis Need |
      |--------------|---------------------|
      | Oral histories | Diarization, transcription alignment |
      | Curator interviews | Speaker identification, language |
      | Virtual tours | Background music, voiceover detection |
      | Lecture recordings | Audience reactions, Q&A segments |
      | Conservation videos | Narration vs demonstration audio |
      | Archival footage | Speech recovery, noise reduction |

      **RELATIONSHIP TO VideoTranscript**:

      VideoAudioAnnotation is complementary to VideoTranscript:

      - **VideoTranscript**: The text content of speech (WHAT was said)
      - **VideoAudioAnnotation**: Audio structure (WHO spoke, music, sounds)

      Together they provide complete audio understanding:

      ```
      VideoAudioAnnotation: Speaker 1 spoke 0:00-0:15
      VideoTranscript: "Welcome to the Rijksmuseum..." (0:00-0:15)
      → Combined: Curator said "Welcome to the Rijksmuseum..."
      ```

    exact_mappings:
      - hc:VideoAudioAnnotation

    close_mappings:
      - ma:AudioTrack
      - crm:E13_Attribute_Assignment

    related_mappings:
      - wikidata:Q11028  # Speech
      - wikidata:Q638    # Music

    slots:
      # Audio event detection
      - audio_event_segments
      - primary_audio_event_type

      # Speech analysis
      - speech_detected
      - speech_segments
      - speech_language
      - speech_language_confidence
      - languages_detected

      # Speaker diarization
      - diarization_enabled
      - diarization_segments
      - speaker_count
      - speaker_labels

      # Music detection
      - music_detected
      - music_segments
      - music_genres_detected
      - music_confidence

      # Sound events
      - sound_events_detected
      - sound_event_types

      # Silence/noise
      - silence_segments
      - silence_total_seconds
      - noise_floor_db

      # Audio quality
      - audio_quality_score
      - snr_db
      - has_clipping

    slot_usage:
      audio_event_segments:
        slot_uri: oa:hasBody
        description: |
          Time-coded segments with detected audio events.

          Web Annotation: hasBody links annotation to content.

          Each segment contains:
          - Start/end time boundaries
          - Event type (SPEECH, MUSIC, SILENCE, etc.)
          - Confidence score
          - Additional metadata (speaker ID, language, etc.)

          Segments may overlap (e.g., speech over background music).
        range: VideoTimeSegment
        multivalued: true
        required: false
        inlined_as_list: true
        examples:
          - value: "[{start_seconds: 0.0, end_seconds: 15.0, segment_text: 'Speech detected - Speaker 1'}]"
            description: "Speech detection segment"

      primary_audio_event_type:
        slot_uri: dcterms:type
        description: |
          The primary type of audio analysis performed.

          Dublin Core: type for categorization.

          **Types**:
          - SPEECH: Speech detection and diarization
          - MUSIC: Music detection and classification
          - SOUND_EVENTS: Environmental sound detection
          - MIXED: Multiple analysis types combined
        range: AudioEventTypeEnum
        required: true
        examples:
          - value: "SPEECH"
            description: "Primary focus on speech analysis"

      speech_detected:
        slot_uri: hc:speechDetected
        description: |
          Whether speech was detected in the video audio.

          High-level flag for presence of speech content.

          - true: At least one speech segment detected
          - false: No speech detected (music-only, silent, etc.)
        range: boolean
        required: false
        examples:
          - value: true
            description: "Speech is present in video"

      speech_segments:
        slot_uri: hc:speechSegments
        description: |
          Detailed speech segments with speaker and language info.

          Each segment represents continuous speech from one speaker.

          Used for:
          - Transcript alignment
          - Speaker navigation
          - Language segmentation
        range: SpeechSegment
        multivalued: true
        required: false
        inlined_as_list: true
        examples:
          - value: "[{start_seconds: 0.0, end_seconds: 15.0, speaker_id: 'spk_001', language: 'nl'}]"
            description: "Dutch speech from speaker 1"

      speech_language:
        slot_uri: dcterms:language
        description: |
          Primary language of speech content (ISO 639-1 code).

          Dublin Core: language for primary language.

          For multilingual content, this is the predominant language.
          See `languages_detected` for all languages.
        range: string
        required: false
        examples:
          - value: "nl"
            description: "Dutch is primary language"
          - value: "en"
            description: "English is primary language"

      speech_language_confidence:
        slot_uri: hc:languageConfidence
        description: |
          Confidence score for language detection (0.0-1.0).

          Higher confidence when:
          - Longer speech segments
          - Clear audio quality
          - Distinct language features

          Lower confidence when:
          - Short utterances
          - Background noise
          - Code-switching
        range: float
        required: false
        minimum_value: 0.0
        maximum_value: 1.0
        examples:
          - value: 0.95
            description: "High confidence language detection"

      languages_detected:
        slot_uri: hc:languagesDetected
        description: |
          All languages detected in speech (ISO 639-1 codes).

          Heritage content often includes multiple languages:
          - Exhibition videos with translations
          - Interviews with multilingual speakers
          - Historical content with period languages

          Ordered by speaking time (most spoken first).
        range: string
        multivalued: true
        required: false
        examples:
          - value: "[nl, en, de]"
            description: "Dutch, English, and German detected"

      diarization_enabled:
        slot_uri: hc:diarizationEnabled
        description: |
          Whether speaker diarization was performed.

          Diarization = identifying distinct speakers and their segments.

          - true: Speaker IDs assigned to speech segments
          - false: Speech detected but speakers not distinguished
        range: boolean
        required: false
        examples:
          - value: true
            description: "Diarization was performed"

      diarization_segments:
        slot_uri: hc:diarizationSegments
        description: |
          Detailed diarization results with speaker assignments.

          Each segment identifies:
          - Time boundaries
          - Speaker ID (anonymous: "spk_001", "spk_002")
          - Optional speaker name (if identified)
          - Confidence score

          Enables "who spoke when" analysis.
        range: DiarizationSegment
        multivalued: true
        required: false
        inlined_as_list: true
        examples:
          - value: "[{start_seconds: 0.0, end_seconds: 15.0, speaker_id: 'spk_001', speaker_label: 'Curator'}]"
            description: "Curator speaking for first 15 seconds"

      speaker_count:
        slot_uri: hc:speakerCount
        description: |
          Number of distinct speakers detected.

          Useful for:
          - Interview classification (1 = monologue, 2+ = dialog)
          - Content type inference
          - Accessibility planning
        range: integer
        required: false
        minimum_value: 0
        examples:
          - value: 3
            description: "Three distinct speakers detected"

      speaker_labels:
        slot_uri: hc:speakerLabels
        description: |
          Labels or names assigned to detected speakers.

          May be:
          - Anonymous: ["Speaker 1", "Speaker 2"]
          - Identified: ["Dr. Taco Dibbits", "Interviewer"]
          - Role-based: ["Curator", "Artist", "Host"]

          Ordered by speaking time (most speaking first).
        range: string
        multivalued: true
        required: false
        examples:
          - value: "[Curator, Artist, Museum Director]"
            description: "Three identified speakers"

      music_detected:
        slot_uri: hc:musicDetected
        description: |
          Whether music was detected in the audio.

          - true: Musical content detected (any amount)
          - false: No music detected (speech-only, silence)
        range: boolean
        required: false
        examples:
          - value: true
            description: "Music present in video"

      music_segments:
        slot_uri: hc:musicSegments
        description: |
          Time segments containing music.

          Each segment includes:
          - Time boundaries
          - Music type (background, featured)
          - Genre classification (if detected)
          - Confidence score
        range: MusicSegment
        multivalued: true
        required: false
        inlined_as_list: true
        examples:
          - value: "[{start_seconds: 0.0, end_seconds: 30.0, music_type: 'BACKGROUND', genre: 'classical'}]"
            description: "Classical background music"

      music_genres_detected:
        slot_uri: hc:musicGenresDetected
        description: |
          Music genres detected in audio.

          **Common Heritage Genres**:
          - classical: Art music, orchestral
          - baroque: Period-specific classical
          - jazz: Jazz performances
          - folk: Traditional/folk music
          - ambient: Background/atmospheric
          - electronic: Modern electronic music
        range: string
        multivalued: true
        required: false
        examples:
          - value: "[classical, baroque]"
            description: "Classical and baroque music detected"

      music_confidence:
        slot_uri: hc:musicConfidence
        description: |
          Overall confidence of music detection (0.0-1.0).

          Average confidence across all music segments.
        range: float
        required: false
        minimum_value: 0.0
        maximum_value: 1.0
        examples:
          - value: 0.88
            description: "High confidence music detection"

      sound_events_detected:
        slot_uri: hc:soundEventsDetected
        description: |
          Whether non-speech, non-music sound events were detected.

          Sound events include applause, laughter, environmental sounds, etc.
        range: boolean
        required: false
        examples:
          - value: true
            description: "Sound events detected"

      sound_event_types:
        slot_uri: hc:soundEventTypes
        description: |
          Types of sound events detected.

          **Heritage-Relevant Events**:
          - APPLAUSE: Lecture endings, openings
          - LAUGHTER: Tour guide humor
          - CROWD_NOISE: Event atmosphere
          - FOOTSTEPS: Gallery ambiance
          - NATURE_SOUNDS: Outdoor heritage sites
          - BELLS: Church/temple recordings
        range: SoundEventTypeEnum
        multivalued: true
        required: false
        examples:
          - value: "[APPLAUSE, CROWD_NOISE]"
            description: "Applause and crowd sounds detected"

      silence_segments:
        slot_uri: hc:silenceSegments
        description: |
          Time segments containing silence or very low audio.

          Silence detection useful for:
          - Finding pauses between segments
          - Quality assessment (unexpected silence)
          - Identifying chapter/scene boundaries

          Threshold typically: audio below -40 dB for > 2 seconds.
        range: VideoTimeSegment
        multivalued: true
        required: false
        inlined_as_list: true
        examples:
          - value: "[{start_seconds: 45.0, end_seconds: 48.0}]"
            description: "3-second silence"

      silence_total_seconds:
        slot_uri: hc:silenceTotalSeconds
        description: |
          Total duration of silence in the video (seconds).

          High silence percentage may indicate:
          - Extended pauses
          - Silent segments (B-roll without audio)
          - Audio issues
        range: float
        required: false
        minimum_value: 0.0
        examples:
          - value: 15.5
            description: "15.5 seconds of total silence"

      noise_floor_db:
        slot_uri: hc:noiseFloorDb
        description: |
          Background noise floor level in decibels.

          **Quality Guidelines**:
          - < -60 dB: Excellent (studio quality)
          - -60 to -40 dB: Good (professional recording)
          - -40 to -30 dB: Acceptable (field recording)
          - > -30 dB: Poor (noisy environment)
        range: float
        required: false
        examples:
          - value: -45.0
            description: "Good quality, moderate noise floor"

      audio_quality_score:
        slot_uri: hc:audioQualityScore
        description: |
          Overall audio quality score (0.0-1.0).

          Composite score based on:
          - Signal-to-noise ratio
          - Clipping presence
          - Frequency response
          - Clarity of speech

          **Interpretation**:
          - > 0.8: High quality, suitable for all uses
          - 0.6-0.8: Good quality, minor issues
          - 0.4-0.6: Acceptable, some degradation
          - < 0.4: Poor quality, may need enhancement
        range: float
        required: false
        minimum_value: 0.0
        maximum_value: 1.0
        examples:
          - value: 0.85
            description: "High audio quality"

      snr_db:
        slot_uri: hc:snrDb
        description: |
          Signal-to-noise ratio in decibels.

          Higher is better:
          - > 30 dB: Excellent
          - 20-30 dB: Good
          - 10-20 dB: Acceptable
          - < 10 dB: Poor (speech intelligibility affected)
        range: float
        required: false
        examples:
          - value: 25.0
            description: "Good signal-to-noise ratio"

      has_clipping:
        slot_uri: hc:hasClipping
        description: |
          Whether audio clipping (peak distortion) was detected.

          Clipping occurs when audio exceeds maximum level:
          - true: Clipping detected (distortion present)
          - false: No clipping (clean audio)

          Clipping is permanent quality loss.
        range: boolean
        required: false
        examples:
          - value: false
            description: "No clipping detected"

    comments:
      - "Audio event detection for video content"
      - "Supports speech, music, silence, and sound event detection"
      - "Speaker diarization for interview navigation"
      - "Language detection for multilingual heritage content"
      - "Audio quality metrics for preservation assessment"

    see_also:
      - "https://www.w3.org/TR/annotation-model/"
      - "https://arxiv.org/abs/2111.08085"  # Speaker diarization survey

# ============================================================================
# Supporting Classes
# ============================================================================

  SpeechSegment:
    class_uri: hc:SpeechSegment
    description: |
      A speech segment with speaker and language information.

      Extends VideoTimeSegment with speech-specific metadata.

    slots:
      - segment_start_seconds
      - segment_end_seconds
      - speaker_id
      - speaker_label
      - segment_language
      - segment_confidence
      - speech_text

    slot_usage:
      segment_start_seconds:
        slot_uri: ma:hasStartTime
        description: Start time in seconds
        range: float
        required: true
        minimum_value: 0.0

      segment_end_seconds:
        slot_uri: ma:hasEndTime
        description: End time in seconds
        range: float
        required: true
        minimum_value: 0.0

      speaker_id:
        slot_uri: hc:speakerId
        description: |
          Unique identifier for the speaker.

          Format: "spk_001", "spk_002", etc. (anonymous)
          Or: "taco_dibbits" (identified)
        range: string
        required: false

      speaker_label:
        slot_uri: schema:name
        description: Human-readable speaker name or role
        range: string
        required: false

      segment_language:
        slot_uri: dcterms:language
        description: Language of speech in this segment (ISO 639-1)
        range: string
        required: false

      segment_confidence:
        slot_uri: hc:confidence
        description: Confidence score for this segment (0.0-1.0)
        range: float
        required: false
        minimum_value: 0.0
        maximum_value: 1.0

      speech_text:
        slot_uri: hc:speechText
        description: |
          Transcript text for this segment (if available).

          Links to VideoTranscript for full transcript.
        range: string
        required: false


  DiarizationSegment:
    class_uri: hc:DiarizationSegment
    description: |
      A diarization segment identifying speaker and time boundaries.

      Focused on "who spoke when" rather than transcript content.

    slots:
      - diarization_start_seconds
      - diarization_end_seconds
      - diarization_speaker_id
      - diarization_speaker_label
      - diarization_confidence
      - is_overlapping

    slot_usage:
      diarization_start_seconds:
        slot_uri: ma:hasStartTime
        description: Start time in seconds
        range: float
        required: true
        minimum_value: 0.0

      diarization_end_seconds:
        slot_uri: ma:hasEndTime
        description: End time in seconds
        range: float
        required: true
        minimum_value: 0.0

      diarization_speaker_id:
        slot_uri: hc:speakerId
        description: Anonymous speaker identifier (spk_001, spk_002, etc.)
        range: string
        required: true

      diarization_speaker_label:
        slot_uri: schema:name
        description: Optional identified name or role
        range: string
        required: false

      diarization_confidence:
        slot_uri: hc:confidence
        description: Diarization confidence (0.0-1.0)
        range: float
        required: false
        minimum_value: 0.0
        maximum_value: 1.0

      is_overlapping:
        slot_uri: hc:isOverlapping
        description: |
          Whether this segment overlaps with another speaker.

          Overlapping speech occurs when multiple people speak simultaneously.
        range: boolean
        required: false


  MusicSegment:
    class_uri: hc:MusicSegment
    description: |
      A segment of detected music with classification.

    slots:
      - music_start_seconds
      - music_end_seconds
      - music_type
      - music_genre
      - music_segment_confidence
      - is_background

    slot_usage:
      music_start_seconds:
        slot_uri: ma:hasStartTime
        description: Start time in seconds
        range: float
        required: true
        minimum_value: 0.0

      music_end_seconds:
        slot_uri: ma:hasEndTime
        description: End time in seconds
        range: float
        required: true
        minimum_value: 0.0

      music_type:
        slot_uri: dcterms:type
        description: Type of music (BACKGROUND, FEATURED, ARCHIVAL)
        range: MusicTypeEnum
        required: false

      music_genre:
        slot_uri: hc:genre
        description: Detected music genre
        range: string
        required: false

      music_segment_confidence:
        slot_uri: hc:confidence
        description: Music detection confidence (0.0-1.0)
        range: float
        required: false
        minimum_value: 0.0
        maximum_value: 1.0

      is_background:
        slot_uri: hc:isBackground
        description: |
          Whether music is background (under speech) vs featured.

          - true: Music is background/ambient
          - false: Music is primary audio
        range: boolean
        required: false


# ============================================================================
# Enumerations
# ============================================================================

enums:

  AudioEventTypeEnum:
    description: |
      Types of audio events detected in video.
    permissible_values:
      SPEECH:
        description: Speech/voice detection and analysis
      MUSIC:
        description: Music detection and classification
      SILENCE:
        description: Silence or very low audio
      SOUND_EVENT:
        description: Non-speech, non-music sound events
      NOISE:
        description: Noise detection (for quality assessment)
      MIXED:
        description: Multiple audio event types analyzed

  SoundEventTypeEnum:
    description: |
      Types of non-speech, non-music sound events.
    permissible_values:
      APPLAUSE:
        description: Clapping, applause
      LAUGHTER:
        description: Laughter from audience or speakers
      CROWD_NOISE:
        description: General crowd/audience noise
      FOOTSTEPS:
        description: Walking, footsteps
      DOOR:
        description: Door opening/closing sounds
      NATURE_SOUNDS:
        description: Birds, wind, water, etc.
      TRAFFIC:
        description: Vehicles, urban sounds
      BELLS:
        description: Church bells, temple bells, etc.
      MACHINERY:
        description: Industrial, mechanical sounds
      COUGHING:
        description: Coughing, clearing throat
      PAPER:
        description: Paper rustling
      TYPING:
        description: Keyboard typing
      PHONE:
        description: Phone ringing or notification
      MUSIC_INSTRUMENT:
        description: Individual instrument sounds
      OTHER:
        description: Other sound event type

  MusicTypeEnum:
    description: |
      Types of music presence in audio.
    permissible_values:
      BACKGROUND:
        description: Background/ambient music under other content
      FEATURED:
        description: Primary audio is music (performance, recording)
      ARCHIVAL:
        description: Historical/archival music recording
      INTRO_OUTRO:
        description: Opening or closing music/jingle
      TRANSITION:
        description: Music used for scene transitions
      DIEGETIC:
        description: Music from within the scene (radio, live performance)
      NON_DIEGETIC:
        description: Music added in post-production


# ============================================================================
# Slot Definitions
# ============================================================================

slots:
  # Audio event slots
  audio_event_segments:
    description: Time-coded segments with detected audio events
    range: VideoTimeSegment
    multivalued: true

  primary_audio_event_type:
    description: Primary type of audio analysis performed
    range: AudioEventTypeEnum

  # Speech slots
  speech_detected:
    description: Whether speech was detected
    range: boolean

  speech_segments:
    description: Detailed speech segments with speaker info
    range: SpeechSegment
    multivalued: true

  speech_language:
    description: Primary language of speech (ISO 639-1)
    range: string

  speech_language_confidence:
    description: Confidence of language detection
    range: float

  languages_detected:
    description: All languages detected in speech
    range: string
    multivalued: true

  # Diarization slots
  diarization_enabled:
    description: Whether speaker diarization was performed
    range: boolean

  diarization_segments:
    description: Detailed diarization results
    range: DiarizationSegment
    multivalued: true

  speaker_count:
    description: Number of distinct speakers detected
    range: integer

  speaker_labels:
    description: Labels or names for detected speakers
    range: string
    multivalued: true

  # Music slots
  music_detected:
    description: Whether music was detected
    range: boolean

  music_segments:
    description: Time segments containing music
    range: MusicSegment
    multivalued: true

  music_genres_detected:
    description: Music genres detected
    range: string
    multivalued: true

  music_confidence:
    description: Overall music detection confidence
    range: float

  # Sound event slots
  sound_events_detected:
    description: Whether sound events were detected
    range: boolean

  sound_event_types:
    description: Types of sound events detected
    range: SoundEventTypeEnum
    multivalued: true

  # Silence/noise slots
  silence_segments:
    description: Time segments with silence
    range: VideoTimeSegment
    multivalued: true

  silence_total_seconds:
    description: Total silence duration
    range: float

  noise_floor_db:
    description: Background noise floor in dB
    range: float

  # Audio quality slots
  audio_quality_score:
    description: Overall audio quality (0.0-1.0)
    range: float

  snr_db:
    description: Signal-to-noise ratio in dB
    range: float

  has_clipping:
    description: Whether audio clipping was detected
    range: boolean

  # SpeechSegment slots
  segment_start_seconds:
    description: Segment start time
    range: float

  segment_end_seconds:
    description: Segment end time
    range: float

  speaker_id:
    description: Speaker identifier
    range: string

  speaker_label:
    description: Speaker name or role
    range: string

  segment_language:
    description: Language of segment
    range: string

  segment_confidence:
    description: Segment confidence score
    range: float

  speech_text:
    description: Transcript text for segment
    range: string

  # DiarizationSegment slots
  diarization_start_seconds:
    description: Diarization segment start
    range: float

  diarization_end_seconds:
    description: Diarization segment end
    range: float

  diarization_speaker_id:
    description: Speaker ID in diarization
    range: string

  diarization_speaker_label:
    description: Speaker label in diarization
    range: string

  diarization_confidence:
    description: Diarization confidence
    range: float

  is_overlapping:
    description: Whether segment has overlapping speech
    range: boolean

  # MusicSegment slots
  music_start_seconds:
    description: Music segment start
    range: float

  music_end_seconds:
    description: Music segment end
    range: float

  music_type:
    description: Type of music presence
    range: MusicTypeEnum

  music_genre:
    description: Detected music genre
    range: string

  music_segment_confidence:
    description: Music segment confidence
    range: float

  is_background:
    description: Whether music is background
    range: boolean