id: https://nde.nl/ontology/hc/class/VideoAudioAnnotation
name: video_audio_annotation_class
title: Video Audio Annotation Class
imports:
- linkml:types
- ./VideoAnnotation
- ./VideoTimeSegment
- ../slots/audio_event_segments
- ../slots/has_audio_quality_score
- ../slots/diarization_confidence
- ../slots/diarization_enabled
- ../slots/diarization_end_seconds
- ../slots/has_or_had_diarization_segment
- ../slots/diarization_speaker_id
- ../slots/diarization_speaker_label
- ../slots/diarization_start_seconds
- ../slots/has_clipping
- ../slots/is_background
- ../slots/is_overlapping
- ../slots/languages_detected
- ../slots/music_confidence
- ../slots/music_detected
- ../slots/music_end_seconds
- ../slots/music_genre
- ../slots/music_genres_detected
- ../slots/music_segment_confidence
- ../slots/has_or_had_music_segment
- ../slots/music_start_seconds
- ../slots/music_type
- ../slots/noise_floor_db
- ../slots/primary_audio_event_type
- ../slots/segment_confidence
- ../slots/segment_end_seconds
- ../slots/segment_language
- ../slots/segment_start_seconds
- ../slots/has_or_had_silence_segment
- ../slots/silence_total_seconds
- ../slots/snr_db
- ../slots/has_or_had_sound_event_type
- ../slots/sound_events_detected
- ../slots/speaker_count
- ../slots/speaker_id
- ../slots/speaker_label
- ../slots/speaker_label
- ../slots/specificity_annotation
- ../slots/speech_detected
- ../slots/speech_language
- ../slots/speech_language_confidence
- ../slots/has_or_had_speech_segment
- ../slots/speech_text
- ../slots/template_specificity
- ./DiarizationSegment
- ./MusicSegment
- ./SpecificityAnnotation
- ./SpeechSegment
- ./TemplateSpecificityScores
- ../slots/has_audio_event_segment
prefixes:
  linkml: https://w3id.org/linkml/
  hc: https://nde.nl/ontology/hc/
  schema: http://schema.org/
  dcterms: http://purl.org/dc/terms/
  prov: http://www.w3.org/ns/prov#
  crm: http://www.cidoc-crm.org/cidoc-crm/
  oa: http://www.w3.org/ns/oa#
  ma: http://www.w3.org/ns/ma-ont#
  wd: http://www.wikidata.org/entity/
default_prefix: hc
classes:
  VideoAudioAnnotation:
    is_a: VideoAnnotation
    class_uri: hc:VideoAudioAnnotation
    abstract: false
    description: "Annotation for audio events detected in video content.\n\n**DEFINITION**:\n\nVideoAudioAnnotation captures\
      \ structured information derived from audio\nanalysis of video content. This includes speech, music, silence, and\n\
      various sound events.\n\n**AUDIO ANALYSIS TYPES**:\n\n| Type | Description | Use Case |\n|------|-------------|----------|\n\
      | **Speech Detection** | Identify spoken segments | Transcript alignment |\n| **Speaker Diarization** | Who spoke when\
      \ | Interview navigation |\n| **Music Detection** | Identify musical segments | Content classification |\n| **Sound\
      \ Events** | Applause, laughter, etc. | Audience engagement |\n| **Silence Detection** | Find quiet segments | Quality\
      \ assessment |\n| **Language Detection** | Identify spoken languages | Multilingual content |\n\n**SPEAKER DIARIZATION**:\n\
      \nDiarization answers \"who spoke when\":\n\n```\n0:00-0:15  Speaker 1 (Curator)\n0:15-0:45  Speaker 2 (Artist)\n0:45-1:00\
      \  Speaker 1 (Curator)\n1:00-1:30  Speaker 3 (Museum Director)\n```\n\nHeritage applications:\n- Navigate to specific\
      \ speakers in interviews\n- Count speaking time per person\n- Identify unnamed speakers for annotation\n- Build speaker\
      \ databases for recognition\n\n**MUSIC DETECTION**:\n\nMusic detection classifies audio segments as containing music:\n\
      \n| Category | Examples |\n|----------|----------|\n| **Background music** | Documentary soundtracks |\n| **Featured\
      \ music** | Concert recordings, performances |\n| **Historical music** | Archival recordings |\n| **Licensed music**\
      \ | Rights-managed content |\n\nMusic segments may also include:\n- Genre classification (classical, jazz, folk)\n-\
      \ Mood/tempo analysis\n- Fingerprinting for identification\n\n**SOUND EVENT DETECTION**:\n\nNon-speech, non-music audio\
      \ events:\n\n| Event Type | Heritage Context |\n|------------|------------------|\n| APPLAUSE | Lecture recordings,\
      \ openings |\n| LAUGHTER | Tour guides, educational content |\n| CROWD_NOISE | Event documentation |\n| DOOR/FOOTSTEPS\
      \ | Ambient archive recordings |\n| NATURE_SOUNDS | Outdoor heritage site recordings |\n| MACHINERY | Industrial heritage,\
      \ conservation |\n\n**LANGUAGE DETECTION**:\n\nMultilingual heritage content requires language identification:\n\n```yaml\n\
      has_or_had_speech_segment:\n  - start: 0.0\n    end: 120.0\n    language: nl\n    speaker_id: speaker_001\n  - start:\
      \ 120.0\n    end: 240.0\n    language: en\n    speaker_id: speaker_001  # Same speaker, switched language\n```\n\n**AUDIO\
      \ QUALITY ANALYSIS**:\n\nAudio quality metrics for preservation and accessibility:\n\n| Metric | Description | Threshold\
      \ |\n|--------|-------------|-----------|\n| SNR | Signal-to-noise ratio | > 20 dB good |\n| Clipping | Peak distortion\
      \ | None ideal |\n| Noise floor | Background noise level | < -50 dB good |\n| Frequency response | Bandwidth | Full-range\
      \ ideal |\n\n**HERITAGE INSTITUTION USE CASES**:\n\n| Content Type | Audio Analysis Need |\n|--------------|---------------------|\n\
      | Oral histories | Diarization, transcription alignment |\n| Curator interviews | Speaker identification, language |\n\
      | Virtual tours | Background music, voiceover detection |\n| Lecture recordings | Audience reactions, Q&A segments |\n\
      | Conservation videos | Narration vs demonstration audio |\n| Archival footage | Speech recovery, noise reduction |\n\
      \n**RELATIONSHIP TO VideoTranscript**:\n\nVideoAudioAnnotation is complementary to VideoTranscript:\n\n- **VideoTranscript**:\
      \ The text content of speech (WHAT was said)\n- **VideoAudioAnnotation**: Audio structure (WHO spoke, music, sounds)\n\
      \nTogether they provide complete audio understanding:\n\n```\nVideoAudioAnnotation: Speaker 1 spoke 0:00-0:15\nVideoTranscript:\
      \ \"Welcome to the Rijksmuseum...\" (0:00-0:15)\n→ Combined: Curator said \"Welcome to the Rijksmuseum...\"\n```\n"
    exact_mappings:
    - hc:VideoAudioAnnotation
    close_mappings:
    - ma:AudioTrack
    - crm:E13_Attribute_Assignment
    related_mappings:
    - wikidata:Q11028
    - wikidata:Q638
    slots:
    - audio_event_segments
    - audio_quality_score
    - diarization_enabled
    - has_or_had_diarization_segment
    - has_clipping
    - languages_detected
    - music_confidence
    - music_detected
    - music_genres_detected
    - has_or_had_music_segment
    - noise_floor_db
    - primary_audio_event_type
    - has_or_had_silence_segment
    - silence_total_seconds
    - snr_db
    - has_or_had_sound_event_type
    - sound_events_detected
    - speaker_count
    - speaker_label
    - specificity_annotation
    - speech_detected
    - speech_language
    - speech_language_confidence
    - has_or_had_speech_segment
    - template_specificity
    slot_usage:
      has_audio_event_segment:
        slot_uri: oa:hasBody
        description: 'Time-coded segments with detected audio events.


          Web Annotation: hasBody links annotation to content.


          Each segment contains:

          - Start/end time boundaries

          - Event type (SPEECH, MUSIC, SILENCE, etc.)

          - Confidence score

          - Additional metadata (speaker ID, language, etc.)


          Segments may overlap (e.g., speech over background music).

          '
        range: VideoTimeSegment
        multivalued: true
        required: false
        inlined_as_list: true
        examples:
        - value: '[{start_seconds: 0.0, end_seconds: 15.0, segment_text: ''Speech detected - Speaker 1''}]'
          description: Speech detection segment
      primary_audio_event_type:
        slot_uri: dcterms:type
        description: 'The primary type of audio analysis performed.


          Dublin Core: type for categorization.


          **Types**:

          - SPEECH: Speech detection and diarization

          - MUSIC: Music detection and classification

          - SOUND_EVENTS: Environmental sound detection

          - MIXED: Multiple analysis types combined

          '
        range: AudioEventTypeEnum
        required: true
        examples:
        - value: SPEECH
          description: Primary focus on speech analysis
      speech_detected:
        slot_uri: hc:speechDetected
        description: 'Whether speech was detected in the video audio.


          High-level flag for presence of speech content.


          - true: At least one speech segment detected

          - false: No speech detected (music-only, silent, etc.)

          '
        range: boolean
        required: false
        examples:
        - value: true
          description: Speech is present in video
      has_or_had_speech_segment:
        slot_uri: hc:speechSegments
        description: 'Detailed speech segments with speaker and language info.


          Each segment represents continuous speech from one speaker.


          Used for:

          - Transcript alignment

          - Speaker navigation

          - Language segmentation

          '
        range: SpeechSegment
        multivalued: true
        required: false
        inlined_as_list: true
        examples:
        - value: '[{start_seconds: 0.0, end_seconds: 15.0, speaker_id: ''spk_001'', language: ''nl''}]'
          description: Dutch speech from speaker 1
      speech_language:
        slot_uri: dcterms:language
        description: 'Primary language of speech content (ISO 639-1 code).


          Dublin Core: language for primary language.


          For multilingual content, this is the predominant language.

          See `languages_detected` for all languages.

          '
        range: string
        required: false
        examples:
        - value: nl
          description: Dutch is primary language
        - value: en
          description: English is primary language
      speech_language_confidence:
        slot_uri: hc:languageConfidence
        description: 'Confidence score for language detection (0.0-1.0).


          Higher confidence when:

          - Longer speech segments

          - Clear audio quality

          - Distinct language features


          Lower confidence when:

          - Short utterances

          - Background noise

          - Code-switching

          '
        range: float
        required: false
        minimum_value: 0.0
        maximum_value: 1.0
        examples:
        - value: 0.95
          description: High confidence language detection
      languages_detected:
        slot_uri: hc:languagesDetected
        description: 'All languages detected in speech (ISO 639-1 codes).


          Heritage content often includes multiple languages:

          - Exhibition videos with translations

          - Interviews with multilingual speakers

          - Historical content with period languages


          Ordered by speaking time (most spoken first).

          '
        range: string
        multivalued: true
        required: false
        examples:
        - value: '[nl, en, de]'
          description: Dutch, English, and German detected
      diarization_enabled:
        slot_uri: hc:diarizationEnabled
        description: 'Whether speaker diarization was performed.


          Diarization = identifying distinct speakers and their segments.


          - true: Speaker IDs assigned to speech segments

          - false: Speech detected but speakers not distinguished

          '
        range: boolean
        required: false
        examples:
        - value: true
          description: Diarization was performed
      has_or_had_diarization_segment:
        slot_uri: hc:diarizationSegments
        description: 'Detailed diarization results with speaker assignments.


          Each segment identifies:

          - Time boundaries

          - Speaker ID (anonymous: "spk_001", "spk_002")

          - Optional speaker name (if identified)

          - Confidence score


          Enables "who spoke when" analysis.

          '
        range: DiarizationSegment
        multivalued: true
        required: false
        inlined_as_list: true
        examples:
        - value: '[{start_seconds: 0.0, end_seconds: 15.0, speaker_id: ''spk_001'', speaker_label: ''Curator''}]'
          description: Curator speaking for first 15 seconds
      speaker_count:
        slot_uri: hc:speakerCount
        description: 'Number of distinct speakers detected.


          Useful for:

          - Interview classification (1 = monologue, 2+ = dialog)

          - Content type inference

          - Accessibility planning

          '
        range: integer
        required: false
        minimum_value: 0
        examples:
        - value: 3
          description: Three distinct speakers detected
      speaker_label:
        slot_uri: hc:speakerLabels
        description: 'Labels or names assigned to detected speakers.


          May be:

          - Anonymous: ["Speaker 1", "Speaker 2"]

          - Identified: ["Dr. Taco Dibbits", "Interviewer"]

          - Role-based: ["Curator", "Artist", "Host"]


          Ordered by speaking time (most speaking first).

          '
        range: string
        multivalued: true
        required: false
        examples:
        - value: '[Curator, Artist, Museum Director]'
          description: Three identified speakers
      music_detected:
        slot_uri: hc:musicDetected
        description: 'Whether music was detected in the audio.


          - true: Musical content detected (any amount)

          - false: No music detected (speech-only, silence)

          '
        range: boolean
        required: false
        examples:
        - value: true
          description: Music present in video
      has_or_had_music_segment:
        slot_uri: hc:musicSegments
        description: 'Time segments containing music.


          Each segment includes:

          - Time boundaries

          - Music type (background, featured)

          - Genre classification (if detected)

          - Confidence score

          '
        range: MusicSegment
        multivalued: true
        required: false
        inlined_as_list: true
        examples:
        - value: '[{start_seconds: 0.0, end_seconds: 30.0, music_type: ''BACKGROUND'', genre: ''classical''}]'
          description: Classical background music
      music_genres_detected:
        slot_uri: hc:musicGenresDetected
        description: 'Music genres detected in audio.


          **Common Heritage Genres**:

          - classical: Art music, orchestral

          - baroque: Period-specific classical

          - jazz: Jazz performances

          - folk: Traditional/folk music

          - ambient: Background/atmospheric

          - electronic: Modern electronic music

          '
        range: string
        multivalued: true
        required: false
        examples:
        - value: '[classical, baroque]'
          description: Classical and baroque music detected
      music_confidence:
        slot_uri: hc:musicConfidence
        description: 'Overall confidence of music detection (0.0-1.0).


          Average confidence across all music segments.

          '
        range: float
        required: false
        minimum_value: 0.0
        maximum_value: 1.0
        examples:
        - value: 0.88
          description: High confidence music detection
      sound_events_detected:
        slot_uri: hc:soundEventsDetected
        description: 'Whether non-speech, non-music sound events were detected.


          Sound events include applause, laughter, environmental sounds, etc.

          '
        range: boolean
        required: false
        examples:
        - value: true
          description: Sound events detected
      has_or_had_sound_event_type:
        slot_uri: hc:soundEventTypes
        description: 'Types of sound events detected.


          **Heritage-Relevant Events**:

          - APPLAUSE: Lecture endings, openings

          - LAUGHTER: Tour guide humor

          - CROWD_NOISE: Event atmosphere

          - FOOTSTEPS: Gallery ambiance

          - NATURE_SOUNDS: Outdoor heritage sites

          - BELLS: Church/temple recordings

          '
        range: SoundEventTypeEnum
        multivalued: true
        required: false
        examples:
        - value: '[APPLAUSE, CROWD_NOISE]'
          description: Applause and crowd sounds detected
      has_or_had_silence_segment:
        slot_uri: hc:silenceSegments
        description: 'Time segments containing silence or very low audio.


          Silence detection useful for:

          - Finding pauses between segments

          - Quality assessment (unexpected silence)

          - Identifying chapter/scene boundaries


          Threshold typically: audio below -40 dB for > 2 seconds.

          '
        range: VideoTimeSegment
        multivalued: true
        required: false
        inlined_as_list: true
        examples:
        - value: '[{start_seconds: 45.0, end_seconds: 48.0}]'
          description: 3-second silence
      silence_total_seconds:
        slot_uri: hc:silenceTotalSeconds
        description: 'Total duration of silence in the video (seconds).


          High silence percentage may indicate:

          - Extended pauses

          - Silent segments (B-roll without audio)

          - Audio issues

          '
        range: float
        required: false
        minimum_value: 0.0
        examples:
        - value: 15.5
          description: 15.5 seconds of total silence
      noise_floor_db:
        slot_uri: hc:noiseFloorDb
        description: 'Background noise floor level in decibels.


          **Quality Guidelines**:

          - < -60 dB: Excellent (studio quality)

          - -60 to -40 dB: Good (professional recording)

          - -40 to -30 dB: Acceptable (field recording)

          - > -30 dB: Poor (noisy environment)

          '
        range: float
        required: false
        examples:
        - value: -45.0
          description: Good quality, moderate noise floor
      has_audio_quality_score:
        slot_uri: hc:audioQualityScore
        description: 'Overall audio quality score (0.0-1.0).


          Composite score based on:

          - Signal-to-noise ratio

          - Clipping presence

          - Frequency response

          - Clarity of speech


          **Interpretation**:

          - > 0.8: High quality, suitable for all uses

          - 0.6-0.8: Good quality, minor issues

          - 0.4-0.6: Acceptable, some degradation

          - < 0.4: Poor quality, may need enhancement

          '
        range: float
        required: false
        minimum_value: 0.0
        maximum_value: 1.0
        examples:
        - value: 0.85
          description: High audio quality
      snr_db:
        slot_uri: hc:snrDb
        description: 'Signal-to-noise ratio in decibels.


          Higher is better:

          - > 30 dB: Excellent

          - 20-30 dB: Good

          - 10-20 dB: Acceptable

          - < 10 dB: Poor (speech intelligibility affected)

          '
        range: float
        required: false
        examples:
        - value: 25.0
          description: Good signal-to-noise ratio
      has_clipping:
        slot_uri: hc:hasClipping
        description: 'Whether audio clipping (peak distortion) was detected.


          Clipping occurs when audio exceeds maximum level:

          - true: Clipping detected (distortion present)

          - false: No clipping (clean audio)


          Clipping is permanent quality loss.

          '
        range: boolean
        required: false
        examples:
        - value: false
          description: No clipping detected
      specificity_annotation:
        range: SpecificityAnnotation
        inlined: true
      template_specificity:
        range: TemplateSpecificityScores
        inlined: true
    comments:
    - Audio event detection for video content
    - Supports speech, music, silence, and sound event detection
    - Speaker diarization for interview navigation
    - Language detection for multilingual heritage content
    - Audio quality metrics for preservation assessment
    see_also:
    - https://www.w3.org/TR/annotation-model/
    - https://arxiv.org/abs/2111.08085
  SpeechSegment:
    class_uri: hc:SpeechSegment
    description: 'A speech segment with speaker and language information.


      Extends VideoTimeSegment with speech-specific metadata.

      '
    slots:
    - segment_confidence
    - segment_end_seconds
    - segment_language
    - segment_start_seconds
    - speaker_id
    - speaker_label
    - specificity_annotation
    - speech_text
    - template_specificity
    slot_usage:
      segment_start_seconds:
        slot_uri: ma:hasStartTime
        description: Start time in seconds
        range: float
        required: true
        minimum_value: 0.0
      segment_end_seconds:
        slot_uri: ma:hasEndTime
        description: End time in seconds
        range: float
        required: true
        minimum_value: 0.0
      speaker_id:
        slot_uri: hc:speakerId
        description: 'Unique identifier for the speaker.


          Format: "spk_001", "spk_002", etc. (anonymous)

          Or: "taco_dibbits" (identified)

          '
        range: string
        required: false
      speaker_label:
        slot_uri: schema:name
        description: Human-readable speaker name or role
        range: string
        required: false
      segment_language:
        slot_uri: dcterms:language
        description: Language of speech in this segment (ISO 639-1)
        range: string
        required: false
      segment_confidence:
        slot_uri: hc:confidence
        description: Confidence score for this segment (0.0-1.0)
        range: float
        required: false
        minimum_value: 0.0
        maximum_value: 1.0
      speech_text:
        slot_uri: hc:speechText
        description: 'Transcript text for this segment (if available).


          Links to VideoTranscript for full transcript.

          '
        range: string
        required: false
      specificity_annotation:
        range: SpecificityAnnotation
        inlined: true
      template_specificity:
        range: TemplateSpecificityScores
        inlined: true
  DiarizationSegment:
    class_uri: hc:DiarizationSegment
    description: 'A diarization segment identifying speaker and time boundaries.


      Focused on "who spoke when" rather than transcript content.

      '
    slots:
    - diarization_confidence
    - diarization_end_seconds
    - diarization_speaker_id
    - diarization_speaker_label
    - diarization_start_seconds
    - is_overlapping
    - specificity_annotation
    - template_specificity
    slot_usage:
      diarization_start_seconds:
        slot_uri: ma:hasStartTime
        description: Start time in seconds
        range: float
        required: true
        minimum_value: 0.0
      diarization_end_seconds:
        slot_uri: ma:hasEndTime
        description: End time in seconds
        range: float
        required: true
        minimum_value: 0.0
      diarization_speaker_id:
        slot_uri: hc:speakerId
        description: Anonymous speaker identifier (spk_001, spk_002, etc.)
        range: string
        required: true
      diarization_speaker_label:
        slot_uri: schema:name
        description: Optional identified name or role
        range: string
        required: false
      diarization_confidence:
        slot_uri: hc:confidence
        description: Diarization confidence (0.0-1.0)
        range: float
        required: false
        minimum_value: 0.0
        maximum_value: 1.0
      is_overlapping:
        slot_uri: hc:isOverlapping
        description: 'Whether this segment overlaps with another speaker.


          Overlapping speech occurs when multiple people speak simultaneously.

          '
        range: boolean
        required: false
      specificity_annotation:
        range: SpecificityAnnotation
        inlined: true
      template_specificity:
        range: TemplateSpecificityScores
        inlined: true
  MusicSegment:
    class_uri: hc:MusicSegment
    description: 'A segment of detected music with classification.

      '
    slots:
    - is_background
    - music_end_seconds
    - music_genre
    - music_segment_confidence
    - music_start_seconds
    - music_type
    - specificity_annotation
    - template_specificity
    slot_usage:
      music_start_seconds:
        slot_uri: ma:hasStartTime
        description: Start time in seconds
        range: float
        required: true
        minimum_value: 0.0
      music_end_seconds:
        slot_uri: ma:hasEndTime
        description: End time in seconds
        range: float
        required: true
        minimum_value: 0.0
      music_type:
        slot_uri: dcterms:type
        description: Type of music (BACKGROUND, FEATURED, ARCHIVAL)
        range: MusicTypeEnum
        required: false
      music_genre:
        slot_uri: hc:genre
        description: Detected music genre
        range: string
        required: false
      music_segment_confidence:
        slot_uri: hc:confidence
        description: Music detection confidence (0.0-1.0)
        range: float
        required: false
        minimum_value: 0.0
        maximum_value: 1.0
      is_background:
        slot_uri: hc:isBackground
        description: 'Whether music is background (under speech) vs featured.


          - true: Music is background/ambient

          - false: Music is primary audio

          '
        range: boolean
        required: false
      specificity_annotation:
        range: SpecificityAnnotation
        inlined: true
      template_specificity:
        range: TemplateSpecificityScores
        inlined: true
enums:
  AudioEventTypeEnum:
    description: 'Types of audio events detected in video.

      '
    permissible_values:
      SPEECH:
        description: Speech/voice detection and analysis
      MUSIC:
        description: Music detection and classification
      SILENCE:
        description: Silence or very low audio
      SOUND_EVENT:
        description: Non-speech, non-music sound events
      NOISE:
        description: Noise detection (for quality assessment)
      MIXED:
        description: Multiple audio event types analyzed
  SoundEventTypeEnum:
    description: 'Types of non-speech, non-music sound events.

      '
    permissible_values:
      APPLAUSE:
        description: Clapping, applause
      LAUGHTER:
        description: Laughter from audience or speakers
      CROWD_NOISE:
        description: General crowd/audience noise
      FOOTSTEPS:
        description: Walking, footsteps
      DOOR:
        description: Door opening/closing sounds
      NATURE_SOUNDS:
        description: Birds, wind, water, etc.
      TRAFFIC:
        description: Vehicles, urban sounds
      BELLS:
        description: Church bells, temple bells, etc.
      MACHINERY:
        description: Industrial, mechanical sounds
      COUGHING:
        description: Coughing, clearing throat
      PAPER:
        description: Paper rustling
      TYPING:
        description: Keyboard typing
      PHONE:
        description: Phone ringing or notification
      MUSIC_INSTRUMENT:
        description: Individual instrument sounds
      OTHER:
        description: Other sound event type
  MusicTypeEnum:
    description: 'Types of music presence in audio.

      '
    permissible_values:
      BACKGROUND:
        description: Background/ambient music under other content
      FEATURED:
        description: Primary audio is music (performance, recording)
      ARCHIVAL:
        description: Historical/archival music recording
      INTRO_OUTRO:
        description: Opening or closing music/jingle
      TRANSITION:
        description: Music used for scene transitions
      DIEGETIC:
        description: Music from within the scene (radio, live performance)
      NON_DIEGETIC:
        description: Music added in post-production