glam/schemas/20251121/linkml/modules/classes/VideoAudioAnnotation.yaml

id: https://nde.nl/ontology/hc/class/VideoAudioAnnotation
name: video_audio_annotation_class
title: Video Audio Annotation Class
imports:
  - linkml:types
  - ../enums/AudioEventTypeEnum
  - ../enums/MusicTypeEnum
  - ../enums/SoundEventTypeEnum
  - ../slots/contain
  - ../slots/end_of_the_end
  - ../slots/has_score
  - ../slots/identified_by
  - ../slots/has_label
  - ../slots/has_provenance
  - ../slots/has_segment
  - ../slots/has_type
  - ../slots/in_background
  - ../slots/diarized
  - ../slots/overlap_with
  - ../slots/has_language
  - ../slots/has_confidence_measure
  - ../slots/has_music
  - ../slots/has_genre
  - ../slots/has_sound
  - ../slots/in_language
  - ../slots/begin_of_the_begin
  - ../slots/has_silence
  - ../slots/has_ratio
  - ../slots/has_speaker
  - ../slots/20260202_matang/speaker_label
  - ../slots/has_spoken_words
  - ../slots/temporal_extent
prefixes:
  linkml: https://w3id.org/linkml/
  hc: https://nde.nl/ontology/hc/
  schema: http://schema.org/
  dcterms: http://purl.org/dc/terms/
  prov: http://www.w3.org/ns/prov#
  crm: http://www.cidoc-crm.org/cidoc-crm/
  oa: http://www.w3.org/ns/oa#
  ma: http://www.w3.org/ns/ma-ont#
  wd: http://www.wikidata.org/entity/
default_prefix: hc
classes:
  VideoAudioAnnotation:
    is_a: VideoAnnotation
    class_uri: hc:VideoAudioAnnotation
    abstract: false
    description: "Annotation for audio events detected in video content.\n\n**DEFINITION**:\n\nVideoAudioAnnotation captures structured information derived from audio\nanalysis of video content. This includes speech, music, silence, and\nvarious sound events.\n\n**AUDIO ANALYSIS TYPES**:\n\n| Type | Description | Use Case |\n|------|-------------|----------|\n| **Speech Detection** | Identify spoken segments | Transcript alignment |\n| **Speaker Diarization** | Who spoke when | Interview navigation |\n| **Music Detection** | Identify musical segments | Content classification |\n| **Sound Events** | Applause, laughter, etc. | Audience engagement |\n| **Silence Detection** | Find quiet segments | Quality assessment |\n| **Language Detection** | Identify spoken languages | Multilingual content |\n\n**SPEAKER DIARIZATION**:\n\nDiarization answers \"who spoke when\":\n\n```\n0:00-0:15  Speaker 1 (Curator)\n0:15-0:45  Speaker 2 (Artist)\n0:45-1:00  Speaker 1 (Curator)\n1:00-1:30  Speaker 3 (Museum\
      \ Director)\n```\n\nHeritage applications:\n- Navigate to specific speakers in interviews\n- Count speaking time per person\n- Identify unnamed speakers for annotation\n- Build speaker databases for recognition\n\n**MUSIC DETECTION**:\n\nMusic detection classifies audio segments as containing music:\n\n| Category | Examples |\n|----------|----------|\n| **Background music** | Documentary soundtracks |\n| **Featured music** | Concert recordings, performances |\n| **Historical music** | Archival recordings |\n| **Licensed music** | Rights-managed content |\n\nMusic segments may also include:\n- Genre classification (classical, jazz, folk)\n- Mood/tempo analysis\n- Fingerprinting for identification\n\n**SOUND EVENT DETECTION**:\n\nNon-speech, non-music audio events:\n\n| Event Type | Heritage Context |\n|------------|------------------|\n| APPLAUSE | Lecture recordings, openings |\n| LAUGHTER | Tour guides, educational content |\n| CROWD_NOISE | Event documentation |\n| DOOR/FOOTSTEPS\
      \ | Ambient archive recordings |\n| NATURE_SOUNDS | Outdoor heritage site recordings |\n| MACHINERY | Industrial heritage, conservation |\n\n**LANGUAGE DETECTION**:\n\nMultilingual heritage content requires language identification:\n\n```yaml\ncontain:\n  - start: 0.0\n    end: 120.0\n    language: nl\n    speaker_id: speaker_001\n  - start: 120.0\n    end: 240.0\n    language: en\n    speaker_id: speaker_001  # Same speaker, switched language\n```\n\n**AUDIO QUALITY ANALYSIS**:\n\nAudio quality metrics for preservation and accessibility:\n\n| Metric | Description | Threshold |\n|--------|-------------|-----------|\n| SNR | Signal-to-noise ratio | > 20 dB good |\n| Clipping | Peak distortion | None ideal |\n| Noise floor | Background noise level | < -50 dB good |\n| Frequency response | Bandwidth | Full-range ideal |\n\n**HERITAGE INSTITUTION USE CASES**:\n\n| Content Type | Audio Analysis Need |\n|--------------|---------------------|\n| Oral histories | Diarization,\
      \ transcription alignment |\n| Curator interviews | Speaker identification, language |\n| Virtual tours | Background music, voiceover detection |\n| Lecture recordings | Audience reactions, Q&A segments |\n| Conservation videos | Narration vs demonstration audio |\n| Archival footage | Speech recovery, noise reduction |\n\n**RELATIONSHIP TO VideoTranscript**:\n\nVideoAudioAnnotation is complementary to VideoTranscript:\n\n- **VideoTranscript**: The text content of speech (WHAT was said)\n- **VideoAudioAnnotation**: Audio structure (WHO spoke, music, sounds)\n\nTogether they provide complete audio understanding:\n\n```\nVideoAudioAnnotation: Speaker 1 spoke 0:00-0:15\nVideoTranscript: \"Welcome to the Rijksmuseum...\" (0:00-0:15)\n\u2192 Combined: Curator said \"Welcome to the Rijksmuseum...\"\n```\n"
    exact_mappings:
    - hc:VideoAudioAnnotation
    close_mappings:
    - ma:AudioTrack
    - crm:E13_Attribute_Assignment
    related_mappings:
    - wikidata:Q11028
    - wikidata:Q638
    slots:
    - has_segment
    # - contain - DUPLICATE REMOVED
    - has_score
    - diarized
    # - contain - DUPLICATE REMOVED
    - has_language
    - music_confidence
    - has_music
    - has_music
    - has_sound
    - has_type
    # - contain - DUPLICATE REMOVED
    - has_silence
    - has_ratio
    # - contain - DUPLICATE REMOVED
    - has_sound
    - has_speaker
    - speaker_label
    - has_spoken_words
    - has_spoken_words
    - has_confidence_measure
    - contain
    - has_score
    slot_usage:
      has_segment:
        range: AudioEventSegment
        multivalued: true
        required: false
        inlined_as_list: true
        examples:
        - value: '[{has_type: SPEECH, start_seconds: 0.0, end_seconds: 15.0, segment_text: "Speech detected - Speaker 1", confidence: 0.95}]'
        - value: '[{has_type: MUSIC, start_seconds: 30.0, end_seconds: 60.0, segment_text: "Background classical music", confidence: 0.88}]'
      contain:
#         range: string
        multivalued: true
        required: false
        inlined_as_list: false # Fixed invalid inline for primitive type
        examples:
        - value:
            temporal_extent:
              begin_of_the_begin: 0.0
              end_of_the_end: 15.0
            contain:
              identified_by: spk_001
              has_label: Curator
      has_speaker:
        range: integer
        required: false
        minimum_value: 0
        examples:
        - value: 3
      speaker_label:
#         range: string
        multivalued: true
        required: false
        examples:
        - value: '[Curator, Artist, Museum Director]'
      has_music:
        range: boolean
        required: false
        examples:
        - value: true
      has_music:
#         range: string
        multivalued: true
        required: false
        examples:
        - value: '[classical, baroque]'
      music_confidence:
        range: float
        required: false
        minimum_value: 0.0
        maximum_value: 1.0
        examples:
        - value: 0.88
      has_sound:
        range: boolean
        required: false
        examples:
        - value: true
      # contain:
      #   range: SoundEventTypeEnum
      #   multivalued: true
      #   required: false
      #   examples:
      #   - value: '[APPLAUSE, CROWD_NOISE]'
      #     description: Applause and crowd sounds detected
      # contain:
      #   range: VideoTimeSegment
      #   multivalued: true
      #   required: false
      #   inlined_as_list: true
      #   examples:
      #   - value: '[{start_seconds: 45.0, end_seconds: 48.0}]'
      #     description: 3-second silence
      has_silence:
        range: float
        required: false
        minimum_value: 0.0
        examples:
        - value: 15.5
      has_sound:
        range: float
        required: false
        examples:
        - value: -45.0
      has_score:
        range: float
        required: false
        minimum_value: 0.0
        maximum_value: 1.0
        examples:
        - value: 0.85
      has_ratio:
        range: float
        required: false
        examples:
        - value: 25.0
    comments:
    - Audio event detection for video content
    - Supports speech, music, silence, and sound event detection
    - Speaker diarization for interview navigation
    - Language detection for multilingual heritage content
    - Audio quality metrics for preservation assessment
    see_also:
    - https://www.w3.org/TR/annotation-model/
    - https://arxiv.org/abs/2111.08085
    annotations:
      specificity_score: 0.1
      specificity_rationale: Generic utility class/slot created during migration
      custodian_types: "['*']"
  SpeechSegment:
    class_uri: hc:SpeechSegment
    description: 'A speech segment with speaker and language information.
      Extends VideoTimeSegment with speech-specific metadata.
      '
    slots:
    - has_confidence_measure
    - end_of_the_end
    - in_language
    - begin_of_the_begin
    - has_speaker
    - speaker_label
    - has_spoken_words
    - has_score
    slot_usage:
      begin_of_the_begin:
        range: float
        required: true
        minimum_value: 0.0
      end_of_the_end:
        range: float
        required: true
        minimum_value: 0.0
      has_speaker:
#         range: string
        required: false
      speaker_label:
#         range: string
        required: false
      in_language:
#         range: string
        required: false
      has_confidence_measure:
        range: float
        required: false
        minimum_value: 0.0
        maximum_value: 1.0
      has_spoken_words:
#         range: string
        required: false
  DiarizationSegment:
    class_uri: hc:DiarizationSegment
    description: 'A diarization segment identifying speaker and time boundaries.
      Focused on "who spoke when" rather than transcript content.
      '
    slots:
    - has_provenance
    - temporal_extent
    - contain
    - overlap_with
    - has_score
    slot_usage:
      temporal_extent:
        range: TimeSpan
        inlined: true
        required: true
      contain:
        range: Speaker
        inlined: true
        required: true
      has_provenance:
        range: Provenance
        inlined: true
        required: false
      overlap_with:
        range: boolean
        required: false
  MusicSegment:
    class_uri: hc:MusicSegment
    description: 'A segment of detected music with classification.
      '
    slots:
    - in_background
    - has_music
    - has_genre
    - has_confidence_measure
    - has_music
    - has_music
    - has_score
    slot_usage:
      has_music:
        range: float
        required: true
        minimum_value: 0.0
      has_music:
        range: float
        required: true
        minimum_value: 0.0
      has_music:
        range: MusicTypeEnum
        required: false
      has_genre:
#         range: string
        required: false
      has_confidence_measure:
        range: float
        required: false
        minimum_value: 0.0
        maximum_value: 1.0
      in_background:
        range: boolean
        required: false