id: https://nde.nl/ontology/hc/class/VideoAudioAnnotation name: video_audio_annotation_class title: Video Audio Annotation Class imports: - linkml:types - ./VideoAnnotation - ./VideoTimeSegment - ./AudioEventSegment # REMOVED - migrated to has_or_had_segment with range AudioEventSegment (Rule 53) # - ../slots/audio_event_segments - ../slots/has_or_had_segment - ../slots/has_audio_quality_score - ../slots/diarization_confidence - ../slots/diarization_enabled - ../slots/diarization_end_seconds - ../slots/has_or_had_diarization_segment - ../slots/diarization_speaker_id - ../slots/diarization_speaker_label - ../slots/diarization_start_seconds - ../slots/has_clipping - ../slots/is_background - ../slots/is_overlapping - ../slots/languages_detected - ../slots/music_confidence - ../slots/music_detected - ../slots/music_end_seconds - ../slots/music_genre - ../slots/music_genres_detected - ../slots/music_segment_confidence - ../slots/has_or_had_music_segment - ../slots/music_start_seconds - ../slots/music_type - ../slots/noise_floor_db - ../slots/primary_audio_event_type - ../slots/segment_confidence - ../slots/segment_end_seconds - ../slots/segment_language - ../slots/segment_start_seconds - ../slots/has_or_had_silence_segment - ../slots/silence_total_seconds - ../slots/snr_db - ../slots/has_or_had_sound_event_type - ../slots/sound_events_detected - ../slots/speaker_count - ../slots/speaker_id - ../slots/speaker_label - ../slots/specificity_annotation - ../slots/speech_detected - ../slots/speech_language - ../slots/speech_language_confidence - ../slots/has_or_had_speech_segment - ../slots/speech_text - ../slots/has_or_had_score # was: template_specificity - migrated per Rule 53 (2026-01-17) - ./VideoAudioAnnotation - ./SpecificityAnnotation - ./TemplateSpecificityScore # was: TemplateSpecificityScores - migrated per Rule 53 (2026-01-17) - ./TemplateSpecificityType - ./TemplateSpecificityTypes - ../slots/has_audio_event_segment - ../enums/AudioEventTypeEnum - ../enums/SoundEventTypeEnum - ../enums/MusicTypeEnum prefixes: linkml: https://w3id.org/linkml/ hc: https://nde.nl/ontology/hc/ schema: http://schema.org/ dcterms: http://purl.org/dc/terms/ prov: http://www.w3.org/ns/prov# crm: http://www.cidoc-crm.org/cidoc-crm/ oa: http://www.w3.org/ns/oa# ma: http://www.w3.org/ns/ma-ont# wd: http://www.wikidata.org/entity/ default_prefix: hc classes: VideoAudioAnnotation: is_a: VideoAnnotation class_uri: hc:VideoAudioAnnotation abstract: false description: "Annotation for audio events detected in video content.\n\n**DEFINITION**:\n\nVideoAudioAnnotation captures\ \ structured information derived from audio\nanalysis of video content. This includes speech, music, silence, and\n\ various sound events.\n\n**AUDIO ANALYSIS TYPES**:\n\n| Type | Description | Use Case |\n|------|-------------|----------|\n\ | **Speech Detection** | Identify spoken segments | Transcript alignment |\n| **Speaker Diarization** | Who spoke when\ \ | Interview navigation |\n| **Music Detection** | Identify musical segments | Content classification |\n| **Sound\ \ Events** | Applause, laughter, etc. | Audience engagement |\n| **Silence Detection** | Find quiet segments | Quality\ \ assessment |\n| **Language Detection** | Identify spoken languages | Multilingual content |\n\n**SPEAKER DIARIZATION**:\n\ \nDiarization answers \"who spoke when\":\n\n```\n0:00-0:15 Speaker 1 (Curator)\n0:15-0:45 Speaker 2 (Artist)\n0:45-1:00\ \ Speaker 1 (Curator)\n1:00-1:30 Speaker 3 (Museum Director)\n```\n\nHeritage applications:\n- Navigate to specific\ \ speakers in interviews\n- Count speaking time per person\n- Identify unnamed speakers for annotation\n- Build speaker\ \ databases for recognition\n\n**MUSIC DETECTION**:\n\nMusic detection classifies audio segments as containing music:\n\ \n| Category | Examples |\n|----------|----------|\n| **Background music** | Documentary soundtracks |\n| **Featured\ \ music** | Concert recordings, performances |\n| **Historical music** | Archival recordings |\n| **Licensed music**\ \ | Rights-managed content |\n\nMusic segments may also include:\n- Genre classification (classical, jazz, folk)\n-\ \ Mood/tempo analysis\n- Fingerprinting for identification\n\n**SOUND EVENT DETECTION**:\n\nNon-speech, non-music audio\ \ events:\n\n| Event Type | Heritage Context |\n|------------|------------------|\n| APPLAUSE | Lecture recordings,\ \ openings |\n| LAUGHTER | Tour guides, educational content |\n| CROWD_NOISE | Event documentation |\n| DOOR/FOOTSTEPS\ \ | Ambient archive recordings |\n| NATURE_SOUNDS | Outdoor heritage site recordings |\n| MACHINERY | Industrial heritage,\ \ conservation |\n\n**LANGUAGE DETECTION**:\n\nMultilingual heritage content requires language identification:\n\n```yaml\n\ has_or_had_speech_segment:\n - start: 0.0\n end: 120.0\n language: nl\n speaker_id: speaker_001\n - start:\ \ 120.0\n end: 240.0\n language: en\n speaker_id: speaker_001 # Same speaker, switched language\n```\n\n**AUDIO\ \ QUALITY ANALYSIS**:\n\nAudio quality metrics for preservation and accessibility:\n\n| Metric | Description | Threshold\ \ |\n|--------|-------------|-----------|\n| SNR | Signal-to-noise ratio | > 20 dB good |\n| Clipping | Peak distortion\ \ | None ideal |\n| Noise floor | Background noise level | < -50 dB good |\n| Frequency response | Bandwidth | Full-range\ \ ideal |\n\n**HERITAGE INSTITUTION USE CASES**:\n\n| Content Type | Audio Analysis Need |\n|--------------|---------------------|\n\ | Oral histories | Diarization, transcription alignment |\n| Curator interviews | Speaker identification, language |\n\ | Virtual tours | Background music, voiceover detection |\n| Lecture recordings | Audience reactions, Q&A segments |\n\ | Conservation videos | Narration vs demonstration audio |\n| Archival footage | Speech recovery, noise reduction |\n\ \n**RELATIONSHIP TO VideoTranscript**:\n\nVideoAudioAnnotation is complementary to VideoTranscript:\n\n- **VideoTranscript**:\ \ The text content of speech (WHAT was said)\n- **VideoAudioAnnotation**: Audio structure (WHO spoke, music, sounds)\n\ \nTogether they provide complete audio understanding:\n\n```\nVideoAudioAnnotation: Speaker 1 spoke 0:00-0:15\nVideoTranscript:\ \ \"Welcome to the Rijksmuseum...\" (0:00-0:15)\n→ Combined: Curator said \"Welcome to the Rijksmuseum...\"\n```\n" exact_mappings: - hc:VideoAudioAnnotation close_mappings: - ma:AudioTrack - crm:E13_Attribute_Assignment related_mappings: - wikidata:Q11028 - wikidata:Q638 slots: # MIGRATED from audio_event_segments to has_or_had_segment (Rule 53) # - audio_event_segments - has_or_had_segment - audio_quality_score - diarization_enabled - has_or_had_diarization_segment - has_clipping - languages_detected - music_confidence - music_detected - music_genres_detected - has_or_had_music_segment - noise_floor_db - primary_audio_event_type - has_or_had_silence_segment - silence_total_seconds - snr_db - has_or_had_sound_event_type - sound_events_detected - speaker_count - speaker_label - specificity_annotation - speech_detected - speech_language - speech_language_confidence - has_or_had_speech_segment - has_or_had_score # was: template_specificity - migrated per Rule 53 (2026-01-17) slot_usage: has_or_had_segment: description: | MIGRATED from audio_event_segments (Rule 53). Audio event segments detected in the video content. range: AudioEventSegment multivalued: true required: false inlined_as_list: true examples: - value: '[{audio_event_type: SPEECH, start_seconds: 0.0, end_seconds: 15.0, segment_text: "Speech detected - Speaker 1", confidence: 0.95}]' description: Speech detection segment - value: '[{audio_event_type: MUSIC, start_seconds: 30.0, end_seconds: 60.0, segment_text: "Background classical music", confidence: 0.88}]' description: Music detection segment # NOTE: has_audio_event_segment is deprecated - use has_or_had_segment above has_audio_event_segment: range: VideoTimeSegment multivalued: true required: false inlined_as_list: true examples: - value: '[{start_seconds: 0.0, end_seconds: 15.0, segment_text: ''Speech detected - Speaker 1''}]' description: Speech detection segment primary_audio_event_type: range: AudioEventTypeEnum required: true examples: - value: SPEECH description: Primary focus on speech analysis speech_detected: range: boolean required: false examples: - value: true description: Speech is present in video has_or_had_speech_segment: range: SpeechSegment multivalued: true required: false inlined_as_list: true examples: - value: '[{start_seconds: 0.0, end_seconds: 15.0, speaker_id: ''spk_001'', language: ''nl''}]' description: Dutch speech from speaker 1 speech_language: range: string required: false examples: - value: nl description: Dutch is primary language - value: en description: English is primary language speech_language_confidence: range: float required: false minimum_value: 0.0 maximum_value: 1.0 examples: - value: 0.95 description: High confidence language detection languages_detected: range: string multivalued: true required: false examples: - value: '[nl, en, de]' description: Dutch, English, and German detected diarization_enabled: range: boolean required: false examples: - value: true description: Diarization was performed has_or_had_diarization_segment: range: DiarizationSegment multivalued: true required: false inlined_as_list: true examples: - value: '[{start_seconds: 0.0, end_seconds: 15.0, speaker_id: ''spk_001'', speaker_label: ''Curator''}]' description: Curator speaking for first 15 seconds speaker_count: range: integer required: false minimum_value: 0 examples: - value: 3 description: Three distinct speakers detected speaker_label: range: string multivalued: true required: false examples: - value: '[Curator, Artist, Museum Director]' description: Three identified speakers music_detected: range: boolean required: false examples: - value: true description: Music present in video has_or_had_music_segment: range: MusicSegment multivalued: true required: false inlined_as_list: true examples: - value: '[{start_seconds: 0.0, end_seconds: 30.0, music_type: ''BACKGROUND'', genre: ''classical''}]' description: Classical background music music_genres_detected: range: string multivalued: true required: false examples: - value: '[classical, baroque]' description: Classical and baroque music detected music_confidence: range: float required: false minimum_value: 0.0 maximum_value: 1.0 examples: - value: 0.88 description: High confidence music detection sound_events_detected: range: boolean required: false examples: - value: true description: Sound events detected has_or_had_sound_event_type: range: SoundEventTypeEnum multivalued: true required: false examples: - value: '[APPLAUSE, CROWD_NOISE]' description: Applause and crowd sounds detected has_or_had_silence_segment: range: VideoTimeSegment multivalued: true required: false inlined_as_list: true examples: - value: '[{start_seconds: 45.0, end_seconds: 48.0}]' description: 3-second silence silence_total_seconds: range: float required: false minimum_value: 0.0 examples: - value: 15.5 description: 15.5 seconds of total silence noise_floor_db: range: float required: false examples: - value: -45.0 description: Good quality, moderate noise floor has_audio_quality_score: range: float required: false minimum_value: 0.0 maximum_value: 1.0 examples: - value: 0.85 description: High audio quality snr_db: range: float required: false examples: - value: 25.0 description: Good signal-to-noise ratio has_clipping: range: boolean required: false examples: - value: false description: No clipping detected comments: - Audio event detection for video content - Supports speech, music, silence, and sound event detection - Speaker diarization for interview navigation - Language detection for multilingual heritage content - Audio quality metrics for preservation assessment see_also: - https://www.w3.org/TR/annotation-model/ - https://arxiv.org/abs/2111.08085 SpeechSegment: class_uri: hc:SpeechSegment description: 'A speech segment with speaker and language information. Extends VideoTimeSegment with speech-specific metadata. ' slots: - segment_confidence - segment_end_seconds - segment_language - segment_start_seconds - speaker_id - speaker_label - specificity_annotation - speech_text - has_or_had_score # was: template_specificity - migrated per Rule 53 (2026-01-17) slot_usage: segment_start_seconds: range: float required: true minimum_value: 0.0 segment_end_seconds: range: float required: true minimum_value: 0.0 speaker_id: range: string required: false speaker_label: range: string required: false segment_language: range: string required: false segment_confidence: range: float required: false minimum_value: 0.0 maximum_value: 1.0 speech_text: range: string required: false DiarizationSegment: class_uri: hc:DiarizationSegment description: 'A diarization segment identifying speaker and time boundaries. Focused on "who spoke when" rather than transcript content. ' slots: - diarization_confidence - diarization_end_seconds - diarization_speaker_id - diarization_speaker_label - diarization_start_seconds - is_overlapping - specificity_annotation - has_or_had_score # was: template_specificity - migrated per Rule 53 (2026-01-17) slot_usage: diarization_start_seconds: range: float required: true minimum_value: 0.0 diarization_end_seconds: range: float required: true minimum_value: 0.0 diarization_speaker_id: range: string required: true diarization_speaker_label: range: string required: false diarization_confidence: range: float required: false minimum_value: 0.0 maximum_value: 1.0 is_overlapping: range: boolean required: false MusicSegment: class_uri: hc:MusicSegment description: 'A segment of detected music with classification. ' slots: - is_background - music_end_seconds - music_genre - music_segment_confidence - music_start_seconds - music_type - specificity_annotation - has_or_had_score # was: template_specificity - migrated per Rule 53 (2026-01-17) slot_usage: music_start_seconds: range: float required: true minimum_value: 0.0 music_end_seconds: range: float required: true minimum_value: 0.0 music_type: range: MusicTypeEnum required: false music_genre: range: string required: false music_segment_confidence: range: float required: false minimum_value: 0.0 maximum_value: 1.0 is_background: range: boolean required: false