id: https://nde.nl/ontology/hc/class/VideoAudioAnnotation name: video_audio_annotation_class title: Video Audio Annotation Class imports: - linkml:types - ../enums/AudioEventTypeEnum - ../enums/MusicTypeEnum - ../enums/SoundEventTypeEnum - ../slots/contain - ../slots/end_of_the_end - ../slots/has_score - ../slots/identified_by - ../slots/has_label - ../slots/has_provenance - ../slots/has_segment - ../slots/has_type - ../slots/in_background - ../slots/diarized - ../slots/overlap_with - ../slots/has_language - ../slots/has_confidence_measure - ../slots/has_music - ../slots/has_genre - ../slots/has_sound - ../slots/in_language - ../slots/begin_of_the_begin - ../slots/has_silence - ../slots/has_ratio - ../slots/has_speaker - ../slots/20260202_matang/speaker_label - ../slots/has_spoken_words - ../slots/temporal_extent prefixes: linkml: https://w3id.org/linkml/ hc: https://nde.nl/ontology/hc/ schema: http://schema.org/ dcterms: http://purl.org/dc/terms/ prov: http://www.w3.org/ns/prov# crm: http://www.cidoc-crm.org/cidoc-crm/ oa: http://www.w3.org/ns/oa# ma: http://www.w3.org/ns/ma-ont# wd: http://www.wikidata.org/entity/ default_prefix: hc classes: VideoAudioAnnotation: is_a: VideoAnnotation class_uri: hc:VideoAudioAnnotation abstract: false description: "Annotation for audio events detected in video content.\n\n**DEFINITION**:\n\nVideoAudioAnnotation captures structured information derived from audio\nanalysis of video content. This includes speech, music, silence, and\nvarious sound events.\n\n**AUDIO ANALYSIS TYPES**:\n\n| Type | Description | Use Case |\n|------|-------------|----------|\n| **Speech Detection** | Identify spoken segments | Transcript alignment |\n| **Speaker Diarization** | Who spoke when | Interview navigation |\n| **Music Detection** | Identify musical segments | Content classification |\n| **Sound Events** | Applause, laughter, etc. | Audience engagement |\n| **Silence Detection** | Find quiet segments | Quality assessment |\n| **Language Detection** | Identify spoken languages | Multilingual content |\n\n**SPEAKER DIARIZATION**:\n\nDiarization answers \"who spoke when\":\n\n```\n0:00-0:15 Speaker 1 (Curator)\n0:15-0:45 Speaker 2 (Artist)\n0:45-1:00 Speaker 1 (Curator)\n1:00-1:30 Speaker 3 (Museum\ \ Director)\n```\n\nHeritage applications:\n- Navigate to specific speakers in interviews\n- Count speaking time per person\n- Identify unnamed speakers for annotation\n- Build speaker databases for recognition\n\n**MUSIC DETECTION**:\n\nMusic detection classifies audio segments as containing music:\n\n| Category | Examples |\n|----------|----------|\n| **Background music** | Documentary soundtracks |\n| **Featured music** | Concert recordings, performances |\n| **Historical music** | Archival recordings |\n| **Licensed music** | Rights-managed content |\n\nMusic segments may also include:\n- Genre classification (classical, jazz, folk)\n- Mood/tempo analysis\n- Fingerprinting for identification\n\n**SOUND EVENT DETECTION**:\n\nNon-speech, non-music audio events:\n\n| Event Type | Heritage Context |\n|------------|------------------|\n| APPLAUSE | Lecture recordings, openings |\n| LAUGHTER | Tour guides, educational content |\n| CROWD_NOISE | Event documentation |\n| DOOR/FOOTSTEPS\ \ | Ambient archive recordings |\n| NATURE_SOUNDS | Outdoor heritage site recordings |\n| MACHINERY | Industrial heritage, conservation |\n\n**LANGUAGE DETECTION**:\n\nMultilingual heritage content requires language identification:\n\n```yaml\ncontain:\n - start: 0.0\n end: 120.0\n language: nl\n speaker_id: speaker_001\n - start: 120.0\n end: 240.0\n language: en\n speaker_id: speaker_001 # Same speaker, switched language\n```\n\n**AUDIO QUALITY ANALYSIS**:\n\nAudio quality metrics for preservation and accessibility:\n\n| Metric | Description | Threshold |\n|--------|-------------|-----------|\n| SNR | Signal-to-noise ratio | > 20 dB good |\n| Clipping | Peak distortion | None ideal |\n| Noise floor | Background noise level | < -50 dB good |\n| Frequency response | Bandwidth | Full-range ideal |\n\n**HERITAGE INSTITUTION USE CASES**:\n\n| Content Type | Audio Analysis Need |\n|--------------|---------------------|\n| Oral histories | Diarization,\ \ transcription alignment |\n| Curator interviews | Speaker identification, language |\n| Virtual tours | Background music, voiceover detection |\n| Lecture recordings | Audience reactions, Q&A segments |\n| Conservation videos | Narration vs demonstration audio |\n| Archival footage | Speech recovery, noise reduction |\n\n**RELATIONSHIP TO VideoTranscript**:\n\nVideoAudioAnnotation is complementary to VideoTranscript:\n\n- **VideoTranscript**: The text content of speech (WHAT was said)\n- **VideoAudioAnnotation**: Audio structure (WHO spoke, music, sounds)\n\nTogether they provide complete audio understanding:\n\n```\nVideoAudioAnnotation: Speaker 1 spoke 0:00-0:15\nVideoTranscript: \"Welcome to the Rijksmuseum...\" (0:00-0:15)\n\u2192 Combined: Curator said \"Welcome to the Rijksmuseum...\"\n```\n" exact_mappings: - hc:VideoAudioAnnotation close_mappings: - ma:AudioTrack - crm:E13_Attribute_Assignment related_mappings: - wikidata:Q11028 - wikidata:Q638 slots: - has_segment # - contain - DUPLICATE REMOVED - has_score - diarized # - contain - DUPLICATE REMOVED - has_language - music_confidence - has_music - has_music - has_sound - has_type # - contain - DUPLICATE REMOVED - has_silence - has_ratio # - contain - DUPLICATE REMOVED - has_sound - has_speaker - speaker_label - has_spoken_words - has_spoken_words - has_confidence_measure - contain - has_score slot_usage: has_segment: range: AudioEventSegment multivalued: true required: false inlined_as_list: true examples: - value: '[{has_type: SPEECH, start_seconds: 0.0, end_seconds: 15.0, segment_text: "Speech detected - Speaker 1", confidence: 0.95}]' - value: '[{has_type: MUSIC, start_seconds: 30.0, end_seconds: 60.0, segment_text: "Background classical music", confidence: 0.88}]' contain: # range: string multivalued: true required: false inlined_as_list: false # Fixed invalid inline for primitive type examples: - value: temporal_extent: begin_of_the_begin: 0.0 end_of_the_end: 15.0 contain: identified_by: spk_001 has_label: Curator has_speaker: range: integer required: false minimum_value: 0 examples: - value: 3 speaker_label: # range: string multivalued: true required: false examples: - value: '[Curator, Artist, Museum Director]' has_music: range: boolean required: false examples: - value: true has_music: # range: string multivalued: true required: false examples: - value: '[classical, baroque]' music_confidence: range: float required: false minimum_value: 0.0 maximum_value: 1.0 examples: - value: 0.88 has_sound: range: boolean required: false examples: - value: true # contain: # range: SoundEventTypeEnum # multivalued: true # required: false # examples: # - value: '[APPLAUSE, CROWD_NOISE]' # description: Applause and crowd sounds detected # contain: # range: VideoTimeSegment # multivalued: true # required: false # inlined_as_list: true # examples: # - value: '[{start_seconds: 45.0, end_seconds: 48.0}]' # description: 3-second silence has_silence: range: float required: false minimum_value: 0.0 examples: - value: 15.5 has_sound: range: float required: false examples: - value: -45.0 has_score: range: float required: false minimum_value: 0.0 maximum_value: 1.0 examples: - value: 0.85 has_ratio: range: float required: false examples: - value: 25.0 comments: - Audio event detection for video content - Supports speech, music, silence, and sound event detection - Speaker diarization for interview navigation - Language detection for multilingual heritage content - Audio quality metrics for preservation assessment see_also: - https://www.w3.org/TR/annotation-model/ - https://arxiv.org/abs/2111.08085 annotations: specificity_score: 0.1 specificity_rationale: Generic utility class/slot created during migration custodian_types: "['*']" SpeechSegment: class_uri: hc:SpeechSegment description: 'A speech segment with speaker and language information. Extends VideoTimeSegment with speech-specific metadata. ' slots: - has_confidence_measure - end_of_the_end - in_language - begin_of_the_begin - has_speaker - speaker_label - has_spoken_words - has_score slot_usage: begin_of_the_begin: range: float required: true minimum_value: 0.0 end_of_the_end: range: float required: true minimum_value: 0.0 has_speaker: # range: string required: false speaker_label: # range: string required: false in_language: # range: string required: false has_confidence_measure: range: float required: false minimum_value: 0.0 maximum_value: 1.0 has_spoken_words: # range: string required: false DiarizationSegment: class_uri: hc:DiarizationSegment description: 'A diarization segment identifying speaker and time boundaries. Focused on "who spoke when" rather than transcript content. ' slots: - has_provenance - temporal_extent - contain - overlap_with - has_score slot_usage: temporal_extent: range: TimeSpan inlined: true required: true contain: range: Speaker inlined: true required: true has_provenance: range: Provenance inlined: true required: false overlap_with: range: boolean required: false MusicSegment: class_uri: hc:MusicSegment description: 'A segment of detected music with classification. ' slots: - in_background - has_music - has_genre - has_confidence_measure - has_music - has_music - has_score slot_usage: has_music: range: float required: true minimum_value: 0.0 has_music: range: float required: true minimum_value: 0.0 has_music: range: MusicTypeEnum required: false has_genre: # range: string required: false has_confidence_measure: range: float required: false minimum_value: 0.0 maximum_value: 1.0 in_background: range: boolean required: false