id: https://nde.nl/ontology/hc/class/VideoAudioAnnotation name: video_audio_annotation_class title: Video Audio Annotation Class imports: - linkml:types - ./VideoAnnotation - ./VideoTimeSegment - ./AudioEventSegment - ../slots/has_or_had_segment - ../slots/has_audio_quality_score - ../slots/has_or_had_provenance - ./Provenance - ./ConfidenceScore - ../slots/is_or_was_diarized - ./DiarizationStatus - ../slots/temporal_extent - ./TimeSpan - ../slots/start_of_the_start - ../slots/end_of_the_end - ./Timestamp - ../slots/contains_or_contained - ./Speaker - ../slots/has_or_had_identifier - ./Identifier - ../slots/has_or_had_label - ./Label - ../slots/has_clipping - ../slots/is_background - ../slots/is_overlapping - ../slots/languages_detected - ../slots/music_confidence - ../slots/music_detected - ../slots/music_end_seconds - ../slots/music_genre - ../slots/music_genres_detected - ../slots/music_segment_confidence - ../slots/has_or_had_music_segment - ../slots/music_start_seconds - ../slots/music_type - ../slots/noise_floor_db - ../slots/has_or_had_type - ../slots/segment_confidence - ../slots/segment_end_seconds - ../slots/segment_language - ../slots/segment_start_seconds - ../slots/contains_or_contained - ../slots/silence_total_seconds - ../slots/snr_db - ../slots/contains_or_contained - ../slots/sound_events_detected - ../slots/speaker_count - ../slots/speaker_id - ../slots/speaker_label - ../slots/specificity_annotation - ../slots/speech_detected - ../slots/speech_language - ../slots/speech_language_confidence - ../slots/contains_or_contained - ../slots/speech_text - ../slots/has_or_had_score - ./VideoAudioAnnotation - ./SpecificityAnnotation - ./TemplateSpecificityScore - ./TemplateSpecificityType - ./TemplateSpecificityTypes - ../slots/contains_or_contained - ./AudioEventSegment - ../enums/AudioEventTypeEnum - ../enums/SoundEventTypeEnum - ../enums/MusicTypeEnum prefixes: linkml: https://w3id.org/linkml/ hc: https://nde.nl/ontology/hc/ schema: http://schema.org/ dcterms: http://purl.org/dc/terms/ prov: http://www.w3.org/ns/prov# crm: http://www.cidoc-crm.org/cidoc-crm/ oa: http://www.w3.org/ns/oa# ma: http://www.w3.org/ns/ma-ont# wd: http://www.wikidata.org/entity/ default_prefix: hc classes: VideoAudioAnnotation: is_a: VideoAnnotation class_uri: hc:VideoAudioAnnotation abstract: false description: "Annotation for audio events detected in video content.\n\n**DEFINITION**:\n\nVideoAudioAnnotation captures structured information derived from audio\nanalysis of video content. This includes speech, music, silence, and\nvarious sound events.\n\n**AUDIO ANALYSIS TYPES**:\n\n| Type | Description | Use Case |\n|------|-------------|----------|\n| **Speech Detection** | Identify spoken segments | Transcript alignment |\n| **Speaker Diarization** | Who spoke when | Interview navigation |\n| **Music Detection** | Identify musical segments | Content classification |\n| **Sound Events** | Applause, laughter, etc. | Audience engagement |\n| **Silence Detection** | Find quiet segments | Quality assessment |\n| **Language Detection** | Identify spoken languages | Multilingual content |\n\n**SPEAKER DIARIZATION**:\n\nDiarization answers \"who spoke when\":\n\n```\n0:00-0:15 Speaker 1 (Curator)\n0:15-0:45 Speaker 2 (Artist)\n0:45-1:00 Speaker 1 (Curator)\n1:00-1:30 Speaker 3 (Museum\ \ Director)\n```\n\nHeritage applications:\n- Navigate to specific speakers in interviews\n- Count speaking time per person\n- Identify unnamed speakers for annotation\n- Build speaker databases for recognition\n\n**MUSIC DETECTION**:\n\nMusic detection classifies audio segments as containing music:\n\n| Category | Examples |\n|----------|----------|\n| **Background music** | Documentary soundtracks |\n| **Featured music** | Concert recordings, performances |\n| **Historical music** | Archival recordings |\n| **Licensed music** | Rights-managed content |\n\nMusic segments may also include:\n- Genre classification (classical, jazz, folk)\n- Mood/tempo analysis\n- Fingerprinting for identification\n\n**SOUND EVENT DETECTION**:\n\nNon-speech, non-music audio events:\n\n| Event Type | Heritage Context |\n|------------|------------------|\n| APPLAUSE | Lecture recordings, openings |\n| LAUGHTER | Tour guides, educational content |\n| CROWD_NOISE | Event documentation |\n| DOOR/FOOTSTEPS\ \ | Ambient archive recordings |\n| NATURE_SOUNDS | Outdoor heritage site recordings |\n| MACHINERY | Industrial heritage, conservation |\n\n**LANGUAGE DETECTION**:\n\nMultilingual heritage content requires language identification:\n\n```yaml\ncontains_or_contained:\n - start: 0.0\n end: 120.0\n language: nl\n speaker_id: speaker_001\n - start: 120.0\n end: 240.0\n language: en\n speaker_id: speaker_001 # Same speaker, switched language\n```\n\n**AUDIO QUALITY ANALYSIS**:\n\nAudio quality metrics for preservation and accessibility:\n\n| Metric | Description | Threshold |\n|--------|-------------|-----------|\n| SNR | Signal-to-noise ratio | > 20 dB good |\n| Clipping | Peak distortion | None ideal |\n| Noise floor | Background noise level | < -50 dB good |\n| Frequency response | Bandwidth | Full-range ideal |\n\n**HERITAGE INSTITUTION USE CASES**:\n\n| Content Type | Audio Analysis Need |\n|--------------|---------------------|\n| Oral histories | Diarization,\ \ transcription alignment |\n| Curator interviews | Speaker identification, language |\n| Virtual tours | Background music, voiceover detection |\n| Lecture recordings | Audience reactions, Q&A segments |\n| Conservation videos | Narration vs demonstration audio |\n| Archival footage | Speech recovery, noise reduction |\n\n**RELATIONSHIP TO VideoTranscript**:\n\nVideoAudioAnnotation is complementary to VideoTranscript:\n\n- **VideoTranscript**: The text content of speech (WHAT was said)\n- **VideoAudioAnnotation**: Audio structure (WHO spoke, music, sounds)\n\nTogether they provide complete audio understanding:\n\n```\nVideoAudioAnnotation: Speaker 1 spoke 0:00-0:15\nVideoTranscript: \"Welcome to the Rijksmuseum...\" (0:00-0:15)\n\u2192 Combined: Curator said \"Welcome to the Rijksmuseum...\"\n```\n" exact_mappings: - hc:VideoAudioAnnotation close_mappings: - ma:AudioTrack - crm:E13_Attribute_Assignment related_mappings: - wikidata:Q11028 - wikidata:Q638 slots: - has_or_had_segment # - contains_or_contained - DUPLICATE REMOVED - has_audio_quality_score - is_or_was_diarized # - contains_or_contained - DUPLICATE REMOVED - has_clipping - languages_detected - music_confidence - music_detected - music_genres_detected - has_or_had_music_segment - noise_floor_db - has_or_had_type # - contains_or_contained - DUPLICATE REMOVED - silence_total_seconds - snr_db # - contains_or_contained - DUPLICATE REMOVED - sound_events_detected - speaker_count - speaker_label - specificity_annotation - speech_detected - speech_language - speech_language_confidence - contains_or_contained - has_or_had_score slot_usage: has_or_had_segment: description: 'MIGRATED from audio_event_segments (Rule 53). Audio event segments detected in the video content. ' range: AudioEventSegment multivalued: true required: false inlined_as_list: true examples: - value: '[{has_or_had_type: SPEECH, start_seconds: 0.0, end_seconds: 15.0, segment_text: "Speech detected - Speaker 1", confidence: 0.95}]' description: Speech detection segment - value: '[{has_or_had_type: MUSIC, start_seconds: 30.0, end_seconds: 60.0, segment_text: "Background classical music", confidence: 0.88}]' description: Music detection segment contains_or_contained: range: DiarizationSegment multivalued: true required: false inlined_as_list: true examples: - value: temporal_extent: begin_of_the_begin: 0.0 end_of_the_end: 15.0 contains_or_contained: has_or_had_identifier: spk_001 has_or_had_label: Curator description: Curator speaking for first 15 seconds speaker_count: range: integer required: false minimum_value: 0 examples: - value: 3 description: Three distinct speakers detected speaker_label: range: string multivalued: true required: false examples: - value: '[Curator, Artist, Museum Director]' description: Three identified speakers music_detected: range: boolean required: false examples: - value: true description: Music present in video has_or_had_music_segment: range: MusicSegment multivalued: true required: false inlined_as_list: true examples: - value: '[{start_seconds: 0.0, end_seconds: 30.0, music_type: ''BACKGROUND'', genre: ''classical''}]' description: Classical background music music_genres_detected: range: string multivalued: true required: false examples: - value: '[classical, baroque]' description: Classical and baroque music detected music_confidence: range: float required: false minimum_value: 0.0 maximum_value: 1.0 examples: - value: 0.88 description: High confidence music detection sound_events_detected: range: boolean required: false examples: - value: true description: Sound events detected # contains_or_contained: # range: SoundEventTypeEnum # multivalued: true # required: false # examples: # - value: '[APPLAUSE, CROWD_NOISE]' # description: Applause and crowd sounds detected # contains_or_contained: # range: VideoTimeSegment # multivalued: true # required: false # inlined_as_list: true # examples: # - value: '[{start_seconds: 45.0, end_seconds: 48.0}]' # description: 3-second silence silence_total_seconds: range: float required: false minimum_value: 0.0 examples: - value: 15.5 description: 15.5 seconds of total silence noise_floor_db: range: float required: false examples: - value: -45.0 description: Good quality, moderate noise floor has_audio_quality_score: range: float required: false minimum_value: 0.0 maximum_value: 1.0 examples: - value: 0.85 description: High audio quality snr_db: range: float required: false examples: - value: 25.0 description: Good signal-to-noise ratio has_clipping: range: boolean required: false examples: - value: false description: No clipping detected comments: - Audio event detection for video content - Supports speech, music, silence, and sound event detection - Speaker diarization for interview navigation - Language detection for multilingual heritage content - Audio quality metrics for preservation assessment see_also: - https://www.w3.org/TR/annotation-model/ - https://arxiv.org/abs/2111.08085 annotations: specificity_score: 0.1 specificity_rationale: Generic utility class/slot created during migration custodian_types: "['*']" custodian_types_rationale: Universal utility concept SpeechSegment: class_uri: hc:SpeechSegment description: 'A speech segment with speaker and language information. Extends VideoTimeSegment with speech-specific metadata. ' slots: - segment_confidence - segment_end_seconds - segment_language - segment_start_seconds - speaker_id - speaker_label - specificity_annotation - speech_text - has_or_had_score slot_usage: segment_start_seconds: range: float required: true minimum_value: 0.0 segment_end_seconds: range: float required: true minimum_value: 0.0 speaker_id: range: string required: false speaker_label: range: string required: false segment_language: range: string required: false segment_confidence: range: float required: false minimum_value: 0.0 maximum_value: 1.0 speech_text: range: string required: false DiarizationSegment: class_uri: hc:DiarizationSegment description: 'A diarization segment identifying speaker and time boundaries. Focused on "who spoke when" rather than transcript content. ' slots: - has_or_had_provenance - temporal_extent - contains_or_contained - is_overlapping - specificity_annotation - has_or_had_score slot_usage: temporal_extent: description: Time range of the diarization segment. range: TimeSpan inlined: true required: true contains_or_contained: description: Speaker identified in this segment. range: Speaker inlined: true required: true has_or_had_provenance: description: Provenance metadata including confidence score. range: Provenance inlined: true required: false is_overlapping: range: boolean required: false MusicSegment: class_uri: hc:MusicSegment description: 'A segment of detected music with classification. ' slots: - is_background - music_end_seconds - music_genre - music_segment_confidence - music_start_seconds - music_type - specificity_annotation - has_or_had_score slot_usage: music_start_seconds: range: float required: true minimum_value: 0.0 music_end_seconds: range: float required: true minimum_value: 0.0 music_type: range: MusicTypeEnum required: false music_genre: range: string required: false music_segment_confidence: range: float required: false minimum_value: 0.0 maximum_value: 1.0 is_background: range: boolean required: false