id: https://nde.nl/ontology/hc/class/VideoAudioAnnotation name: video_audio_annotation_class title: Video Audio Annotation Class imports: - linkml:types - ./VideoAnnotation - ./VideoTimeSegment - ../slots/audio_event_segments - ../slots/has_audio_quality_score - ../slots/diarization_confidence - ../slots/diarization_enabled - ../slots/diarization_end_seconds - ../slots/has_or_had_diarization_segment - ../slots/diarization_speaker_id - ../slots/diarization_speaker_label - ../slots/diarization_start_seconds - ../slots/has_clipping - ../slots/is_background - ../slots/is_overlapping - ../slots/languages_detected - ../slots/music_confidence - ../slots/music_detected - ../slots/music_end_seconds - ../slots/music_genre - ../slots/music_genres_detected - ../slots/music_segment_confidence - ../slots/has_or_had_music_segment - ../slots/music_start_seconds - ../slots/music_type - ../slots/noise_floor_db - ../slots/primary_audio_event_type - ../slots/segment_confidence - ../slots/segment_end_seconds - ../slots/segment_language - ../slots/segment_start_seconds - ../slots/has_or_had_silence_segment - ../slots/silence_total_seconds - ../slots/snr_db - ../slots/has_or_had_sound_event_type - ../slots/sound_events_detected - ../slots/speaker_count - ../slots/speaker_id - ../slots/speaker_label - ../slots/speaker_label - ../slots/specificity_annotation - ../slots/speech_detected - ../slots/speech_language - ../slots/speech_language_confidence - ../slots/has_or_had_speech_segment - ../slots/speech_text - ../slots/template_specificity - ./DiarizationSegment - ./MusicSegment - ./SpecificityAnnotation - ./SpeechSegment - ./TemplateSpecificityScores prefixes: linkml: https://w3id.org/linkml/ hc: https://nde.nl/ontology/hc/ schema: http://schema.org/ dcterms: http://purl.org/dc/terms/ prov: http://www.w3.org/ns/prov# crm: http://www.cidoc-crm.org/cidoc-crm/ oa: http://www.w3.org/ns/oa# ma: http://www.w3.org/ns/ma-ont# wd: http://www.wikidata.org/entity/ default_prefix: hc classes: VideoAudioAnnotation: is_a: VideoAnnotation class_uri: hc:VideoAudioAnnotation abstract: false description: | Annotation for audio events detected in video content. **DEFINITION**: VideoAudioAnnotation captures structured information derived from audio analysis of video content. This includes speech, music, silence, and various sound events. **AUDIO ANALYSIS TYPES**: | Type | Description | Use Case | |------|-------------|----------| | **Speech Detection** | Identify spoken segments | Transcript alignment | | **Speaker Diarization** | Who spoke when | Interview navigation | | **Music Detection** | Identify musical segments | Content classification | | **Sound Events** | Applause, laughter, etc. | Audience engagement | | **Silence Detection** | Find quiet segments | Quality assessment | | **Language Detection** | Identify spoken languages | Multilingual content | **SPEAKER DIARIZATION**: Diarization answers "who spoke when": ``` 0:00-0:15 Speaker 1 (Curator) 0:15-0:45 Speaker 2 (Artist) 0:45-1:00 Speaker 1 (Curator) 1:00-1:30 Speaker 3 (Museum Director) ``` Heritage applications: - Navigate to specific speakers in interviews - Count speaking time per person - Identify unnamed speakers for annotation - Build speaker databases for recognition **MUSIC DETECTION**: Music detection classifies audio segments as containing music: | Category | Examples | |----------|----------| | **Background music** | Documentary soundtracks | | **Featured music** | Concert recordings, performances | | **Historical music** | Archival recordings | | **Licensed music** | Rights-managed content | Music segments may also include: - Genre classification (classical, jazz, folk) - Mood/tempo analysis - Fingerprinting for identification **SOUND EVENT DETECTION**: Non-speech, non-music audio events: | Event Type | Heritage Context | |------------|------------------| | APPLAUSE | Lecture recordings, openings | | LAUGHTER | Tour guides, educational content | | CROWD_NOISE | Event documentation | | DOOR/FOOTSTEPS | Ambient archive recordings | | NATURE_SOUNDS | Outdoor heritage site recordings | | MACHINERY | Industrial heritage, conservation | **LANGUAGE DETECTION**: Multilingual heritage content requires language identification: ```yaml has_or_had_speech_segment: - start: 0.0 end: 120.0 language: nl speaker_id: speaker_001 - start: 120.0 end: 240.0 language: en speaker_id: speaker_001 # Same speaker, switched language ``` **AUDIO QUALITY ANALYSIS**: Audio quality metrics for preservation and accessibility: | Metric | Description | Threshold | |--------|-------------|-----------| | SNR | Signal-to-noise ratio | > 20 dB good | | Clipping | Peak distortion | None ideal | | Noise floor | Background noise level | < -50 dB good | | Frequency response | Bandwidth | Full-range ideal | **HERITAGE INSTITUTION USE CASES**: | Content Type | Audio Analysis Need | |--------------|---------------------| | Oral histories | Diarization, transcription alignment | | Curator interviews | Speaker identification, language | | Virtual tours | Background music, voiceover detection | | Lecture recordings | Audience reactions, Q&A segments | | Conservation videos | Narration vs demonstration audio | | Archival footage | Speech recovery, noise reduction | **RELATIONSHIP TO VideoTranscript**: VideoAudioAnnotation is complementary to VideoTranscript: - **VideoTranscript**: The text content of speech (WHAT was said) - **VideoAudioAnnotation**: Audio structure (WHO spoke, music, sounds) Together they provide complete audio understanding: ``` VideoAudioAnnotation: Speaker 1 spoke 0:00-0:15 VideoTranscript: "Welcome to the Rijksmuseum..." (0:00-0:15) → Combined: Curator said "Welcome to the Rijksmuseum..." ``` exact_mappings: - hc:VideoAudioAnnotation close_mappings: - ma:AudioTrack - crm:E13_Attribute_Assignment related_mappings: - wikidata:Q11028 - wikidata:Q638 slots: - audio_event_segments - audio_quality_score - diarization_enabled - has_or_had_diarization_segment - has_clipping - languages_detected - music_confidence - music_detected - music_genres_detected - has_or_had_music_segment - noise_floor_db - primary_audio_event_type - has_or_had_silence_segment - silence_total_seconds - snr_db - has_or_had_sound_event_type - sound_events_detected - speaker_count - speaker_label - specificity_annotation - speech_detected - speech_language - speech_language_confidence - has_or_had_speech_segment - template_specificity slot_usage: has_audio_event_segment: slot_uri: oa:hasBody description: | Time-coded segments with detected audio events. Web Annotation: hasBody links annotation to content. Each segment contains: - Start/end time boundaries - Event type (SPEECH, MUSIC, SILENCE, etc.) - Confidence score - Additional metadata (speaker ID, language, etc.) Segments may overlap (e.g., speech over background music). range: VideoTimeSegment multivalued: true required: false inlined_as_list: true examples: - value: '[{start_seconds: 0.0, end_seconds: 15.0, segment_text: ''Speech detected - Speaker 1''}]' description: Speech detection segment primary_audio_event_type: slot_uri: dcterms:type description: | The primary type of audio analysis performed. Dublin Core: type for categorization. **Types**: - SPEECH: Speech detection and diarization - MUSIC: Music detection and classification - SOUND_EVENTS: Environmental sound detection - MIXED: Multiple analysis types combined range: AudioEventTypeEnum required: true examples: - value: SPEECH description: Primary focus on speech analysis speech_detected: slot_uri: hc:speechDetected description: | Whether speech was detected in the video audio. High-level flag for presence of speech content. - true: At least one speech segment detected - false: No speech detected (music-only, silent, etc.) range: boolean required: false examples: - value: true description: Speech is present in video has_or_had_speech_segment: slot_uri: hc:speechSegments description: | Detailed speech segments with speaker and language info. Each segment represents continuous speech from one speaker. Used for: - Transcript alignment - Speaker navigation - Language segmentation range: SpeechSegment multivalued: true required: false inlined_as_list: true examples: - value: '[{start_seconds: 0.0, end_seconds: 15.0, speaker_id: ''spk_001'', language: ''nl''}]' description: Dutch speech from speaker 1 speech_language: slot_uri: dcterms:language description: | Primary language of speech content (ISO 639-1 code). Dublin Core: language for primary language. For multilingual content, this is the predominant language. See `languages_detected` for all languages. range: string required: false examples: - value: nl description: Dutch is primary language - value: en description: English is primary language speech_language_confidence: slot_uri: hc:languageConfidence description: | Confidence score for language detection (0.0-1.0). Higher confidence when: - Longer speech segments - Clear audio quality - Distinct language features Lower confidence when: - Short utterances - Background noise - Code-switching range: float required: false minimum_value: 0.0 maximum_value: 1.0 examples: - value: 0.95 description: High confidence language detection languages_detected: slot_uri: hc:languagesDetected description: | All languages detected in speech (ISO 639-1 codes). Heritage content often includes multiple languages: - Exhibition videos with translations - Interviews with multilingual speakers - Historical content with period languages Ordered by speaking time (most spoken first). range: string multivalued: true required: false examples: - value: '[nl, en, de]' description: Dutch, English, and German detected diarization_enabled: slot_uri: hc:diarizationEnabled description: | Whether speaker diarization was performed. Diarization = identifying distinct speakers and their segments. - true: Speaker IDs assigned to speech segments - false: Speech detected but speakers not distinguished range: boolean required: false examples: - value: true description: Diarization was performed has_or_had_diarization_segment: slot_uri: hc:diarizationSegments description: | Detailed diarization results with speaker assignments. Each segment identifies: - Time boundaries - Speaker ID (anonymous: "spk_001", "spk_002") - Optional speaker name (if identified) - Confidence score Enables "who spoke when" analysis. range: DiarizationSegment multivalued: true required: false inlined_as_list: true examples: - value: '[{start_seconds: 0.0, end_seconds: 15.0, speaker_id: ''spk_001'', speaker_label: ''Curator''}]' description: Curator speaking for first 15 seconds speaker_count: slot_uri: hc:speakerCount description: | Number of distinct speakers detected. Useful for: - Interview classification (1 = monologue, 2+ = dialog) - Content type inference - Accessibility planning range: integer required: false minimum_value: 0 examples: - value: 3 description: Three distinct speakers detected speaker_label: slot_uri: hc:speakerLabels description: | Labels or names assigned to detected speakers. May be: - Anonymous: ["Speaker 1", "Speaker 2"] - Identified: ["Dr. Taco Dibbits", "Interviewer"] - Role-based: ["Curator", "Artist", "Host"] Ordered by speaking time (most speaking first). range: string multivalued: true required: false examples: - value: '[Curator, Artist, Museum Director]' description: Three identified speakers music_detected: slot_uri: hc:musicDetected description: | Whether music was detected in the audio. - true: Musical content detected (any amount) - false: No music detected (speech-only, silence) range: boolean required: false examples: - value: true description: Music present in video has_or_had_music_segment: slot_uri: hc:musicSegments description: | Time segments containing music. Each segment includes: - Time boundaries - Music type (background, featured) - Genre classification (if detected) - Confidence score range: MusicSegment multivalued: true required: false inlined_as_list: true examples: - value: '[{start_seconds: 0.0, end_seconds: 30.0, music_type: ''BACKGROUND'', genre: ''classical''}]' description: Classical background music music_genres_detected: slot_uri: hc:musicGenresDetected description: | Music genres detected in audio. **Common Heritage Genres**: - classical: Art music, orchestral - baroque: Period-specific classical - jazz: Jazz performances - folk: Traditional/folk music - ambient: Background/atmospheric - electronic: Modern electronic music range: string multivalued: true required: false examples: - value: '[classical, baroque]' description: Classical and baroque music detected music_confidence: slot_uri: hc:musicConfidence description: | Overall confidence of music detection (0.0-1.0). Average confidence across all music segments. range: float required: false minimum_value: 0.0 maximum_value: 1.0 examples: - value: 0.88 description: High confidence music detection sound_events_detected: slot_uri: hc:soundEventsDetected description: | Whether non-speech, non-music sound events were detected. Sound events include applause, laughter, environmental sounds, etc. range: boolean required: false examples: - value: true description: Sound events detected has_or_had_sound_event_type: slot_uri: hc:soundEventTypes description: | Types of sound events detected. **Heritage-Relevant Events**: - APPLAUSE: Lecture endings, openings - LAUGHTER: Tour guide humor - CROWD_NOISE: Event atmosphere - FOOTSTEPS: Gallery ambiance - NATURE_SOUNDS: Outdoor heritage sites - BELLS: Church/temple recordings range: SoundEventTypeEnum multivalued: true required: false examples: - value: '[APPLAUSE, CROWD_NOISE]' description: Applause and crowd sounds detected has_or_had_silence_segment: slot_uri: hc:silenceSegments description: | Time segments containing silence or very low audio. Silence detection useful for: - Finding pauses between segments - Quality assessment (unexpected silence) - Identifying chapter/scene boundaries Threshold typically: audio below -40 dB for > 2 seconds. range: VideoTimeSegment multivalued: true required: false inlined_as_list: true examples: - value: '[{start_seconds: 45.0, end_seconds: 48.0}]' description: 3-second silence silence_total_seconds: slot_uri: hc:silenceTotalSeconds description: | Total duration of silence in the video (seconds). High silence percentage may indicate: - Extended pauses - Silent segments (B-roll without audio) - Audio issues range: float required: false minimum_value: 0.0 examples: - value: 15.5 description: 15.5 seconds of total silence noise_floor_db: slot_uri: hc:noiseFloorDb description: | Background noise floor level in decibels. **Quality Guidelines**: - < -60 dB: Excellent (studio quality) - -60 to -40 dB: Good (professional recording) - -40 to -30 dB: Acceptable (field recording) - > -30 dB: Poor (noisy environment) range: float required: false examples: - value: -45.0 description: Good quality, moderate noise floor has_audio_quality_score: slot_uri: hc:audioQualityScore description: | Overall audio quality score (0.0-1.0). Composite score based on: - Signal-to-noise ratio - Clipping presence - Frequency response - Clarity of speech **Interpretation**: - > 0.8: High quality, suitable for all uses - 0.6-0.8: Good quality, minor issues - 0.4-0.6: Acceptable, some degradation - < 0.4: Poor quality, may need enhancement range: float required: false minimum_value: 0.0 maximum_value: 1.0 examples: - value: 0.85 description: High audio quality snr_db: slot_uri: hc:snrDb description: | Signal-to-noise ratio in decibels. Higher is better: - > 30 dB: Excellent - 20-30 dB: Good - 10-20 dB: Acceptable - < 10 dB: Poor (speech intelligibility affected) range: float required: false examples: - value: 25.0 description: Good signal-to-noise ratio has_clipping: slot_uri: hc:hasClipping description: | Whether audio clipping (peak distortion) was detected. Clipping occurs when audio exceeds maximum level: - true: Clipping detected (distortion present) - false: No clipping (clean audio) Clipping is permanent quality loss. range: boolean required: false examples: - value: false description: No clipping detected specificity_annotation: range: SpecificityAnnotation inlined: true template_specificity: range: TemplateSpecificityScores inlined: true comments: - Audio event detection for video content - Supports speech, music, silence, and sound event detection - Speaker diarization for interview navigation - Language detection for multilingual heritage content - Audio quality metrics for preservation assessment see_also: - https://www.w3.org/TR/annotation-model/ - https://arxiv.org/abs/2111.08085 SpeechSegment: class_uri: hc:SpeechSegment description: | A speech segment with speaker and language information. Extends VideoTimeSegment with speech-specific metadata. slots: - segment_confidence - segment_end_seconds - segment_language - segment_start_seconds - speaker_id - speaker_label - specificity_annotation - speech_text - template_specificity slot_usage: segment_start_seconds: slot_uri: ma:hasStartTime description: Start time in seconds range: float required: true minimum_value: 0.0 segment_end_seconds: slot_uri: ma:hasEndTime description: End time in seconds range: float required: true minimum_value: 0.0 speaker_id: slot_uri: hc:speakerId description: | Unique identifier for the speaker. Format: "spk_001", "spk_002", etc. (anonymous) Or: "taco_dibbits" (identified) range: string required: false speaker_label: slot_uri: schema:name description: Human-readable speaker name or role range: string required: false segment_language: slot_uri: dcterms:language description: Language of speech in this segment (ISO 639-1) range: string required: false segment_confidence: slot_uri: hc:confidence description: Confidence score for this segment (0.0-1.0) range: float required: false minimum_value: 0.0 maximum_value: 1.0 speech_text: slot_uri: hc:speechText description: | Transcript text for this segment (if available). Links to VideoTranscript for full transcript. range: string required: false specificity_annotation: range: SpecificityAnnotation inlined: true template_specificity: range: TemplateSpecificityScores inlined: true DiarizationSegment: class_uri: hc:DiarizationSegment description: | A diarization segment identifying speaker and time boundaries. Focused on "who spoke when" rather than transcript content. slots: - diarization_confidence - diarization_end_seconds - diarization_speaker_id - diarization_speaker_label - diarization_start_seconds - is_overlapping - specificity_annotation - template_specificity slot_usage: diarization_start_seconds: slot_uri: ma:hasStartTime description: Start time in seconds range: float required: true minimum_value: 0.0 diarization_end_seconds: slot_uri: ma:hasEndTime description: End time in seconds range: float required: true minimum_value: 0.0 diarization_speaker_id: slot_uri: hc:speakerId description: Anonymous speaker identifier (spk_001, spk_002, etc.) range: string required: true diarization_speaker_label: slot_uri: schema:name description: Optional identified name or role range: string required: false diarization_confidence: slot_uri: hc:confidence description: Diarization confidence (0.0-1.0) range: float required: false minimum_value: 0.0 maximum_value: 1.0 is_overlapping: slot_uri: hc:isOverlapping description: | Whether this segment overlaps with another speaker. Overlapping speech occurs when multiple people speak simultaneously. range: boolean required: false specificity_annotation: range: SpecificityAnnotation inlined: true template_specificity: range: TemplateSpecificityScores inlined: true MusicSegment: class_uri: hc:MusicSegment description: | A segment of detected music with classification. slots: - is_background - music_end_seconds - music_genre - music_segment_confidence - music_start_seconds - music_type - specificity_annotation - template_specificity slot_usage: music_start_seconds: slot_uri: ma:hasStartTime description: Start time in seconds range: float required: true minimum_value: 0.0 music_end_seconds: slot_uri: ma:hasEndTime description: End time in seconds range: float required: true minimum_value: 0.0 music_type: slot_uri: dcterms:type description: Type of music (BACKGROUND, FEATURED, ARCHIVAL) range: MusicTypeEnum required: false music_genre: slot_uri: hc:genre description: Detected music genre range: string required: false music_segment_confidence: slot_uri: hc:confidence description: Music detection confidence (0.0-1.0) range: float required: false minimum_value: 0.0 maximum_value: 1.0 is_background: slot_uri: hc:isBackground description: | Whether music is background (under speech) vs featured. - true: Music is background/ambient - false: Music is primary audio range: boolean required: false specificity_annotation: range: SpecificityAnnotation inlined: true template_specificity: range: TemplateSpecificityScores inlined: true enums: AudioEventTypeEnum: description: | Types of audio events detected in video. permissible_values: SPEECH: description: Speech/voice detection and analysis MUSIC: description: Music detection and classification SILENCE: description: Silence or very low audio SOUND_EVENT: description: Non-speech, non-music sound events NOISE: description: Noise detection (for quality assessment) MIXED: description: Multiple audio event types analyzed SoundEventTypeEnum: description: | Types of non-speech, non-music sound events. permissible_values: APPLAUSE: description: Clapping, applause LAUGHTER: description: Laughter from audience or speakers CROWD_NOISE: description: General crowd/audience noise FOOTSTEPS: description: Walking, footsteps DOOR: description: Door opening/closing sounds NATURE_SOUNDS: description: Birds, wind, water, etc. TRAFFIC: description: Vehicles, urban sounds BELLS: description: Church bells, temple bells, etc. MACHINERY: description: Industrial, mechanical sounds COUGHING: description: Coughing, clearing throat PAPER: description: Paper rustling TYPING: description: Keyboard typing PHONE: description: Phone ringing or notification MUSIC_INSTRUMENT: description: Individual instrument sounds OTHER: description: Other sound event type MusicTypeEnum: description: | Types of music presence in audio. permissible_values: BACKGROUND: description: Background/ambient music under other content FEATURED: description: Primary audio is music (performance, recording) ARCHIVAL: description: Historical/archival music recording INTRO_OUTRO: description: Opening or closing music/jingle TRANSITION: description: Music used for scene transitions DIEGETIC: description: Music from within the scene (radio, live performance) NON_DIEGETIC: description: Music added in post-production slots: has_audio_event_segment: description: Time-coded segments with detected audio events range: VideoTimeSegment multivalued: true primary_audio_event_type: description: Primary type of audio analysis performed range: AudioEventTypeEnum speech_detected: description: Whether speech was detected range: boolean has_or_had_speech_segment: description: Detailed speech segments with speaker info range: SpeechSegment multivalued: true speech_language: description: Primary language of speech (ISO 639-1) range: string speech_language_confidence: description: Confidence of language detection range: float languages_detected: description: All languages detected in speech range: string multivalued: true diarization_enabled: description: Whether speaker diarization was performed range: boolean has_or_had_diarization_segment: description: Detailed diarization results range: DiarizationSegment multivalued: true speaker_count: description: Number of distinct speakers detected range: integer speaker_label: description: Labels or names for detected speakers range: string multivalued: true music_detected: description: Whether music was detected range: boolean has_or_had_music_segment: description: Time segments containing music range: MusicSegment multivalued: true music_genres_detected: description: Music genres detected range: string multivalued: true music_confidence: description: Overall music detection confidence range: float sound_events_detected: description: Whether sound events were detected range: boolean has_or_had_sound_event_type: description: Types of sound events detected range: SoundEventTypeEnum multivalued: true has_or_had_silence_segment: description: Time segments with silence range: VideoTimeSegment multivalued: true silence_total_seconds: description: Total silence duration range: float noise_floor_db: description: Background noise floor in dB range: float has_audio_quality_score: description: Overall audio quality (0.0-1.0) range: float snr_db: description: Signal-to-noise ratio in dB range: float has_clipping: description: Whether audio clipping was detected range: boolean segment_start_seconds: description: Segment start time range: float segment_end_seconds: description: Segment end time range: float speaker_id: description: Speaker identifier range: string speaker_label: description: Speaker name or role range: string segment_language: description: Language of segment range: string segment_confidence: description: Segment confidence score range: float speech_text: description: Transcript text for segment range: string diarization_start_seconds: description: Diarization segment start range: float diarization_end_seconds: description: Diarization segment end range: float diarization_speaker_id: description: Speaker ID in diarization range: string diarization_speaker_label: description: Speaker label in diarization range: string diarization_confidence: description: Diarization confidence range: float is_overlapping: description: Whether segment has overlapping speech range: boolean music_start_seconds: description: Music segment start range: float music_end_seconds: description: Music segment end range: float music_type: description: Type of music presence range: MusicTypeEnum music_genre: description: Detected music genre range: string music_segment_confidence: description: Music segment confidence range: float is_background: description: Whether music is background range: boolean