id: https://nde.nl/ontology/hc/class/VideoAudioAnnotation name: video_audio_annotation_class title: Video Audio Annotation Class imports: - linkml:types - ./VideoAnnotation - ./VideoTimeSegment - ../slots/audio_event_segments - ../slots/has_audio_quality_score - ../slots/diarization_confidence - ../slots/diarization_enabled - ../slots/diarization_end_seconds - ../slots/has_or_had_diarization_segment - ../slots/diarization_speaker_id - ../slots/diarization_speaker_label - ../slots/diarization_start_seconds - ../slots/has_clipping - ../slots/is_background - ../slots/is_overlapping - ../slots/languages_detected - ../slots/music_confidence - ../slots/music_detected - ../slots/music_end_seconds - ../slots/music_genre - ../slots/music_genres_detected - ../slots/music_segment_confidence - ../slots/has_or_had_music_segment - ../slots/music_start_seconds - ../slots/music_type - ../slots/noise_floor_db - ../slots/primary_audio_event_type - ../slots/segment_confidence - ../slots/segment_end_seconds - ../slots/segment_language - ../slots/segment_start_seconds - ../slots/has_or_had_silence_segment - ../slots/silence_total_seconds - ../slots/snr_db - ../slots/has_or_had_sound_event_type - ../slots/sound_events_detected - ../slots/speaker_count - ../slots/speaker_id - ../slots/speaker_label - ../slots/speaker_label - ../slots/specificity_annotation - ../slots/speech_detected - ../slots/speech_language - ../slots/speech_language_confidence - ../slots/has_or_had_speech_segment - ../slots/speech_text - ../slots/template_specificity - ./DiarizationSegment - ./MusicSegment - ./SpecificityAnnotation - ./SpeechSegment - ./TemplateSpecificityScores - ../slots/has_audio_event_segment prefixes: linkml: https://w3id.org/linkml/ hc: https://nde.nl/ontology/hc/ schema: http://schema.org/ dcterms: http://purl.org/dc/terms/ prov: http://www.w3.org/ns/prov# crm: http://www.cidoc-crm.org/cidoc-crm/ oa: http://www.w3.org/ns/oa# ma: http://www.w3.org/ns/ma-ont# wd: http://www.wikidata.org/entity/ default_prefix: hc classes: VideoAudioAnnotation: is_a: VideoAnnotation class_uri: hc:VideoAudioAnnotation abstract: false description: "Annotation for audio events detected in video content.\n\n**DEFINITION**:\n\nVideoAudioAnnotation captures\ \ structured information derived from audio\nanalysis of video content. This includes speech, music, silence, and\n\ various sound events.\n\n**AUDIO ANALYSIS TYPES**:\n\n| Type | Description | Use Case |\n|------|-------------|----------|\n\ | **Speech Detection** | Identify spoken segments | Transcript alignment |\n| **Speaker Diarization** | Who spoke when\ \ | Interview navigation |\n| **Music Detection** | Identify musical segments | Content classification |\n| **Sound\ \ Events** | Applause, laughter, etc. | Audience engagement |\n| **Silence Detection** | Find quiet segments | Quality\ \ assessment |\n| **Language Detection** | Identify spoken languages | Multilingual content |\n\n**SPEAKER DIARIZATION**:\n\ \nDiarization answers \"who spoke when\":\n\n```\n0:00-0:15 Speaker 1 (Curator)\n0:15-0:45 Speaker 2 (Artist)\n0:45-1:00\ \ Speaker 1 (Curator)\n1:00-1:30 Speaker 3 (Museum Director)\n```\n\nHeritage applications:\n- Navigate to specific\ \ speakers in interviews\n- Count speaking time per person\n- Identify unnamed speakers for annotation\n- Build speaker\ \ databases for recognition\n\n**MUSIC DETECTION**:\n\nMusic detection classifies audio segments as containing music:\n\ \n| Category | Examples |\n|----------|----------|\n| **Background music** | Documentary soundtracks |\n| **Featured\ \ music** | Concert recordings, performances |\n| **Historical music** | Archival recordings |\n| **Licensed music**\ \ | Rights-managed content |\n\nMusic segments may also include:\n- Genre classification (classical, jazz, folk)\n-\ \ Mood/tempo analysis\n- Fingerprinting for identification\n\n**SOUND EVENT DETECTION**:\n\nNon-speech, non-music audio\ \ events:\n\n| Event Type | Heritage Context |\n|------------|------------------|\n| APPLAUSE | Lecture recordings,\ \ openings |\n| LAUGHTER | Tour guides, educational content |\n| CROWD_NOISE | Event documentation |\n| DOOR/FOOTSTEPS\ \ | Ambient archive recordings |\n| NATURE_SOUNDS | Outdoor heritage site recordings |\n| MACHINERY | Industrial heritage,\ \ conservation |\n\n**LANGUAGE DETECTION**:\n\nMultilingual heritage content requires language identification:\n\n```yaml\n\ has_or_had_speech_segment:\n - start: 0.0\n end: 120.0\n language: nl\n speaker_id: speaker_001\n - start:\ \ 120.0\n end: 240.0\n language: en\n speaker_id: speaker_001 # Same speaker, switched language\n```\n\n**AUDIO\ \ QUALITY ANALYSIS**:\n\nAudio quality metrics for preservation and accessibility:\n\n| Metric | Description | Threshold\ \ |\n|--------|-------------|-----------|\n| SNR | Signal-to-noise ratio | > 20 dB good |\n| Clipping | Peak distortion\ \ | None ideal |\n| Noise floor | Background noise level | < -50 dB good |\n| Frequency response | Bandwidth | Full-range\ \ ideal |\n\n**HERITAGE INSTITUTION USE CASES**:\n\n| Content Type | Audio Analysis Need |\n|--------------|---------------------|\n\ | Oral histories | Diarization, transcription alignment |\n| Curator interviews | Speaker identification, language |\n\ | Virtual tours | Background music, voiceover detection |\n| Lecture recordings | Audience reactions, Q&A segments |\n\ | Conservation videos | Narration vs demonstration audio |\n| Archival footage | Speech recovery, noise reduction |\n\ \n**RELATIONSHIP TO VideoTranscript**:\n\nVideoAudioAnnotation is complementary to VideoTranscript:\n\n- **VideoTranscript**:\ \ The text content of speech (WHAT was said)\n- **VideoAudioAnnotation**: Audio structure (WHO spoke, music, sounds)\n\ \nTogether they provide complete audio understanding:\n\n```\nVideoAudioAnnotation: Speaker 1 spoke 0:00-0:15\nVideoTranscript:\ \ \"Welcome to the Rijksmuseum...\" (0:00-0:15)\n→ Combined: Curator said \"Welcome to the Rijksmuseum...\"\n```\n" exact_mappings: - hc:VideoAudioAnnotation close_mappings: - ma:AudioTrack - crm:E13_Attribute_Assignment related_mappings: - wikidata:Q11028 - wikidata:Q638 slots: - audio_event_segments - audio_quality_score - diarization_enabled - has_or_had_diarization_segment - has_clipping - languages_detected - music_confidence - music_detected - music_genres_detected - has_or_had_music_segment - noise_floor_db - primary_audio_event_type - has_or_had_silence_segment - silence_total_seconds - snr_db - has_or_had_sound_event_type - sound_events_detected - speaker_count - speaker_label - specificity_annotation - speech_detected - speech_language - speech_language_confidence - has_or_had_speech_segment - template_specificity slot_usage: has_audio_event_segment: slot_uri: oa:hasBody description: 'Time-coded segments with detected audio events. Web Annotation: hasBody links annotation to content. Each segment contains: - Start/end time boundaries - Event type (SPEECH, MUSIC, SILENCE, etc.) - Confidence score - Additional metadata (speaker ID, language, etc.) Segments may overlap (e.g., speech over background music). ' range: VideoTimeSegment multivalued: true required: false inlined_as_list: true examples: - value: '[{start_seconds: 0.0, end_seconds: 15.0, segment_text: ''Speech detected - Speaker 1''}]' description: Speech detection segment primary_audio_event_type: slot_uri: dcterms:type description: 'The primary type of audio analysis performed. Dublin Core: type for categorization. **Types**: - SPEECH: Speech detection and diarization - MUSIC: Music detection and classification - SOUND_EVENTS: Environmental sound detection - MIXED: Multiple analysis types combined ' range: AudioEventTypeEnum required: true examples: - value: SPEECH description: Primary focus on speech analysis speech_detected: slot_uri: hc:speechDetected description: 'Whether speech was detected in the video audio. High-level flag for presence of speech content. - true: At least one speech segment detected - false: No speech detected (music-only, silent, etc.) ' range: boolean required: false examples: - value: true description: Speech is present in video has_or_had_speech_segment: slot_uri: hc:speechSegments description: 'Detailed speech segments with speaker and language info. Each segment represents continuous speech from one speaker. Used for: - Transcript alignment - Speaker navigation - Language segmentation ' range: SpeechSegment multivalued: true required: false inlined_as_list: true examples: - value: '[{start_seconds: 0.0, end_seconds: 15.0, speaker_id: ''spk_001'', language: ''nl''}]' description: Dutch speech from speaker 1 speech_language: slot_uri: dcterms:language description: 'Primary language of speech content (ISO 639-1 code). Dublin Core: language for primary language. For multilingual content, this is the predominant language. See `languages_detected` for all languages. ' range: string required: false examples: - value: nl description: Dutch is primary language - value: en description: English is primary language speech_language_confidence: slot_uri: hc:languageConfidence description: 'Confidence score for language detection (0.0-1.0). Higher confidence when: - Longer speech segments - Clear audio quality - Distinct language features Lower confidence when: - Short utterances - Background noise - Code-switching ' range: float required: false minimum_value: 0.0 maximum_value: 1.0 examples: - value: 0.95 description: High confidence language detection languages_detected: slot_uri: hc:languagesDetected description: 'All languages detected in speech (ISO 639-1 codes). Heritage content often includes multiple languages: - Exhibition videos with translations - Interviews with multilingual speakers - Historical content with period languages Ordered by speaking time (most spoken first). ' range: string multivalued: true required: false examples: - value: '[nl, en, de]' description: Dutch, English, and German detected diarization_enabled: slot_uri: hc:diarizationEnabled description: 'Whether speaker diarization was performed. Diarization = identifying distinct speakers and their segments. - true: Speaker IDs assigned to speech segments - false: Speech detected but speakers not distinguished ' range: boolean required: false examples: - value: true description: Diarization was performed has_or_had_diarization_segment: slot_uri: hc:diarizationSegments description: 'Detailed diarization results with speaker assignments. Each segment identifies: - Time boundaries - Speaker ID (anonymous: "spk_001", "spk_002") - Optional speaker name (if identified) - Confidence score Enables "who spoke when" analysis. ' range: DiarizationSegment multivalued: true required: false inlined_as_list: true examples: - value: '[{start_seconds: 0.0, end_seconds: 15.0, speaker_id: ''spk_001'', speaker_label: ''Curator''}]' description: Curator speaking for first 15 seconds speaker_count: slot_uri: hc:speakerCount description: 'Number of distinct speakers detected. Useful for: - Interview classification (1 = monologue, 2+ = dialog) - Content type inference - Accessibility planning ' range: integer required: false minimum_value: 0 examples: - value: 3 description: Three distinct speakers detected speaker_label: slot_uri: hc:speakerLabels description: 'Labels or names assigned to detected speakers. May be: - Anonymous: ["Speaker 1", "Speaker 2"] - Identified: ["Dr. Taco Dibbits", "Interviewer"] - Role-based: ["Curator", "Artist", "Host"] Ordered by speaking time (most speaking first). ' range: string multivalued: true required: false examples: - value: '[Curator, Artist, Museum Director]' description: Three identified speakers music_detected: slot_uri: hc:musicDetected description: 'Whether music was detected in the audio. - true: Musical content detected (any amount) - false: No music detected (speech-only, silence) ' range: boolean required: false examples: - value: true description: Music present in video has_or_had_music_segment: slot_uri: hc:musicSegments description: 'Time segments containing music. Each segment includes: - Time boundaries - Music type (background, featured) - Genre classification (if detected) - Confidence score ' range: MusicSegment multivalued: true required: false inlined_as_list: true examples: - value: '[{start_seconds: 0.0, end_seconds: 30.0, music_type: ''BACKGROUND'', genre: ''classical''}]' description: Classical background music music_genres_detected: slot_uri: hc:musicGenresDetected description: 'Music genres detected in audio. **Common Heritage Genres**: - classical: Art music, orchestral - baroque: Period-specific classical - jazz: Jazz performances - folk: Traditional/folk music - ambient: Background/atmospheric - electronic: Modern electronic music ' range: string multivalued: true required: false examples: - value: '[classical, baroque]' description: Classical and baroque music detected music_confidence: slot_uri: hc:musicConfidence description: 'Overall confidence of music detection (0.0-1.0). Average confidence across all music segments. ' range: float required: false minimum_value: 0.0 maximum_value: 1.0 examples: - value: 0.88 description: High confidence music detection sound_events_detected: slot_uri: hc:soundEventsDetected description: 'Whether non-speech, non-music sound events were detected. Sound events include applause, laughter, environmental sounds, etc. ' range: boolean required: false examples: - value: true description: Sound events detected has_or_had_sound_event_type: slot_uri: hc:soundEventTypes description: 'Types of sound events detected. **Heritage-Relevant Events**: - APPLAUSE: Lecture endings, openings - LAUGHTER: Tour guide humor - CROWD_NOISE: Event atmosphere - FOOTSTEPS: Gallery ambiance - NATURE_SOUNDS: Outdoor heritage sites - BELLS: Church/temple recordings ' range: SoundEventTypeEnum multivalued: true required: false examples: - value: '[APPLAUSE, CROWD_NOISE]' description: Applause and crowd sounds detected has_or_had_silence_segment: slot_uri: hc:silenceSegments description: 'Time segments containing silence or very low audio. Silence detection useful for: - Finding pauses between segments - Quality assessment (unexpected silence) - Identifying chapter/scene boundaries Threshold typically: audio below -40 dB for > 2 seconds. ' range: VideoTimeSegment multivalued: true required: false inlined_as_list: true examples: - value: '[{start_seconds: 45.0, end_seconds: 48.0}]' description: 3-second silence silence_total_seconds: slot_uri: hc:silenceTotalSeconds description: 'Total duration of silence in the video (seconds). High silence percentage may indicate: - Extended pauses - Silent segments (B-roll without audio) - Audio issues ' range: float required: false minimum_value: 0.0 examples: - value: 15.5 description: 15.5 seconds of total silence noise_floor_db: slot_uri: hc:noiseFloorDb description: 'Background noise floor level in decibels. **Quality Guidelines**: - < -60 dB: Excellent (studio quality) - -60 to -40 dB: Good (professional recording) - -40 to -30 dB: Acceptable (field recording) - > -30 dB: Poor (noisy environment) ' range: float required: false examples: - value: -45.0 description: Good quality, moderate noise floor has_audio_quality_score: slot_uri: hc:audioQualityScore description: 'Overall audio quality score (0.0-1.0). Composite score based on: - Signal-to-noise ratio - Clipping presence - Frequency response - Clarity of speech **Interpretation**: - > 0.8: High quality, suitable for all uses - 0.6-0.8: Good quality, minor issues - 0.4-0.6: Acceptable, some degradation - < 0.4: Poor quality, may need enhancement ' range: float required: false minimum_value: 0.0 maximum_value: 1.0 examples: - value: 0.85 description: High audio quality snr_db: slot_uri: hc:snrDb description: 'Signal-to-noise ratio in decibels. Higher is better: - > 30 dB: Excellent - 20-30 dB: Good - 10-20 dB: Acceptable - < 10 dB: Poor (speech intelligibility affected) ' range: float required: false examples: - value: 25.0 description: Good signal-to-noise ratio has_clipping: slot_uri: hc:hasClipping description: 'Whether audio clipping (peak distortion) was detected. Clipping occurs when audio exceeds maximum level: - true: Clipping detected (distortion present) - false: No clipping (clean audio) Clipping is permanent quality loss. ' range: boolean required: false examples: - value: false description: No clipping detected specificity_annotation: range: SpecificityAnnotation inlined: true template_specificity: range: TemplateSpecificityScores inlined: true comments: - Audio event detection for video content - Supports speech, music, silence, and sound event detection - Speaker diarization for interview navigation - Language detection for multilingual heritage content - Audio quality metrics for preservation assessment see_also: - https://www.w3.org/TR/annotation-model/ - https://arxiv.org/abs/2111.08085 SpeechSegment: class_uri: hc:SpeechSegment description: 'A speech segment with speaker and language information. Extends VideoTimeSegment with speech-specific metadata. ' slots: - segment_confidence - segment_end_seconds - segment_language - segment_start_seconds - speaker_id - speaker_label - specificity_annotation - speech_text - template_specificity slot_usage: segment_start_seconds: slot_uri: ma:hasStartTime description: Start time in seconds range: float required: true minimum_value: 0.0 segment_end_seconds: slot_uri: ma:hasEndTime description: End time in seconds range: float required: true minimum_value: 0.0 speaker_id: slot_uri: hc:speakerId description: 'Unique identifier for the speaker. Format: "spk_001", "spk_002", etc. (anonymous) Or: "taco_dibbits" (identified) ' range: string required: false speaker_label: slot_uri: schema:name description: Human-readable speaker name or role range: string required: false segment_language: slot_uri: dcterms:language description: Language of speech in this segment (ISO 639-1) range: string required: false segment_confidence: slot_uri: hc:confidence description: Confidence score for this segment (0.0-1.0) range: float required: false minimum_value: 0.0 maximum_value: 1.0 speech_text: slot_uri: hc:speechText description: 'Transcript text for this segment (if available). Links to VideoTranscript for full transcript. ' range: string required: false specificity_annotation: range: SpecificityAnnotation inlined: true template_specificity: range: TemplateSpecificityScores inlined: true DiarizationSegment: class_uri: hc:DiarizationSegment description: 'A diarization segment identifying speaker and time boundaries. Focused on "who spoke when" rather than transcript content. ' slots: - diarization_confidence - diarization_end_seconds - diarization_speaker_id - diarization_speaker_label - diarization_start_seconds - is_overlapping - specificity_annotation - template_specificity slot_usage: diarization_start_seconds: slot_uri: ma:hasStartTime description: Start time in seconds range: float required: true minimum_value: 0.0 diarization_end_seconds: slot_uri: ma:hasEndTime description: End time in seconds range: float required: true minimum_value: 0.0 diarization_speaker_id: slot_uri: hc:speakerId description: Anonymous speaker identifier (spk_001, spk_002, etc.) range: string required: true diarization_speaker_label: slot_uri: schema:name description: Optional identified name or role range: string required: false diarization_confidence: slot_uri: hc:confidence description: Diarization confidence (0.0-1.0) range: float required: false minimum_value: 0.0 maximum_value: 1.0 is_overlapping: slot_uri: hc:isOverlapping description: 'Whether this segment overlaps with another speaker. Overlapping speech occurs when multiple people speak simultaneously. ' range: boolean required: false specificity_annotation: range: SpecificityAnnotation inlined: true template_specificity: range: TemplateSpecificityScores inlined: true MusicSegment: class_uri: hc:MusicSegment description: 'A segment of detected music with classification. ' slots: - is_background - music_end_seconds - music_genre - music_segment_confidence - music_start_seconds - music_type - specificity_annotation - template_specificity slot_usage: music_start_seconds: slot_uri: ma:hasStartTime description: Start time in seconds range: float required: true minimum_value: 0.0 music_end_seconds: slot_uri: ma:hasEndTime description: End time in seconds range: float required: true minimum_value: 0.0 music_type: slot_uri: dcterms:type description: Type of music (BACKGROUND, FEATURED, ARCHIVAL) range: MusicTypeEnum required: false music_genre: slot_uri: hc:genre description: Detected music genre range: string required: false music_segment_confidence: slot_uri: hc:confidence description: Music detection confidence (0.0-1.0) range: float required: false minimum_value: 0.0 maximum_value: 1.0 is_background: slot_uri: hc:isBackground description: 'Whether music is background (under speech) vs featured. - true: Music is background/ambient - false: Music is primary audio ' range: boolean required: false specificity_annotation: range: SpecificityAnnotation inlined: true template_specificity: range: TemplateSpecificityScores inlined: true enums: AudioEventTypeEnum: description: 'Types of audio events detected in video. ' permissible_values: SPEECH: description: Speech/voice detection and analysis MUSIC: description: Music detection and classification SILENCE: description: Silence or very low audio SOUND_EVENT: description: Non-speech, non-music sound events NOISE: description: Noise detection (for quality assessment) MIXED: description: Multiple audio event types analyzed SoundEventTypeEnum: description: 'Types of non-speech, non-music sound events. ' permissible_values: APPLAUSE: description: Clapping, applause LAUGHTER: description: Laughter from audience or speakers CROWD_NOISE: description: General crowd/audience noise FOOTSTEPS: description: Walking, footsteps DOOR: description: Door opening/closing sounds NATURE_SOUNDS: description: Birds, wind, water, etc. TRAFFIC: description: Vehicles, urban sounds BELLS: description: Church bells, temple bells, etc. MACHINERY: description: Industrial, mechanical sounds COUGHING: description: Coughing, clearing throat PAPER: description: Paper rustling TYPING: description: Keyboard typing PHONE: description: Phone ringing or notification MUSIC_INSTRUMENT: description: Individual instrument sounds OTHER: description: Other sound event type MusicTypeEnum: description: 'Types of music presence in audio. ' permissible_values: BACKGROUND: description: Background/ambient music under other content FEATURED: description: Primary audio is music (performance, recording) ARCHIVAL: description: Historical/archival music recording INTRO_OUTRO: description: Opening or closing music/jingle TRANSITION: description: Music used for scene transitions DIEGETIC: description: Music from within the scene (radio, live performance) NON_DIEGETIC: description: Music added in post-production