glam/schemas/20251121/linkml/modules/classes/VideoAudioAnnotation.yaml
kempersc 4034c2a00a Refactor schema slots across multiple classes to improve consistency and clarity
- Removed unused slots from TaxonomicAuthority, TechnicalFeature, TelevisionArchive, TentativeWorldHeritageSite, Threat, TimeSpan, Title, TradeRegister, TradeUnionArchive, TradeUnionArchiveRecordSetType, TransferEvent, UNESCODomain, UnitIdentifier, UniversityArchive, UnspecifiedType, UserCommunity, Venue, Vereinsarchiv, Verlagsarchiv, VerlagsarchivRecordSetType, Version, Verwaltungsarchiv, VideoAnnotationTypes, VideoAudioAnnotation, VideoFrame, VideoPost, VideoSubtitle, VideoTextContent, Warehouse, WebArchive, WebClaim, WebClaimsBlock, WebLink, WebPortal, WebPortalTypes, WomensArchives, WordCount, WorldHeritageSite, WritingSystem, and XPathScore.
- Introduced new slot is_or_was_retrieved_at for tracking data retrieval timestamps.
2026-01-31 00:28:09 +01:00

366 lines
14 KiB
YAML

id: https://nde.nl/ontology/hc/class/VideoAudioAnnotation
name: video_audio_annotation_class
title: Video Audio Annotation Class
imports:
- linkml:types
- ../enums/AudioEventTypeEnum
- ../enums/MusicTypeEnum
- ../enums/SoundEventTypeEnum
- ../slots/contains_or_contained
- ../slots/end_of_the_end
- ../slots/has_audio_quality_score
- ../slots/has_or_had_identifier
- ../slots/has_or_had_label
- ../slots/has_or_had_provenance
- ../slots/has_or_had_score
- ../slots/has_or_had_segment
- ../slots/has_or_had_type
- ../slots/is_background
- ../slots/is_or_was_diarized
- ../slots/is_overlapping
- ../slots/languages_detected
- ../slots/music_confidence
- ../slots/music_detected
- ../slots/music_end_seconds
- ../slots/music_genre
- ../slots/music_genres_detected
- ../slots/music_segment_confidence
- ../slots/music_start_seconds
- ../slots/music_type
- ../slots/noise_floor_db
- ../slots/segment_confidence
- ../slots/segment_end_seconds
- ../slots/segment_language
- ../slots/segment_start_seconds
- ../slots/silence_total_seconds
- ../slots/snr_db
- ../slots/sound_events_detected
- ../slots/speaker_count
- ../slots/speaker_id
- ../slots/speaker_label
- ../slots/specificity_annotation
- ../slots/speech_detected
- ../slots/speech_language
- ../slots/speech_language_confidence
- ../slots/speech_text
- ../slots/start_of_the_start
- ../slots/temporal_extent
- ./AudioEventSegment
- ./ConfidenceScore
- ./DiarizationStatus
- ./Identifier
- ./Label
- ./Provenance
- ./Speaker
- ./SpecificityAnnotation
- ./TemplateSpecificityScore
- ./TemplateSpecificityType
- ./TemplateSpecificityTypes
- ./TimeSpan
- ./Timestamp
- ./VideoAnnotation
- ./VideoAudioAnnotation
- ./VideoTimeSegment
- ./DiarizationSegment
- ./MusicSegment
prefixes:
linkml: https://w3id.org/linkml/
hc: https://nde.nl/ontology/hc/
schema: http://schema.org/
dcterms: http://purl.org/dc/terms/
prov: http://www.w3.org/ns/prov#
crm: http://www.cidoc-crm.org/cidoc-crm/
oa: http://www.w3.org/ns/oa#
ma: http://www.w3.org/ns/ma-ont#
wd: http://www.wikidata.org/entity/
default_prefix: hc
classes:
VideoAudioAnnotation:
is_a: VideoAnnotation
class_uri: hc:VideoAudioAnnotation
abstract: false
description: "Annotation for audio events detected in video content.\n\n**DEFINITION**:\n\nVideoAudioAnnotation captures structured information derived from audio\nanalysis of video content. This includes speech, music, silence, and\nvarious sound events.\n\n**AUDIO ANALYSIS TYPES**:\n\n| Type | Description | Use Case |\n|------|-------------|----------|\n| **Speech Detection** | Identify spoken segments | Transcript alignment |\n| **Speaker Diarization** | Who spoke when | Interview navigation |\n| **Music Detection** | Identify musical segments | Content classification |\n| **Sound Events** | Applause, laughter, etc. | Audience engagement |\n| **Silence Detection** | Find quiet segments | Quality assessment |\n| **Language Detection** | Identify spoken languages | Multilingual content |\n\n**SPEAKER DIARIZATION**:\n\nDiarization answers \"who spoke when\":\n\n```\n0:00-0:15 Speaker 1 (Curator)\n0:15-0:45 Speaker 2 (Artist)\n0:45-1:00 Speaker 1 (Curator)\n1:00-1:30 Speaker 3 (Museum\
\ Director)\n```\n\nHeritage applications:\n- Navigate to specific speakers in interviews\n- Count speaking time per person\n- Identify unnamed speakers for annotation\n- Build speaker databases for recognition\n\n**MUSIC DETECTION**:\n\nMusic detection classifies audio segments as containing music:\n\n| Category | Examples |\n|----------|----------|\n| **Background music** | Documentary soundtracks |\n| **Featured music** | Concert recordings, performances |\n| **Historical music** | Archival recordings |\n| **Licensed music** | Rights-managed content |\n\nMusic segments may also include:\n- Genre classification (classical, jazz, folk)\n- Mood/tempo analysis\n- Fingerprinting for identification\n\n**SOUND EVENT DETECTION**:\n\nNon-speech, non-music audio events:\n\n| Event Type | Heritage Context |\n|------------|------------------|\n| APPLAUSE | Lecture recordings, openings |\n| LAUGHTER | Tour guides, educational content |\n| CROWD_NOISE | Event documentation |\n| DOOR/FOOTSTEPS\
\ | Ambient archive recordings |\n| NATURE_SOUNDS | Outdoor heritage site recordings |\n| MACHINERY | Industrial heritage, conservation |\n\n**LANGUAGE DETECTION**:\n\nMultilingual heritage content requires language identification:\n\n```yaml\ncontains_or_contained:\n - start: 0.0\n end: 120.0\n language: nl\n speaker_id: speaker_001\n - start: 120.0\n end: 240.0\n language: en\n speaker_id: speaker_001 # Same speaker, switched language\n```\n\n**AUDIO QUALITY ANALYSIS**:\n\nAudio quality metrics for preservation and accessibility:\n\n| Metric | Description | Threshold |\n|--------|-------------|-----------|\n| SNR | Signal-to-noise ratio | > 20 dB good |\n| Clipping | Peak distortion | None ideal |\n| Noise floor | Background noise level | < -50 dB good |\n| Frequency response | Bandwidth | Full-range ideal |\n\n**HERITAGE INSTITUTION USE CASES**:\n\n| Content Type | Audio Analysis Need |\n|--------------|---------------------|\n| Oral histories | Diarization,\
\ transcription alignment |\n| Curator interviews | Speaker identification, language |\n| Virtual tours | Background music, voiceover detection |\n| Lecture recordings | Audience reactions, Q&A segments |\n| Conservation videos | Narration vs demonstration audio |\n| Archival footage | Speech recovery, noise reduction |\n\n**RELATIONSHIP TO VideoTranscript**:\n\nVideoAudioAnnotation is complementary to VideoTranscript:\n\n- **VideoTranscript**: The text content of speech (WHAT was said)\n- **VideoAudioAnnotation**: Audio structure (WHO spoke, music, sounds)\n\nTogether they provide complete audio understanding:\n\n```\nVideoAudioAnnotation: Speaker 1 spoke 0:00-0:15\nVideoTranscript: \"Welcome to the Rijksmuseum...\" (0:00-0:15)\n\u2192 Combined: Curator said \"Welcome to the Rijksmuseum...\"\n```\n"
exact_mappings:
- hc:VideoAudioAnnotation
close_mappings:
- ma:AudioTrack
- crm:E13_Attribute_Assignment
related_mappings:
- wikidata:Q11028
- wikidata:Q638
slots:
- has_or_had_segment
# - contains_or_contained - DUPLICATE REMOVED
- has_audio_quality_score
- is_or_was_diarized
# - contains_or_contained - DUPLICATE REMOVED
- has_clipping
- languages_detected
- music_confidence
- music_detected
- music_genres_detected
- has_or_had_music_segment
- noise_floor_db
- has_or_had_type
# - contains_or_contained - DUPLICATE REMOVED
- silence_total_seconds
- snr_db
# - contains_or_contained - DUPLICATE REMOVED
- sound_events_detected
- speaker_count
- speaker_label
- specificity_annotation
- speech_detected
- speech_language
- speech_language_confidence
- contains_or_contained
- has_or_had_score
slot_usage:
has_or_had_segment:
description: 'MIGRATED from audio_event_segments (Rule 53).
Audio event segments detected in the video content.
'
range: AudioEventSegment
multivalued: true
required: false
inlined_as_list: true
examples:
- value: '[{has_or_had_type: SPEECH, start_seconds: 0.0, end_seconds: 15.0, segment_text: "Speech detected - Speaker 1", confidence: 0.95}]'
description: Speech detection segment
- value: '[{has_or_had_type: MUSIC, start_seconds: 30.0, end_seconds: 60.0, segment_text: "Background classical music", confidence: 0.88}]'
description: Music detection segment
contains_or_contained:
range: string
multivalued: true
required: false
inlined_as_list: true
examples:
- value:
temporal_extent:
begin_of_the_begin: 0.0
end_of_the_end: 15.0
contains_or_contained:
has_or_had_identifier: spk_001
has_or_had_label: Curator
description: Curator speaking for first 15 seconds
speaker_count:
range: integer
required: false
minimum_value: 0
examples:
- value: 3
description: Three distinct speakers detected
speaker_label:
range: string
multivalued: true
required: false
examples:
- value: '[Curator, Artist, Museum Director]'
description: Three identified speakers
music_detected:
range: boolean
required: false
examples:
- value: true
description: Music present in video
has_or_had_music_segment:
range: MusicSegment
multivalued: true
required: false
inlined_as_list: true
examples:
- value: '[{start_seconds: 0.0, end_seconds: 30.0, music_type: ''BACKGROUND'', genre: ''classical''}]'
description: Classical background music
music_genres_detected:
range: string
multivalued: true
required: false
examples:
- value: '[classical, baroque]'
description: Classical and baroque music detected
music_confidence:
range: float
required: false
minimum_value: 0.0
maximum_value: 1.0
examples:
- value: 0.88
description: High confidence music detection
sound_events_detected:
range: boolean
required: false
examples:
- value: true
description: Sound events detected
# contains_or_contained:
# range: SoundEventTypeEnum
# multivalued: true
# required: false
# examples:
# - value: '[APPLAUSE, CROWD_NOISE]'
# description: Applause and crowd sounds detected
# contains_or_contained:
# range: VideoTimeSegment
# multivalued: true
# required: false
# inlined_as_list: true
# examples:
# - value: '[{start_seconds: 45.0, end_seconds: 48.0}]'
# description: 3-second silence
silence_total_seconds:
range: float
required: false
minimum_value: 0.0
examples:
- value: 15.5
description: 15.5 seconds of total silence
noise_floor_db:
range: float
required: false
examples:
- value: -45.0
description: Good quality, moderate noise floor
has_audio_quality_score:
range: float
required: false
minimum_value: 0.0
maximum_value: 1.0
examples:
- value: 0.85
description: High audio quality
snr_db:
range: float
required: false
examples:
- value: 25.0
description: Good signal-to-noise ratio
has_clipping:
range: boolean
required: false
examples:
- value: false
description: No clipping detected
comments:
- Audio event detection for video content
- Supports speech, music, silence, and sound event detection
- Speaker diarization for interview navigation
- Language detection for multilingual heritage content
- Audio quality metrics for preservation assessment
see_also:
- https://www.w3.org/TR/annotation-model/
- https://arxiv.org/abs/2111.08085
annotations:
specificity_score: 0.1
specificity_rationale: Generic utility class/slot created during migration
custodian_types: "['*']"
SpeechSegment:
class_uri: hc:SpeechSegment
description: 'A speech segment with speaker and language information.
Extends VideoTimeSegment with speech-specific metadata.
'
slots:
- segment_confidence
- segment_end_seconds
- segment_language
- segment_start_seconds
- speaker_id
- speaker_label
- specificity_annotation
- speech_text
- has_or_had_score
slot_usage:
segment_start_seconds:
range: float
required: true
minimum_value: 0.0
segment_end_seconds:
range: float
required: true
minimum_value: 0.0
speaker_id:
range: string
required: false
speaker_label:
range: string
required: false
segment_language:
range: string
required: false
segment_confidence:
range: float
required: false
minimum_value: 0.0
maximum_value: 1.0
speech_text:
range: string
required: false
DiarizationSegment:
class_uri: hc:DiarizationSegment
description: 'A diarization segment identifying speaker and time boundaries.
Focused on "who spoke when" rather than transcript content.
'
slots:
- has_or_had_provenance
- temporal_extent
- contains_or_contained
- is_overlapping
- specificity_annotation
- has_or_had_score
slot_usage:
temporal_extent:
description: Time range of the diarization segment.
range: TimeSpan
inlined: true
required: true
contains_or_contained:
description: Speaker identified in this segment.
range: Speaker
inlined: true
required: true
has_or_had_provenance:
description: Provenance metadata including confidence score.
range: Provenance
inlined: true
required: false
is_overlapping:
range: boolean
required: false
MusicSegment:
class_uri: hc:MusicSegment
description: 'A segment of detected music with classification.
'
slots:
- is_background
- music_end_seconds
- music_genre
- music_segment_confidence
- music_start_seconds
- music_type
- specificity_annotation
- has_or_had_score
slot_usage:
music_start_seconds:
range: float
required: true
minimum_value: 0.0
music_end_seconds:
range: float
required: true
minimum_value: 0.0
music_type:
range: MusicTypeEnum
required: false
music_genre:
range: string
required: false
music_segment_confidence:
range: float
required: false
minimum_value: 0.0
maximum_value: 1.0
is_background:
range: boolean
required: false