From 51554947a0d089a8229138e33536effbae2b858c Mon Sep 17 00:00:00 2001 From: kempersc Date: Tue, 16 Dec 2025 20:03:17 +0100 Subject: [PATCH] feat(schema): Add video content schema with comprehensive examples Video Schema Classes (9 files): - VideoPost, VideoComment: Social media video modeling - VideoTextContent: Base class for text content extraction - VideoTranscript, VideoSubtitle: Text with timing and formatting - VideoTimeSegment: Time code handling with ISO 8601 duration - VideoAnnotation: Base annotation with W3C Web Annotation alignment - VideoAnnotationTypes: Scene, Object, OCR detection annotations - VideoChapter, VideoChapterList: Navigation and chapter structure - VideoAudioAnnotation: Speaker diarization, music, sound events Enumerations (12 enums): - VideoDefinitionEnum, LiveBroadcastStatusEnum - TranscriptFormatEnum, SubtitleFormatEnum, SubtitlePositionEnum - AnnotationTypeEnum, AnnotationMotivationEnum - DetectionLevelEnum, SceneTypeEnum, TransitionTypeEnum, TextTypeEnum - ChapterSourceEnum, AudioEventTypeEnum, SoundEventTypeEnum, MusicTypeEnum Examples (904 lines, 10 comprehensive heritage-themed examples): - Rijksmuseum virtual tour chapters (5 chapters with heritage entity refs) - Operation Night Watch documentary chapters (5 chapters) - VideoAudioAnnotation: curator interview, exhibition promo, museum lecture All examples reference real heritage entities with Wikidata IDs: Q5598 (Rembrandt), Q41264 (Vermeer), Q219831 (The Night Watch) --- .../examples/video_content_examples.yaml | 904 ++++++++++++ .../modules/classes/VideoAnnotation.yaml | 542 +++++++ .../modules/classes/VideoAnnotationTypes.yaml | 1312 +++++++++++++++++ .../modules/classes/VideoAudioAnnotation.yaml | 1108 ++++++++++++++ .../linkml/modules/classes/VideoChapter.yaml | 621 ++++++++ .../linkml/modules/classes/VideoPost.yaml | 763 ++++++++++ .../linkml/modules/classes/VideoSubtitle.yaml | 632 ++++++++ .../modules/classes/VideoTextContent.yaml | 524 +++++++ .../modules/classes/VideoTimeSegment.yaml | 375 +++++ .../modules/classes/VideoTranscript.yaml | 469 ++++++ 10 files changed, 7250 insertions(+) create mode 100644 schemas/20251121/linkml/examples/video_content_examples.yaml create mode 100644 schemas/20251121/linkml/modules/classes/VideoAnnotation.yaml create mode 100644 schemas/20251121/linkml/modules/classes/VideoAnnotationTypes.yaml create mode 100644 schemas/20251121/linkml/modules/classes/VideoAudioAnnotation.yaml create mode 100644 schemas/20251121/linkml/modules/classes/VideoChapter.yaml create mode 100644 schemas/20251121/linkml/modules/classes/VideoPost.yaml create mode 100644 schemas/20251121/linkml/modules/classes/VideoSubtitle.yaml create mode 100644 schemas/20251121/linkml/modules/classes/VideoTextContent.yaml create mode 100644 schemas/20251121/linkml/modules/classes/VideoTimeSegment.yaml create mode 100644 schemas/20251121/linkml/modules/classes/VideoTranscript.yaml diff --git a/schemas/20251121/linkml/examples/video_content_examples.yaml b/schemas/20251121/linkml/examples/video_content_examples.yaml new file mode 100644 index 0000000000..d0889a0efb --- /dev/null +++ b/schemas/20251121/linkml/examples/video_content_examples.yaml @@ -0,0 +1,904 @@ +# Video Content Examples +# Instance data demonstrating video schema classes for heritage institutions +# Covers: VideoPost, VideoComment, VideoTranscript, VideoSubtitle, VideoAnnotation types +# +# Part of Heritage Custodian Ontology v0.9.10 +# +# HERITAGE INSTITUTION VIDEO USE CASES: +# - Virtual museum tours +# - Conservation documentation +# - Curator interviews +# - Collection spotlights +# - Educational content +# - Live event recordings + +# ============================================================================ +# EXAMPLE 1: Museum Virtual Tour Video +# Complete VideoPost with transcript, subtitles, and scene annotations +# ============================================================================ + +video_posts: + + - post_id: "https://nde.nl/ontology/hc/video/nl/rijksmuseum-gallery-honour" + platform_type: YOUTUBE + platform_id: "UCo2sQFl0mV4K2v6D4d8Z9bQ" + platform_post_id: "dQw4w9WgXcQ" + post_url: "https://www.youtube.com/watch?v=dQw4w9WgXcQ" + post_title: "The Gallery of Honour - Rijksmuseum Virtual Tour" + post_description: | + Take a virtual walk through the famous Gallery of Honour at the Rijksmuseum + in Amsterdam. This corridor displays masterpieces of Dutch Golden Age painting, + culminating in Rembrandt's Night Watch. Our curator guides you through the + history and significance of these iconic works. + + # Video technical properties + duration: "PT15M42S" + definition: hd + aspect_ratio: "16:9" + frame_rate: 30.0 + + # Caption and language + caption_available: true + default_language: "nl" + default_audio_language: "nl" + available_caption_languages: + - "nl" + - "en" + - "de" + - "fr" + - "zh" + + # Engagement metrics (observational) + view_count: 125847 + like_count: 3421 + dislike_count: 42 + comment_count: 287 + favorite_count: 892 + metrics_observed_at: "2025-12-15T10:30:00Z" + + # Platform-specific + video_category_id: "27" # Education + live_broadcast_content: none + is_licensed_content: false + is_embeddable: true + is_made_for_kids: false + + # Publishing info (inherited from SocialMediaPost) + published_at: "2023-03-15T14:00:00Z" + last_updated_at: "2023-03-15T14:00:00Z" + + # Comments + comments_fetched: 50 + video_comments: + - comment_id: "Ugw3x9K2mL8f7nPqR1" + comment_author: "ArtHistoryFan" + comment_author_channel_id: "UC7f8n2p3m4x5L6qR7sT8vW" + comment_text: "This virtual tour is amazing! I visited last year and seeing it again brings back wonderful memories. The Night Watch looks even more spectacular in 4K." + comment_published_at: "2023-03-16T09:22:15Z" + comment_like_count: 45 + comment_reply_count: 3 + comment_replies: + - comment_id: "Ugw3x9K2mL8f7nPqR1.8nRq" + comment_author: "Rijksmuseum" + comment_author_channel_id: "UCo2sQFl0mV4K2v6D4d8Z9bQ" + comment_text: "Thank you for visiting and for your kind words! We hope to see you again soon." + comment_published_at: "2023-03-16T11:45:30Z" + comment_like_count: 12 + comment_reply_count: 0 + + - comment_id: "Ugw5y7T4nM9g8oPsS2" + comment_author: "DutchHeritageExplorer" + comment_author_channel_id: "UC9g0n3p4m5x6L7qR8sT9vX" + comment_text: "Great explanation of the Vermeer paintings! Would love to see more content about the restoration process." + comment_published_at: "2023-03-17T16:33:45Z" + comment_like_count: 28 + comment_reply_count: 1 + +# ============================================================================ +# EXAMPLE 2: Video Transcript (Full Text) +# ============================================================================ + +video_transcripts: + + - content_id: "https://nde.nl/ontology/hc/transcript/nl/rijksmuseum-gallery-honour-full" + source_video_url: "https://www.youtube.com/watch?v=dQw4w9WgXcQ" + content_language: "nl" + + full_transcript: | + Welkom in de Eregalerij van het Rijksmuseum. Deze iconische gang is het hart + van het museum en herbergt de grootste meesterwerken uit de Gouden Eeuw. + + We beginnen onze wandeling bij de ingang, waar we direct worden begroet door + Frans Hals' portret van Isaac Massa en Beatrix van der Laen. Dit schilderij + uit 1622 toont de levendige penseelstreek waarmee Hals bekend staat. + + Verderop zien we werken van Jan Steen, bekend om zijn humoristische taferelen + van het dagelijks leven. Zijn schilderij "De vrolijke huishouding" illustreert + het Nederlandse spreekwoord "een huishouden van Jan Steen." + + Aan het einde van de galerie staat het beroemdste schilderij van Nederland: + De Nachtwacht van Rembrandt. Dit monumentale werk uit 1642 toont de + schutterij van kapitein Frans Banninck Cocq in actie. + + word_count: 142 + generation_method: AUTOMATIC + generation_model: "whisper-large-v3" + generation_confidence: 0.94 + manual_corrections: true + + # Provenance + generated_by: "OpenAI Whisper" + generation_timestamp: "2025-12-01T08:15:00Z" + reviewed_by: "Rijksmuseum Digital Team" + review_timestamp: "2025-12-02T14:30:00Z" + + transcript_format: PLAIN_TEXT + +# ============================================================================ +# EXAMPLE 3: Video Subtitles (Time-Coded) +# ============================================================================ + +video_subtitles: + + - content_id: "https://nde.nl/ontology/hc/subtitle/nl/rijksmuseum-gallery-honour-en" + source_video_url: "https://www.youtube.com/watch?v=dQw4w9WgXcQ" + content_language: "en" + + subtitle_format: VTT + total_cues: 45 + + subtitle_entries: + - sequence_number: 1 + start_time: "00:00:00.000" + end_time: "00:00:04.500" + text: "Welcome to the Gallery of Honour at the Rijksmuseum." + speaker_label: "Curator" + + - sequence_number: 2 + start_time: "00:00:04.500" + end_time: "00:00:09.200" + text: "This iconic corridor is the heart of the museum" + speaker_label: "Curator" + + - sequence_number: 3 + start_time: "00:00:09.200" + end_time: "00:00:14.800" + text: "and houses the greatest masterpieces from the Golden Age." + speaker_label: "Curator" + + - sequence_number: 4 + start_time: "00:00:14.800" + end_time: "00:00:20.500" + text: "We begin our walk at the entrance, where we are immediately greeted" + speaker_label: "Curator" + + - sequence_number: 5 + start_time: "00:00:20.500" + end_time: "00:00:27.000" + text: "by Frans Hals' portrait of Isaac Massa and Beatrix van der Laen." + speaker_label: "Curator" + + is_closed_captions: false + is_sdh: false + + generation_method: HUMAN + reviewed_by: "Rijksmuseum Translation Team" + review_timestamp: "2023-03-10T16:00:00Z" + +# ============================================================================ +# EXAMPLE 4: Scene Annotations (Computer Vision) +# ============================================================================ + +video_scene_annotations: + + - annotation_id: "https://nde.nl/ontology/hc/annotation/scene/rijksmuseum-01" + source_video_url: "https://www.youtube.com/watch?v=dQw4w9WgXcQ" + annotation_type: SCENE + annotation_motivation: DESCRIBING + + time_segment: + segment_id: "scene-01" + start_time: "00:00:00.000" + end_time: "00:00:45.000" + duration_seconds: 45.0 + + scene_type: ESTABLISHING + scene_label: "Gallery Entrance Introduction" + scene_description: | + Wide shot of the Gallery of Honour entrance. Camera slowly pans + from left to right, revealing the long corridor with paintings + on both walls. Natural light streams in from skylights above. + + detected_elements: + - "architectural interior" + - "museum gallery" + - "natural lighting" + - "oil paintings" + - "parquet flooring" + + dominant_colors: + - "#8B7355" # Brown/wood tones + - "#F5F5DC" # Cream walls + - "#DAA520" # Golden frames + + confidence_score: 0.92 + detection_model: "google-video-intelligence-v1" + detection_timestamp: "2025-12-01T09:00:00Z" + + - annotation_id: "https://nde.nl/ontology/hc/annotation/scene/rijksmuseum-02" + source_video_url: "https://www.youtube.com/watch?v=dQw4w9WgXcQ" + annotation_type: SCENE + annotation_motivation: DESCRIBING + + time_segment: + segment_id: "scene-02" + start_time: "00:00:45.000" + end_time: "00:02:30.000" + duration_seconds: 105.0 + + scene_type: CLOSE_UP + scene_label: "Frans Hals Portrait Detail" + scene_description: | + Close-up shots of Frans Hals' portrait painting showing + brushwork detail and color palette. Camera moves slowly + across canvas surface highlighting texture. + + detected_elements: + - "oil painting" + - "portrait" + - "17th century costume" + - "lace collar" + - "dark background" + + confidence_score: 0.88 + detection_model: "google-video-intelligence-v1" + detection_timestamp: "2025-12-01T09:00:00Z" + +# ============================================================================ +# EXAMPLE 5: Object Annotations (Artwork Detection) +# ============================================================================ + +video_object_annotations: + + - annotation_id: "https://nde.nl/ontology/hc/annotation/object/rijksmuseum-night-watch" + source_video_url: "https://www.youtube.com/watch?v=dQw4w9WgXcQ" + annotation_type: OBJECT + annotation_motivation: IDENTIFYING + + time_segment: + segment_id: "night-watch-segment" + start_time: "00:12:30.000" + end_time: "00:15:42.000" + duration_seconds: 192.0 + + detected_objects: + - object_id: "obj-night-watch-001" + object_label: "The Night Watch" + object_category: "painting" + confidence: 0.98 + bounding_box_x: 120 + bounding_box_y: 80 + bounding_box_width: 1680 + bounding_box_height: 920 + wikidata_entity: "Q219831" + artist: "Rembrandt van Rijn" + creation_year: 1642 + + - object_id: "obj-captain-001" + object_label: "Captain Frans Banninck Cocq" + object_category: "person (depicted)" + confidence: 0.91 + bounding_box_x: 450 + bounding_box_y: 150 + bounding_box_width: 380 + bounding_box_height: 720 + wikidata_entity: "Q467089" + + detection_level: FRAME + confidence_score: 0.95 + detection_model: "artwork-recognition-v2" + detection_timestamp: "2025-12-01T09:15:00Z" + +# ============================================================================ +# EXAMPLE 6: OCR Annotations (Text in Video) +# ============================================================================ + +video_ocr_annotations: + + - annotation_id: "https://nde.nl/ontology/hc/annotation/ocr/rijksmuseum-label-01" + source_video_url: "https://www.youtube.com/watch?v=dQw4w9WgXcQ" + annotation_type: OCR + annotation_motivation: TRANSCRIBING + + time_segment: + segment_id: "label-segment-01" + start_time: "00:05:15.000" + end_time: "00:05:22.000" + duration_seconds: 7.0 + + detected_text_regions: + - region_id: "text-001" + detected_text: "Johannes Vermeer" + text_language: "nl" + text_type: ARTWORK_LABEL + bounding_box_x: 100 + bounding_box_y: 650 + bounding_box_width: 280 + bounding_box_height: 35 + confidence: 0.97 + + - region_id: "text-002" + detected_text: "Het melkmeisje, ca. 1660" + text_language: "nl" + text_type: ARTWORK_LABEL + bounding_box_x: 100 + bounding_box_y: 690 + bounding_box_width: 320 + bounding_box_height: 30 + confidence: 0.94 + + - region_id: "text-003" + detected_text: "Olieverf op doek" + text_language: "nl" + text_type: CAPTION + bounding_box_x: 100 + bounding_box_y: 725 + bounding_box_width: 200 + bounding_box_height: 25 + confidence: 0.91 + + detection_level: FRAME + confidence_score: 0.94 + detection_model: "google-cloud-vision-ocr" + detection_timestamp: "2025-12-01T09:20:00Z" + +# ============================================================================ +# EXAMPLE 7: Conservation Documentation Video +# Archive use case with technical annotations +# ============================================================================ + +conservation_videos: + + - post_id: "https://nde.nl/ontology/hc/video/nl/rijksmuseum-night-watch-restoration" + platform_type: YOUTUBE + platform_id: "UCo2sQFl0mV4K2v6D4d8Z9bQ" + platform_post_id: "abcd1234efgh" + post_url: "https://www.youtube.com/watch?v=abcd1234efgh" + post_title: "Operation Night Watch - Restoration Process Documentary" + post_description: | + Follow the largest and most detailed art research and conservation project + ever undertaken on a single painting. Operation Night Watch uses cutting-edge + technology to study and restore Rembrandt's masterpiece. + + duration: "PT45M30S" + definition: uhd + aspect_ratio: "16:9" + frame_rate: 24.0 + + caption_available: true + default_language: "en" + default_audio_language: "en" + available_caption_languages: + - "en" + - "nl" + - "de" + - "ja" + + view_count: 892341 + like_count: 28456 + comment_count: 1523 + metrics_observed_at: "2025-12-15T10:30:00Z" + + video_category_id: "28" # Science & Technology + live_broadcast_content: none + is_licensed_content: false + is_embeddable: true + is_made_for_kids: false + + published_at: "2021-06-22T12:00:00Z" + +# ============================================================================ +# EXAMPLE 8: Video Chapters (Navigation Segments) +# YouTube chapters, virtual tour sections, conservation phases +# ============================================================================ + +video_chapters: + + # Rijksmuseum Virtual Tour - Gallery of Honour chapters + - chapter_id: "dQw4w9WgXcQ_chapter_0" + chapter_title: "Introduction - Welcome to the Rijksmuseum" + chapter_index: 0 + chapter_start_seconds: 0.0 + chapter_end_seconds: 45.0 + chapter_start_time: "PT0S" + chapter_end_time: "PT45S" + chapter_description: | + Opening shot of the Gallery of Honour entrance with curator introduction. + Overview of what visitors will see during the virtual tour. + auto_generated: false + chapter_source: MANUAL + chapter_thumbnail_url: "https://i.ytimg.com/vi/dQw4w9WgXcQ/hqdefault.jpg?sqp=-oaymwEjCNACELwBSFryq4qpAxUIARUAAAAAGAElAADIQj0AgKJDeAE=&rs=AOn4CLBp1" + + - chapter_id: "dQw4w9WgXcQ_chapter_1" + chapter_title: "Frans Hals and Early Portraits" + chapter_index: 1 + chapter_start_seconds: 45.0 + chapter_end_seconds: 180.0 + chapter_start_time: "PT45S" + chapter_end_time: "PT3M" + chapter_description: | + Exploration of Frans Hals' portrait of Isaac Massa and Beatrix van der Laen. + Discussion of Hals' innovative brushwork techniques. + auto_generated: false + chapter_source: MANUAL + heritage_entities_mentioned: + - entity_id: "Q167654" # Frans Hals + entity_type: "Person" + entity_label: "Frans Hals" + - entity_id: "Q2628540" # Portrait of Isaac Massa and Beatrix van der Laen + entity_type: "Artwork" + entity_label: "Portrait of Isaac Massa and Beatrix van der Laen" + + - chapter_id: "dQw4w9WgXcQ_chapter_2" + chapter_title: "Jan Steen's Household Scenes" + chapter_index: 2 + chapter_start_seconds: 180.0 + chapter_end_seconds: 360.0 + chapter_start_time: "PT3M" + chapter_end_time: "PT6M" + chapter_description: | + The humorous domestic scenes of Jan Steen and the meaning behind + the Dutch expression "een huishouden van Jan Steen." + auto_generated: false + chapter_source: MANUAL + heritage_entities_mentioned: + - entity_id: "Q205863" # Jan Steen + entity_type: "Person" + entity_label: "Jan Steen" + + - chapter_id: "dQw4w9WgXcQ_chapter_3" + chapter_title: "Vermeer's Masterpieces" + chapter_index: 3 + chapter_start_seconds: 360.0 + chapter_end_seconds: 600.0 + chapter_start_time: "PT6M" + chapter_end_time: "PT10M" + chapter_description: | + Close examination of Johannes Vermeer's The Milkmaid and other works. + Analysis of Vermeer's distinctive use of light and color. + auto_generated: false + chapter_source: MANUAL + heritage_entities_mentioned: + - entity_id: "Q41264" # Johannes Vermeer + entity_type: "Person" + entity_label: "Johannes Vermeer" + - entity_id: "Q154349" # The Milkmaid + entity_type: "Artwork" + entity_label: "Het melkmeisje (The Milkmaid)" + + - chapter_id: "dQw4w9WgXcQ_chapter_4" + chapter_title: "The Night Watch - Rembrandt's Masterpiece" + chapter_index: 4 + chapter_start_seconds: 600.0 + chapter_end_seconds: 942.0 + chapter_start_time: "PT10M" + chapter_end_time: "PT15M42S" + chapter_description: | + Culmination of the tour at Rembrandt's iconic Night Watch. + Discussion of the painting's history, composition, and restoration. + auto_generated: false + chapter_source: MANUAL + heritage_entities_mentioned: + - entity_id: "Q5598" # Rembrandt + entity_type: "Person" + entity_label: "Rembrandt van Rijn" + - entity_id: "Q219831" # The Night Watch + entity_type: "Artwork" + entity_label: "De Nachtwacht (The Night Watch)" + + # Conservation Documentary - Operation Night Watch chapters + - chapter_id: "abcd1234efgh_chapter_0" + chapter_title: "Project Overview" + chapter_index: 0 + chapter_start_seconds: 0.0 + chapter_end_seconds: 300.0 + chapter_start_time: "PT0S" + chapter_end_time: "PT5M" + chapter_description: | + Introduction to Operation Night Watch, the most extensive research + and conservation project ever undertaken on a single painting. + auto_generated: false + chapter_source: MANUAL + + - chapter_id: "abcd1234efgh_chapter_1" + chapter_title: "Technical Imaging and Analysis" + chapter_index: 1 + chapter_start_seconds: 300.0 + chapter_end_seconds: 900.0 + chapter_start_time: "PT5M" + chapter_end_time: "PT15M" + chapter_description: | + Multi-spectral imaging, X-ray analysis, and macro photography + revealing hidden layers and underdrawings in the painting. + auto_generated: false + chapter_source: MANUAL + conservation_phase: "DOCUMENTATION" + + - chapter_id: "abcd1234efgh_chapter_2" + chapter_title: "Condition Assessment" + chapter_index: 2 + chapter_start_seconds: 900.0 + chapter_end_seconds: 1500.0 + chapter_start_time: "PT15M" + chapter_end_time: "PT25M" + chapter_description: | + Detailed examination of the painting's condition, including + craquelure patterns, varnish degradation, and previous restorations. + auto_generated: false + chapter_source: MANUAL + conservation_phase: "ASSESSMENT" + + - chapter_id: "abcd1234efgh_chapter_3" + chapter_title: "Cleaning Process" + chapter_index: 3 + chapter_start_seconds: 1500.0 + chapter_end_seconds: 2100.0 + chapter_start_time: "PT25M" + chapter_end_time: "PT35M" + chapter_description: | + The meticulous cleaning process using specialized solvents and + techniques to remove centuries of accumulated dirt and varnish. + auto_generated: false + chapter_source: MANUAL + conservation_phase: "TREATMENT" + + - chapter_id: "abcd1234efgh_chapter_4" + chapter_title: "AI-Assisted Reconstruction" + chapter_index: 4 + chapter_start_seconds: 2100.0 + chapter_end_seconds: 2730.0 + chapter_start_time: "PT35M" + chapter_end_time: "PT45M30S" + chapter_description: | + How artificial intelligence was used to digitally reconstruct + missing portions of the painting that were cut off in 1715. + auto_generated: false + chapter_source: MANUAL + conservation_phase: "DIGITAL_RECONSTRUCTION" + +# ============================================================================ +# EXAMPLE 9: Video Chapter Lists (Complete Sets) +# ============================================================================ + +video_chapter_lists: + + # Complete chapter list for Rijksmuseum virtual tour + - list_id: "https://nde.nl/ontology/hc/chapterlist/rijksmuseum-gallery-honour" + video_id: "dQw4w9WgXcQ" + video_url: "https://www.youtube.com/watch?v=dQw4w9WgXcQ" + video_title: "The Gallery of Honour - Rijksmuseum Virtual Tour" + + chapters: + - "dQw4w9WgXcQ_chapter_0" + - "dQw4w9WgXcQ_chapter_1" + - "dQw4w9WgXcQ_chapter_2" + - "dQw4w9WgXcQ_chapter_3" + - "dQw4w9WgXcQ_chapter_4" + + total_chapters: 5 + chapters_source: MANUAL + covers_full_video: true + video_duration_seconds: 942.0 + + extraction_timestamp: "2025-12-15T14:00:00Z" + extraction_method: "YouTube Data API v3" + + # Complete chapter list for Operation Night Watch documentary + - list_id: "https://nde.nl/ontology/hc/chapterlist/operation-night-watch" + video_id: "abcd1234efgh" + video_url: "https://www.youtube.com/watch?v=abcd1234efgh" + video_title: "Operation Night Watch - Restoration Process Documentary" + + chapters: + - "abcd1234efgh_chapter_0" + - "abcd1234efgh_chapter_1" + - "abcd1234efgh_chapter_2" + - "abcd1234efgh_chapter_3" + - "abcd1234efgh_chapter_4" + + total_chapters: 5 + chapters_source: MANUAL + covers_full_video: true + video_duration_seconds: 2730.0 + + extraction_timestamp: "2025-12-15T14:00:00Z" + extraction_method: "YouTube Data API v3" + +# ============================================================================ +# EXAMPLE 10: Video Audio Annotations (Speech, Music, Sound Events) +# ============================================================================ + +video_audio_annotations: + + # Example 1: Curator Interview with Speaker Diarization + - annotation_id: "https://nde.nl/ontology/hc/annotation/audio/rijksmuseum-interview-01" + source_video_url: "https://www.youtube.com/watch?v=xyz789curator" + annotation_type: AUDIO + annotation_motivation: TRANSCRIBING + + # Primary audio characteristics + primary_audio_event_type: SPEECH + speech_detected: true + speech_language: "nl" + languages_detected: + - "nl" + - "en" # Some English art terminology used + + # Speaker diarization (who spoke when) + diarization_enabled: true + speaker_count: 2 + speaker_labels: + - "Dr. Taco Dibbits" + - "Interviewer" + + diarization_segments: + - segment_id: "diar-001" + diarization_start_seconds: 0.0 + diarization_end_seconds: 8.5 + diarization_start_time: "PT0S" + diarization_end_time: "PT8.5S" + diarization_speaker_id: "spk_001" + diarization_speaker_label: "Interviewer" + diarization_confidence: 0.94 + transcript_snippet: "Welkom bij het Rijksmuseum. Vandaag spreken we met de directeur..." + + - segment_id: "diar-002" + diarization_start_seconds: 8.5 + diarization_end_seconds: 45.0 + diarization_start_time: "PT8.5S" + diarization_end_time: "PT45S" + diarization_speaker_id: "spk_002" + diarization_speaker_label: "Dr. Taco Dibbits" + diarization_confidence: 0.97 + transcript_snippet: "Dank u wel. Het is een bijzonder moment voor het museum..." + + - segment_id: "diar-003" + diarization_start_seconds: 45.0 + diarization_end_seconds: 52.0 + diarization_start_time: "PT45S" + diarization_end_time: "PT52S" + diarization_speaker_id: "spk_001" + diarization_speaker_label: "Interviewer" + diarization_confidence: 0.92 + transcript_snippet: "Kunt u ons meer vertellen over de nieuwe tentoonstelling?" + + - segment_id: "diar-004" + diarization_start_seconds: 52.0 + diarization_end_seconds: 180.0 + diarization_start_time: "PT52S" + diarization_end_time: "PT3M" + diarization_speaker_id: "spk_002" + diarization_speaker_label: "Dr. Taco Dibbits" + diarization_confidence: 0.96 + transcript_snippet: "Jazeker. Deze tentoonstelling is uniek omdat we voor het eerst..." + + # Audio quality metrics + audio_quality_score: 0.92 + snr_db: 28.0 + has_clipping: false + audio_channels: 2 + sample_rate_hz: 48000 + + # No music in this interview + music_detected: false + + # Detection metadata + detection_model: "whisper-large-v3-diarize" + detection_timestamp: "2025-12-15T16:00:00Z" + confidence_score: 0.94 + + # Example 2: Exhibition Promotional Video with Music + - annotation_id: "https://nde.nl/ontology/hc/annotation/audio/vangogh-exhibition-promo" + source_video_url: "https://www.youtube.com/watch?v=promo2025vgm" + annotation_type: AUDIO + annotation_motivation: DESCRIBING + + # Mixed speech and music + primary_audio_event_type: MIXED + speech_detected: true + music_detected: true + + speech_language: "en" + languages_detected: + - "en" + - "nl" + + # Speech segments (voiceover narration) + speech_segments: + - segment_id: "speech-001" + speech_start_seconds: 5.0 + speech_end_seconds: 25.0 + speech_start_time: "PT5S" + speech_end_time: "PT25S" + speaker_id: "narrator" + speaker_label: "Voiceover Narrator" + speech_type: NARRATION + transcript_snippet: "This spring, the Van Gogh Museum presents a groundbreaking exhibition..." + + - segment_id: "speech-002" + speech_start_seconds: 45.0 + speech_end_seconds: 60.0 + speech_start_time: "PT45S" + speech_end_time: "PT1M" + speaker_id: "curator" + speaker_label: "Exhibition Curator" + speech_type: INTERVIEW + transcript_snippet: "Van Gogh's use of color was revolutionary..." + + # Music segments (background and featured) + music_segments: + - segment_id: "music-001" + music_start_seconds: 0.0 + music_end_seconds: 120.0 + music_start_time: "PT0S" + music_end_time: "PT2M" + music_type: BACKGROUND + music_genre: "classical" + is_background: true + volume_level: "low" + music_title: null # Unknown background track + + - segment_id: "music-002" + music_start_seconds: 90.0 + music_end_seconds: 115.0 + music_start_time: "PT1M30S" + music_end_time: "PT1M55S" + music_type: DRAMATIC + music_genre: "orchestral" + is_background: false + volume_level: "medium" + music_description: "Dramatic orchestral swell accompanying visual climax" + + music_genres_detected: + - "classical" + - "orchestral" + + # Audio quality metrics + audio_quality_score: 0.88 + snr_db: 22.0 # Lower due to music mixing + audio_channels: 2 + sample_rate_hz: 48000 + + detection_model: "audio-analysis-v2" + detection_timestamp: "2025-12-15T16:30:00Z" + confidence_score: 0.86 + + # Example 3: Museum Lecture Recording with Audience Reactions + - annotation_id: "https://nde.nl/ontology/hc/annotation/audio/stedelijk-lecture-2024" + source_video_url: "https://www.youtube.com/watch?v=lecture2024sted" + annotation_type: AUDIO + annotation_motivation: TRANSCRIBING + + primary_audio_event_type: SPEECH + speech_detected: true + music_detected: false + + speech_language: "nl" + languages_detected: + - "nl" + + # Main lecture content + diarization_enabled: true + speaker_count: 1 + speaker_labels: + - "Prof. Dr. Beatrix Ruf" + + diarization_segments: + - segment_id: "lecture-001" + diarization_start_seconds: 0.0 + diarization_end_seconds: 1800.0 + diarization_start_time: "PT0S" + diarization_end_time: "PT30M" + diarization_speaker_id: "spk_main" + diarization_speaker_label: "Prof. Dr. Beatrix Ruf" + diarization_confidence: 0.98 + + # Sound events detected (audience reactions) + sound_events_detected: true + sound_event_types: + - APPLAUSE + - LAUGHTER + - CROWD_NOISE + + sound_event_segments: + - segment_id: "sound-001" + sound_start_seconds: 420.0 + sound_end_seconds: 425.0 + sound_start_time: "PT7M" + sound_end_time: "PT7M5S" + sound_event_type: LAUGHTER + sound_confidence: 0.89 + sound_description: "Audience laughter in response to humorous anecdote" + + - segment_id: "sound-002" + sound_start_seconds: 1795.0 + sound_end_seconds: 1810.0 + sound_start_time: "PT29M55S" + sound_end_time: "PT30M10S" + sound_event_type: APPLAUSE + sound_confidence: 0.96 + sound_description: "Audience applause at conclusion of lecture" + + - segment_id: "sound-003" + sound_start_seconds: 1200.0 + sound_end_seconds: 1203.0 + sound_start_time: "PT20M" + sound_end_time: "PT20M3S" + sound_event_type: CROWD_NOISE + sound_confidence: 0.72 + sound_description: "Brief audience murmuring during slide transition" + + # Audio quality metrics (live recording) + audio_quality_score: 0.78 + snr_db: 18.0 # Lower due to room acoustics + has_reverb: true + audio_channels: 2 + sample_rate_hz: 44100 + + detection_model: "audio-event-detector-v1" + detection_timestamp: "2025-12-15T17:00:00Z" + confidence_score: 0.82 + +# ============================================================================ +# PROVENANCE METADATA +# ============================================================================ + +provenance: + data_source: EXAMPLE_INSTANCES + data_tier: TIER_4_INFERRED + extraction_date: "2025-12-16T00:00:00Z" + extraction_method: "Manual example creation for schema documentation" + confidence_score: 1.0 + notes: | + Example instances demonstrating video content modeling capabilities. + Based on real heritage institution video patterns but with synthetic data. + + Classes demonstrated: + - VideoPost (with VideoComment) + - VideoTranscript + - VideoSubtitle + - VideoSceneAnnotation + - VideoObjectAnnotation + - VideoOCRAnnotation + - VideoChapter (NEW in v0.9.10) + - VideoChapterList (NEW in v0.9.10) + - VideoAudioAnnotation (NEW in v0.9.10) + - SpeechSegment + - DiarizationSegment + - MusicSegment + - SoundEventSegment + + Heritage use cases covered: + - Virtual museum tours + - Conservation documentation + - Artwork recognition + - Museum label OCR + - Video chapter navigation (NEW) + - Speaker diarization in interviews (NEW) + - Music detection in promotional content (NEW) + - Audience reaction detection in lectures (NEW) + + Enumerations demonstrated: + - ChapterSourceEnum: MANUAL, AUTO_GENERATED, YOUTUBE_API + - AudioEventTypeEnum: SPEECH, MUSIC, MIXED, AMBIENT, SILENCE + - SoundEventTypeEnum: APPLAUSE, LAUGHTER, CROWD_NOISE + - MusicTypeEnum: BACKGROUND, FOREGROUND, DRAMATIC + + Heritage entities referenced (Wikidata): + - Q5598 (Rembrandt van Rijn) + - Q41264 (Johannes Vermeer) + - Q167654 (Frans Hals) + - Q205863 (Jan Steen) + - Q219831 (The Night Watch) + - Q154349 (The Milkmaid) + - Q2628540 (Portrait of Isaac Massa and Beatrix van der Laen) diff --git a/schemas/20251121/linkml/modules/classes/VideoAnnotation.yaml b/schemas/20251121/linkml/modules/classes/VideoAnnotation.yaml new file mode 100644 index 0000000000..15229d97d7 --- /dev/null +++ b/schemas/20251121/linkml/modules/classes/VideoAnnotation.yaml @@ -0,0 +1,542 @@ +# Video Annotation Class +# Abstract base class for computer vision and multimodal video annotations +# +# Part of Heritage Custodian Ontology v0.9.5 +# +# HIERARCHY: +# E73_Information_Object (CIDOC-CRM) +# │ +# └── VideoTextContent (abstract base) +# │ +# ├── VideoTranscript (audio-derived) +# │ │ +# │ └── VideoSubtitle (time-coded captions) +# │ +# └── VideoAnnotation (this class - ABSTRACT) +# │ +# ├── VideoSceneAnnotation (scene/shot detection) +# ├── VideoObjectAnnotation (object/face/logo detection) +# └── VideoOCRAnnotation (text-in-video extraction) +# +# DESIGN RATIONALE: +# VideoAnnotation is the abstract parent for all annotations derived from +# visual analysis of video content. Unlike VideoTranscript (audio-derived), +# these annotations come from computer vision, multimodal AI, or manual +# visual analysis. +# +# Key differences from transcript branch: +# - Frame-based rather than audio-based analysis +# - Spatial information (bounding boxes, regions) +# - Detection thresholds and frame sampling +# - Multiple detection types per segment +# +# ONTOLOGY ALIGNMENT: +# - W3C Web Annotation (oa:Annotation) for annotation structure +# - CIDOC-CRM E13_Attribute_Assignment for attribution activities +# - IIIF Presentation API for spatial/temporal selectors + +id: https://nde.nl/ontology/hc/class/VideoAnnotation +name: video_annotation_class +title: Video Annotation Class + +imports: + - linkml:types + - ./VideoTextContent + - ./VideoTimeSegment + +prefixes: + linkml: https://w3id.org/linkml/ + hc: https://nde.nl/ontology/hc/ + schema: http://schema.org/ + dcterms: http://purl.org/dc/terms/ + prov: http://www.w3.org/ns/prov# + crm: http://www.cidoc-crm.org/cidoc-crm/ + oa: http://www.w3.org/ns/oa# + as: https://www.w3.org/ns/activitystreams# + +default_prefix: hc + +classes: + + VideoAnnotation: + is_a: VideoTextContent + class_uri: oa:Annotation + abstract: true + description: | + Abstract base class for computer vision and multimodal video annotations. + + **DEFINITION**: + + VideoAnnotation represents structured information derived from visual + analysis of video content. This includes: + + | Subclass | Analysis Type | Output | + |----------|---------------|--------| + | VideoSceneAnnotation | Shot/scene detection | Scene boundaries, types | + | VideoObjectAnnotation | Object detection | Objects, faces, logos | + | VideoOCRAnnotation | Text extraction | On-screen text (OCR) | + + **RELATIONSHIP TO W3C WEB ANNOTATION**: + + VideoAnnotation aligns with the W3C Web Annotation Data Model: + + ```turtle + :annotation a oa:Annotation ; + oa:hasBody :detection_result ; + oa:hasTarget [ + oa:hasSource :video ; + oa:hasSelector [ + a oa:FragmentSelector ; + dcterms:conformsTo ; + rdf:value "t=30,35" + ] + ] ; + oa:motivatedBy oa:classifying . + ``` + + **FRAME-BASED ANALYSIS**: + + Unlike audio transcription (continuous stream), video annotation is + typically frame-based: + + - `frame_sample_rate`: Frames analyzed per second (e.g., 1 fps, 5 fps) + - `total_frames_analyzed`: Total frames processed + - Higher sample rates = more detections but higher compute cost + + **DETECTION THRESHOLDS**: + + CV models output confidence scores. Thresholds filter noise: + + | Threshold | Use Case | + |-----------|----------| + | 0.9+ | High precision, production display | + | 0.7-0.9 | Balanced, general use | + | 0.5-0.7 | High recall, research/review | + | < 0.5 | Raw output, needs filtering | + + **MODEL ARCHITECTURE TRACKING**: + + Different model architectures have different characteristics: + + | Architecture | Examples | Strengths | + |--------------|----------|-----------| + | CNN | ResNet, VGG | Fast inference, good for objects | + | Transformer | ViT, CLIP | Better context, multimodal | + | Hybrid | DETR, Swin | Balance of speed and accuracy | + + **HERITAGE INSTITUTION CONTEXT**: + + Video annotations enable: + - **Discovery**: Find videos containing specific objects/artworks + - **Accessibility**: Scene descriptions for visually impaired + - **Research**: Analyze visual content at scale + - **Preservation**: Document visual content as text + - **Linking**: Connect detected artworks to collection records + + **CIDOC-CRM E13_Attribute_Assignment**: + + Annotations are attribute assignments - asserting properties about + video segments. The CV model or human annotator is the assigning agent. + + exact_mappings: + - oa:Annotation + + close_mappings: + - crm:E13_Attribute_Assignment + + related_mappings: + - as:Activity + - schema:ClaimReview + + slots: + # Annotation structure + - annotation_type + - annotation_segments + + # Detection parameters + - detection_threshold + - detection_count + + # Frame analysis + - frame_sample_rate + - total_frames_analyzed + - keyframe_extraction + + # Model details + - model_architecture + - model_task + + # Spatial information + - includes_bounding_boxes + - includes_segmentation_masks + + # Annotation motivation + - annotation_motivation + + slot_usage: + annotation_type: + slot_uri: dcterms:type + description: | + High-level type classification for this annotation. + + Dublin Core: type for resource categorization. + + **Standard Types**: + - SCENE_DETECTION: Shot/scene boundary detection + - OBJECT_DETECTION: Object, face, logo detection + - OCR: Text-in-video extraction + - ACTION_RECOGNITION: Human action detection + - SEMANTIC_SEGMENTATION: Pixel-level classification + - MULTIMODAL: Combined audio+visual analysis + range: AnnotationTypeEnum + required: true + examples: + - value: "OBJECT_DETECTION" + description: "Object and face detection annotation" + + annotation_segments: + slot_uri: oa:hasBody + description: | + List of temporal segments with detection results. + + Web Annotation: hasBody links annotation to its content. + + Each segment contains: + - Time boundaries (start/end) + - Detection text/description + - Per-segment confidence + + Reuses VideoTimeSegment for consistent temporal modeling. + range: VideoTimeSegment + multivalued: true + required: false + inlined_as_list: true + examples: + - value: "[{start_seconds: 30.0, end_seconds: 35.0, segment_text: 'Night Watch painting visible'}]" + description: "Object detection segment" + + detection_threshold: + slot_uri: hc:detectionThreshold + description: | + Minimum confidence threshold used for detection filtering. + + Detections below this threshold were excluded from results. + + Range: 0.0 to 1.0 + + **Common Values**: + - 0.5: Standard threshold (balanced) + - 0.7: High precision mode + - 0.3: High recall mode (includes uncertain detections) + range: float + required: false + minimum_value: 0.0 + maximum_value: 1.0 + examples: + - value: 0.5 + description: "Standard detection threshold" + + detection_count: + slot_uri: hc:detectionCount + description: | + Total number of detections across all analyzed frames. + + Useful for: + - Understanding annotation density + - Quality assessment + - Performance metrics + + Note: May be higher than annotation_segments count if segments + are aggregated or filtered. + range: integer + required: false + minimum_value: 0 + examples: + - value: 342 + description: "342 total detections found" + + frame_sample_rate: + slot_uri: hc:frameSampleRate + description: | + Number of frames analyzed per second of video. + + **Common Values**: + - 1.0: One frame per second (efficient) + - 5.0: Five frames per second (balanced) + - 30.0: Every frame at 30fps (thorough but expensive) + - 0.1: One frame every 10 seconds (overview only) + + Higher rates catch more content but increase compute cost. + range: float + required: false + minimum_value: 0.0 + examples: + - value: 1.0 + description: "Analyzed 1 frame per second" + + total_frames_analyzed: + slot_uri: hc:totalFramesAnalyzed + description: | + Total number of video frames that were analyzed. + + Calculated as: video_duration_seconds × frame_sample_rate + + Useful for: + - Understanding analysis coverage + - Cost estimation + - Reproducibility + range: integer + required: false + minimum_value: 0 + examples: + - value: 1800 + description: "Analyzed 1,800 frames (30 min video at 1 fps)" + + keyframe_extraction: + slot_uri: hc:keyframeExtraction + description: | + Whether keyframe extraction was used instead of uniform sampling. + + **Keyframe extraction** selects visually distinct frames + (scene changes, significant motion) rather than uniform intervals. + + - true: Keyframes extracted (variable frame selection) + - false: Uniform sampling at frame_sample_rate + + Keyframe extraction is more efficient but may miss content + between scene changes. + range: boolean + required: false + examples: + - value: true + description: "Used keyframe extraction" + + model_architecture: + slot_uri: hc:modelArchitecture + description: | + Architecture type of the CV/ML model used. + + **Common Architectures**: + - CNN: Convolutional Neural Network (ResNet, VGG, EfficientNet) + - Transformer: Vision Transformer (ViT, Swin, CLIP) + - Hybrid: Combined architectures (DETR, ConvNeXt) + - RNN: Recurrent (for temporal analysis) + - GAN: Generative (for reconstruction tasks) + + Useful for understanding model capabilities and limitations. + range: string + required: false + examples: + - value: "Transformer" + description: "Vision Transformer architecture" + - value: "CNN" + description: "Convolutional Neural Network" + + model_task: + slot_uri: hc:modelTask + description: | + Specific task the model was trained for. + + **Common Tasks**: + - classification: Image/frame classification + - detection: Object detection with bounding boxes + - segmentation: Pixel-level classification + - captioning: Image/video captioning + - embedding: Feature extraction for similarity + + A model's task determines its output format. + range: string + required: false + examples: + - value: "detection" + description: "Object detection task" + - value: "captioning" + description: "Video captioning task" + + includes_bounding_boxes: + slot_uri: hc:includesBoundingBoxes + description: | + Whether annotation includes spatial bounding box coordinates. + + Bounding boxes define rectangular regions in frames where + objects/faces/text were detected. + + Format typically: [x, y, width, height] or [x1, y1, x2, y2] + + - true: Spatial coordinates available in segment data + - false: Only temporal information (no spatial) + range: boolean + required: false + examples: + - value: true + description: "Includes bounding box coordinates" + + includes_segmentation_masks: + slot_uri: hc:includesSegmentationMasks + description: | + Whether annotation includes pixel-level segmentation masks. + + Segmentation masks provide precise object boundaries + (more detailed than bounding boxes). + + - true: Pixel masks available (typically as separate files) + - false: No segmentation data + + Masks are memory-intensive; often stored externally. + range: boolean + required: false + examples: + - value: false + description: "No segmentation masks included" + + annotation_motivation: + slot_uri: oa:motivatedBy + description: | + The motivation or purpose for creating this annotation. + + Web Annotation: motivatedBy describes why annotation was created. + + **Standard Motivations** (from W3C Web Annotation): + - classifying: Categorizing content + - describing: Adding description + - identifying: Identifying depicted things + - tagging: Adding tags/keywords + - linking: Linking to external resources + + **Heritage-Specific**: + - accessibility: For accessibility services + - discovery: For search/discovery + - preservation: For digital preservation + range: AnnotationMotivationEnum + required: false + examples: + - value: "CLASSIFYING" + description: "Annotation for classification purposes" + + comments: + - "Abstract base for all CV/multimodal video annotations" + - "Extends VideoTextContent with frame-based analysis parameters" + - "W3C Web Annotation compatible structure" + - "Supports both temporal and spatial annotation" + - "Tracks detection thresholds and model architecture" + + see_also: + - "https://www.w3.org/TR/annotation-model/" + - "http://www.cidoc-crm.org/cidoc-crm/E13_Attribute_Assignment" + - "https://iiif.io/api/presentation/3.0/" + +# ============================================================================ +# Enumerations +# ============================================================================ + +enums: + + AnnotationTypeEnum: + description: | + Types of video annotation based on analysis method. + permissible_values: + SCENE_DETECTION: + description: Shot and scene boundary detection + OBJECT_DETECTION: + description: Object, face, and logo detection + OCR: + description: Optical character recognition (text-in-video) + ACTION_RECOGNITION: + description: Human action and activity detection + SEMANTIC_SEGMENTATION: + description: Pixel-level semantic classification + POSE_ESTIMATION: + description: Human body pose detection + EMOTION_RECOGNITION: + description: Facial emotion/expression analysis + MULTIMODAL: + description: Combined audio-visual analysis + CAPTIONING: + description: Automated video captioning/description + CUSTOM: + description: Custom annotation type + + AnnotationMotivationEnum: + description: | + Motivation for creating annotation (W3C Web Annotation aligned). + permissible_values: + CLASSIFYING: + description: Categorizing or classifying content + meaning: oa:classifying + DESCRIBING: + description: Adding descriptive information + meaning: oa:describing + IDENTIFYING: + description: Identifying depicted entities + meaning: oa:identifying + TAGGING: + description: Adding tags or keywords + meaning: oa:tagging + LINKING: + description: Linking to external resources + meaning: oa:linking + COMMENTING: + description: Adding commentary + meaning: oa:commenting + ACCESSIBILITY: + description: Providing accessibility support + DISCOVERY: + description: Enabling search and discovery + PRESERVATION: + description: Supporting digital preservation + RESEARCH: + description: Supporting research and analysis + +# ============================================================================ +# Slot Definitions +# ============================================================================ + +slots: + annotation_type: + description: High-level type of video annotation + range: AnnotationTypeEnum + + annotation_segments: + description: List of temporal segments with detection results + range: VideoTimeSegment + multivalued: true + + detection_threshold: + description: Minimum confidence threshold for detection filtering + range: float + + detection_count: + description: Total number of detections found + range: integer + + frame_sample_rate: + description: Frames analyzed per second of video + range: float + + total_frames_analyzed: + description: Total number of frames analyzed + range: integer + + keyframe_extraction: + description: Whether keyframe extraction was used + range: boolean + + model_architecture: + description: Architecture type of CV/ML model (CNN, Transformer, etc.) + range: string + + model_task: + description: Specific task model was trained for + range: string + + includes_bounding_boxes: + description: Whether annotation includes spatial bounding boxes + range: boolean + + includes_segmentation_masks: + description: Whether annotation includes pixel segmentation masks + range: boolean + + annotation_motivation: + description: Motivation for creating annotation (W3C Web Annotation) + range: AnnotationMotivationEnum diff --git a/schemas/20251121/linkml/modules/classes/VideoAnnotationTypes.yaml b/schemas/20251121/linkml/modules/classes/VideoAnnotationTypes.yaml new file mode 100644 index 0000000000..4c20ad7f1e --- /dev/null +++ b/schemas/20251121/linkml/modules/classes/VideoAnnotationTypes.yaml @@ -0,0 +1,1312 @@ +# Video Annotation Types +# Concrete subclasses for scene, object, and OCR video annotations +# +# Part of Heritage Custodian Ontology v0.9.5 +# +# HIERARCHY: +# VideoAnnotation (abstract base) +# │ +# ├── VideoSceneAnnotation (this file) +# │ - Shot/scene boundary detection +# │ - Scene classification (interior, exterior, etc.) +# │ - Transition detection (cut, fade, dissolve) +# │ +# ├── VideoObjectAnnotation (this file) +# │ - Object detection (paintings, artifacts, etc.) +# │ - Face detection and recognition +# │ - Logo and landmark detection +# │ +# └── VideoOCRAnnotation (this file) +# - Text-in-video extraction +# - Title cards, captions, signs +# - Document and handwriting recognition +# +# HERITAGE INSTITUTION CONTEXT: +# These annotation types enable rich discovery and accessibility: +# - Find videos showing specific artworks or artifacts +# - Identify speakers and staff members in videos +# - Extract and index on-screen text +# - Navigate videos by scene or content type + +id: https://nde.nl/ontology/hc/class/VideoAnnotationTypes +name: video_annotation_types +title: Video Annotation Types + +imports: + - linkml:types + - ./VideoAnnotation + - ./VideoTimeSegment + +prefixes: + linkml: https://w3id.org/linkml/ + hc: https://nde.nl/ontology/hc/ + schema: http://schema.org/ + dcterms: http://purl.org/dc/terms/ + crm: http://www.cidoc-crm.org/cidoc-crm/ + oa: http://www.w3.org/ns/oa# + +default_prefix: hc + +# ============================================================================ +# Classes +# ============================================================================ + +classes: + + # ========================================================================== + # VideoSceneAnnotation - Shot/Scene Detection + # ========================================================================== + + VideoSceneAnnotation: + is_a: VideoAnnotation + class_uri: hc:VideoSceneAnnotation + abstract: false + description: | + Annotation for video scene and shot boundary detection. + + **DEFINITION**: + + VideoSceneAnnotation captures the temporal structure of video content + by identifying shot boundaries, scene changes, and transitions. + + **TERMINOLOGY**: + + | Term | Definition | + |------|------------| + | **Shot** | Continuous footage from a single camera take | + | **Scene** | Semantic unit (may contain multiple shots) | + | **Transition** | Visual effect between shots (cut, fade, dissolve) | + + **SHOT vs SCENE**: + + - **Shot detection**: Technical boundary (camera edit points) + - **Scene detection**: Semantic boundary (content/location change) + + This class supports both levels of granularity via `detection_level`. + + **DETECTION METHODS**: + + | Method | Approach | Accuracy | + |--------|----------|----------| + | Histogram diff | Color histogram changes | Good for cuts | + | CNN-based | Learned visual features | Best for all types | + | Motion-based | Optical flow analysis | Good for dissolves | + | Audio-visual | Combined modalities | Best for scenes | + + **TRANSITION TYPES**: + + | Type | Visual Effect | + |------|---------------| + | CUT | Instantaneous change (most common) | + | FADE_IN | Gradual appearance from black | + | FADE_OUT | Gradual disappearance to black | + | DISSOLVE | One shot blends into next | + | WIPE | Geometric transition effect | + + **HERITAGE USE CASES**: + + - **Video navigation**: Jump to specific scenes/chapters + - **Highlight extraction**: Key scenes for previews + - **Content analysis**: Understanding video structure + - **Preservation**: Document editing decisions + + **SCENE TYPE CLASSIFICATION**: + + Heritage videos often contain predictable scene types: + - Exhibition halls and galleries + - Close-ups of artworks/artifacts + - Interviews with curators/experts + - Exterior establishing shots + - Archival footage or photographs + + exact_mappings: + - hc:VideoSceneAnnotation + + close_mappings: + - schema:Clip + + slots: + # Scene structure + - scene_segments + - scene_count + - average_scene_duration_seconds + + # Detection granularity + - detection_level + + # Scene classification + - scene_types_detected + - transition_types_detected + + # Statistics + - cut_count + - fade_count + - dissolve_count + + slot_usage: + scene_segments: + slot_uri: hc:sceneSegments + description: | + List of detected scene/shot boundaries with metadata. + + Each segment represents one scene or shot: + - `start_seconds` / `end_seconds`: Scene boundaries + - `segment_text`: Scene description or type label + - `confidence`: Detection confidence + + Ordered chronologically by start_seconds. + range: VideoTimeSegment + multivalued: true + required: true + inlined_as_list: true + examples: + - value: "[{start_seconds: 0.0, end_seconds: 15.5, segment_text: 'Opening titles'}]" + description: "Scene segment with description" + + scene_count: + slot_uri: hc:sceneCount + description: | + Total number of scenes/shots detected in the video. + + Should equal length of scene_segments array. + range: integer + required: true + minimum_value: 1 + examples: + - value: 47 + description: "47 scenes detected" + + average_scene_duration_seconds: + slot_uri: hc:averageSceneDuration + description: | + Average duration of scenes in seconds. + + Calculated as: video_duration / scene_count + + Useful for understanding video pacing: + - < 3s: Fast-paced, music video style + - 3-10s: Documentary/educational + - > 10s: Slow-paced, interview-heavy + range: float + required: false + minimum_value: 0.0 + examples: + - value: 8.5 + description: "Average scene is 8.5 seconds" + + detection_level: + slot_uri: hc:detectionLevel + description: | + Granularity of detection: shots or scenes. + + - SHOT: Technical camera edit boundaries + - SCENE: Semantic content/location boundaries + - BOTH: Both shot and scene detection performed + range: DetectionLevelEnum + required: true + examples: + - value: "SCENE" + description: "Semantic scene detection" + + scene_types_detected: + slot_uri: hc:sceneTypesDetected + description: | + List of scene type labels found in the video. + + **Common Heritage Scene Types**: + - INTERIOR: Indoor shots + - EXTERIOR: Outdoor shots + - CLOSEUP: Detail shots of objects + - INTERVIEW: Talking head / interview + - ARCHIVAL: Historical footage/photos + - TITLE_CARD: Text overlays + - B_ROLL: Supplementary footage + range: SceneTypeEnum + multivalued: true + required: false + examples: + - value: "[INTERIOR, CLOSEUP, INTERVIEW]" + description: "Scene types found in video" + + transition_types_detected: + slot_uri: hc:transitionTypesDetected + description: | + Types of transitions detected between scenes. + + Most heritage videos use simple cuts; complex + transitions may indicate professional production. + range: TransitionTypeEnum + multivalued: true + required: false + examples: + - value: "[CUT, FADE_IN, FADE_OUT]" + description: "Transitions found in video" + + cut_count: + slot_uri: hc:cutCount + description: | + Number of hard cuts (instantaneous transitions). + + Cuts are the most common transition type. + range: integer + required: false + minimum_value: 0 + examples: + - value: 42 + description: "42 cuts detected" + + fade_count: + slot_uri: hc:fadeCount + description: | + Number of fade transitions (fade in + fade out). + + Fades often indicate section boundaries or time passage. + range: integer + required: false + minimum_value: 0 + examples: + - value: 5 + description: "5 fades detected" + + dissolve_count: + slot_uri: hc:dissolveCount + description: | + Number of dissolve/crossfade transitions. + + Dissolves create smooth blending between scenes. + range: integer + required: false + minimum_value: 0 + examples: + - value: 3 + description: "3 dissolves detected" + + comments: + - "Scene and shot boundary detection" + - "Supports both technical (shot) and semantic (scene) analysis" + - "Transition type classification" + - "Scene type labeling for heritage content" + + see_also: + - "https://en.wikipedia.org/wiki/Shot_(filmmaking)" + + # ========================================================================== + # VideoObjectAnnotation - Object/Face/Logo Detection + # ========================================================================== + + VideoObjectAnnotation: + is_a: VideoAnnotation + class_uri: hc:VideoObjectAnnotation + abstract: false + description: | + Annotation for object, face, and entity detection in video. + + **DEFINITION**: + + VideoObjectAnnotation captures visual entities detected in video frames: + + | Detection Type | Examples | + |----------------|----------| + | Objects | Paintings, sculptures, artifacts, furniture | + | Faces | People, staff, visitors | + | Logos | Institution logos, brand marks | + | Landmarks | Buildings, monuments, locations | + | Text regions | Signs, labels (see VideoOCRAnnotation for text extraction) | + + **OBJECT DETECTION vs CLASSIFICATION**: + + - **Classification**: What is in the frame? (labels only) + - **Detection**: What + where? (labels + bounding boxes) + - **Segmentation**: What + precise boundary? (pixel masks) + + This class supports all three via inherited flags. + + **HERITAGE-SPECIFIC OBJECT CLASSES**: + + | Category | Objects | + |----------|---------| + | **Art** | Painting, sculpture, drawing, print, photograph | + | **Artifacts** | Pottery, jewelry, tools, textiles, furniture | + | **Documents** | Books, manuscripts, letters, maps | + | **Architecture** | Columns, arches, facades, interiors | + | **Natural** | Specimens, fossils, botanical samples | + + **FACE DETECTION AND RECOGNITION**: + + Two distinct capabilities: + - **Detection**: Locate faces (bounding boxes) + - **Recognition**: Identify who (requires reference database) + + Heritage use cases: + - Identify curators, directors, experts in videos + - Find videos featuring specific people + - Accessibility: Announce speaker changes + + **LINKING TO COLLECTION RECORDS**: + + Detected objects can be linked to collection database: + + ```yaml + detected_objects: + - label: "The Night Watch" + wikidata_id: Q219831 + collection_id: "SK-C-5" # Rijksmuseum ID + confidence: 0.95 + ``` + + **BOUNDING BOX FORMAT**: + + Coordinates are normalized (0.0-1.0) relative to frame dimensions: + - `x`: Left edge (0.0 = left, 1.0 = right) + - `y`: Top edge (0.0 = top, 1.0 = bottom) + - `width`: Box width as fraction of frame width + - `height`: Box height as fraction of frame height + + exact_mappings: + - hc:VideoObjectAnnotation + + close_mappings: + - crm:E1_CRM_Entity + + related_mappings: + - schema:ImageObject + + slots: + # Detected entities + - detected_objects + - detected_faces + - detected_logos + - detected_landmarks + + # Detection statistics + - unique_object_count + - unique_face_count + - object_classes_detected + + # Tracking + - includes_object_tracking + - tracking_ids_assigned + + # Linking + - linked_to_collection + + slot_usage: + detected_objects: + slot_uri: hc:detectedObjects + description: | + List of detected objects with labels and locations. + + Each detection includes: + - Object class/label + - Confidence score + - Temporal segment (when visible) + - Bounding box (if includes_bounding_boxes=true) + + For heritage: paintings, artifacts, specimens, etc. + range: DetectedObject + multivalued: true + required: false + inlined_as_list: true + examples: + - value: "[{label: 'painting', confidence: 0.92, segment: {...}}]" + description: "Detected painting object" + + detected_faces: + slot_uri: hc:detectedFaces + description: | + List of detected faces with optional identity. + + Each detection includes: + - Face bounding box and confidence + - Temporal segment (when visible) + - Person identity (if recognized) + - Facial landmarks (if extracted) + range: DetectedFace + multivalued: true + required: false + inlined_as_list: true + examples: + - value: "[{confidence: 0.88, person_id: 'curator_001'}]" + description: "Detected and identified face" + + detected_logos: + slot_uri: hc:detectedLogos + description: | + List of detected logos or brand marks. + + Heritage use cases: + - Institution logos + - Sponsor logos + - Historical brand marks on artifacts + range: DetectedLogo + multivalued: true + required: false + inlined_as_list: true + examples: + - value: "[{label: 'Rijksmuseum logo', confidence: 0.95}]" + description: "Detected institution logo" + + detected_landmarks: + slot_uri: hc:detectedLandmarks + description: | + List of detected landmarks or buildings. + + Uses landmark recognition to identify: + - Famous buildings and monuments + - Museum facades + - Heritage sites + range: DetectedLandmark + multivalued: true + required: false + inlined_as_list: true + examples: + - value: "[{label: 'Rijksmuseum building', wikidata_id: 'Q190804'}]" + description: "Detected landmark with Wikidata link" + + unique_object_count: + slot_uri: hc:uniqueObjectCount + description: | + Number of unique objects detected (deduplicated). + + Same object appearing in multiple frames counts once. + Requires object tracking or deduplication. + range: integer + required: false + minimum_value: 0 + examples: + - value: 15 + description: "15 unique objects identified" + + unique_face_count: + slot_uri: hc:uniqueFaceCount + description: | + Number of unique faces detected (deduplicated). + + Same person appearing multiple times counts once. + Requires face clustering or recognition. + range: integer + required: false + minimum_value: 0 + examples: + - value: 3 + description: "3 unique people identified" + + object_classes_detected: + slot_uri: hc:objectClassesDetected + description: | + List of unique object class labels detected. + + Vocabulary depends on model training: + - COCO: 80 common object categories + - ImageNet: 1000 categories + - Custom: Heritage-specific categories + range: string + multivalued: true + required: false + examples: + - value: "[painting, person, sculpture, book]" + description: "Object classes found in video" + + includes_object_tracking: + slot_uri: hc:includesObjectTracking + description: | + Whether objects are tracked across frames. + + Tracking maintains object identity over time: + - true: Same object has consistent ID across frames + - false: Independent per-frame detections + + Tracking enables counting unique objects and + understanding object movement/presence. + range: boolean + required: false + examples: + - value: true + description: "Objects tracked across frames" + + tracking_ids_assigned: + slot_uri: hc:trackingIdsAssigned + description: | + Number of unique tracking IDs assigned. + + Each tracked entity gets a unique ID maintained + across its visible duration. + range: integer + required: false + minimum_value: 0 + examples: + - value: 23 + description: "23 unique tracking IDs assigned" + + linked_to_collection: + slot_uri: hc:linkedToCollection + description: | + Whether detected objects are linked to collection database. + + When true, detected objects have collection_id or + wikidata_id linking them to authoritative records. + + Enables: + - "Find videos showing artwork X" + - Rich metadata for detected items + - Cross-referencing with collection management + range: boolean + required: false + examples: + - value: true + description: "Objects linked to collection records" + + comments: + - "Object, face, and logo detection in video" + - "Supports bounding boxes and tracking" + - "Heritage-specific object vocabulary" + - "Enables linking to collection database" + + see_also: + - "https://cocodataset.org/" + - "https://iiif.io/api/presentation/3.0/" + + # ========================================================================== + # VideoOCRAnnotation - Text-in-Video Extraction + # ========================================================================== + + VideoOCRAnnotation: + is_a: VideoAnnotation + class_uri: hc:VideoOCRAnnotation + abstract: false + description: | + Annotation for extracting text visible in video frames (OCR). + + **DEFINITION**: + + VideoOCRAnnotation captures text that appears on-screen in video: + + | Text Type | Examples | + |-----------|----------| + | **Titles** | Opening titles, chapter headings | + | **Captions** | Burned-in subtitles, name cards | + | **Signs** | Museum signage, room labels | + | **Documents** | Letters, manuscripts, books shown | + | **Labels** | Artifact labels, exhibition text | + | **Graphics** | Infographics, charts, timelines | + + **OCR vs SUBTITLES**: + + - **VideoSubtitle**: Text derived from AUDIO (speech-to-text) + - **VideoOCRAnnotation**: Text derived from VIDEO (image-to-text) + + OCR captures text VISIBLE in frames, not spoken. + + **TEXT DETECTION PIPELINE**: + + 1. **Detection**: Locate text regions (bounding boxes) + 2. **Recognition**: Extract characters from regions + 3. **Post-processing**: Correct, normalize, structure + + **HERITAGE USE CASES**: + + | Use Case | Value | + |----------|-------| + | **Name cards** | Identify speakers automatically | + | **Document digitization** | Extract text from filmed documents | + | **Exhibition text** | Capture interpretive panels | + | **Historical signs** | Archive street names, shop signs | + | **Handwritten text** | Extract letters, diaries, notes | + + **LANGUAGE DETECTION**: + + OCR can detect and extract text in multiple languages: + - `text_languages_detected`: Languages found in video + - Mixed-language content is common in heritage videos + + **TEXT REGION TYPES**: + + | Type | Appearance | + |------|------------| + | OVERLAY | Digitally added text (titles, lower thirds) | + | NATURAL | Text in physical scene (signs, documents) | + | HISTORICAL | Archival text (period documents, photos) | + | HANDWRITTEN | Manuscript, notes, signatures | + + exact_mappings: + - hc:VideoOCRAnnotation + + close_mappings: + - schema:TextDigitalDocument + + slots: + # Extracted text + - text_segments + - full_extracted_text + + # Text classification + - text_types_detected + - text_languages_detected + + # Statistics + - text_region_count + - total_characters_extracted + + # Quality + - includes_handwriting + - handwriting_confidence + + # Spatial + - text_regions + + slot_usage: + text_segments: + slot_uri: hc:textSegments + description: | + Time-coded segments with extracted text. + + Each segment contains: + - `start_seconds` / `end_seconds`: When text is visible + - `segment_text`: The extracted text content + - `confidence`: OCR confidence score + + Segments may overlap if multiple text regions visible. + range: VideoTimeSegment + multivalued: true + required: true + inlined_as_list: true + examples: + - value: "[{start_seconds: 0.0, end_seconds: 5.0, segment_text: 'Rijksmuseum Presents'}]" + description: "Title card text extraction" + + full_extracted_text: + slot_uri: hc:fullExtractedText + description: | + All extracted text concatenated as single string. + + Useful for: + - Full-text search indexing + - Text analysis (NLP, keyword extraction) + - Quick review of all on-screen text + + Ordered chronologically by appearance. + range: string + required: false + examples: + - value: "Rijksmuseum Presents... The Night Watch... Rembrandt van Rijn, 1642..." + description: "All text from video" + + text_types_detected: + slot_uri: hc:textTypesDetected + description: | + Types of text regions found in video. + + Classifying text type helps with: + - Filtering (e.g., show only name cards) + - Priority (titles more important than background signs) + - Accuracy expectations (overlays clearer than handwriting) + range: TextTypeEnum + multivalued: true + required: false + examples: + - value: "[TITLE_CARD, NAME_LOWER_THIRD, DOCUMENT]" + description: "Text types found in video" + + text_languages_detected: + slot_uri: dcterms:language + description: | + Languages of detected text (ISO 639-1 codes). + + Heritage videos often contain multilingual text: + - Exhibition labels in multiple languages + - Historical documents in period languages + - Modern overlays vs historical content + range: string + multivalued: true + required: false + examples: + - value: "[nl, en, la]" + description: "Dutch, English, and Latin text detected" + + text_region_count: + slot_uri: hc:textRegionCount + description: | + Total number of text regions detected. + + A region is a contiguous area of text. + Multiple regions may be visible simultaneously. + range: integer + required: false + minimum_value: 0 + examples: + - value: 28 + description: "28 text regions detected" + + total_characters_extracted: + slot_uri: hc:totalCharactersExtracted + description: | + Total character count of all extracted text. + + Useful for: + - Understanding OCR output volume + - Cost estimation (some OCR APIs charge per character) + range: integer + required: false + minimum_value: 0 + examples: + - value: 3456 + description: "3,456 characters extracted" + + includes_handwriting: + slot_uri: hc:includesHandwriting + description: | + Whether handwritten text was detected. + + Handwriting OCR is more challenging and typically + has lower confidence than printed text. + + Heritage relevance: manuscripts, letters, diaries, + annotations, signatures. + range: boolean + required: false + examples: + - value: true + description: "Handwritten text detected" + + handwriting_confidence: + slot_uri: hc:handwritingConfidence + description: | + Average confidence for handwriting recognition. + + Typically lower than printed text confidence. + + Useful for quality assessment and filtering. + range: float + required: false + minimum_value: 0.0 + maximum_value: 1.0 + examples: + - value: 0.68 + description: "Moderate handwriting recognition confidence" + + text_regions: + slot_uri: hc:textRegions + description: | + Detailed text region data with spatial coordinates. + + Each region includes: + - Bounding box coordinates + - Extracted text + - Region type classification + - Language detection + + For detailed spatial analysis beyond time segments. + range: TextRegion + multivalued: true + required: false + inlined_as_list: true + examples: + - value: "[{text: 'Welcome', bbox: [0.1, 0.9, 0.4, 0.1], type: 'OVERLAY'}]" + description: "Text region with coordinates" + + comments: + - "OCR extraction for text visible in video frames" + - "Distinct from subtitles (audio-derived)" + - "Supports printed and handwritten text" + - "Heritage use: documents, labels, signage, name cards" + + see_also: + - "https://en.wikipedia.org/wiki/Optical_character_recognition" + +# ============================================================================ +# Supporting Classes (Embedded Types) +# ============================================================================ + + DetectedObject: + class_uri: hc:DetectedObject + description: | + A single detected object with label, confidence, and location. + slots: + - object_label + - object_confidence + - object_segment + - object_bbox + - object_wikidata_id + - object_collection_id + + slot_usage: + object_label: + slot_uri: rdfs:label + description: Object class label (e.g., "painting", "sculpture") + range: string + required: true + object_confidence: + slot_uri: hc:confidence + description: Detection confidence (0.0-1.0) + range: float + required: true + object_segment: + slot_uri: hc:segment + description: Time segment when object is visible + range: VideoTimeSegment + required: false + object_bbox: + slot_uri: hc:boundingBox + description: Bounding box as [x, y, width, height] normalized 0-1 + range: float + multivalued: true + required: false + object_wikidata_id: + slot_uri: hc:wikidataId + description: Wikidata ID if object is identified + range: string + required: false + object_collection_id: + slot_uri: hc:collectionId + description: Collection database ID for artwork/artifact + range: string + required: false + + DetectedFace: + class_uri: hc:DetectedFace + description: | + A detected face with optional identity and attributes. + slots: + - face_confidence + - face_segment + - face_bbox + - person_id + - person_name + - is_recognized + + slot_usage: + face_confidence: + slot_uri: hc:confidence + description: Face detection confidence (0.0-1.0) + range: float + required: true + face_segment: + slot_uri: hc:segment + description: Time segment when face is visible + range: VideoTimeSegment + required: false + face_bbox: + slot_uri: hc:boundingBox + description: Face bounding box as [x, y, width, height] + range: float + multivalued: true + required: false + person_id: + slot_uri: hc:personId + description: Unique identifier for recognized person + range: string + required: false + person_name: + slot_uri: schema:name + description: Name of recognized person + range: string + required: false + is_recognized: + slot_uri: hc:isRecognized + description: Whether face was matched to known person + range: boolean + required: false + + DetectedLogo: + class_uri: hc:DetectedLogo + description: | + A detected logo or brand mark. + slots: + - logo_label + - logo_confidence + - logo_segment + - logo_bbox + - logo_organization + + slot_usage: + logo_label: + slot_uri: rdfs:label + description: Logo name or brand + range: string + required: true + logo_confidence: + slot_uri: hc:confidence + description: Detection confidence (0.0-1.0) + range: float + required: true + logo_segment: + slot_uri: hc:segment + description: Time segment when logo is visible + range: VideoTimeSegment + required: false + logo_bbox: + slot_uri: hc:boundingBox + description: Logo bounding box + range: float + multivalued: true + required: false + logo_organization: + slot_uri: schema:organization + description: Organization associated with logo + range: string + required: false + + DetectedLandmark: + class_uri: hc:DetectedLandmark + description: | + A detected landmark or notable building. + slots: + - landmark_label + - landmark_confidence + - landmark_segment + - landmark_wikidata_id + - landmark_geonames_id + + slot_usage: + landmark_label: + slot_uri: rdfs:label + description: Landmark name + range: string + required: true + landmark_confidence: + slot_uri: hc:confidence + description: Detection confidence (0.0-1.0) + range: float + required: true + landmark_segment: + slot_uri: hc:segment + description: Time segment when landmark is visible + range: VideoTimeSegment + required: false + landmark_wikidata_id: + slot_uri: hc:wikidataId + description: Wikidata ID for landmark + range: string + required: false + landmark_geonames_id: + slot_uri: hc:geonamesId + description: GeoNames ID for location + range: string + required: false + + TextRegion: + class_uri: hc:TextRegion + description: | + A detected text region with extracted content and location. + slots: + - region_text + - region_confidence + - region_bbox + - region_type + - region_language + + slot_usage: + region_text: + slot_uri: oa:bodyValue + description: Extracted text content + range: string + required: true + region_confidence: + slot_uri: hc:confidence + description: OCR confidence (0.0-1.0) + range: float + required: true + region_bbox: + slot_uri: hc:boundingBox + description: Text region bounding box + range: float + multivalued: true + required: false + region_type: + slot_uri: dcterms:type + description: Type of text region + range: TextTypeEnum + required: false + region_language: + slot_uri: dcterms:language + description: Detected language (ISO 639-1) + range: string + required: false + +# ============================================================================ +# Enumerations +# ============================================================================ + +enums: + + DetectionLevelEnum: + description: Granularity of scene/shot detection + permissible_values: + SHOT: + description: Technical camera edit boundaries + SCENE: + description: Semantic content/location boundaries + BOTH: + description: Both shot and scene detection + + SceneTypeEnum: + description: Types of scenes in heritage videos + permissible_values: + INTERIOR: + description: Indoor/interior shots + EXTERIOR: + description: Outdoor/exterior shots + CLOSEUP: + description: Detail shots of objects/artworks + WIDE_SHOT: + description: Establishing or wide-angle shots + INTERVIEW: + description: Talking head / interview format + ARCHIVAL: + description: Historical footage or photographs + ANIMATION: + description: Animated or graphics sequence + TITLE_CARD: + description: Text overlay or title sequence + B_ROLL: + description: Supplementary/cutaway footage + DEMONSTRATION: + description: Process or technique demonstration + TOUR: + description: Walking tour or navigation sequence + + TransitionTypeEnum: + description: Types of video transitions + permissible_values: + CUT: + description: Instantaneous transition (hard cut) + FADE_IN: + description: Gradual appearance from black + FADE_OUT: + description: Gradual disappearance to black + DISSOLVE: + description: Cross-fade between shots + WIPE: + description: Geometric wipe transition + MORPH: + description: Morphing transition effect + FLASH: + description: Flash or strobe transition + OTHER: + description: Other transition type + + TextTypeEnum: + description: Types of on-screen text regions + permissible_values: + TITLE_CARD: + description: Opening/closing titles + LOWER_THIRD: + description: Name/title overlay at bottom + SUBTITLE: + description: Burned-in subtitles/captions + SIGN: + description: Physical signs in scene + LABEL: + description: Museum/exhibition labels + DOCUMENT: + description: Text from documents/books + HANDWRITTEN: + description: Handwritten text + GRAPHIC: + description: Infographic or chart text + WATERMARK: + description: Video watermark or logo + URL: + description: Website URL display + CREDITS: + description: Credits or attribution text + OTHER: + description: Other text type + +# ============================================================================ +# Slot Definitions +# ============================================================================ + +slots: + # Scene annotation slots + scene_segments: + description: List of detected scene/shot segments + range: VideoTimeSegment + multivalued: true + scene_count: + description: Number of scenes detected + range: integer + average_scene_duration_seconds: + description: Average scene duration + range: float + detection_level: + description: Shot vs scene detection granularity + range: DetectionLevelEnum + scene_types_detected: + description: Scene type labels found + range: SceneTypeEnum + multivalued: true + transition_types_detected: + description: Transition types found + range: TransitionTypeEnum + multivalued: true + cut_count: + description: Number of hard cuts + range: integer + fade_count: + description: Number of fades + range: integer + dissolve_count: + description: Number of dissolves + range: integer + + # Object annotation slots + detected_objects: + description: List of detected objects + range: DetectedObject + multivalued: true + detected_faces: + description: List of detected faces + range: DetectedFace + multivalued: true + detected_logos: + description: List of detected logos + range: DetectedLogo + multivalued: true + detected_landmarks: + description: List of detected landmarks + range: DetectedLandmark + multivalued: true + unique_object_count: + description: Number of unique objects + range: integer + unique_face_count: + description: Number of unique faces + range: integer + object_classes_detected: + description: Object class labels found + range: string + multivalued: true + includes_object_tracking: + description: Whether objects tracked across frames + range: boolean + tracking_ids_assigned: + description: Number of tracking IDs + range: integer + linked_to_collection: + description: Whether linked to collection database + range: boolean + + # OCR annotation slots + text_segments: + description: Time-coded text extraction segments + range: VideoTimeSegment + multivalued: true + full_extracted_text: + description: All extracted text concatenated + range: string + text_types_detected: + description: Types of text regions found + range: TextTypeEnum + multivalued: true + text_languages_detected: + description: Languages detected in text + range: string + multivalued: true + text_region_count: + description: Number of text regions + range: integer + total_characters_extracted: + description: Total characters extracted + range: integer + includes_handwriting: + description: Whether handwriting detected + range: boolean + handwriting_confidence: + description: Handwriting OCR confidence + range: float + text_regions: + description: Detailed text region data + range: TextRegion + multivalued: true + + # Supporting class slots + object_label: + description: Object class label + range: string + object_confidence: + description: Object detection confidence + range: float + object_segment: + description: Object visibility segment + range: VideoTimeSegment + object_bbox: + description: Object bounding box + range: float + multivalued: true + object_wikidata_id: + description: Object Wikidata ID + range: string + object_collection_id: + description: Object collection database ID + range: string + + face_confidence: + description: Face detection confidence + range: float + face_segment: + description: Face visibility segment + range: VideoTimeSegment + face_bbox: + description: Face bounding box + range: float + multivalued: true + person_id: + description: Recognized person identifier + range: string + person_name: + description: Recognized person name + range: string + is_recognized: + description: Whether face was recognized + range: boolean + + logo_label: + description: Logo name + range: string + logo_confidence: + description: Logo detection confidence + range: float + logo_segment: + description: Logo visibility segment + range: VideoTimeSegment + logo_bbox: + description: Logo bounding box + range: float + multivalued: true + logo_organization: + description: Organization for logo + range: string + + landmark_label: + description: Landmark name + range: string + landmark_confidence: + description: Landmark detection confidence + range: float + landmark_segment: + description: Landmark visibility segment + range: VideoTimeSegment + landmark_wikidata_id: + description: Landmark Wikidata ID + range: string + landmark_geonames_id: + description: Landmark GeoNames ID + range: string + + region_text: + description: Extracted text content + range: string + region_confidence: + description: OCR confidence + range: float + region_bbox: + description: Text region bounding box + range: float + multivalued: true + region_type: + description: Text region type + range: TextTypeEnum + region_language: + description: Detected language + range: string diff --git a/schemas/20251121/linkml/modules/classes/VideoAudioAnnotation.yaml b/schemas/20251121/linkml/modules/classes/VideoAudioAnnotation.yaml new file mode 100644 index 0000000000..083ce1db6e --- /dev/null +++ b/schemas/20251121/linkml/modules/classes/VideoAudioAnnotation.yaml @@ -0,0 +1,1108 @@ +# Video Audio Annotation Class +# Models audio event detection in video content (speech, music, silence, diarization) +# +# Part of Heritage Custodian Ontology v0.9.10 +# +# HIERARCHY: +# VideoAnnotation (abstract base) +# │ +# ├── VideoSceneAnnotation (scene/shot detection) +# ├── VideoObjectAnnotation (object/face/logo detection) +# ├── VideoOCRAnnotation (text-in-video extraction) +# └── VideoAudioAnnotation (this class) +# - Speech detection and diarization +# - Music detection and classification +# - Sound event detection +# - Silence/noise detection +# +# HERITAGE INSTITUTION USE CASES: +# - Speaker identification in curator interviews +# - Music detection in promotional videos +# - Silence detection for video quality analysis +# - Language detection for multilingual content +# - Applause/audience reaction in lecture recordings +# - Sound effects in exhibition media +# +# ONTOLOGY ALIGNMENT: +# - W3C Web Annotation for annotation structure +# - CIDOC-CRM E13_Attribute_Assignment for attribution +# - W3C Media Ontology for audio properties +# - Speech-to-Text standards for diarization + +id: https://nde.nl/ontology/hc/class/VideoAudioAnnotation +name: video_audio_annotation_class +title: Video Audio Annotation Class + +imports: + - linkml:types + - ./VideoAnnotation + - ./VideoTimeSegment + +prefixes: + linkml: https://w3id.org/linkml/ + hc: https://nde.nl/ontology/hc/ + schema: http://schema.org/ + dcterms: http://purl.org/dc/terms/ + prov: http://www.w3.org/ns/prov# + crm: http://www.cidoc-crm.org/cidoc-crm/ + oa: http://www.w3.org/ns/oa# + ma: http://www.w3.org/ns/ma-ont# + wikidata: http://www.wikidata.org/entity/ + +default_prefix: hc + +# ============================================================================ +# Classes +# ============================================================================ + +classes: + + VideoAudioAnnotation: + is_a: VideoAnnotation + class_uri: hc:VideoAudioAnnotation + abstract: false + description: | + Annotation for audio events detected in video content. + + **DEFINITION**: + + VideoAudioAnnotation captures structured information derived from audio + analysis of video content. This includes speech, music, silence, and + various sound events. + + **AUDIO ANALYSIS TYPES**: + + | Type | Description | Use Case | + |------|-------------|----------| + | **Speech Detection** | Identify spoken segments | Transcript alignment | + | **Speaker Diarization** | Who spoke when | Interview navigation | + | **Music Detection** | Identify musical segments | Content classification | + | **Sound Events** | Applause, laughter, etc. | Audience engagement | + | **Silence Detection** | Find quiet segments | Quality assessment | + | **Language Detection** | Identify spoken languages | Multilingual content | + + **SPEAKER DIARIZATION**: + + Diarization answers "who spoke when": + + ``` + 0:00-0:15 Speaker 1 (Curator) + 0:15-0:45 Speaker 2 (Artist) + 0:45-1:00 Speaker 1 (Curator) + 1:00-1:30 Speaker 3 (Museum Director) + ``` + + Heritage applications: + - Navigate to specific speakers in interviews + - Count speaking time per person + - Identify unnamed speakers for annotation + - Build speaker databases for recognition + + **MUSIC DETECTION**: + + Music detection classifies audio segments as containing music: + + | Category | Examples | + |----------|----------| + | **Background music** | Documentary soundtracks | + | **Featured music** | Concert recordings, performances | + | **Historical music** | Archival recordings | + | **Licensed music** | Rights-managed content | + + Music segments may also include: + - Genre classification (classical, jazz, folk) + - Mood/tempo analysis + - Fingerprinting for identification + + **SOUND EVENT DETECTION**: + + Non-speech, non-music audio events: + + | Event Type | Heritage Context | + |------------|------------------| + | APPLAUSE | Lecture recordings, openings | + | LAUGHTER | Tour guides, educational content | + | CROWD_NOISE | Event documentation | + | DOOR/FOOTSTEPS | Ambient archive recordings | + | NATURE_SOUNDS | Outdoor heritage site recordings | + | MACHINERY | Industrial heritage, conservation | + + **LANGUAGE DETECTION**: + + Multilingual heritage content requires language identification: + + ```yaml + speech_segments: + - start: 0.0 + end: 120.0 + language: nl + speaker_id: speaker_001 + - start: 120.0 + end: 240.0 + language: en + speaker_id: speaker_001 # Same speaker, switched language + ``` + + **AUDIO QUALITY ANALYSIS**: + + Audio quality metrics for preservation and accessibility: + + | Metric | Description | Threshold | + |--------|-------------|-----------| + | SNR | Signal-to-noise ratio | > 20 dB good | + | Clipping | Peak distortion | None ideal | + | Noise floor | Background noise level | < -50 dB good | + | Frequency response | Bandwidth | Full-range ideal | + + **HERITAGE INSTITUTION USE CASES**: + + | Content Type | Audio Analysis Need | + |--------------|---------------------| + | Oral histories | Diarization, transcription alignment | + | Curator interviews | Speaker identification, language | + | Virtual tours | Background music, voiceover detection | + | Lecture recordings | Audience reactions, Q&A segments | + | Conservation videos | Narration vs demonstration audio | + | Archival footage | Speech recovery, noise reduction | + + **RELATIONSHIP TO VideoTranscript**: + + VideoAudioAnnotation is complementary to VideoTranscript: + + - **VideoTranscript**: The text content of speech (WHAT was said) + - **VideoAudioAnnotation**: Audio structure (WHO spoke, music, sounds) + + Together they provide complete audio understanding: + + ``` + VideoAudioAnnotation: Speaker 1 spoke 0:00-0:15 + VideoTranscript: "Welcome to the Rijksmuseum..." (0:00-0:15) + → Combined: Curator said "Welcome to the Rijksmuseum..." + ``` + + exact_mappings: + - hc:VideoAudioAnnotation + + close_mappings: + - ma:AudioTrack + - crm:E13_Attribute_Assignment + + related_mappings: + - wikidata:Q11028 # Speech + - wikidata:Q638 # Music + + slots: + # Audio event detection + - audio_event_segments + - primary_audio_event_type + + # Speech analysis + - speech_detected + - speech_segments + - speech_language + - speech_language_confidence + - languages_detected + + # Speaker diarization + - diarization_enabled + - diarization_segments + - speaker_count + - speaker_labels + + # Music detection + - music_detected + - music_segments + - music_genres_detected + - music_confidence + + # Sound events + - sound_events_detected + - sound_event_types + + # Silence/noise + - silence_segments + - silence_total_seconds + - noise_floor_db + + # Audio quality + - audio_quality_score + - snr_db + - has_clipping + + slot_usage: + audio_event_segments: + slot_uri: oa:hasBody + description: | + Time-coded segments with detected audio events. + + Web Annotation: hasBody links annotation to content. + + Each segment contains: + - Start/end time boundaries + - Event type (SPEECH, MUSIC, SILENCE, etc.) + - Confidence score + - Additional metadata (speaker ID, language, etc.) + + Segments may overlap (e.g., speech over background music). + range: VideoTimeSegment + multivalued: true + required: false + inlined_as_list: true + examples: + - value: "[{start_seconds: 0.0, end_seconds: 15.0, segment_text: 'Speech detected - Speaker 1'}]" + description: "Speech detection segment" + + primary_audio_event_type: + slot_uri: dcterms:type + description: | + The primary type of audio analysis performed. + + Dublin Core: type for categorization. + + **Types**: + - SPEECH: Speech detection and diarization + - MUSIC: Music detection and classification + - SOUND_EVENTS: Environmental sound detection + - MIXED: Multiple analysis types combined + range: AudioEventTypeEnum + required: true + examples: + - value: "SPEECH" + description: "Primary focus on speech analysis" + + speech_detected: + slot_uri: hc:speechDetected + description: | + Whether speech was detected in the video audio. + + High-level flag for presence of speech content. + + - true: At least one speech segment detected + - false: No speech detected (music-only, silent, etc.) + range: boolean + required: false + examples: + - value: true + description: "Speech is present in video" + + speech_segments: + slot_uri: hc:speechSegments + description: | + Detailed speech segments with speaker and language info. + + Each segment represents continuous speech from one speaker. + + Used for: + - Transcript alignment + - Speaker navigation + - Language segmentation + range: SpeechSegment + multivalued: true + required: false + inlined_as_list: true + examples: + - value: "[{start_seconds: 0.0, end_seconds: 15.0, speaker_id: 'spk_001', language: 'nl'}]" + description: "Dutch speech from speaker 1" + + speech_language: + slot_uri: dcterms:language + description: | + Primary language of speech content (ISO 639-1 code). + + Dublin Core: language for primary language. + + For multilingual content, this is the predominant language. + See `languages_detected` for all languages. + range: string + required: false + examples: + - value: "nl" + description: "Dutch is primary language" + - value: "en" + description: "English is primary language" + + speech_language_confidence: + slot_uri: hc:languageConfidence + description: | + Confidence score for language detection (0.0-1.0). + + Higher confidence when: + - Longer speech segments + - Clear audio quality + - Distinct language features + + Lower confidence when: + - Short utterances + - Background noise + - Code-switching + range: float + required: false + minimum_value: 0.0 + maximum_value: 1.0 + examples: + - value: 0.95 + description: "High confidence language detection" + + languages_detected: + slot_uri: hc:languagesDetected + description: | + All languages detected in speech (ISO 639-1 codes). + + Heritage content often includes multiple languages: + - Exhibition videos with translations + - Interviews with multilingual speakers + - Historical content with period languages + + Ordered by speaking time (most spoken first). + range: string + multivalued: true + required: false + examples: + - value: "[nl, en, de]" + description: "Dutch, English, and German detected" + + diarization_enabled: + slot_uri: hc:diarizationEnabled + description: | + Whether speaker diarization was performed. + + Diarization = identifying distinct speakers and their segments. + + - true: Speaker IDs assigned to speech segments + - false: Speech detected but speakers not distinguished + range: boolean + required: false + examples: + - value: true + description: "Diarization was performed" + + diarization_segments: + slot_uri: hc:diarizationSegments + description: | + Detailed diarization results with speaker assignments. + + Each segment identifies: + - Time boundaries + - Speaker ID (anonymous: "spk_001", "spk_002") + - Optional speaker name (if identified) + - Confidence score + + Enables "who spoke when" analysis. + range: DiarizationSegment + multivalued: true + required: false + inlined_as_list: true + examples: + - value: "[{start_seconds: 0.0, end_seconds: 15.0, speaker_id: 'spk_001', speaker_label: 'Curator'}]" + description: "Curator speaking for first 15 seconds" + + speaker_count: + slot_uri: hc:speakerCount + description: | + Number of distinct speakers detected. + + Useful for: + - Interview classification (1 = monologue, 2+ = dialog) + - Content type inference + - Accessibility planning + range: integer + required: false + minimum_value: 0 + examples: + - value: 3 + description: "Three distinct speakers detected" + + speaker_labels: + slot_uri: hc:speakerLabels + description: | + Labels or names assigned to detected speakers. + + May be: + - Anonymous: ["Speaker 1", "Speaker 2"] + - Identified: ["Dr. Taco Dibbits", "Interviewer"] + - Role-based: ["Curator", "Artist", "Host"] + + Ordered by speaking time (most speaking first). + range: string + multivalued: true + required: false + examples: + - value: "[Curator, Artist, Museum Director]" + description: "Three identified speakers" + + music_detected: + slot_uri: hc:musicDetected + description: | + Whether music was detected in the audio. + + - true: Musical content detected (any amount) + - false: No music detected (speech-only, silence) + range: boolean + required: false + examples: + - value: true + description: "Music present in video" + + music_segments: + slot_uri: hc:musicSegments + description: | + Time segments containing music. + + Each segment includes: + - Time boundaries + - Music type (background, featured) + - Genre classification (if detected) + - Confidence score + range: MusicSegment + multivalued: true + required: false + inlined_as_list: true + examples: + - value: "[{start_seconds: 0.0, end_seconds: 30.0, music_type: 'BACKGROUND', genre: 'classical'}]" + description: "Classical background music" + + music_genres_detected: + slot_uri: hc:musicGenresDetected + description: | + Music genres detected in audio. + + **Common Heritage Genres**: + - classical: Art music, orchestral + - baroque: Period-specific classical + - jazz: Jazz performances + - folk: Traditional/folk music + - ambient: Background/atmospheric + - electronic: Modern electronic music + range: string + multivalued: true + required: false + examples: + - value: "[classical, baroque]" + description: "Classical and baroque music detected" + + music_confidence: + slot_uri: hc:musicConfidence + description: | + Overall confidence of music detection (0.0-1.0). + + Average confidence across all music segments. + range: float + required: false + minimum_value: 0.0 + maximum_value: 1.0 + examples: + - value: 0.88 + description: "High confidence music detection" + + sound_events_detected: + slot_uri: hc:soundEventsDetected + description: | + Whether non-speech, non-music sound events were detected. + + Sound events include applause, laughter, environmental sounds, etc. + range: boolean + required: false + examples: + - value: true + description: "Sound events detected" + + sound_event_types: + slot_uri: hc:soundEventTypes + description: | + Types of sound events detected. + + **Heritage-Relevant Events**: + - APPLAUSE: Lecture endings, openings + - LAUGHTER: Tour guide humor + - CROWD_NOISE: Event atmosphere + - FOOTSTEPS: Gallery ambiance + - NATURE_SOUNDS: Outdoor heritage sites + - BELLS: Church/temple recordings + range: SoundEventTypeEnum + multivalued: true + required: false + examples: + - value: "[APPLAUSE, CROWD_NOISE]" + description: "Applause and crowd sounds detected" + + silence_segments: + slot_uri: hc:silenceSegments + description: | + Time segments containing silence or very low audio. + + Silence detection useful for: + - Finding pauses between segments + - Quality assessment (unexpected silence) + - Identifying chapter/scene boundaries + + Threshold typically: audio below -40 dB for > 2 seconds. + range: VideoTimeSegment + multivalued: true + required: false + inlined_as_list: true + examples: + - value: "[{start_seconds: 45.0, end_seconds: 48.0}]" + description: "3-second silence" + + silence_total_seconds: + slot_uri: hc:silenceTotalSeconds + description: | + Total duration of silence in the video (seconds). + + High silence percentage may indicate: + - Extended pauses + - Silent segments (B-roll without audio) + - Audio issues + range: float + required: false + minimum_value: 0.0 + examples: + - value: 15.5 + description: "15.5 seconds of total silence" + + noise_floor_db: + slot_uri: hc:noiseFloorDb + description: | + Background noise floor level in decibels. + + **Quality Guidelines**: + - < -60 dB: Excellent (studio quality) + - -60 to -40 dB: Good (professional recording) + - -40 to -30 dB: Acceptable (field recording) + - > -30 dB: Poor (noisy environment) + range: float + required: false + examples: + - value: -45.0 + description: "Good quality, moderate noise floor" + + audio_quality_score: + slot_uri: hc:audioQualityScore + description: | + Overall audio quality score (0.0-1.0). + + Composite score based on: + - Signal-to-noise ratio + - Clipping presence + - Frequency response + - Clarity of speech + + **Interpretation**: + - > 0.8: High quality, suitable for all uses + - 0.6-0.8: Good quality, minor issues + - 0.4-0.6: Acceptable, some degradation + - < 0.4: Poor quality, may need enhancement + range: float + required: false + minimum_value: 0.0 + maximum_value: 1.0 + examples: + - value: 0.85 + description: "High audio quality" + + snr_db: + slot_uri: hc:snrDb + description: | + Signal-to-noise ratio in decibels. + + Higher is better: + - > 30 dB: Excellent + - 20-30 dB: Good + - 10-20 dB: Acceptable + - < 10 dB: Poor (speech intelligibility affected) + range: float + required: false + examples: + - value: 25.0 + description: "Good signal-to-noise ratio" + + has_clipping: + slot_uri: hc:hasClipping + description: | + Whether audio clipping (peak distortion) was detected. + + Clipping occurs when audio exceeds maximum level: + - true: Clipping detected (distortion present) + - false: No clipping (clean audio) + + Clipping is permanent quality loss. + range: boolean + required: false + examples: + - value: false + description: "No clipping detected" + + comments: + - "Audio event detection for video content" + - "Supports speech, music, silence, and sound event detection" + - "Speaker diarization for interview navigation" + - "Language detection for multilingual heritage content" + - "Audio quality metrics for preservation assessment" + + see_also: + - "https://www.w3.org/TR/annotation-model/" + - "https://arxiv.org/abs/2111.08085" # Speaker diarization survey + +# ============================================================================ +# Supporting Classes +# ============================================================================ + + SpeechSegment: + class_uri: hc:SpeechSegment + description: | + A speech segment with speaker and language information. + + Extends VideoTimeSegment with speech-specific metadata. + + slots: + - segment_start_seconds + - segment_end_seconds + - speaker_id + - speaker_label + - segment_language + - segment_confidence + - speech_text + + slot_usage: + segment_start_seconds: + slot_uri: ma:hasStartTime + description: Start time in seconds + range: float + required: true + minimum_value: 0.0 + + segment_end_seconds: + slot_uri: ma:hasEndTime + description: End time in seconds + range: float + required: true + minimum_value: 0.0 + + speaker_id: + slot_uri: hc:speakerId + description: | + Unique identifier for the speaker. + + Format: "spk_001", "spk_002", etc. (anonymous) + Or: "taco_dibbits" (identified) + range: string + required: false + + speaker_label: + slot_uri: schema:name + description: Human-readable speaker name or role + range: string + required: false + + segment_language: + slot_uri: dcterms:language + description: Language of speech in this segment (ISO 639-1) + range: string + required: false + + segment_confidence: + slot_uri: hc:confidence + description: Confidence score for this segment (0.0-1.0) + range: float + required: false + minimum_value: 0.0 + maximum_value: 1.0 + + speech_text: + slot_uri: hc:speechText + description: | + Transcript text for this segment (if available). + + Links to VideoTranscript for full transcript. + range: string + required: false + + + DiarizationSegment: + class_uri: hc:DiarizationSegment + description: | + A diarization segment identifying speaker and time boundaries. + + Focused on "who spoke when" rather than transcript content. + + slots: + - diarization_start_seconds + - diarization_end_seconds + - diarization_speaker_id + - diarization_speaker_label + - diarization_confidence + - is_overlapping + + slot_usage: + diarization_start_seconds: + slot_uri: ma:hasStartTime + description: Start time in seconds + range: float + required: true + minimum_value: 0.0 + + diarization_end_seconds: + slot_uri: ma:hasEndTime + description: End time in seconds + range: float + required: true + minimum_value: 0.0 + + diarization_speaker_id: + slot_uri: hc:speakerId + description: Anonymous speaker identifier (spk_001, spk_002, etc.) + range: string + required: true + + diarization_speaker_label: + slot_uri: schema:name + description: Optional identified name or role + range: string + required: false + + diarization_confidence: + slot_uri: hc:confidence + description: Diarization confidence (0.0-1.0) + range: float + required: false + minimum_value: 0.0 + maximum_value: 1.0 + + is_overlapping: + slot_uri: hc:isOverlapping + description: | + Whether this segment overlaps with another speaker. + + Overlapping speech occurs when multiple people speak simultaneously. + range: boolean + required: false + + + MusicSegment: + class_uri: hc:MusicSegment + description: | + A segment of detected music with classification. + + slots: + - music_start_seconds + - music_end_seconds + - music_type + - music_genre + - music_segment_confidence + - is_background + + slot_usage: + music_start_seconds: + slot_uri: ma:hasStartTime + description: Start time in seconds + range: float + required: true + minimum_value: 0.0 + + music_end_seconds: + slot_uri: ma:hasEndTime + description: End time in seconds + range: float + required: true + minimum_value: 0.0 + + music_type: + slot_uri: dcterms:type + description: Type of music (BACKGROUND, FEATURED, ARCHIVAL) + range: MusicTypeEnum + required: false + + music_genre: + slot_uri: hc:genre + description: Detected music genre + range: string + required: false + + music_segment_confidence: + slot_uri: hc:confidence + description: Music detection confidence (0.0-1.0) + range: float + required: false + minimum_value: 0.0 + maximum_value: 1.0 + + is_background: + slot_uri: hc:isBackground + description: | + Whether music is background (under speech) vs featured. + + - true: Music is background/ambient + - false: Music is primary audio + range: boolean + required: false + + +# ============================================================================ +# Enumerations +# ============================================================================ + +enums: + + AudioEventTypeEnum: + description: | + Types of audio events detected in video. + permissible_values: + SPEECH: + description: Speech/voice detection and analysis + MUSIC: + description: Music detection and classification + SILENCE: + description: Silence or very low audio + SOUND_EVENT: + description: Non-speech, non-music sound events + NOISE: + description: Noise detection (for quality assessment) + MIXED: + description: Multiple audio event types analyzed + + SoundEventTypeEnum: + description: | + Types of non-speech, non-music sound events. + permissible_values: + APPLAUSE: + description: Clapping, applause + LAUGHTER: + description: Laughter from audience or speakers + CROWD_NOISE: + description: General crowd/audience noise + FOOTSTEPS: + description: Walking, footsteps + DOOR: + description: Door opening/closing sounds + NATURE_SOUNDS: + description: Birds, wind, water, etc. + TRAFFIC: + description: Vehicles, urban sounds + BELLS: + description: Church bells, temple bells, etc. + MACHINERY: + description: Industrial, mechanical sounds + COUGHING: + description: Coughing, clearing throat + PAPER: + description: Paper rustling + TYPING: + description: Keyboard typing + PHONE: + description: Phone ringing or notification + MUSIC_INSTRUMENT: + description: Individual instrument sounds + OTHER: + description: Other sound event type + + MusicTypeEnum: + description: | + Types of music presence in audio. + permissible_values: + BACKGROUND: + description: Background/ambient music under other content + FEATURED: + description: Primary audio is music (performance, recording) + ARCHIVAL: + description: Historical/archival music recording + INTRO_OUTRO: + description: Opening or closing music/jingle + TRANSITION: + description: Music used for scene transitions + DIEGETIC: + description: Music from within the scene (radio, live performance) + NON_DIEGETIC: + description: Music added in post-production + + +# ============================================================================ +# Slot Definitions +# ============================================================================ + +slots: + # Audio event slots + audio_event_segments: + description: Time-coded segments with detected audio events + range: VideoTimeSegment + multivalued: true + + primary_audio_event_type: + description: Primary type of audio analysis performed + range: AudioEventTypeEnum + + # Speech slots + speech_detected: + description: Whether speech was detected + range: boolean + + speech_segments: + description: Detailed speech segments with speaker info + range: SpeechSegment + multivalued: true + + speech_language: + description: Primary language of speech (ISO 639-1) + range: string + + speech_language_confidence: + description: Confidence of language detection + range: float + + languages_detected: + description: All languages detected in speech + range: string + multivalued: true + + # Diarization slots + diarization_enabled: + description: Whether speaker diarization was performed + range: boolean + + diarization_segments: + description: Detailed diarization results + range: DiarizationSegment + multivalued: true + + speaker_count: + description: Number of distinct speakers detected + range: integer + + speaker_labels: + description: Labels or names for detected speakers + range: string + multivalued: true + + # Music slots + music_detected: + description: Whether music was detected + range: boolean + + music_segments: + description: Time segments containing music + range: MusicSegment + multivalued: true + + music_genres_detected: + description: Music genres detected + range: string + multivalued: true + + music_confidence: + description: Overall music detection confidence + range: float + + # Sound event slots + sound_events_detected: + description: Whether sound events were detected + range: boolean + + sound_event_types: + description: Types of sound events detected + range: SoundEventTypeEnum + multivalued: true + + # Silence/noise slots + silence_segments: + description: Time segments with silence + range: VideoTimeSegment + multivalued: true + + silence_total_seconds: + description: Total silence duration + range: float + + noise_floor_db: + description: Background noise floor in dB + range: float + + # Audio quality slots + audio_quality_score: + description: Overall audio quality (0.0-1.0) + range: float + + snr_db: + description: Signal-to-noise ratio in dB + range: float + + has_clipping: + description: Whether audio clipping was detected + range: boolean + + # SpeechSegment slots + segment_start_seconds: + description: Segment start time + range: float + + segment_end_seconds: + description: Segment end time + range: float + + speaker_id: + description: Speaker identifier + range: string + + speaker_label: + description: Speaker name or role + range: string + + segment_language: + description: Language of segment + range: string + + segment_confidence: + description: Segment confidence score + range: float + + speech_text: + description: Transcript text for segment + range: string + + # DiarizationSegment slots + diarization_start_seconds: + description: Diarization segment start + range: float + + diarization_end_seconds: + description: Diarization segment end + range: float + + diarization_speaker_id: + description: Speaker ID in diarization + range: string + + diarization_speaker_label: + description: Speaker label in diarization + range: string + + diarization_confidence: + description: Diarization confidence + range: float + + is_overlapping: + description: Whether segment has overlapping speech + range: boolean + + # MusicSegment slots + music_start_seconds: + description: Music segment start + range: float + + music_end_seconds: + description: Music segment end + range: float + + music_type: + description: Type of music presence + range: MusicTypeEnum + + music_genre: + description: Detected music genre + range: string + + music_segment_confidence: + description: Music segment confidence + range: float + + is_background: + description: Whether music is background + range: boolean diff --git a/schemas/20251121/linkml/modules/classes/VideoChapter.yaml b/schemas/20251121/linkml/modules/classes/VideoChapter.yaml new file mode 100644 index 0000000000..0ff4fe5d5d --- /dev/null +++ b/schemas/20251121/linkml/modules/classes/VideoChapter.yaml @@ -0,0 +1,621 @@ +# Video Chapter Class +# Models video chapter markers (YouTube chapters, manual/auto-generated sections) +# +# Part of Heritage Custodian Ontology v0.9.10 +# +# STRUCTURE: +# VideoChapter (this class) +# - chapter_title, chapter_index +# - start/end times (via VideoTimeSegment composition) +# - auto_generated flag +# - thumbnail references +# +# USE CASES: +# - YouTube video chapters (manual creator-defined) +# - Auto-generated chapters (YouTube AI, third-party tools) +# - Museum virtual tour sections +# - Conservation documentation phases +# - Interview segments +# +# ONTOLOGY ALIGNMENT: +# - Schema.org Clip for media segments +# - W3C Media Fragments for temporal addressing +# - CIDOC-CRM E52_Time-Span for temporal extent + +id: https://nde.nl/ontology/hc/class/VideoChapter +name: video_chapter_class +title: Video Chapter Class + +imports: + - linkml:types + - ./VideoTimeSegment + +prefixes: + linkml: https://w3id.org/linkml/ + hc: https://nde.nl/ontology/hc/ + schema: http://schema.org/ + dcterms: http://purl.org/dc/terms/ + prov: http://www.w3.org/ns/prov# + crm: http://www.cidoc-crm.org/cidoc-crm/ + oa: http://www.w3.org/ns/oa# + ma: http://www.w3.org/ns/ma-ont# + wikidata: http://www.wikidata.org/entity/ + +default_prefix: hc + +classes: + + VideoChapter: + class_uri: schema:Clip + abstract: false + description: | + A named chapter or section within a video, defined by temporal boundaries. + + **DEFINITION**: + + VideoChapter represents a titled segment of video content, typically used for + navigation and content organization. Chapters appear in video player interfaces + (YouTube chapters, Vimeo chapters) allowing viewers to jump to specific sections. + + **PLATFORM SUPPORT**: + + | Platform | Chapter Support | Auto-Generated | Custom Thumbnails | + |----------|-----------------|----------------|-------------------| + | YouTube | Yes (2020+) | Yes | No (keyframe) | + | Vimeo | Yes | No | Yes | + | Facebook | Limited | No | No | + | Wistia | Yes | No | Yes | + + **YOUTUBE CHAPTER REQUIREMENTS**: + + For YouTube to recognize chapters: + - First chapter MUST start at 0:00 + - Minimum 3 chapters required + - Each chapter must be at least 10 seconds + - Timestamps in description in `MM:SS` or `HH:MM:SS` format + + **HERITAGE INSTITUTION USE CASES**: + + | Content Type | Chapter Examples | + |--------------|------------------| + | Virtual tour | "Main Hall", "Dutch Masters", "Gift Shop" | + | Conservation | "Assessment", "Cleaning", "Retouching", "Varnishing" | + | Interview | "Introduction", "Early Career", "Major Works", "Legacy" | + | Exhibition | "Curator Introduction", "Theme 1", "Theme 2", "Conclusion" | + | Lecture | "Overview", "Case Study 1", "Case Study 2", "Q&A" | + + **AUTO-GENERATED VS MANUAL CHAPTERS**: + + | Source | Characteristics | Quality | + |--------|-----------------|---------| + | Manual (creator) | Semantic, meaningful titles | High | + | YouTube AI | Scene-based, generic titles | Variable | + | Third-party tools | Transcript-based, keyword titles | Medium | + + The `auto_generated` flag distinguishes these sources. + + **RELATIONSHIP TO VideoTimeSegment**: + + VideoChapter USES VideoTimeSegment for temporal boundaries rather than + extending it. This composition pattern allows: + - Reuse of segment validation (start < end) + - Consistent time representation across schema + - Separation of structural (chapter) and temporal (segment) concerns + + **MEDIA FRAGMENTS URI**: + + Chapters can be addressed via W3C Media Fragments: + ``` + https://youtube.com/watch?v=ABC123#t=120,300 + ``` + Corresponds to chapter starting at 2:00, ending at 5:00. + + **NESTED CHAPTERS**: + + Some platforms support hierarchical chapters (parent/child). + Use `parent_chapter_id` for nested structure: + + ``` + Chapter 1: Dutch Golden Age + └─ 1.1: Rembrandt + └─ 1.2: Vermeer + Chapter 2: Modern Art + ``` + + exact_mappings: + - schema:Clip + + close_mappings: + - ma:MediaFragment + - crm:E52_Time-Span + + related_mappings: + - wikidata:Q1454986 # Chapter (division of a book/document) + + slots: + # Chapter identification + - chapter_id + - chapter_title + - chapter_index + - chapter_description + + # Temporal boundaries (composition with VideoTimeSegment) + - chapter_start_seconds + - chapter_end_seconds + - chapter_start_time + - chapter_end_time + + # Generation metadata + - auto_generated + - chapter_source + + # Visual + - chapter_thumbnail_url + - chapter_thumbnail_timestamp + + # Hierarchy + - parent_chapter_id + - nesting_level + + slot_usage: + chapter_id: + slot_uri: dcterms:identifier + description: | + Unique identifier for this chapter. + + Dublin Core: identifier for unique identification. + + **Format**: Platform-specific or UUID + - YouTube: No native chapter ID (use index) + - Generated: `{video_id}_chapter_{index}` + range: string + required: true + examples: + - value: "ABC123_chapter_0" + description: "First chapter of video ABC123" + - value: "550e8400-e29b-41d4-a716-446655440000" + description: "UUID-based chapter ID" + + chapter_title: + slot_uri: schema:name + description: | + Title of the chapter as displayed to viewers. + + Schema.org: name for the chapter's title. + + **Best Practices**: + - Keep titles concise (under 50 characters) + - Use descriptive, meaningful titles + - Avoid timestamps in title (redundant) + + **Auto-Generated Titles**: + - YouTube AI: Often generic ("Introduction", "Main Content") + - May need manual refinement for heritage content + range: string + required: true + examples: + - value: "De Nachtwacht (The Night Watch)" + description: "Chapter about specific artwork" + - value: "Curator Interview: Conservation Process" + description: "Interview segment chapter" + + chapter_index: + slot_uri: hc:chapterIndex + description: | + Zero-based index of this chapter within the video. + + **Ordering**: + - 0: First chapter (typically starts at 0:00) + - Subsequent chapters in temporal order + + Used for: + - Reconstruction of chapter sequence + - Navigation (previous/next) + - Display ordering + range: integer + required: true + minimum_value: 0 + examples: + - value: 0 + description: "First chapter" + - value: 5 + description: "Sixth chapter (zero-indexed)" + + chapter_description: + slot_uri: schema:description + description: | + Optional detailed description of chapter content. + + Schema.org: description for chapter details. + + Longer-form description than title. May include: + - Topics covered + - Featured artworks + - Key points discussed + + Not all platforms display chapter descriptions. + range: string + required: false + examples: + - value: "Dr. Dibbits discusses the restoration of Rembrandt's masterpiece, including the controversial 2019 operation." + description: "Detailed chapter description" + + chapter_start_seconds: + slot_uri: ma:hasStartTime + description: | + Start time of chapter in seconds from video beginning. + + Media Ontology: hasStartTime for temporal start. + + **First Chapter Rule**: + For YouTube chapters to be recognized, the first chapter + MUST start at 0.0 seconds. + + Floating point for millisecond precision. + range: float + required: true + minimum_value: 0.0 + examples: + - value: 0.0 + description: "First chapter starts at video beginning" + - value: 120.5 + description: "Chapter starts at 2:00.5" + + chapter_end_seconds: + slot_uri: ma:hasEndTime + description: | + End time of chapter in seconds from video beginning. + + Media Ontology: hasEndTime for temporal end. + + **Chapter Boundaries**: + - End time of chapter N = start time of chapter N+1 + - Last chapter ends at video duration + - No gaps between chapters (continuous coverage) + range: float + required: false + minimum_value: 0.0 + examples: + - value: 120.0 + description: "Chapter ends at 2:00" + + chapter_start_time: + slot_uri: hc:chapterStartTime + description: | + Start time as ISO 8601 duration for display/serialization. + + Derived from chapter_start_seconds. + + **Format**: ISO 8601 duration (e.g., "PT2M30S" = 2:30) + range: string + required: false + pattern: "^PT(\\d+H)?(\\d+M)?(\\d+(\\.\\d+)?S)?$" + examples: + - value: "PT0S" + description: "Start of video" + - value: "PT10M30S" + description: "10 minutes 30 seconds" + + chapter_end_time: + slot_uri: hc:chapterEndTime + description: | + End time as ISO 8601 duration for display/serialization. + + Derived from chapter_end_seconds. + range: string + required: false + pattern: "^PT(\\d+H)?(\\d+M)?(\\d+(\\.\\d+)?S)?$" + examples: + - value: "PT5M0S" + description: "5 minutes" + + auto_generated: + slot_uri: hc:autoGenerated + description: | + Whether this chapter was auto-generated by AI/ML. + + **Sources**: + - true: YouTube AI chapters, third-party tools, ASR-based + - false: Manual creator-defined chapters + + Auto-generated chapters may have: + - Generic titles + - Less semantic meaning + - Scene-based rather than topic-based boundaries + range: boolean + required: false + examples: + - value: false + description: "Manual creator-defined chapter" + - value: true + description: "YouTube AI auto-generated" + + chapter_source: + slot_uri: prov:wasAttributedTo + description: | + Source or method that created this chapter. + + PROV-O: wasAttributedTo for attribution. + + **Common Values**: + - MANUAL: Creator-defined in video description + - YOUTUBE_AI: YouTube auto-chapters feature + - WHISPER_CHAPTERS: Generated from Whisper transcript + - SCENE_DETECTION: Based on visual scene changes + - THIRD_PARTY: External tool (specify in notes) + range: ChapterSourceEnum + required: false + examples: + - value: "MANUAL" + description: "Creator manually added chapters" + + chapter_thumbnail_url: + slot_uri: schema:thumbnailUrl + description: | + URL to thumbnail image for this chapter. + + Schema.org: thumbnailUrl for preview image. + + **Platform Behavior**: + - YouTube: Auto-selects keyframe from chapter start + - Vimeo: Allows custom chapter thumbnails + + Thumbnail helps viewers preview chapter content. + range: uri + required: false + examples: + - value: "https://i.ytimg.com/vi/ABC123/hq1.jpg" + description: "YouTube chapter thumbnail" + + chapter_thumbnail_timestamp: + slot_uri: hc:thumbnailTimestamp + description: | + Timestamp (in seconds) of frame used for thumbnail. + + May differ slightly from chapter_start_seconds if + a more visually representative frame was selected. + range: float + required: false + minimum_value: 0.0 + examples: + - value: 122.5 + description: "Thumbnail from 2:02.5" + + parent_chapter_id: + slot_uri: dcterms:isPartOf + description: | + Reference to parent chapter for hierarchical chapters. + + Dublin Core: isPartOf for containment relationship. + + Enables nested chapter structure: + ``` + Chapter 1: Dutch Masters + └─ 1.1: Rembrandt + └─ 1.2: Vermeer + ``` + + null/empty for top-level chapters. + range: string + required: false + examples: + - value: "ABC123_chapter_0" + description: "This is a sub-chapter of chapter 0" + + nesting_level: + slot_uri: hc:nestingLevel + description: | + Depth level in chapter hierarchy. + + - 0: Top-level chapter + - 1: First-level sub-chapter + - 2: Second-level sub-chapter + - etc. + + Most platforms only support level 0 (flat chapters). + range: integer + required: false + minimum_value: 0 + examples: + - value: 0 + description: "Top-level chapter" + - value: 1 + description: "Sub-chapter" + + comments: + - "Models video chapters for navigation (YouTube chapters, etc.)" + - "Supports both manual and auto-generated chapters" + - "Temporal boundaries via composition with VideoTimeSegment pattern" + - "Hierarchical chapters supported via parent_chapter_id" + - "Schema.org Clip alignment for semantic web compatibility" + + see_also: + - "https://support.google.com/youtube/answer/9884579" + - "https://schema.org/Clip" + - "https://www.w3.org/TR/media-frags/" + + + # ========================================================================== + # Supporting Class: VideoChapterList + # ========================================================================== + + VideoChapterList: + class_uri: schema:ItemList + description: | + A collection of chapters for a video. + + Groups all chapters for a video with metadata about the chapter set. + + Enables bulk operations on chapters: + - Import/export of chapter lists + - Validation of chapter coverage + - Source tracking for entire chapter set + + exact_mappings: + - schema:ItemList + + slots: + - video_id + - chapters + - total_chapters + - chapters_source + - chapters_generated_at + - covers_full_video + + slot_usage: + video_id: + slot_uri: schema:isPartOf + description: Reference to the parent video + range: string + required: true + + chapters: + slot_uri: schema:itemListElement + description: Ordered list of chapters + range: VideoChapter + multivalued: true + required: true + inlined_as_list: true + + total_chapters: + slot_uri: hc:totalChapters + description: Total number of chapters + range: integer + required: false + minimum_value: 0 + + chapters_source: + slot_uri: prov:wasAttributedTo + description: Primary source for this chapter list + range: ChapterSourceEnum + required: false + + chapters_generated_at: + slot_uri: prov:generatedAtTime + description: When chapters were generated/extracted + range: datetime + required: false + + covers_full_video: + slot_uri: hc:coversFullVideo + description: | + Whether chapters cover the entire video duration. + + - true: No gaps, first chapter at 0:00, last ends at video end + - false: Partial coverage (gaps between chapters) + range: boolean + required: false + +# ============================================================================ +# Enumerations +# ============================================================================ + +enums: + + ChapterSourceEnum: + description: | + Source or method that created video chapters. + permissible_values: + MANUAL: + description: Creator manually defined chapters in video description + YOUTUBE_AI: + description: YouTube auto-chapters feature (AI-generated) + WHISPER_CHAPTERS: + description: Generated from Whisper transcript analysis + SCENE_DETECTION: + description: Based on visual scene change detection + TRANSCRIPT_ANALYSIS: + description: Topic segmentation from transcript + THIRD_PARTY: + description: External tool or service + IMPORTED: + description: Imported from another platform/format + UNKNOWN: + description: Chapter source not determined + +# ============================================================================ +# Slot Definitions +# ============================================================================ + +slots: + chapter_id: + description: Unique identifier for chapter + range: string + + chapter_title: + description: Display title of chapter + range: string + + chapter_index: + description: Zero-based index in chapter sequence + range: integer + + chapter_description: + description: Detailed description of chapter content + range: string + + chapter_start_seconds: + description: Start time in seconds + range: float + + chapter_end_seconds: + description: End time in seconds + range: float + + chapter_start_time: + description: Start time as ISO 8601 duration + range: string + + chapter_end_time: + description: End time as ISO 8601 duration + range: string + + auto_generated: + description: Whether chapter was auto-generated by AI + range: boolean + + chapter_source: + description: Source that created this chapter + range: ChapterSourceEnum + + chapter_thumbnail_url: + description: URL to chapter thumbnail image + range: uri + + chapter_thumbnail_timestamp: + description: Timestamp of thumbnail frame + range: float + + parent_chapter_id: + description: Reference to parent chapter for nesting + range: string + + nesting_level: + description: Depth level in chapter hierarchy + range: integer + + # VideoChapterList slots + video_id: + description: Reference to parent video + range: string + + chapters: + description: Ordered list of video chapters + range: VideoChapter + multivalued: true + + total_chapters: + description: Total number of chapters + range: integer + + chapters_source: + description: Primary source for chapter list + range: ChapterSourceEnum + + chapters_generated_at: + description: When chapters were generated + range: datetime + + covers_full_video: + description: Whether chapters cover entire video + range: boolean diff --git a/schemas/20251121/linkml/modules/classes/VideoPost.yaml b/schemas/20251121/linkml/modules/classes/VideoPost.yaml new file mode 100644 index 0000000000..228aec6055 --- /dev/null +++ b/schemas/20251121/linkml/modules/classes/VideoPost.yaml @@ -0,0 +1,763 @@ +# Video Post Class +# Concrete subclass of SocialMediaPost for video content with platform-specific properties +# +# Part of Heritage Custodian Ontology v0.9.5 +# +# STRUCTURE: +# SocialMediaPost (parent) +# └── VideoPost (this class) +# - duration, definition, captions +# - view/like/comment metrics +# - YouTube-specific fields +# +# DATA SOURCE EXAMPLE: +# From data/custodian/NL-GE-AAL-M-NOM-nationaal_onderduikmuseum.yaml: +# youtube_enrichment: +# videos: +# - video_id: FbIoC-Owy-M +# duration: PT10M59S +# definition: hd +# caption_available: false +# view_count: 132 +# like_count: 2 +# comment_count: 0 + +id: https://nde.nl/ontology/hc/class/VideoPost +name: video_post_class +title: Video Post Class + +imports: + - linkml:types + - ./SocialMediaPost + - ./SocialMediaPostTypes + - ../slots/language + +prefixes: + linkml: https://w3id.org/linkml/ + hc: https://nde.nl/ontology/hc/ + schema: http://schema.org/ + dcterms: http://purl.org/dc/terms/ + prov: http://www.w3.org/ns/prov# + crm: http://www.cidoc-crm.org/cidoc-crm/ + skos: http://www.w3.org/2004/02/skos/core# + as: https://www.w3.org/ns/activitystreams# + wikidata: http://www.wikidata.org/entity/ + +default_prefix: hc + +classes: + + VideoPost: + is_a: SocialMediaPost + class_uri: as:Video + abstract: false + description: | + Concrete class for video content with platform-specific properties. + + **DEFINITION**: + + VideoPost is a specialized SocialMediaPost for video content. It extends + the base post class with video-specific slots for duration, resolution, + captions, and engagement metrics. + + **EXTENDS**: SocialMediaPost + + This class adds: + - Video technical properties (duration, definition, aspect ratio) + - Caption and subtitle availability + - Engagement metrics (views, likes, comments) + - Platform-specific fields (YouTube category, live broadcast status) + - Temporal markers (chapters, segments) + + **ONTOLOGY MAPPINGS**: + + | Property | Activity Streams | Schema.org | + |----------|------------------|------------| + | Class | as:Video | schema:VideoObject | + | duration | as:duration | schema:duration | + | definition | - | schema:videoQuality | + | caption | - | schema:caption | + | view_count | - | schema:interactionStatistic | + + **PLATFORM SUPPORT**: + + | Platform | Duration Limit | Resolution | Captions | + |----------|----------------|------------|----------| + | YouTube | 12 hours (verified) | Up to 8K | VTT, SRT | + | Vimeo | Varies by plan | Up to 8K | VTT, SRT | + | Facebook | 4 hours | Up to 4K | Auto-generated | + | TikTok | 10 minutes | 1080p | Auto-generated | + | Instagram Reels | 90 seconds | 1080p | Auto-generated | + + **HERITAGE INSTITUTION USE CASES**: + + | Content Type | Typical Duration | Platform | + |--------------|------------------|----------| + | Virtual tours | 10-30 min | YouTube | + | Conservation docs | 5-20 min | YouTube, Vimeo | + | Curator interviews | 15-60 min | YouTube | + | Object spotlights | 2-5 min | YouTube, Instagram | + | Short clips | 15-60 sec | TikTok, Reels | + | Live recordings | 30-120 min | YouTube | + + **METRICS OBSERVATION**: + + Video metrics (views, likes, comments) are observational data that change + constantly. Each metric reading should include: + - `metrics_observed_at`: When metrics were recorded + - `retrieval_timestamp`: When API call was made + + **RELATIONSHIP TO VideoPostType**: + + - VideoPost is a **concrete post instance** with video content + - VideoPostType is a **type classification** for categorizing posts + - A VideoPost typically has `post_types: [VideoPostType]` + - But may also have multiple types: `[LiveStreamPostType, VideoPostType]` + + **CAPTION AND SUBTITLE DISTINCTION**: + + Related classes for textual content derived from video: + - VideoSubtitle: Time-coded text (SRT/VTT format) + - VideoTranscript: Full text without timestamps + - VideoAnnotation: Computer vision derived content + + See VideoTextContent hierarchy for detailed modeling. + + exact_mappings: + - as:Video + - schema:VideoObject + + close_mappings: + - crm:E73_Information_Object + + related_mappings: + - wikidata:Q34508 # Video + - wikidata:Q604644 # Online video + + slots: + # ======================================== + # Video Technical Properties + # ======================================== + - duration + - definition + - aspect_ratio + - frame_rate + + # ======================================== + # Caption and Subtitle Availability + # ======================================== + - caption_available + - default_language + - default_audio_language + - available_caption_languages + + # ======================================== + # Engagement Metrics + # ======================================== + - view_count + - like_count + - dislike_count + - comment_count + - favorite_count + - metrics_observed_at + + # ======================================== + # Platform-Specific + # ======================================== + - video_category_id + - live_broadcast_content + - is_licensed_content + - is_embeddable + - is_made_for_kids + + # ======================================== + # Comments/Replies + # ======================================== + - comments_fetched + - video_comments + + slot_usage: + # --- Video Technical Properties --- + + duration: + slot_uri: schema:duration + description: | + Duration of the video in ISO 8601 format. + + Schema.org: duration for media length. + + **Format**: ISO 8601 duration (e.g., "PT10M59S" = 10 minutes 59 seconds) + + **Common Patterns**: + - PT30S = 30 seconds + - PT5M = 5 minutes + - PT1H30M = 1 hour 30 minutes + - PT2H15M30S = 2 hours 15 minutes 30 seconds + range: string + required: false + pattern: "^P(T(\\d+H)?(\\d+M)?(\\d+S)?)?$" + examples: + - value: "PT10M59S" + description: "10 minutes and 59 seconds" + - value: "PT1H30M" + description: "1 hour 30 minutes" + + definition: + slot_uri: schema:videoQuality + description: | + Video resolution/definition quality. + + Schema.org: videoQuality for resolution class. + + **Values**: + - sd: Standard definition (480p or lower) + - hd: High definition (720p, 1080p) + - 4k: Ultra HD (2160p) + - 8k: Full Ultra HD (4320p) + range: VideoDefinitionEnum + required: false + examples: + - value: "hd" + description: "High definition (720p/1080p)" + + aspect_ratio: + slot_uri: schema:width + description: | + Video aspect ratio. + + **Common Values**: + - 16:9: Standard widescreen (YouTube default) + - 9:16: Vertical (Shorts, Reels, TikTok) + - 4:3: Classic TV format + - 1:1: Square (Instagram legacy) + - 21:9: Cinematic ultrawide + range: string + required: false + examples: + - value: "16:9" + description: "Standard widescreen" + - value: "9:16" + description: "Vertical format for Shorts/Reels" + + frame_rate: + slot_uri: hc:frameRate + description: | + Video frame rate in frames per second. + + **Common Values**: + - 24: Cinema standard + - 25: PAL standard + - 30: NTSC standard + - 60: High frame rate + range: float + required: false + examples: + - value: 30.0 + description: "30 frames per second" + + # --- Caption and Subtitle Availability --- + + caption_available: + slot_uri: schema:hasPart + description: | + Whether captions/subtitles are available for this video. + + Indicates if the video has any caption tracks (auto-generated or manual). + + Related: Use `available_caption_languages` for specific languages. + range: boolean + required: false + examples: + - value: true + description: "Video has captions available" + - value: false + description: "No captions available" + + default_language: + slot_uri: schema:inLanguage + description: | + Default/primary language of the video content. + + Schema.org: inLanguage for content language. + + ISO 639-1 code (e.g., "nl", "en", "de"). + + Refers to on-screen text, title, description language. + range: string + required: false + examples: + - value: "nl" + description: "Dutch language content" + + default_audio_language: + slot_uri: hc:defaultAudioLanguage + description: | + Language of the video's default audio track. + + ISO 639-1 code. May differ from `default_language` for + dubbed or multilingual content. + range: string + required: false + examples: + - value: "nl" + description: "Dutch audio track" + + available_caption_languages: + slot_uri: hc:availableCaptionLanguages + description: | + List of languages for which captions are available. + + ISO 639-1 codes for all caption tracks. + range: string + multivalued: true + required: false + examples: + - value: ["nl", "en", "de"] + description: "Captions available in Dutch, English, German" + + # --- Engagement Metrics --- + + view_count: + slot_uri: schema:interactionCount + description: | + Number of views for this video. + + Schema.org: interactionCount for view statistic. + + **OBSERVATIONAL**: This value changes constantly. + Always record `metrics_observed_at` timestamp. + range: integer + required: false + minimum_value: 0 + examples: + - value: 132 + description: "132 views at observation time" + + like_count: + slot_uri: hc:likeCount + description: | + Number of likes/upvotes for this video. + + Platform-specific: YouTube likes, Facebook reactions, etc. + + **OBSERVATIONAL**: Record with `metrics_observed_at`. + range: integer + required: false + minimum_value: 0 + examples: + - value: 2 + description: "2 likes at observation time" + + dislike_count: + slot_uri: hc:dislikeCount + description: | + Number of dislikes/downvotes (if available). + + Note: YouTube hid public dislike counts in Nov 2021. + API may still return dislike data for channel owners. + range: integer + required: false + minimum_value: 0 + + comment_count: + slot_uri: hc:commentCount + description: | + Number of comments on this video. + + **OBSERVATIONAL**: Record with `metrics_observed_at`. + range: integer + required: false + minimum_value: 0 + examples: + - value: 0 + description: "No comments at observation time" + + favorite_count: + slot_uri: hc:favoriteCount + description: | + Number of times video was favorited/saved. + + Platform-specific availability. + range: integer + required: false + minimum_value: 0 + + metrics_observed_at: + slot_uri: prov:atTime + description: | + Timestamp when engagement metrics were recorded. + + PROV-O: atTime for observation timestamp. + + **CRITICAL**: Metrics change constantly. This timestamp + indicates when view_count, like_count, etc. were observed. + range: datetime + required: false + examples: + - value: "2025-12-01T23:16:22.294232+00:00" + description: "Metrics observed December 1, 2025" + + # --- Platform-Specific --- + + video_category_id: + slot_uri: hc:videoCategoryId + description: | + Platform-specific category identifier. + + **YouTube Category IDs**: + - 1: Film & Animation + - 2: Autos & Vehicles + - 10: Music + - 15: Pets & Animals + - 17: Sports + - 19: Travel & Events + - 20: Gaming + - 22: People & Blogs + - 23: Comedy + - 24: Entertainment + - 25: News & Politics + - 26: Howto & Style + - 27: Education + - 28: Science & Technology + - 29: Nonprofits & Activism + range: string + required: false + examples: + - value: "22" + description: "YouTube: People & Blogs" + - value: "27" + description: "YouTube: Education" + + live_broadcast_content: + slot_uri: hc:liveBroadcastContent + description: | + Live broadcast status of the video. + + **Values**: + - none: Not a live broadcast (standard video) + - live: Currently broadcasting live + - upcoming: Scheduled live stream not yet started + + When `live` or `upcoming` becomes `none`, video is archived. + range: LiveBroadcastStatusEnum + required: false + examples: + - value: "none" + description: "Standard video (not live)" + - value: "live" + description: "Currently broadcasting" + + is_licensed_content: + slot_uri: hc:isLicensedContent + description: | + Whether the video contains licensed content (music, clips). + + Affects monetization and regional availability. + range: boolean + required: false + + is_embeddable: + slot_uri: hc:isEmbeddable + description: | + Whether the video can be embedded on external sites. + + Publisher-controlled setting. + range: boolean + required: false + + is_made_for_kids: + slot_uri: hc:isMadeForKids + description: | + Whether the video is designated as made for children. + + COPPA compliance flag. Affects comments, ads, features. + range: boolean + required: false + + # --- Comments --- + + comments_fetched: + slot_uri: hc:commentsFetched + description: | + Number of comments actually fetched/archived. + + May be less than `comment_count` due to API limits, + deleted comments, or pagination. + range: integer + required: false + minimum_value: 0 + examples: + - value: 0 + description: "No comments fetched" + + video_comments: + slot_uri: hc:videoComments + description: | + Collection of comments on this video. + + Structured comment data with author, text, timestamp, likes. + + Note: Comments may contain nested replies. + range: VideoComment + multivalued: true + required: false + inlined: true + + comments: + - "Extends SocialMediaPost with video-specific properties" + - "Maps to as:Video and schema:VideoObject" + - "Metrics are observational - always include metrics_observed_at" + - "Caption availability signals but not content (see VideoSubtitle)" + - "YouTube is primary platform for heritage institution video content" + + see_also: + - "https://www.w3.org/ns/activitystreams#Video" + - "https://schema.org/VideoObject" + - "https://developers.google.com/youtube/v3/docs/videos" + + # ========================================================================== + # Supporting Class: VideoComment + # ========================================================================== + + VideoComment: + class_uri: schema:Comment + description: | + A comment on a video post. + + Models user-generated comments with author, text, timestamp, + and engagement metrics. Supports nested reply threads. + + exact_mappings: + - schema:Comment + - as:Note + + slots: + - comment_id + - comment_author + - comment_author_channel_id + - comment_text + - comment_published_at + - comment_updated_at + - comment_like_count + - comment_reply_count + - comment_replies + + slot_usage: + comment_id: + slot_uri: dcterms:identifier + description: Unique identifier for the comment + range: string + required: true + + comment_author: + slot_uri: schema:author + description: Display name of comment author + range: string + required: true + + comment_author_channel_id: + slot_uri: hc:authorChannelId + description: Platform channel/account ID of author + range: string + required: false + + comment_text: + slot_uri: schema:text + description: Full text content of the comment + range: string + required: true + + comment_published_at: + slot_uri: dcterms:created + description: When comment was originally posted + range: datetime + required: true + + comment_updated_at: + slot_uri: dcterms:modified + description: When comment was last edited + range: datetime + required: false + + comment_like_count: + slot_uri: hc:likeCount + description: Number of likes on this comment + range: integer + required: false + minimum_value: 0 + + comment_reply_count: + slot_uri: hc:replyCount + description: Number of replies to this comment + range: integer + required: false + minimum_value: 0 + + comment_replies: + slot_uri: schema:comment + description: Nested reply comments + range: VideoComment + multivalued: true + required: false + inlined: true + +# ============================================================================ +# Supporting Enumerations +# ============================================================================ + +enums: + + VideoDefinitionEnum: + description: | + Video resolution/definition quality categories. + + Based on common platform standards. + permissible_values: + sd: + description: Standard definition (480p or lower) + hd: + description: High definition (720p, 1080p) + uhd: + description: Ultra HD (2160p/4K) + 4k: + description: 4K resolution (2160p) - alias for uhd + 8k: + description: Full Ultra HD (4320p) + + LiveBroadcastStatusEnum: + description: | + Live broadcast status values for video content. + + Based on YouTube API liveBroadcastContent values. + permissible_values: + none: + description: Not a live broadcast (standard uploaded video) + live: + description: Currently broadcasting live + upcoming: + description: Scheduled live stream that hasn't started yet + +# ============================================================================ +# Slot Definitions +# ============================================================================ + +slots: + duration: + description: Duration in ISO 8601 format + range: string + + definition: + description: Video resolution quality (sd, hd, 4k, 8k) + range: VideoDefinitionEnum + + aspect_ratio: + description: Video aspect ratio (16:9, 9:16, 4:3, etc.) + range: string + + frame_rate: + description: Frame rate in FPS + range: float + + caption_available: + description: Whether captions are available + range: boolean + + default_audio_language: + description: Language of default audio track + range: string + + available_caption_languages: + description: Languages for which captions exist + range: string + multivalued: true + + view_count: + description: Number of views + range: integer + + like_count: + description: Number of likes + range: integer + + dislike_count: + description: Number of dislikes + range: integer + + comment_count: + description: Number of comments + range: integer + + favorite_count: + description: Number of favorites/saves + range: integer + + metrics_observed_at: + description: When metrics were recorded + range: datetime + + video_category_id: + description: Platform category identifier + range: string + + live_broadcast_content: + description: Live broadcast status + range: LiveBroadcastStatusEnum + + is_licensed_content: + description: Contains licensed content + range: boolean + + is_embeddable: + description: Can be embedded externally + range: boolean + + is_made_for_kids: + description: COPPA kids content flag + range: boolean + + comments_fetched: + description: Number of comments actually retrieved + range: integer + + video_comments: + description: Collection of video comments + range: VideoComment + multivalued: true + + # VideoComment slots + comment_id: + description: Unique comment identifier + range: string + + comment_author: + description: Comment author display name + range: string + + comment_author_channel_id: + description: Author's channel/account ID + range: string + + comment_text: + description: Comment text content + range: string + + comment_published_at: + description: When comment was posted + range: datetime + + comment_updated_at: + description: When comment was edited + range: datetime + + comment_like_count: + description: Likes on this comment + range: integer + + comment_reply_count: + description: Number of replies + range: integer + + comment_replies: + description: Nested reply comments + range: VideoComment + multivalued: true diff --git a/schemas/20251121/linkml/modules/classes/VideoSubtitle.yaml b/schemas/20251121/linkml/modules/classes/VideoSubtitle.yaml new file mode 100644 index 0000000000..e7335daba7 --- /dev/null +++ b/schemas/20251121/linkml/modules/classes/VideoSubtitle.yaml @@ -0,0 +1,632 @@ +# Video Subtitle Class +# Time-coded caption/subtitle content extending VideoTranscript +# +# Part of Heritage Custodian Ontology v0.9.5 +# +# HIERARCHY: +# E73_Information_Object (CIDOC-CRM) +# │ +# └── VideoTextContent (abstract - provenance) +# │ +# └── VideoTranscript (full text transcription) +# │ +# └── VideoSubtitle (this class - time-coded captions) +# +# DESIGN RATIONALE: +# VideoSubtitle extends VideoTranscript because subtitles ARE transcripts +# with additional time-coding and display metadata: +# +# 1. A subtitle file (SRT, VTT) contains complete spoken content (transcript) +# 2. Plus precise start/end times for each caption +# 3. Plus display formatting (position, styling in some formats) +# +# You can always derive a plain transcript from subtitles by stripping times. +# This inheritance enables polymorphic handling: treat subtitles as transcripts +# when time-coding isn't needed. +# +# SUBTITLE FORMATS SUPPORTED: +# - SRT (SubRip): Most common, simple time + text +# - VTT (WebVTT): W3C standard, supports styling +# - TTML (DFXP): XML-based, broadcast standard +# - SBV (YouTube): YouTube's native format +# - ASS/SSA: Advanced styling, anime subtitles + +id: https://nde.nl/ontology/hc/class/VideoSubtitle +name: video_subtitle_class +title: Video Subtitle Class + +imports: + - linkml:types + - ./VideoTranscript + - ./VideoTimeSegment + +prefixes: + linkml: https://w3id.org/linkml/ + hc: https://nde.nl/ontology/hc/ + schema: http://schema.org/ + dcterms: http://purl.org/dc/terms/ + prov: http://www.w3.org/ns/prov# + crm: http://www.cidoc-crm.org/cidoc-crm/ + skos: http://www.w3.org/2004/02/skos/core# + ma: http://www.w3.org/ns/ma-ont# + +default_prefix: hc + +classes: + + VideoSubtitle: + is_a: VideoTranscript + class_uri: hc:VideoSubtitle + abstract: false + description: | + Time-coded caption/subtitle content for video. + + **DEFINITION**: + + VideoSubtitle represents caption/subtitle tracks that provide time-coded + text synchronized with video playback. It extends VideoTranscript because + subtitles contain complete transcription PLUS temporal synchronization. + + **INHERITANCE FROM VideoTranscript**: + + VideoSubtitle inherits all transcript capabilities: + - `full_text`: Complete subtitle text concatenated + - `segments`: Time-coded entries (REQUIRED for subtitles) + - `includes_timestamps`: Always true for subtitles + - `content_language`: Language of subtitle text + - All provenance from VideoTextContent + + And adds subtitle-specific properties: + - `subtitle_format`: SRT, VTT, TTML, SBV, ASS + - `is_closed_caption`: CC vs regular subtitles + - `is_sdh`: Subtitles for Deaf/Hard-of-Hearing + - `includes_sound_descriptions`: Non-speech audio descriptions + + **SCHEMA.ORG ALIGNMENT**: + + Maps to `schema:caption` property: + > "For downloadable machine formats (closed caption, subtitles etc.) + > use the MediaObject.encodingFormat property." + + **SUBTITLE vs CAPTION vs TRANSCRIPT**: + + | Type | Time-coded | Purpose | Audience | + |------|------------|---------|----------| + | Transcript | Optional | Reading, search | Everyone | + | Subtitle | Required | Language translation | Hearing viewers | + | Caption (CC) | Required | Accessibility | Deaf/HoH viewers | + | SDH | Required | Full accessibility | Deaf viewers, noisy environments | + + **SDH (Subtitles for Deaf/Hard-of-Hearing)**: + + SDH differs from regular subtitles by including: + - Speaker identification: "(John) Hello" + - Sound effects: "[door slams]", "[music playing]" + - Music descriptions: "♪ upbeat jazz ♪" + - Emotional cues: "[laughing]", "[whispering]" + + **SUBTITLE FORMATS**: + + | Format | Extension | Features | Use Case | + |--------|-----------|----------|----------| + | SRT | .srt | Simple, universal | Most video players | + | VTT | .vtt | W3C standard, styling | HTML5 video, web | + | TTML | .ttml/.dfxp | XML, rich styling | Broadcast, streaming | + | SBV | .sbv | YouTube native | YouTube uploads | + | ASS | .ass | Advanced styling | Anime, complex layouts | + + **SRT FORMAT EXAMPLE**: + + ``` + 1 + 00:00:00,000 --> 00:00:03,500 + Welcome to the Rijksmuseum. + + 2 + 00:00:03,500 --> 00:00:08,200 + Today we'll explore the Night Watch gallery. + ``` + + **VTT FORMAT EXAMPLE**: + + ``` + WEBVTT + + 00:00:00.000 --> 00:00:03.500 + Welcome to the Rijksmuseum. + + 00:00:03.500 --> 00:00:08.200 + Today we'll explore the Night Watch gallery. + ``` + + **HERITAGE INSTITUTION CONTEXT**: + + Subtitles are critical for heritage video accessibility: + + 1. **Accessibility Compliance**: WCAG 2.1, Section 508 + 2. **Multilingual Access**: Translate for international audiences + 3. **Silent Viewing**: Social media, public displays, quiet spaces + 4. **Search Discovery**: Subtitle text is indexed by platforms + 5. **Preservation**: Text outlasts video format obsolescence + + **YOUTUBE API INTEGRATION**: + + Subtitle tracks from YouTube API populate: + - `subtitle_format`: Typically VTT or SRT + - `generation_method`: PLATFORM_PROVIDED or ASR_AUTOMATIC + - `content_language`: From track language code + - `is_auto_generated`: YouTube auto-caption flag + + **SEGMENTS ARE REQUIRED**: + + Unlike VideoTranscript where segments are optional, VideoSubtitle + REQUIRES the `segments` slot to be populated with VideoTimeSegment + entries that include start_seconds, end_seconds, and segment_text. + + exact_mappings: + - schema:caption + + close_mappings: + - ma:CaptioningFormat + + related_mappings: + - schema:transcript + + slots: + # Subtitle-specific format + - subtitle_format + - raw_subtitle_content + + # Accessibility metadata + - is_closed_caption + - is_sdh + - includes_sound_descriptions + - includes_music_descriptions + - includes_speaker_identification + + # Source/generation info + - is_auto_generated + - track_name + - track_id + + # Positioning (for formats that support it) + - default_position + + # Entry counts + - entry_count + - average_entry_duration_seconds + + slot_usage: + # Override segments to be required for subtitles + segments: + required: true + description: | + Time-coded subtitle entries as VideoTimeSegment objects. + + **REQUIRED for VideoSubtitle** (optional in parent VideoTranscript). + + Each segment represents one caption display unit: + - start_seconds: When caption appears + - end_seconds: When caption disappears + - segment_text: Caption text content + - segment_index: Order in subtitle track + - confidence: For auto-generated captions + + Segments are ordered by start_seconds for proper playback. + + # Override includes_timestamps to default true + includes_timestamps: + ifabsent: "true" + description: | + Whether subtitle includes time markers. + + **Always true for VideoSubtitle** - time-coding is definitional. + + subtitle_format: + slot_uri: dcterms:format + description: | + Subtitle file format. + + Dublin Core: format for resource format. + + Specifies the encoding format of the subtitle content. + Affects parsing and rendering capabilities. + range: SubtitleFormatEnum + required: true + examples: + - value: "VTT" + description: "WebVTT format (W3C standard)" + - value: "SRT" + description: "SubRip format (most common)" + + raw_subtitle_content: + slot_uri: hc:rawSubtitleContent + description: | + Original subtitle file content as raw string. + + Preserves the complete subtitle file in its native format. + Useful for: + - Format conversion + - Re-parsing with different tools + - Archive preservation + + May be large - consider storing separately for large files. + range: string + required: false + examples: + - value: | + WEBVTT + + 00:00:00.000 --> 00:00:03.500 + Welcome to the museum. + description: "Complete VTT file content" + + is_closed_caption: + slot_uri: hc:isClosedCaption + description: | + Whether this is a closed caption track (CC). + + Closed captions differ from subtitles: + - **CC (true)**: Designed for Deaf/HoH, includes non-speech audio + - **Subtitles (false)**: Translation of dialogue only + + CC typically includes [MUSIC], [APPLAUSE], speaker ID, etc. + range: boolean + required: false + ifabsent: "false" + examples: + - value: true + description: "This is a closed caption track" + + is_sdh: + slot_uri: hc:isSDH + description: | + Whether these are Subtitles for Deaf/Hard-of-Hearing (SDH). + + SDH combines subtitle translation with CC-style annotations: + - Dialogue translation (like subtitles) + - Sound descriptions (like CC) + - Speaker identification + + Typically marked "[SDH]" on streaming platforms. + range: boolean + required: false + ifabsent: "false" + examples: + - value: true + description: "SDH subtitle track" + + includes_sound_descriptions: + slot_uri: hc:includesSoundDescriptions + description: | + Whether subtitle includes non-speech sound descriptions. + + Examples of sound descriptions: + - [door slams] + - [phone ringing] + - [thunder] + - [footsteps approaching] + + Characteristic of CC and SDH tracks. + range: boolean + required: false + ifabsent: "false" + examples: + - value: true + description: "Contains sound effect descriptions" + + includes_music_descriptions: + slot_uri: hc:includesMusicDescriptions + description: | + Whether subtitle includes music/song descriptions. + + Examples: + - ♪ upbeat jazz playing ♪ + - [classical music] + - ♪ singing in Dutch ♪ + - [somber orchestral music] + + Important for heritage content with significant musical elements. + range: boolean + required: false + ifabsent: "false" + examples: + - value: true + description: "Contains music descriptions" + + includes_speaker_identification: + slot_uri: hc:includesSpeakerIdentification + description: | + Whether subtitle identifies speakers. + + Speaker identification patterns: + - (John): Hello there. + - NARRATOR: Welcome to the museum. + - [Curator] This painting dates from 1642. + + Different from transcript speaker_id which is per-segment; + this indicates whether the TEXT CONTENT includes labels. + range: boolean + required: false + ifabsent: "false" + examples: + - value: true + description: "Subtitle text includes speaker labels" + + is_auto_generated: + slot_uri: hc:isAutoGenerated + description: | + Whether subtitle was auto-generated by the platform. + + Distinct from generation_method (inherited from VideoTextContent): + - `is_auto_generated`: Platform flag (YouTube, Vimeo) + - `generation_method`: How WE know it was generated + + Auto-generated captions typically have lower accuracy. + range: boolean + required: false + ifabsent: "false" + examples: + - value: true + description: "YouTube auto-generated caption" + + track_name: + slot_uri: schema:name + description: | + Human-readable name of the subtitle track. + + Schema.org: name for track label. + + Examples from YouTube: + - "English" + - "English (auto-generated)" + - "Dutch - Nederlands" + - "English (United Kingdom)" + range: string + required: false + examples: + - value: "English (auto-generated)" + description: "YouTube auto-caption track name" + + track_id: + slot_uri: dcterms:identifier + description: | + Platform-specific identifier for this subtitle track. + + Dublin Core: identifier for unique ID. + + Used to fetch/update specific tracks via API. + range: string + required: false + examples: + - value: "en.3OWxR1w4QfE" + description: "YouTube caption track ID" + + default_position: + slot_uri: hc:defaultPosition + description: | + Default display position for captions. + + For formats that support positioning (VTT, TTML, ASS): + - BOTTOM: Default, below video content + - TOP: Above video content + - MIDDLE: Center of video + + May be overridden per-segment in advanced formats. + range: SubtitlePositionEnum + required: false + ifabsent: "string(BOTTOM)" + examples: + - value: "BOTTOM" + description: "Standard bottom caption position" + + entry_count: + slot_uri: hc:entryCount + description: | + Number of subtitle entries (caption cues). + + Equals length of segments array. + Useful for content sizing without loading full segments. + range: integer + required: false + minimum_value: 0 + examples: + - value: 127 + description: "127 caption cues in this track" + + average_entry_duration_seconds: + slot_uri: hc:averageEntryDuration + description: | + Average duration of subtitle entries in seconds. + + Typical ranges: + - 2-4 seconds: Normal speech rate + - < 2 seconds: Rapid dialogue + - > 5 seconds: Slow narration or long displays + + Useful for quality assessment - very short or long entries + may indicate timing issues. + range: float + required: false + minimum_value: 0.0 + examples: + - value: 3.2 + description: "Average 3.2 seconds per caption" + + rules: + - postconditions: + description: | + segments must be populated for VideoSubtitle. + This is enforced by making segments required in slot_usage. + + comments: + - "Time-coded caption/subtitle content" + - "Extends VideoTranscript - subtitles ARE transcripts plus time codes" + - "Supports multiple formats: SRT, VTT, TTML, SBV, ASS" + - "Accessibility metadata: CC, SDH, sound/music descriptions" + - "Critical for heritage video accessibility compliance" + + see_also: + - "https://schema.org/caption" + - "https://www.w3.org/TR/webvtt1/" + - "https://developer.mozilla.org/en-US/docs/Web/API/WebVTT_API" + - "https://www.3playmedia.com/learn/popular-topics/closed-captioning/" + +# ============================================================================ +# Enumerations +# ============================================================================ + +enums: + + SubtitleFormatEnum: + description: | + Subtitle/caption file formats. + + Each format has different capabilities for timing precision, + styling, positioning, and metadata. + permissible_values: + SRT: + description: | + SubRip subtitle format (.srt). + Most widely supported format. + Simple: sequence number, timecode, text. + No styling or positioning support. + VTT: + description: | + WebVTT format (.vtt). + W3C standard for HTML5 video. + Supports styling (CSS), positioning, cue settings. + Recommended for web delivery. + TTML: + description: | + Timed Text Markup Language (.ttml/.dfxp/.xml). + W3C XML-based standard. + Rich styling, regions, timing. + Used in broadcast and streaming (Netflix, Amazon). + SBV: + description: | + YouTube SubViewer format (.sbv). + Simple format similar to SRT. + Native YouTube caption format. + ASS: + description: | + Advanced SubStation Alpha (.ass). + Advanced styling, positioning, effects. + Popular for anime subtitles. + Includes SSA (.ssa) as predecessor. + STL: + description: | + EBU STL format (.stl). + European Broadcasting Union standard. + Used in broadcast television. + Binary format with teletext compatibility. + CAP: + description: | + Scenarist Closed Caption (.scc/.cap). + Used for broadcast closed captioning. + CEA-608/CEA-708 compliant. + SAMI: + description: | + Synchronized Accessible Media Interchange (.smi/.sami). + Microsoft format for Windows Media. + HTML-like markup with timing. + LRC: + description: | + LRC lyrics format (.lrc). + Simple format for song lyrics. + Line-by-line timing, no duration. + JSON: + description: | + JSON-based subtitle format. + Used by some APIs (YouTube transcript API). + Structure varies by source. + UNKNOWN: + description: | + Unknown or unrecognized format. + May require manual parsing or conversion. + + SubtitlePositionEnum: + description: | + Default caption display position on video. + + May be overridden by format-specific positioning (VTT, TTML, ASS). + permissible_values: + BOTTOM: + description: | + Bottom of video frame (standard position). + Most common for subtitles and captions. + Typically in lower 10-15% of frame. + TOP: + description: | + Top of video frame. + Used when bottom is occluded. + Common for some broadcast formats. + MIDDLE: + description: | + Center of video frame. + Rarely used except for specific effects. + LEFT: + description: | + Left side of frame (vertical text). + Rare, used for specific languages/effects. + RIGHT: + description: | + Right side of frame (vertical text). + Rare, used for specific languages/effects. + +# ============================================================================ +# Slot Definitions +# ============================================================================ + +slots: + subtitle_format: + description: Subtitle file format (SRT, VTT, TTML, etc.) + range: SubtitleFormatEnum + + raw_subtitle_content: + description: Original subtitle file content as raw string + range: string + + is_closed_caption: + description: Whether this is a closed caption (CC) track + range: boolean + + is_sdh: + description: Whether these are Subtitles for Deaf/Hard-of-Hearing + range: boolean + + includes_sound_descriptions: + description: Whether subtitle includes non-speech sound descriptions + range: boolean + + includes_music_descriptions: + description: Whether subtitle includes music descriptions + range: boolean + + includes_speaker_identification: + description: Whether subtitle text includes speaker labels + range: boolean + + is_auto_generated: + description: Whether subtitle was auto-generated by platform + range: boolean + + track_name: + description: Human-readable name of subtitle track + range: string + + track_id: + description: Platform-specific identifier for subtitle track + range: string + + default_position: + description: Default display position for captions + range: SubtitlePositionEnum + + entry_count: + description: Number of subtitle entries (caption cues) + range: integer + + average_entry_duration_seconds: + description: Average duration of subtitle entries in seconds + range: float diff --git a/schemas/20251121/linkml/modules/classes/VideoTextContent.yaml b/schemas/20251121/linkml/modules/classes/VideoTextContent.yaml new file mode 100644 index 0000000000..1b798c2a53 --- /dev/null +++ b/schemas/20251121/linkml/modules/classes/VideoTextContent.yaml @@ -0,0 +1,524 @@ +# Video Text Content Class +# Abstract base class for all textual/derived content from videos +# +# Part of Heritage Custodian Ontology v0.9.5 +# +# HIERARCHY: +# E73_Information_Object (CIDOC-CRM) +# │ +# └── VideoTextContent (this class - ABSTRACT) +# │ +# ├── VideoTranscript (full text transcription) +# │ │ +# │ └── VideoSubtitle (time-coded captions) +# │ +# └── VideoAnnotation (CV/multimodal derived) +# │ +# ├── VideoSceneAnnotation +# ├── VideoObjectAnnotation +# └── VideoOCRAnnotation +# +# DESIGN RATIONALE: +# All text derived from video (transcripts, subtitles, annotations) shares +# common provenance requirements: +# - Source video reference +# - Generation method (ASR, manual, CV model) +# - Generation timestamp +# - Model/tool version +# - Overall confidence score +# +# This abstract base ensures consistent provenance tracking across all +# video-derived text content types. + +id: https://nde.nl/ontology/hc/class/VideoTextContent +name: video_text_content_class +title: Video Text Content Class + +imports: + - linkml:types + - ./VideoPost + +prefixes: + linkml: https://w3id.org/linkml/ + hc: https://nde.nl/ontology/hc/ + schema: http://schema.org/ + dcterms: http://purl.org/dc/terms/ + prov: http://www.w3.org/ns/prov# + crm: http://www.cidoc-crm.org/cidoc-crm/ + skos: http://www.w3.org/2004/02/skos/core# + oa: http://www.w3.org/ns/oa# + +default_prefix: hc + +classes: + + VideoTextContent: + class_uri: crm:E73_Information_Object + abstract: true + description: | + Abstract base class for all textual/derived content from videos. + + **DEFINITION**: + + VideoTextContent is the abstract parent for all text that is extracted, + transcribed, or derived from video content. This includes: + + | Subclass | Source | Description | + |----------|--------|-------------| + | VideoTranscript | Audio | Full text transcription of spoken content | + | VideoSubtitle | Audio | Time-coded caption entries (SRT/VTT) | + | VideoAnnotation | Visual | CV/multimodal-derived descriptions | + + **PROVENANCE REQUIREMENTS**: + + All video-derived text MUST include comprehensive provenance: + + 1. **Source**: Which video was processed (`source_video`) + 2. **Method**: How was content generated (`generation_method`) + 3. **Agent**: Who/what generated it (`generated_by`) + 4. **Time**: When was it generated (`generation_timestamp`) + 5. **Version**: Tool/model version (`model_version`) + 6. **Quality**: Overall confidence (`overall_confidence`) + + **PROV-O ALIGNMENT**: + + Maps to W3C PROV-O for provenance tracking: + + ```turtle + :transcript a hc:VideoTranscript ; + prov:wasGeneratedBy :asr_activity ; + prov:wasAttributedTo :whisper_model ; + prov:generatedAtTime "2025-12-01T10:00:00Z" ; + prov:wasDerivedFrom :source_video . + ``` + + **CIDOC-CRM E73_Information_Object**: + + - E73 is the base for all identifiable immaterial items + - Includes texts, computer programs, songs, recipes + - VideoTextContent are E73 instances derived from video (E73) + + **GENERATION METHODS**: + + | Method | Description | Typical Confidence | + |--------|-------------|-------------------| + | ASR_AUTOMATIC | Automatic speech recognition | 0.75-0.95 | + | ASR_ENHANCED | ASR with post-processing | 0.85-0.98 | + | MANUAL_TRANSCRIPTION | Human transcription | 0.98-1.0 | + | MANUAL_CORRECTION | Human-corrected ASR | 0.95-1.0 | + | CV_AUTOMATIC | Computer vision detection | 0.60-0.90 | + | MULTIMODAL | Combined audio+visual AI | 0.70-0.95 | + | OCR | Optical character recognition | 0.80-0.98 | + | PLATFORM_PROVIDED | From YouTube/Vimeo API | 0.85-0.95 | + + **HERITAGE INSTITUTION CONTEXT**: + + Video text content is critical for: + - **Accessibility**: Deaf/HoH users need accurate captions + - **Discovery**: Full-text search over video collections + - **Preservation**: Text outlasts video format obsolescence + - **Research**: Analyzing spoken content at scale + - **Translation**: Multilingual access to heritage content + + **LANGUAGE SUPPORT**: + + - `content_language`: Primary language of text content + - May differ from video's default_audio_language if translated + - ISO 639-1 codes (e.g., "nl", "en", "de") + + exact_mappings: + - crm:E73_Information_Object + + close_mappings: + - prov:Entity + + related_mappings: + - schema:CreativeWork + - dcterms:Text + + slots: + # Source reference + - source_video + - source_video_url + + # Content metadata + - content_language + - content_title + + # Provenance - Generation + - generated_by + - generation_method + - generation_timestamp + - model_version + - model_provider + + # Quality + - overall_confidence + - is_verified + - verified_by + - verification_date + + # Processing metadata + - processing_duration_seconds + - word_count + - character_count + + slot_usage: + source_video: + slot_uri: prov:wasDerivedFrom + description: | + Reference to the VideoPost from which this content was derived. + + PROV-O: wasDerivedFrom links derived content to source. + + Links to the video's unique identifier (post_id). + range: string + required: true + examples: + - value: "FbIoC-Owy-M" + description: "YouTube video ID as source reference" + + source_video_url: + slot_uri: schema:url + description: | + URL of the source video. + + Convenience field for direct video access. + Derived from source_video but stored for quick reference. + range: uri + required: false + examples: + - value: "https://www.youtube.com/watch?v=FbIoC-Owy-M" + description: "Full YouTube video URL" + + content_language: + slot_uri: dcterms:language + description: | + Primary language of the text content. + + Dublin Core: language for content language. + + ISO 639-1 code. May differ from video's audio language + if this is a translation or localization. + range: string + required: true + examples: + - value: "nl" + description: "Dutch language content" + - value: "en" + description: "English translation" + + content_title: + slot_uri: dcterms:title + description: | + Title or label for this text content. + + Dublin Core: title for content name. + + Examples: + - "Rijksmuseum Tour - Full Transcript" + - "Dutch Subtitles - Auto-generated" + - "Scene Annotations - CV Model v2.1" + range: string + required: false + examples: + - value: "De Vrijheidsroute Ep.3 - Dutch Transcript" + description: "Descriptive title for transcript" + + generated_by: + slot_uri: prov:wasAttributedTo + description: | + The agent (model, service, person) that generated this content. + + PROV-O: wasAttributedTo identifies the responsible agent. + + **Examples**: + - AI Models: "openai/whisper-large-v3", "google/speech-to-text" + - Services: "YouTube Auto-captions", "Rev.com" + - Human: "transcriber:jane.doe@museum.nl" + range: string + required: true + examples: + - value: "openai/whisper-large-v3" + description: "OpenAI Whisper ASR model" + - value: "YouTube Auto-captions" + description: "Platform-provided captions" + - value: "manual:curator@rijksmuseum.nl" + description: "Human transcriber" + + generation_method: + slot_uri: prov:wasGeneratedBy + description: | + The method used to generate this content. + + PROV-O: wasGeneratedBy for generation activity type. + + See GenerationMethodEnum for standardized values. + range: GenerationMethodEnum + required: true + examples: + - value: "ASR_AUTOMATIC" + description: "Automatic speech recognition" + - value: "MANUAL_TRANSCRIPTION" + description: "Human transcription" + + generation_timestamp: + slot_uri: prov:generatedAtTime + description: | + When this content was generated. + + PROV-O: generatedAtTime for creation timestamp. + + ISO 8601 datetime. Critical for versioning and reproducibility. + range: datetime + required: true + examples: + - value: "2025-12-01T10:30:00Z" + description: "Generated December 1, 2025 at 10:30 UTC" + + model_version: + slot_uri: schema:softwareVersion + description: | + Version of the model or tool used for generation. + + Schema.org: softwareVersion for version tracking. + + Critical for reproducibility and quality assessment. + range: string + required: false + examples: + - value: "large-v3" + description: "Whisper model version" + - value: "v2.3.1" + description: "Software version number" + + model_provider: + slot_uri: schema:provider + description: | + Provider or vendor of the generation model/service. + + Schema.org: provider for service provider. + range: string + required: false + examples: + - value: "OpenAI" + description: "Model provider" + - value: "Google Cloud" + description: "Cloud service provider" + + overall_confidence: + slot_uri: hc:overallConfidence + description: | + Overall confidence score for the generated content. + + Range: 0.0 (no confidence) to 1.0 (complete certainty) + + Aggregated from per-segment confidence scores or + provided by the generation model. + + **Thresholds** (suggested): + - > 0.9: High quality, production-ready + - 0.75-0.9: Good, may have minor errors + - 0.6-0.75: Usable, should be reviewed + - < 0.6: Low quality, needs significant correction + range: float + required: false + minimum_value: 0.0 + maximum_value: 1.0 + examples: + - value: 0.92 + description: "High confidence ASR output" + + is_verified: + slot_uri: hc:isVerified + description: | + Whether content has been verified by a human. + + - **true**: Human-reviewed and approved + - **false**: Not yet verified (default for AI-generated) + + Critical for quality assurance in heritage contexts. + range: boolean + required: false + ifabsent: "false" + examples: + - value: true + description: "Human-verified transcript" + + verified_by: + slot_uri: prov:wasAttributedTo + description: | + Identity of the person who verified the content. + + Only populated when is_verified = true. + range: string + required: false + examples: + - value: "curator@rijksmuseum.nl" + description: "Staff member who verified" + + verification_date: + slot_uri: dcterms:dateAccepted + description: | + Date when content was verified. + + Dublin Core: dateAccepted for approval date. + range: datetime + required: false + examples: + - value: "2025-12-02T15:00:00Z" + description: "Verified December 2, 2025" + + processing_duration_seconds: + slot_uri: hc:processingDuration + description: | + Time taken to generate this content, in seconds. + + Useful for performance monitoring and cost estimation. + range: float + required: false + minimum_value: 0.0 + examples: + - value: 45.3 + description: "Processed in 45.3 seconds" + + word_count: + slot_uri: hc:wordCount + description: | + Total number of words in the text content. + + Useful for content sizing and analysis. + range: integer + required: false + minimum_value: 0 + examples: + - value: 1523 + description: "1,523 words in transcript" + + character_count: + slot_uri: hc:characterCount + description: | + Total number of characters in the text content. + + Includes spaces. Useful for storage estimation. + range: integer + required: false + minimum_value: 0 + examples: + - value: 8742 + description: "8,742 characters" + + comments: + - "Abstract base for all video-derived text content" + - "Comprehensive PROV-O provenance tracking" + - "Confidence scoring for AI-generated content" + - "Verification workflow support" + - "Critical for heritage accessibility and discovery" + + see_also: + - "https://www.w3.org/TR/prov-o/" + - "http://www.cidoc-crm.org/cidoc-crm/E73_Information_Object" + +# ============================================================================ +# Enumerations +# ============================================================================ + +enums: + + GenerationMethodEnum: + description: | + Methods for generating video-derived text content. + + Standardized values for provenance tracking. + permissible_values: + ASR_AUTOMATIC: + description: Automatic speech recognition (raw output) + ASR_ENHANCED: + description: ASR with post-processing (punctuation, normalization) + MANUAL_TRANSCRIPTION: + description: Fully human-transcribed content + MANUAL_CORRECTION: + description: Human-corrected ASR output + CV_AUTOMATIC: + description: Computer vision detection (raw output) + CV_ENHANCED: + description: CV with post-processing or filtering + MULTIMODAL: + description: Combined audio+visual AI processing + OCR: + description: Optical character recognition from video frames + PLATFORM_PROVIDED: + description: Content from platform API (YouTube, Vimeo captions) + HYBRID: + description: Combination of automated and manual methods + UNKNOWN: + description: Generation method not recorded + +# ============================================================================ +# Slot Definitions +# ============================================================================ + +slots: + source_video: + description: Reference to source VideoPost (video ID) + range: string + + source_video_url: + description: URL of the source video + range: uri + + content_language: + description: Primary language of text content (ISO 639-1) + range: string + + content_title: + description: Title or label for this text content + range: string + + generated_by: + description: Agent that generated this content (model, service, person) + range: string + + generation_method: + description: Method used to generate content + range: GenerationMethodEnum + + generation_timestamp: + description: When content was generated + range: datetime + + model_version: + description: Version of model/tool used + range: string + + model_provider: + description: Provider of model/service + range: string + + overall_confidence: + description: Overall confidence score (0.0-1.0) + range: float + + is_verified: + description: Whether content has been human-verified + range: boolean + + verified_by: + description: Person who verified the content + range: string + + verification_date: + description: Date content was verified + range: datetime + + processing_duration_seconds: + description: Time taken to generate content + range: float + + word_count: + description: Total word count + range: integer + + character_count: + description: Total character count + range: integer diff --git a/schemas/20251121/linkml/modules/classes/VideoTimeSegment.yaml b/schemas/20251121/linkml/modules/classes/VideoTimeSegment.yaml new file mode 100644 index 0000000000..0f5d252455 --- /dev/null +++ b/schemas/20251121/linkml/modules/classes/VideoTimeSegment.yaml @@ -0,0 +1,375 @@ +# Video Time Segment Class +# Reusable temporal segment for video content (subtitles, annotations, chapters) +# +# Part of Heritage Custodian Ontology v0.9.5 +# +# STRUCTURE: +# VideoTimeSegment (this class) +# - start_time, end_time (ISO 8601 duration) +# - start_seconds, end_seconds (float for computation) +# - segment_text (text content for this segment) +# - confidence (for ASR/CV generated content) +# +# USED BY: +# - VideoSubtitle (time-coded caption entries) +# - VideoAnnotation (scene/object detection segments) +# - VideoChapter (user-defined chapters) +# +# ONTOLOGY ALIGNMENT: +# - Maps to Media Fragments URI 1.0 (W3C) for temporal addressing +# - CIDOC-CRM E52_Time-Span for temporal extent +# - Web Annotation oa:FragmentSelector for annotation targets + +id: https://nde.nl/ontology/hc/class/VideoTimeSegment +name: video_time_segment_class +title: Video Time Segment Class + +imports: + - linkml:types + +prefixes: + linkml: https://w3id.org/linkml/ + hc: https://nde.nl/ontology/hc/ + schema: http://schema.org/ + dcterms: http://purl.org/dc/terms/ + crm: http://www.cidoc-crm.org/cidoc-crm/ + oa: http://www.w3.org/ns/oa# + ma: http://www.w3.org/ns/ma-ont# + +default_prefix: hc + +classes: + + VideoTimeSegment: + class_uri: crm:E52_Time-Span + abstract: false + description: | + A temporal segment within a video, defined by start and end times. + + **DEFINITION**: + + VideoTimeSegment represents a bounded temporal portion of video content. + It is the foundational unit for time-coded content including: + - Subtitle/caption entries (text displayed at specific times) + - Annotation segments (detected scenes, objects, faces) + - Chapter markers (user-defined content sections) + + **DUAL TIME REPRESENTATION**: + + Times are stored in two formats for different use cases: + + | Format | Example | Use Case | + |--------|---------|----------| + | ISO 8601 duration | PT0M30S | Human-readable, serialization | + | Seconds (float) | 30.0 | Computation, synchronization | + + Both representations MUST be kept in sync. The seconds format is + primary for computation; ISO 8601 is derived for display/storage. + + **MEDIA FRAGMENTS URI (W3C)**: + + VideoTimeSegment aligns with W3C Media Fragments URI 1.0 specification + for addressing temporal fragments of video: + + ``` + https://example.com/video.mp4#t=30,35 + ``` + + The `start_seconds` and `end_seconds` map directly to the `t=` parameter. + + **WEB ANNOTATION COMPATIBILITY**: + + When used as an annotation target selector: + - Maps to `oa:FragmentSelector` with `conformsTo` Media Fragments + - Enables interoperability with W3C Web Annotation Data Model + + **CIDOC-CRM E52_Time-Span**: + + In cultural heritage documentation: + - E52_Time-Span is the extent of a time-span + - Used for temporal properties of cultural objects + - VideoTimeSegment extends this to media-specific temporal segments + + **CONFIDENCE SCORING**: + + For segments generated by ASR (speech recognition) or CV (computer vision): + - `confidence`: 0.0-1.0 score for segment accuracy + - Enables filtering by quality threshold + - Critical for AI-generated transcripts and annotations + + **HERITAGE USE CASES**: + + | Use Case | Example | Start | End | + |----------|---------|-------|-----| + | Subtitle entry | "Welcome to the museum" | 0:30 | 0:35 | + | Scene annotation | "Exhibition hall panorama" | 1:00 | 1:30 | + | Chapter marker | "Introduction" | 0:00 | 2:00 | + | Object detection | "Painting: Night Watch" | 3:15 | 3:20 | + | Speaker change | "Curator speaking" | 5:00 | 7:30 | + + exact_mappings: + - crm:E52_Time-Span + - oa:FragmentSelector + + close_mappings: + - ma:MediaFragment + + related_mappings: + - schema:Clip + + slots: + # Time boundaries (ISO 8601 duration format) + - start_time + - end_time + + # Time boundaries (seconds for computation) + - start_seconds + - end_seconds + + # Content + - segment_text + - segment_index + + # Quality + - confidence + + # Metadata + - speaker_id + - speaker_label + + slot_usage: + start_time: + slot_uri: ma:hasStartTime + description: | + Start time of segment as ISO 8601 duration from video beginning. + + Media Ontology: hasStartTime for temporal start. + + **Format**: ISO 8601 duration (e.g., "PT0M30S" = 30 seconds from start) + + **Common Patterns**: + - PT0S = Start of video (0 seconds) + - PT30S = 30 seconds + - PT1M30S = 1 minute 30 seconds + - PT1H15M30S = 1 hour 15 minutes 30 seconds + range: string + required: false + pattern: "^PT(\\d+H)?(\\d+M)?(\\d+(\\.\\d+)?S)?$" + examples: + - value: "PT0M30S" + description: "30 seconds from video start" + - value: "PT1H15M30S" + description: "1 hour 15 minutes 30 seconds" + + end_time: + slot_uri: ma:hasEndTime + description: | + End time of segment as ISO 8601 duration from video beginning. + + Media Ontology: hasEndTime for temporal end. + + Must be greater than or equal to start_time. + range: string + required: false + pattern: "^PT(\\d+H)?(\\d+M)?(\\d+(\\.\\d+)?S)?$" + examples: + - value: "PT0M35S" + description: "35 seconds from video start" + + start_seconds: + slot_uri: hc:startSeconds + description: | + Start time in seconds (floating point) from video beginning. + + **PRIMARY for computation**. Use for: + - Video player synchronization + - Duration calculations + - Time-based sorting and filtering + + Precision to milliseconds (3 decimal places) is typical. + range: float + required: true + minimum_value: 0.0 + examples: + - value: 30.0 + description: "30 seconds from start" + - value: 30.500 + description: "30.5 seconds (millisecond precision)" + + end_seconds: + slot_uri: hc:endSeconds + description: | + End time in seconds (floating point) from video beginning. + + Must be greater than start_seconds. + + For single-frame annotations (e.g., object detection in one frame), + end_seconds may equal start_seconds or be slightly greater. + range: float + required: true + minimum_value: 0.0 + examples: + - value: 35.0 + description: "35 seconds from start" + + segment_text: + slot_uri: oa:bodyValue + description: | + Text content for this segment. + + Web Annotation: bodyValue for textual content. + + **Usage by content type**: + - Subtitles: Displayed caption text + - Transcripts: Spoken words during this segment + - Annotations: Description of detected content + - Chapters: Chapter title/description + range: string + required: false + examples: + - value: "Welkom bij het Rijksmuseum" + description: "Dutch subtitle text" + - value: "The curator explains the painting's history" + description: "Transcript segment" + + segment_index: + slot_uri: hc:segmentIndex + description: | + Sequential index of this segment within the parent content. + + Zero-based index for ordering segments: + - Subtitle: Order in which captions appear + - Annotation: Detection sequence + + Enables reconstruction of segment order when times overlap + or for stable sorting. + range: integer + required: false + minimum_value: 0 + examples: + - value: 0 + description: "First segment" + - value: 42 + description: "43rd segment (zero-indexed)" + + confidence: + slot_uri: hc:confidence + description: | + Confidence score for AI-generated content. + + Range: 0.0 (no confidence) to 1.0 (complete certainty) + + **Applies to**: + - ASR-generated transcript/subtitle segments + - CV-detected scene or object annotations + - OCR-extracted text from video frames + + **Thresholds** (suggested): + - > 0.9: High confidence, suitable for display + - 0.7-0.9: Medium, may need review + - < 0.7: Low, flag for human verification + range: float + required: false + minimum_value: 0.0 + maximum_value: 1.0 + examples: + - value: 0.95 + description: "High confidence ASR segment" + - value: 0.72 + description: "Medium confidence, may contain errors" + + speaker_id: + slot_uri: hc:speakerId + description: | + Identifier for the speaker during this segment. + + For transcripts with speaker diarization: + - Links to identified speaker (e.g., "SPEAKER_01") + - May be resolved to actual person identity + + Enables multi-speaker transcript navigation. + range: string + required: false + examples: + - value: "SPEAKER_01" + description: "First identified speaker" + - value: "curator_taco_dibbits" + description: "Resolved speaker identity" + + speaker_label: + slot_uri: hc:speakerLabel + description: | + Human-readable label for the speaker. + + Display name for the speaker during this segment: + - May be generic ("Narrator", "Interviewer") + - May be specific ("Dr. Taco Dibbits, Museum Director") + + Distinguished from speaker_id which is a machine identifier. + range: string + required: false + examples: + - value: "Narrator" + description: "Generic speaker label" + - value: "Dr. Taco Dibbits, Museum Director" + description: "Specific identified speaker" + + rules: + - postconditions: + description: end_seconds must be >= start_seconds + # Note: LinkML doesn't support direct comparison rules, + # but this documents the constraint for validation + + comments: + - "Reusable time segment for subtitles, annotations, chapters" + - "Dual time format: ISO 8601 for serialization, seconds for computation" + - "Aligns with W3C Media Fragments URI specification" + - "Confidence scoring for AI-generated content" + - "Speaker diarization support for multi-speaker transcripts" + + see_also: + - "https://www.w3.org/TR/media-frags/" + - "https://www.w3.org/TR/annotation-model/" + - "https://www.w3.org/ns/ma-ont" + - "http://www.cidoc-crm.org/cidoc-crm/E52_Time-Span" + +# ============================================================================ +# Slot Definitions +# ============================================================================ + +slots: + start_time: + description: Start time as ISO 8601 duration from video beginning + range: string + + end_time: + description: End time as ISO 8601 duration from video beginning + range: string + + start_seconds: + description: Start time in seconds (float) from video beginning + range: float + + end_seconds: + description: End time in seconds (float) from video beginning + range: float + + segment_text: + description: Text content for this time segment + range: string + + segment_index: + description: Sequential index of segment within parent + range: integer + + confidence: + description: Confidence score for AI-generated content (0.0-1.0) + range: float + + speaker_id: + description: Identifier for speaker during this segment + range: string + + speaker_label: + description: Human-readable label for speaker + range: string diff --git a/schemas/20251121/linkml/modules/classes/VideoTranscript.yaml b/schemas/20251121/linkml/modules/classes/VideoTranscript.yaml new file mode 100644 index 0000000000..e847949fda --- /dev/null +++ b/schemas/20251121/linkml/modules/classes/VideoTranscript.yaml @@ -0,0 +1,469 @@ +# Video Transcript Class +# Full text transcription of video audio content +# +# Part of Heritage Custodian Ontology v0.9.5 +# +# HIERARCHY: +# E73_Information_Object (CIDOC-CRM) +# │ +# └── VideoTextContent (abstract base - provenance) +# │ +# └── VideoTranscript (this class) +# │ +# └── VideoSubtitle (time-coded extension) +# +# DESIGN RATIONALE: +# VideoTranscript represents the complete textual representation of spoken +# content in a video. It extends VideoTextContent to inherit comprehensive +# provenance tracking and adds transcript-specific slots: +# +# - full_text: Complete transcript as single text block +# - transcript_format: How the text is structured (plain, paragraphed, etc.) +# - segments: Optional structured breakdown into VideoTimeSegments +# - includes_timestamps/speakers: Metadata about content structure +# +# VideoSubtitle extends this because subtitles ARE transcripts plus time-codes. + +id: https://nde.nl/ontology/hc/class/VideoTranscript +name: video_transcript_class +title: Video Transcript Class + +imports: + - linkml:types + - ./VideoTextContent + - ./VideoTimeSegment + +prefixes: + linkml: https://w3id.org/linkml/ + hc: https://nde.nl/ontology/hc/ + schema: http://schema.org/ + dcterms: http://purl.org/dc/terms/ + prov: http://www.w3.org/ns/prov# + crm: http://www.cidoc-crm.org/cidoc-crm/ + skos: http://www.w3.org/2004/02/skos/core# + +default_prefix: hc + +classes: + + VideoTranscript: + is_a: VideoTextContent + class_uri: crm:E33_Linguistic_Object + abstract: false + description: | + Full text transcription of video audio content. + + **DEFINITION**: + + A VideoTranscript is the complete textual representation of all spoken + content in a video. It extends VideoTextContent with transcript-specific + properties and inherits all provenance tracking capabilities. + + **RELATIONSHIP TO VideoSubtitle**: + + VideoSubtitle is a subclass of VideoTranscript because: + 1. A subtitle file contains everything a transcript needs PLUS time codes + 2. You can derive a plain transcript from subtitles by stripping times + 3. This inheritance allows polymorphic handling of text content + + ``` + VideoTranscript VideoSubtitle (is_a VideoTranscript) + ├── full_text ├── full_text (inherited) + ├── segments[] ├── segments[] (required, with times) + └── (optional times) └── subtitle_format (SRT, VTT, etc.) + ``` + + **SCHEMA.ORG ALIGNMENT**: + + Maps to `schema:transcript` property: + > "If this MediaObject is an AudioObject or VideoObject, + > the transcript of that object." + + **CIDOC-CRM E33_Linguistic_Object**: + + E33 is the class comprising: + > "identifiable expressions in natural language or code" + + A transcript is a linguistic object derived from the audio track of + a video (which is itself an E73_Information_Object). + + **TRANSCRIPT FORMATS**: + + | Format | Description | Use Case | + |--------|-------------|----------| + | PLAIN_TEXT | Continuous text, no structure | Simple search indexing | + | PARAGRAPHED | Text broken into paragraphs | Human reading | + | STRUCTURED | Segments with speaker labels | Research, analysis | + | TIMESTAMPED | Segments with time markers | Navigation, subtitling | + + **GENERATION METHODS** (inherited from VideoTextContent): + + | Method | Typical Use | Quality | + |--------|-------------|---------| + | ASR_AUTOMATIC | Whisper, Google STT | 0.80-0.95 | + | MANUAL_TRANSCRIPTION | Human transcriber | 0.98-1.0 | + | PLATFORM_PROVIDED | YouTube auto-captions | 0.75-0.90 | + | HYBRID | ASR + human correction | 0.95-1.0 | + + **HERITAGE INSTITUTION CONTEXT**: + + Transcripts are critical for heritage video collections: + + 1. **Discovery**: Full-text search over video content + 2. **Accessibility**: Deaf/HoH access to spoken content + 3. **Preservation**: Text outlasts video format obsolescence + 4. **Research**: Corpus analysis, keyword extraction + 5. **Translation**: Base for multilingual access + 6. **SEO**: Search engine indexing of video content + + **STRUCTURED SEGMENTS**: + + When `segments` is populated, the transcript has structural breakdown: + + ```yaml + segments: + - segment_index: 0 + start_seconds: 0.0 + end_seconds: 5.5 + segment_text: "Welcome to the Rijksmuseum." + speaker_label: "Narrator" + confidence: 0.94 + - segment_index: 1 + start_seconds: 5.5 + end_seconds: 12.3 + segment_text: "Today we'll explore the Night Watch gallery." + speaker_label: "Narrator" + confidence: 0.91 + ``` + + **PROVENANCE** (inherited from VideoTextContent): + + All transcripts include: + - `source_video`: Which video was transcribed + - `generated_by`: Tool/person that created transcript + - `generation_method`: ASR_AUTOMATIC, MANUAL_TRANSCRIPTION, etc. + - `generation_timestamp`: When transcript was created + - `overall_confidence`: Aggregate quality score + - `is_verified`: Whether human-reviewed + + exact_mappings: + - crm:E33_Linguistic_Object + + close_mappings: + - schema:transcript + + related_mappings: + - dcterms:Text + + slots: + # Core content + - full_text + - transcript_format + + # Structural information + - includes_timestamps + - includes_speakers + - segments + + # Speaker metadata + - speaker_count + - primary_speaker + + # Additional metadata + - source_language_auto_detected + - paragraph_count + - sentence_count + + slot_usage: + full_text: + slot_uri: schema:text + description: | + Complete transcript text as a single string. + + Schema.org: text for primary textual content. + + Contains all spoken content from the video, concatenated. + May include: + - Speaker labels (if includes_speakers = true) + - Timestamps (if includes_timestamps = true) + - Paragraph breaks (if format = PARAGRAPHED or STRUCTURED) + + For structured access, use the `segments` slot instead. + range: string + required: true + examples: + - value: | + Welcome to the Rijksmuseum. Today we'll explore the masterpieces + of Dutch Golden Age painting. Our first stop is the Night Watch + by Rembrandt van Rijn, painted in 1642. + description: "Plain text transcript excerpt" + - value: | + [Narrator] Welcome to the Rijksmuseum. + [Narrator] Today we'll explore the masterpieces of Dutch Golden Age painting. + [Curator] Our first stop is the Night Watch by Rembrandt van Rijn. + description: "Transcript with speaker labels" + + transcript_format: + slot_uri: dcterms:format + description: | + Format/structure of the transcript text. + + Dublin Core: format for resource format. + + Indicates how the full_text is structured: + - PLAIN_TEXT: Continuous text without breaks + - PARAGRAPHED: Broken into paragraphs + - STRUCTURED: Includes speaker labels, times, or both + - TIMESTAMPED: Includes inline time markers + range: TranscriptFormatEnum + required: false + ifabsent: "string(PLAIN_TEXT)" + examples: + - value: "STRUCTURED" + description: "Text with speaker labels and paragraph breaks" + + includes_timestamps: + slot_uri: hc:includesTimestamps + description: | + Whether the transcript includes time markers. + + - **true**: Timestamps are embedded in full_text or segments have times + - **false**: No temporal information (default) + + If true, prefer using `segments` for programmatic access. + range: boolean + required: false + ifabsent: "false" + examples: + - value: true + description: "Transcript has time codes" + + includes_speakers: + slot_uri: hc:includesSpeakers + description: | + Whether the transcript includes speaker identification. + + - **true**: Speaker labels/diarization available + - **false**: Single speaker or no identification (default) + + When true, check `speaker_count` for number of distinct speakers. + range: boolean + required: false + ifabsent: "false" + examples: + - value: true + description: "Multi-speaker transcript with diarization" + + segments: + slot_uri: hc:transcriptSegments + description: | + Structured breakdown of transcript into time-coded segments. + + Optional for VideoTranscript (plain transcripts may not have times). + Required for VideoSubtitle (subtitles must have time codes). + + Each segment is a VideoTimeSegment with: + - start_seconds / end_seconds: Time boundaries + - segment_text: Text for this segment + - confidence: Per-segment accuracy score + - speaker_id / speaker_label: Speaker identification + + Use segments for: + - Video player synchronization + - Jump-to-time navigation + - Per-segment quality analysis + - Speaker-separated views + range: VideoTimeSegment + required: false + multivalued: true + inlined: true + inlined_as_list: true + examples: + - value: | + - segment_index: 0 + start_seconds: 0.0 + end_seconds: 3.5 + segment_text: "Welcome to the museum." + confidence: 0.95 + description: "Single structured segment" + + speaker_count: + slot_uri: hc:speakerCount + description: | + Number of distinct speakers identified in the transcript. + + Only meaningful when includes_speakers = true. + + 0 = Unknown/not analyzed + 1 = Single speaker (monologue) + 2+ = Multi-speaker (dialogue, panel, interview) + range: integer + required: false + minimum_value: 0 + examples: + - value: 3 + description: "Three speakers identified" + + primary_speaker: + slot_uri: hc:primarySpeaker + description: | + Identifier or name of the main/dominant speaker. + + For interviews: the interviewee (not interviewer) + For presentations: the presenter + For tours: the guide + + May be generic ("Narrator") or specific ("Dr. Taco Dibbits"). + range: string + required: false + examples: + - value: "Narrator" + description: "Generic primary speaker" + - value: "Dr. Taco Dibbits, Museum Director" + description: "Named primary speaker" + + source_language_auto_detected: + slot_uri: hc:sourceLanguageAutoDetected + description: | + Whether the content_language was auto-detected by ASR. + + - **true**: Language detected by ASR model + - **false**: Language was specified/known (default) + + Useful for quality assessment - auto-detection may be wrong. + range: boolean + required: false + ifabsent: "false" + examples: + - value: true + description: "Language was auto-detected" + + paragraph_count: + slot_uri: hc:paragraphCount + description: | + Number of paragraphs in the transcript. + + Only meaningful when transcript_format = PARAGRAPHED or STRUCTURED. + + Useful for content sizing and readability assessment. + range: integer + required: false + minimum_value: 0 + examples: + - value: 15 + description: "Transcript has 15 paragraphs" + + sentence_count: + slot_uri: hc:sentenceCount + description: | + Approximate number of sentences in the transcript. + + Derived from punctuation analysis or NLP sentence segmentation. + + Useful for content analysis and readability metrics. + range: integer + required: false + minimum_value: 0 + examples: + - value: 47 + description: "Transcript has ~47 sentences" + + comments: + - "Full text transcription of video audio content" + - "Extends VideoTextContent with transcript-specific properties" + - "Base class for VideoSubtitle (subtitles are transcripts + time codes)" + - "Supports both plain text and structured segment-based transcripts" + - "Critical for accessibility, discovery, and preservation" + + see_also: + - "https://schema.org/transcript" + - "http://www.cidoc-crm.org/cidoc-crm/E33_Linguistic_Object" + +# ============================================================================ +# Enumerations +# ============================================================================ + +enums: + + TranscriptFormatEnum: + description: | + Format/structure of transcript text content. + + Indicates how the full_text is organized. + permissible_values: + PLAIN_TEXT: + description: | + Continuous text without structural markers. + No speaker labels, no timestamps, no paragraph breaks. + Suitable for simple full-text search indexing. + PARAGRAPHED: + description: | + Text broken into paragraphs. + May be based on topic changes, speaker pauses, or semantic units. + Improves human readability. + STRUCTURED: + description: | + Text with speaker labels and/or section markers. + Format: "[Speaker] Text content" or similar. + Enables speaker-specific analysis. + TIMESTAMPED: + description: | + Text with inline time markers. + Format: "[00:30] Text content" or similar. + Enables temporal navigation in text view. + VERBATIM: + description: | + Exact transcription including fillers, false starts, overlaps. + "[um]", "[pause]", "[crosstalk]" markers. + Used for linguistic analysis or legal transcripts. + CLEAN: + description: | + Edited for readability - fillers removed, grammar corrected. + May diverge slightly from literal spoken content. + Suitable for publication or accessibility. + +# ============================================================================ +# Slot Definitions +# ============================================================================ + +slots: + full_text: + description: Complete transcript text as single string + range: string + + transcript_format: + description: Format/structure of transcript text + range: TranscriptFormatEnum + + includes_timestamps: + description: Whether transcript includes time markers + range: boolean + + includes_speakers: + description: Whether transcript includes speaker identification + range: boolean + + segments: + description: Structured breakdown into time-coded segments + range: VideoTimeSegment + multivalued: true + + speaker_count: + description: Number of distinct speakers identified + range: integer + + primary_speaker: + description: Identifier/name of main speaker + range: string + + source_language_auto_detected: + description: Whether language was auto-detected by ASR + range: boolean + + paragraph_count: + description: Number of paragraphs in transcript + range: integer + + sentence_count: + description: Number of sentences in transcript + range: integer