glam/schemas/20251121/linkml/modules/classes/VideoTextContent.yaml

id: https://nde.nl/ontology/hc/class/VideoTextContent
name: video_text_content_class
title: Video Text Content Class
imports:
- linkml:types
- ./VideoPost
- ../slots/class_metadata_slots
prefixes:
  linkml: https://w3id.org/linkml/
  hc: https://nde.nl/ontology/hc/
  schema: http://schema.org/
  dcterms: http://purl.org/dc/terms/
  prov: http://www.w3.org/ns/prov#
  crm: http://www.cidoc-crm.org/cidoc-crm/
  skos: http://www.w3.org/2004/02/skos/core#
  oa: http://www.w3.org/ns/oa#
default_prefix: hc
classes:
  VideoTextContent:
    class_uri: crm:E73_Information_Object
    abstract: true
    description: |
      Abstract base class for all textual/derived content from videos.

      **DEFINITION**:

      VideoTextContent is the abstract parent for all text that is extracted,
      transcribed, or derived from video content. This includes:

      | Subclass | Source | Description |
      |----------|--------|-------------|
      | VideoTranscript | Audio | Full text transcription of spoken content |
      | VideoSubtitle | Audio | Time-coded caption entries (SRT/VTT) |
      | VideoAnnotation | Visual | CV/multimodal-derived descriptions |

      **PROVENANCE REQUIREMENTS**:

      All video-derived text MUST include comprehensive provenance:

      1. **Source**: Which video was processed (`source_video`)
      2. **Method**: How was content generated (`generation_method`)
      3. **Agent**: Who/what generated it (`generated_by`)
      4. **Time**: When was it generated (`generation_timestamp`)
      5. **Version**: Tool/model version (`model_version`)
      6. **Quality**: Overall confidence (`overall_confidence`)

      **PROV-O ALIGNMENT**:

      Maps to W3C PROV-O for provenance tracking:

      ```turtle
      :transcript a hc:VideoTranscript ;
          prov:wasGeneratedBy :asr_activity ;
          prov:wasAttributedTo :whisper_model ;
          prov:generatedAtTime "2025-12-01T10:00:00Z" ;
          prov:wasDerivedFrom :source_video .
      ```

      **CIDOC-CRM E73_Information_Object**:

      - E73 is the base for all identifiable immaterial items
      - Includes texts, computer programs, songs, recipes
      - VideoTextContent are E73 instances derived from video (E73)

      **GENERATION METHODS**:

      | Method | Description | Typical Confidence |
      |--------|-------------|-------------------|
      | ASR_AUTOMATIC | Automatic speech recognition | 0.75-0.95 |
      | ASR_ENHANCED | ASR with post-processing | 0.85-0.98 |
      | MANUAL_TRANSCRIPTION | Human transcription | 0.98-1.0 |
      | MANUAL_CORRECTION | Human-corrected ASR | 0.95-1.0 |
      | CV_AUTOMATIC | Computer vision detection | 0.60-0.90 |
      | MULTIMODAL | Combined audio+visual AI | 0.70-0.95 |
      | OCR | Optical character recognition | 0.80-0.98 |
      | PLATFORM_PROVIDED | From YouTube/Vimeo API | 0.85-0.95 |

      **HERITAGE INSTITUTION CONTEXT**:

      Video text content is critical for:
      - **Accessibility**: Deaf/HoH users need accurate captions
      - **Discovery**: Full-text search over video collections
      - **Preservation**: Text outlasts video format obsolescence
      - **Research**: Analyzing spoken content at scale
      - **Translation**: Multilingual access to heritage content

      **LANGUAGE SUPPORT**:

      - `content_language`: Primary language of text content
      - May differ from video's default_audio_language if translated
      - ISO 639-1 codes (e.g., "nl", "en", "de")
    exact_mappings:
    - crm:E73_Information_Object
    close_mappings:
    - prov:Entity
    related_mappings:
    - schema:CreativeWork
    - dcterms:Text
    slots:
    - character_count
    - content_language
    - content_title
    - generated_by
    - generation_method
    - generation_timestamp
    - is_verified
    - model_provider
    - model_version
    - overall_confidence
    - processing_duration_seconds
    - source_video
    - source_video_url
    - specificity_annotation
    - template_specificity
    - verification_date
    - verified_by
    - word_count
    slot_usage:
      source_video:
        slot_uri: prov:wasDerivedFrom
        description: |
          Reference to the VideoPost from which this content was derived.

          PROV-O: wasDerivedFrom links derived content to source.

          Links to the video's unique identifier (post_id).
        range: string
        required: true
        examples:
        - value: FbIoC-Owy-M
          description: YouTube video ID as source reference
      source_video_url:
        slot_uri: schema:url
        description: |
          URL of the source video.

          Convenience field for direct video access.
          Derived from source_video but stored for quick reference.
        range: uri
        required: false
        examples:
        - value: https://www.youtube.com/watch?v=FbIoC-Owy-M
          description: Full YouTube video URL
      content_language:
        slot_uri: dcterms:language
        description: |
          Primary language of the text content.

          Dublin Core: language for content language.

          ISO 639-1 code. May differ from video's audio language
          if this is a translation or localization.
        range: string
        required: true
        examples:
        - value: nl
          description: Dutch language content
        - value: en
          description: English translation
      content_title:
        slot_uri: dcterms:title
        description: |
          Title or label for this text content.

          Dublin Core: title for content name.

          Examples:
          - "Rijksmuseum Tour - Full Transcript"
          - "Dutch Subtitles - Auto-generated"
          - "Scene Annotations - CV Model v2.1"
        range: string
        required: false
        examples:
        - value: De Vrijheidsroute Ep.3 - Dutch Transcript
          description: Descriptive title for transcript
      generated_by:
        slot_uri: prov:wasAttributedTo
        description: |
          The agent (model, service, person) that generated this content.

          PROV-O: wasAttributedTo identifies the responsible agent.

          **Examples**:
          - AI Models: "openai/whisper-large-v3", "google/speech-to-text"
          - Services: "YouTube Auto-captions", "Rev.com"
          - Human: "transcriber:jane.doe@museum.nl"
        range: string
        required: true
        examples:
        - value: openai/whisper-large-v3
          description: OpenAI Whisper ASR model
        - value: YouTube Auto-captions
          description: Platform-provided captions
        - value: manual:curator@rijksmuseum.nl
          description: Human transcriber
      generation_method:
        slot_uri: prov:wasGeneratedBy
        description: |
          The method used to generate this content.

          PROV-O: wasGeneratedBy for generation activity type.

          See GenerationMethodEnum for standardized values.
        range: GenerationMethodEnum
        required: true
        examples:
        - value: ASR_AUTOMATIC
          description: Automatic speech recognition
        - value: MANUAL_TRANSCRIPTION
          description: Human transcription
      generation_timestamp:
        slot_uri: prov:generatedAtTime
        description: |
          When this content was generated.

          PROV-O: generatedAtTime for creation timestamp.

          ISO 8601 datetime. Critical for versioning and reproducibility.
        range: datetime
        required: true
        examples:
        - value: '2025-12-01T10:30:00Z'
          description: Generated December 1, 2025 at 10:30 UTC
      model_version:
        slot_uri: schema:softwareVersion
        description: |
          Version of the model or tool used for generation.

          Schema.org: softwareVersion for version tracking.

          Critical for reproducibility and quality assessment.
        range: string
        required: false
        examples:
        - value: large-v3
          description: Whisper model version
        - value: v2.3.1
          description: Software version number
      model_provider:
        slot_uri: schema:provider
        description: |
          Provider or vendor of the generation model/service.

          Schema.org: provider for service provider.
        range: string
        required: false
        examples:
        - value: OpenAI
          description: Model provider
        - value: Google Cloud
          description: Cloud service provider
      overall_confidence:
        slot_uri: hc:overallConfidence
        description: |
          Overall confidence score for the generated content.

          Range: 0.0 (no confidence) to 1.0 (complete certainty)

          Aggregated from per-segment confidence scores or
          provided by the generation model.

          **Thresholds** (suggested):
          - > 0.9: High quality, production-ready
          - 0.75-0.9: Good, may have minor errors
          - 0.6-0.75: Usable, should be reviewed
          - < 0.6: Low quality, needs significant correction
        range: float
        required: false
        minimum_value: 0.0
        maximum_value: 1.0
        examples:
        - value: 0.92
          description: High confidence ASR output
      is_verified:
        slot_uri: hc:isVerified
        description: |
          Whether content has been verified by a human.

          - **true**: Human-reviewed and approved
          - **false**: Not yet verified (default for AI-generated)

          Critical for quality assurance in heritage contexts.
        range: boolean
        required: false
        ifabsent: 'false'
        examples:
        - value: true
          description: Human-verified transcript
      verified_by:
        slot_uri: prov:wasAttributedTo
        description: |
          Identity of the person who verified the content.

          Only populated when is_verified = true.
        range: string
        required: false
        examples:
        - value: curator@rijksmuseum.nl
          description: Staff member who verified
      verification_date:
        slot_uri: dcterms:dateAccepted
        description: |
          Date when content was verified.

          Dublin Core: dateAccepted for approval date.
        range: datetime
        required: false
        examples:
        - value: '2025-12-02T15:00:00Z'
          description: Verified December 2, 2025
      processing_duration_seconds:
        slot_uri: hc:processingDuration
        description: |
          Time taken to generate this content, in seconds.

          Useful for performance monitoring and cost estimation.
        range: float
        required: false
        minimum_value: 0.0
        examples:
        - value: 45.3
          description: Processed in 45.3 seconds
      word_count:
        slot_uri: hc:wordCount
        description: |
          Total number of words in the text content.

          Useful for content sizing and analysis.
        range: integer
        required: false
        minimum_value: 0
        examples:
        - value: 1523
          description: 1,523 words in transcript
      character_count:
        slot_uri: hc:characterCount
        description: |
          Total number of characters in the text content.

          Includes spaces. Useful for storage estimation.
        range: integer
        required: false
        minimum_value: 0
        examples:
        - value: 8742
          description: 8,742 characters
      specificity_annotation:
        range: SpecificityAnnotation
        inlined: true
      template_specificity:
        range: TemplateSpecificityScores
        inlined: true
    comments:
    - Abstract base for all video-derived text content
    - Comprehensive PROV-O provenance tracking
    - Confidence scoring for AI-generated content
    - Verification workflow support
    - Critical for heritage accessibility and discovery
    see_also:
    - https://www.w3.org/TR/prov-o/
    - http://www.cidoc-crm.org/cidoc-crm/E73_Information_Object
enums:
  GenerationMethodEnum:
    description: |
      Methods for generating video-derived text content.

      Standardized values for provenance tracking.
    permissible_values:
      ASR_AUTOMATIC:
        description: Automatic speech recognition (raw output)
      ASR_ENHANCED:
        description: ASR with post-processing (punctuation, normalization)
      MANUAL_TRANSCRIPTION:
        description: Fully human-transcribed content
      MANUAL_CORRECTION:
        description: Human-corrected ASR output
      CV_AUTOMATIC:
        description: Computer vision detection (raw output)
      CV_ENHANCED:
        description: CV with post-processing or filtering
      MULTIMODAL:
        description: Combined audio+visual AI processing
      OCR:
        description: Optical character recognition from video frames
      PLATFORM_PROVIDED:
        description: Content from platform API (YouTube, Vimeo captions)
      HYBRID:
        description: Combination of automated and manual methods
      UNKNOWN:
        description: Generation method not recorded
slots:
  source_video:
    description: Reference to source VideoPost (video ID)
    range: string
  source_video_url:
    description: URL of the source video
    range: uri
  content_language:
    description: Primary language of text content (ISO 639-1)
    range: string
  content_title:
    description: Title or label for this text content
    range: string
  generated_by:
    description: Agent that generated this content (model, service, person)
    range: string
  generation_method:
    description: Method used to generate content
    range: GenerationMethodEnum
  generation_timestamp:
    description: When content was generated
    range: datetime
  model_version:
    description: Version of model/tool used
    range: string
  model_provider:
    description: Provider of model/service
    range: string
  overall_confidence:
    description: Overall confidence score (0.0-1.0)
    range: float
  is_verified:
    description: Whether content has been human-verified
    range: boolean
  verified_by:
    description: Person who verified the content
    range: string
  verification_date:
    description: Date content was verified
    range: datetime
  processing_duration_seconds:
    description: Time taken to generate content
    range: float
  word_count:
    description: Total word count
    range: integer
  character_count:
    description: Total character count
    range: integer