glam/data/entity_annotation/modules/advanced/tei/cmc.yaml

# =============================================================================
# GLAM-NER: TEI P5 COMPUTER-MEDIATED COMMUNICATION (CMC) MODULE
# =============================================================================
# Module: modules/advanced/tei/cmc.yaml
# TEI Chapter: 9 (Computer-mediated Communication)
# TEI Module Name: cmc
# Version: 1.0.0
# Status: Complete
# =============================================================================
#
# This module provides LinkML representations of TEI P5 elements for encoding
# computer-mediated communication including social media posts, chat messages,
# forum threads, wiki discussions, and other digital discourse. Essential for
# social media NER, online discourse analysis, and digital heritage collections.
#
# Key Features:
#   - Post element for CMC turns/messages
#   - Threading and reply structures
#   - Emoji and emoticon encoding
#   - Multimodal content (text, images, audio, video)
#   - Participant metadata and anonymization
#   - Bot/automated content detection
#   - Cross-platform CMC normalization
#
# TEI Source: https://tei-c.org/release/doc/tei-p5-doc/en/html/CMC.html
# =============================================================================

id: https://w3id.org/glam/ner/tei/cmc
name: glam-ner-tei-cmc
title: "TEI P5 Computer-mediated Communication Module for GLAM-NER"
version: "1.0.0"

license: https://creativecommons.org/licenses/by/4.0/
see_also:
  - https://tei-c.org/release/doc/tei-p5-doc/en/html/CMC.html
  - https://www.w3.org/community/ontolex/wiki/CMC

prefixes:
  linkml: https://w3id.org/linkml/
  glam: https://w3id.org/glam/ner/
  tei: http://www.tei-c.org/ns/1.0/
  schema: http://schema.org/
  sioc: http://rdfs.org/sioc/ns#
  as: https://www.w3.org/ns/activitystreams#
  foaf: http://xmlns.com/foaf/0.1/
  dcterms: http://purl.org/dc/terms/
  crm: http://www.cidoc-crm.org/cidoc-crm/
  prov: http://www.w3.org/ns/prov#
  oa: http://www.w3.org/ns/oa#

default_range: string

imports:
  - linkml:types

# =============================================================================
# ENUMERATIONS
# =============================================================================

enums:

  # ---------------------------------------------------------------------------
  # CMC Modality (written vs spoken)
  # ---------------------------------------------------------------------------
  CMCModalityEnum:
    description: >-
      Modality of computer-mediated communication. Distinguishes between
      written text-based communication and spoken/audio-based communication
      transmitted via digital channels.
    permissible_values:
      written:
        description: "Text-based CMC (chat, email, forum posts, social media text)"
      spoken:
        description: "Voice-based CMC (voice messages, audio calls, podcasts)"
      mixed:
        description: "Combines written and spoken modalities (video with text overlays)"

  # ---------------------------------------------------------------------------
  # Content Generation Source
  # ---------------------------------------------------------------------------
  GeneratedByEnum:
    description: >-
      Source of content generation for CMC posts. Distinguishes between
      human-authored content and various forms of automated/system-generated
      content. Essential for NER to identify bot-generated text.
    permissible_values:
      human:
        description: "Content authored by a human user"
      template:
        description: "Content generated from a template with user input"
      system:
        description: "System-generated content (notifications, status updates)"
      bot:
        description: "Content generated by an automated bot/AI agent"
      unknown:
        description: "Generation source cannot be determined"

  # ---------------------------------------------------------------------------
  # CMC Platform Type
  # ---------------------------------------------------------------------------
  CMCPlatformTypeEnum:
    description: >-
      Type of computer-mediated communication platform. Categorizes the
      technological context of CMC for normalization and analysis.
    permissible_values:
      social_media:
        description: "Social networking platforms (Twitter/X, Facebook, Instagram, LinkedIn)"
      chat:
        description: "Synchronous chat platforms (WhatsApp, Telegram, Slack, Discord)"
      forum:
        description: "Asynchronous discussion forums (Reddit, Stack Overflow, phpBB)"
      wiki_talk:
        description: "Wiki discussion pages (Wikipedia talk pages, MediaWiki)"
      email:
        description: "Email communication (Gmail, Outlook, mailing lists)"
      blog:
        description: "Blog platforms (WordPress, Medium, Blogger)"
      comment:
        description: "Comment sections (news articles, video comments)"
      microblog:
        description: "Microblogging platforms (Twitter/X, Mastodon, Bluesky)"
      video_live:
        description: "Live video streaming chat (YouTube Live, Twitch)"
      gaming:
        description: "Gaming communication (in-game chat, Discord gaming)"
      dating:
        description: "Dating platform messaging"
      professional:
        description: "Professional networking (LinkedIn messages)"
      customer_support:
        description: "Customer service chat systems"
      sms:
        description: "SMS/text messaging"
      other:
        description: "Other CMC platform type"

  # ---------------------------------------------------------------------------
  # Post Type
  # ---------------------------------------------------------------------------
  CMCPostTypeEnum:
    description: >-
      Type of CMC post within a communication thread. Distinguishes between
      original posts, replies, reposts/shares, and other post types.
    permissible_values:
      original:
        description: "Original post starting a new thread"
      reply:
        description: "Reply to another post"
      repost:
        description: "Share/repost of another post (RT, reblog)"
      quote:
        description: "Quote post with commentary"
      reaction:
        description: "Reaction-only post (emoji reaction, like)"
      edit:
        description: "Edit of a previous post"
      deletion:
        description: "Deletion marker (post was deleted)"
      system:
        description: "System message (join/leave notifications)"
      pinned:
        description: "Pinned/sticky post"

  # ---------------------------------------------------------------------------
  # Emoji Category
  # ---------------------------------------------------------------------------
  EmojiCategoryEnum:
    description: >-
      Unicode emoji category classification. Used for emoji/emoticon encoding
      in CMC transcriptions.
    permissible_values:
      smileys_emotion:
        description: "Smileys & Emotion (face expressions, hearts)"
      people_body:
        description: "People & Body (hand gestures, people)"
      animals_nature:
        description: "Animals & Nature (animals, plants)"
      food_drink:
        description: "Food & Drink"
      travel_places:
        description: "Travel & Places (buildings, transport)"
      activities:
        description: "Activities (sports, arts)"
      objects:
        description: "Objects (tools, symbols)"
      symbols:
        description: "Symbols (arrows, zodiac, flags)"
      flags:
        description: "Flags (country flags, special flags)"
      component:
        description: "Component (skin tones, hair)"

  # ---------------------------------------------------------------------------
  # Participant Anonymization Level
  # ---------------------------------------------------------------------------
  AnonymizationLevelEnum:
    description: >-
      Level of participant anonymization applied to CMC data. Important
      for privacy protection in CMC corpora and compliance with GDPR.
    permissible_values:
      none:
        description: "No anonymization (original usernames preserved)"
      pseudonymized:
        description: "Usernames replaced with consistent pseudonyms"
      anonymized:
        description: "Full anonymization (no identifying information)"
      aggregated:
        description: "Aggregated data (no individual posts)"

  # ---------------------------------------------------------------------------
  # Thread Structure Type
  # ---------------------------------------------------------------------------
  ThreadStructureEnum:
    description: >-
      Type of threading structure in CMC platform. Affects how replies
      and conversations are organized and visualized.
    permissible_values:
      flat:
        description: "Flat chronological list (no threading)"
      linear:
        description: "Linear thread with reply references"
      nested:
        description: "Nested/threaded replies (Reddit-style)"
      wiki_indent:
        description: "Wiki-style indentation threading"
      graph:
        description: "Graph structure (multiple parents)"

  # ---------------------------------------------------------------------------
  # Multimodal Content Type
  # ---------------------------------------------------------------------------
  CMCMediaTypeEnum:
    description: >-
      Type of multimodal content embedded in CMC posts. Posts may contain
      text plus images, video, audio, or other media.
    permissible_values:
      text_only:
        description: "Text-only post"
      image:
        description: "Post with image(s)"
      video:
        description: "Post with video"
      audio:
        description: "Post with audio/voice message"
      gif:
        description: "Post with animated GIF"
      sticker:
        description: "Post with sticker"
      file:
        description: "Post with file attachment"
      poll:
        description: "Post with poll"
      location:
        description: "Post with location/check-in"
      link:
        description: "Post with link preview"
      mixed:
        description: "Post with multiple media types"

# =============================================================================
# CLASSES
# =============================================================================

classes:

  # ===========================================================================
  # CORE POST ELEMENT
  # ===========================================================================

  CMCPost:
    description: >-
      A single post, message, or contribution in computer-mediated communication.
      The fundamental unit of CMC discourse, corresponding to TEI <post> element.
      Can represent tweets, chat messages, forum posts, wiki talk contributions,
      email messages, blog comments, etc.
    class_uri: tei:post
    slots:
      - xml_id
      - post_type
      - modality
      - generated_by
      - reply_to
      - indent_level
      - who
      - when_written
      - when_posted
      - synch
      - content_text
      - embedded_media
      - mentions
      - hashtags
      - emojis
      - urls
      - edit_history
      - reactions
      - platform_metadata
    slot_usage:
      xml_id:
        required: true
      content_text:
        required: true
    annotations:
      tei_element: "post"
      tei_module: "cmc"
      glam_hypernym: "TXT.CMC.PST"
      ner_relevance: |
        CMC posts are primary sources for NER in social media and online discourse.
        May contain informal language, abbreviations, hashtags, and @mentions.
        Entity recognition must handle: platform-specific formats (@ for users,
        # for topics), emoji as sentiment markers, URLs as references, and
        non-standard orthography typical of CMC.

  # ===========================================================================
  # THREADING AND STRUCTURE
  # ===========================================================================

  CMCThread:
    description: >-
      A thread of related CMC posts, representing a conversation or discussion.
      Contains an ordered collection of posts with reply relationships.
      Used for forum threads, Twitter threads, email chains, chat conversations.
    class_uri: sioc:Thread
    slots:
      - xml_id
      - thread_id
      - thread_title
      - thread_structure
      - original_post
      - posts
      - post_count
      - participant_count
      - start_time
      - last_activity
      - is_closed
      - is_pinned
      - platform_thread_url
    slot_usage:
      xml_id:
        required: true
      posts:
        required: true
        multivalued: true
    annotations:
      glam_hypernym: "TXT.CMC.THR"
      sioc_mapping: "sioc:Thread"

  CMCConversation:
    description: >-
      A conversation context containing one or more related threads.
      Represents broader discourse context, such as all discussion on a topic
      across multiple threads, or a chat room conversation over time.
    class_uri: sioc:Forum
    slots:
      - xml_id
      - conversation_id
      - conversation_title
      - threads
      - platform_type
      - channel_name
      - start_time
      - end_time
      - total_posts
      - participants
    annotations:
      glam_hypernym: "TXT.CMC.CNV"
      sioc_mapping: "sioc:Forum"

  # ===========================================================================
  # PARTICIPANT METADATA
  # ===========================================================================

  CMCParticipant:
    description: >-
      A participant in computer-mediated communication. Represents a user
      account or identity within a CMC platform. May be pseudonymized or
      anonymized for privacy. Links to TEI <person> for speaker identification.
    class_uri: sioc:UserAccount
    slots:
      - xml_id
      - participant_id
      - username
      - display_name
      - anonymized_id
      - anonymization_level
      - account_created
      - account_verified
      - is_bot
      - platform_user_url
      - person_ref
      - role_in_conversation
      - post_count_in_corpus
      - demographic_info
    slot_usage:
      xml_id:
        required: true
    annotations:
      glam_hypernym: "AGT.CMC.USR"
      sioc_mapping: "sioc:UserAccount"
      foaf_mapping: "foaf:OnlineAccount"
      privacy_note: |
        CMC participant data must comply with privacy regulations (GDPR, CCPA).
        Use anonymization_level to document privacy protection measures.
        Real identities should only be preserved when explicit consent exists
        or for public figures in public discourse.

  CMCParticipantGroup:
    description: >-
      A group of participants in CMC, such as members of a chat room,
      forum community, or social media follower group.
    class_uri: sioc:Usergroup
    slots:
      - xml_id
      - group_id
      - group_name
      - group_type
      - members
      - member_count
      - creation_date
      - platform_group_url
    annotations:
      glam_hypernym: "GRP.CMC"
      sioc_mapping: "sioc:Usergroup"

  # ===========================================================================
  # EMOJI AND EMOTICON ENCODING
  # ===========================================================================

  CMCEmoji:
    description: >-
      An emoji or emoticon in CMC text. Encodes both Unicode emoji and
      text-based emoticons (e.g., :) :-P). Essential for sentiment analysis
      and understanding informal CMC expression.
    slots:
      - xml_id
      - emoji_char
      - emoji_codepoint
      - emoji_name
      - emoji_category
      - is_custom
      - custom_emoji_url
      - text_equivalent
      - position_in_post
      - sentiment_valence
    annotations:
      glam_hypernym: "TXT.CMC.EMJ"
      ner_note: |
        Emoji can function as: sentiment markers, entity references (flag emoji
        for countries), topic markers, or standalone expressions. NER pipelines
        should consider emoji context for entity disambiguation and sentiment.

  CMCEmoticon:
    description: >-
      A text-based emoticon in CMC (e.g., :), :-), :P, <3, XD).
      Distinguished from Unicode emoji as ASCII-based representations.
    slots:
      - xml_id
      - emoticon_text
      - normalized_form
      - emoji_equivalent
      - position_in_post
      - sentiment_valence
    annotations:
      glam_hypernym: "TXT.CMC.EMO"

  # ===========================================================================
  # HASHTAGS AND MENTIONS
  # ===========================================================================

  CMCHashtag:
    description: >-
      A hashtag in CMC text (e.g., #BlackLivesMatter, #AI, #heritage).
      Hashtags function as topic markers, community identifiers, and
      sometimes as named entities themselves (event names, campaign names).
    slots:
      - xml_id
      - hashtag_text
      - hashtag_normalized
      - position_in_post
      - is_trending
      - topic_category
      - entity_ref
    annotations:
      glam_hypernym: "APP.CMC.HTG"
      ner_note: |
        Hashtags may represent: events (#Olympics2024), organizations (#UNESCO),
        movements (#MeToo), topics (#AI), or locations (#Paris). NER should
        consider hashtags as potential entity mentions.

  CMCMention:
    description: >-
      An @-mention of a user in CMC text (e.g., @username, @NASA).
      Mentions explicitly reference other users or accounts and
      function as named entity references within CMC.
    slots:
      - xml_id
      - mention_text
      - mentioned_username
      - mentioned_user_ref
      - position_in_post
      - is_reply_mention
      - entity_type
      - entity_ref
    annotations:
      glam_hypernym: "APP.CMC.MEN"
      ner_note: |
        @-mentions can reference: individuals (@jack), organizations (@NASA),
        bots (@github-actions), or fictional entities. NER should resolve
        mentions to known entities when possible.

  # ===========================================================================
  # MULTIMODAL CONTENT
  # ===========================================================================

  CMCEmbeddedMedia:
    description: >-
      Media content embedded in a CMC post (images, videos, audio,
      GIFs, stickers, files). Extends TEI <figure>/<graphic> for CMC context.
    slots:
      - xml_id
      - media_type
      - media_url
      - thumbnail_url
      - alt_text
      - caption
      - duration_seconds
      - file_size
      - mime_type
      - is_quoted_content
      - original_post_ref
      - ocr_text
      - transcription
    annotations:
      glam_hypernym: "THG.CMC.MED"
      ner_note: |
        Embedded media may contain entities: images of people/places,
        screenshots with text, quoted posts with entity mentions.
        OCR/transcription enables NER on visual content.

  # ===========================================================================
  # REACTIONS AND ENGAGEMENT
  # ===========================================================================

  CMCReaction:
    description: >-
      A reaction to a CMC post (like, love, laugh, angry, etc.).
      Captures engagement metrics and sentiment signals.
    slots:
      - xml_id
      - reaction_type
      - reaction_emoji
      - reactor_ref
      - reaction_time
      - reaction_count
    annotations:
      glam_hypernym: "TXT.CMC.RXN"

  CMCReactionSet:
    description: >-
      Aggregated reactions on a CMC post, summarizing all reaction types
      and counts. Useful for engagement analysis.
    slots:
      - xml_id
      - total_reactions
      - reaction_breakdown
      - top_reactors
    annotations:
      glam_hypernym: "TXT.CMC.RXS"

  # ===========================================================================
  # PLATFORM METADATA
  # ===========================================================================

  CMCPlatformMetadata:
    description: >-
      Platform-specific metadata for CMC content. Captures technical
      and contextual information from the originating platform.
    slots:
      - xml_id
      - platform_name
      - platform_type
      - platform_version
      - platform_url
      - api_version
      - collection_date
      - collection_method
      - terms_of_service_url
      - rate_limit_info
      - geographic_availability
    annotations:
      glam_hypernym: "DOC.MET.CMC"
      prov_mapping: "prov:Activity"

  CMCPostMetadata:
    description: >-
      Platform-specific metadata for an individual CMC post.
      Includes platform IDs, engagement metrics, and technical details.
    slots:
      - xml_id
      - platform_post_id
      - platform_post_url
      - view_count
      - reply_count
      - repost_count
      - like_count
      - quote_count
      - bookmark_count
      - is_edited
      - edit_count
      - language_detected
      - is_sensitive
      - is_sponsored
      - visibility
    annotations:
      glam_hypernym: "DOC.MET.CMC.PST"

  # ===========================================================================
  # CMC CORPUS STRUCTURE
  # ===========================================================================

  CMCCorpus:
    description: >-
      A corpus of CMC data for linguistic or NER research. Extends
      TEI <teiCorpus> for CMC-specific collection contexts.
    slots:
      - xml_id
      - corpus_id
      - corpus_title
      - corpus_description
      - platforms_included
      - time_range_start
      - time_range_end
      - total_posts
      - total_participants
      - total_threads
      - languages_included
      - collection_methodology
      - sampling_strategy
      - anonymization_applied
      - ethical_approval
      - license
    annotations:
      glam_hypernym: "DOC.CRP.CMC"
      void_mapping: "void:Dataset"

  # ===========================================================================
  # NER EXTENSIONS FOR CMC
  # ===========================================================================

  CMCEntityMention:
    description: >-
      An entity mention extracted from CMC text. Extends standard NER
      entity mention with CMC-specific attributes like platform context,
      mention format (@user, #hashtag), and informal language handling.
    slots:
      - xml_id
      - entity_text
      - entity_type
      - entity_ref
      - mention_format
      - is_hashtag_entity
      - is_mention_entity
      - is_url_entity
      - informal_variant
      - canonical_form
      - confidence_score
      - context_window
      - post_ref
    annotations:
      glam_hypernym: "NER.CMC.ENT"
      nif_mapping: "nif:String"

  CMCEntityNormalization:
    description: >-
      Normalization of informal CMC entity mentions to canonical forms.
      Handles abbreviations, typos, slang, and platform-specific formats.
    slots:
      - xml_id
      - original_text
      - normalized_text
      - entity_ref
      - normalization_type
      - normalization_confidence
      - normalization_method
    annotations:
      glam_hypernym: "NER.CMC.NRM"

# =============================================================================
# SLOTS
# =============================================================================

slots:

  # ---------------------------------------------------------------------------
  # Common Identifiers
  # ---------------------------------------------------------------------------

  xml_id:
    description: "Unique identifier for the element"
    range: string
    identifier: true
    slot_uri: tei:id

  # ---------------------------------------------------------------------------
  # Post Slots
  # ---------------------------------------------------------------------------

  post_type:
    description: "Type of CMC post (original, reply, repost, etc.)"
    range: CMCPostTypeEnum

  modality:
    description: "Modality of communication (written, spoken, mixed)"
    range: CMCModalityEnum
    slot_uri: tei:modality

  generated_by:
    description: "Source of content generation (human, bot, system, template)"
    range: GeneratedByEnum
    slot_uri: tei:generatedBy

  reply_to:
    description: "Reference to post being replied to"
    range: string
    slot_uri: tei:replyTo
    annotations:
      note: "Value is xml:id of the post being replied to"

  indent_level:
    description: "Indentation level for wiki-style threading (0=root)"
    range: integer
    slot_uri: tei:indentLevel
    minimum_value: 0

  who:
    description: "Reference to participant who authored the post"
    range: string
    slot_uri: tei:who

  when_written:
    description: "Time when the post was written/composed"
    range: datetime

  when_posted:
    description: "Time when the post was published/sent"
    range: datetime
    slot_uri: tei:when

  synch:
    description: "Synchronization point for temporal alignment"
    range: string
    slot_uri: tei:synch

  content_text:
    description: "Text content of the post"
    range: string

  embedded_media:
    description: "Media embedded in the post"
    range: CMCEmbeddedMedia
    multivalued: true

  mentions:
    description: "@-mentions in the post"
    range: CMCMention
    multivalued: true

  hashtags:
    description: "Hashtags in the post"
    range: CMCHashtag
    multivalued: true

  emojis:
    description: "Emoji in the post"
    range: CMCEmoji
    multivalued: true

  urls:
    description: "URLs in the post"
    range: string
    multivalued: true

  edit_history:
    description: "History of edits to the post"
    range: string
    multivalued: true

  reactions:
    description: "Reactions to the post"
    range: CMCReactionSet

  platform_metadata:
    description: "Platform-specific metadata for the post"
    range: CMCPostMetadata

  # ---------------------------------------------------------------------------
  # Thread Slots
  # ---------------------------------------------------------------------------

  thread_id:
    description: "Platform-specific thread identifier"
    range: string

  thread_title:
    description: "Title of the thread"
    range: string

  thread_structure:
    description: "Type of threading structure"
    range: ThreadStructureEnum

  original_post:
    description: "Reference to the original/root post"
    range: CMCPost

  posts:
    description: "Posts in the thread"
    range: CMCPost
    multivalued: true

  post_count:
    description: "Total number of posts in thread"
    range: integer

  participant_count:
    description: "Number of unique participants"
    range: integer

  start_time:
    description: "Time of first post"
    range: datetime

  last_activity:
    description: "Time of most recent activity"
    range: datetime

  is_closed:
    description: "Whether thread is closed for new replies"
    range: boolean

  is_pinned:
    description: "Whether thread is pinned/sticky"
    range: boolean

  platform_thread_url:
    description: "URL to thread on platform"
    range: uri

  # ---------------------------------------------------------------------------
  # Conversation Slots
  # ---------------------------------------------------------------------------

  conversation_id:
    description: "Identifier for the conversation"
    range: string

  conversation_title:
    description: "Title of the conversation"
    range: string

  threads:
    description: "Threads in the conversation"
    range: CMCThread
    multivalued: true

  channel_name:
    description: "Name of the channel/room"
    range: string

  end_time:
    description: "Time of conversation end"
    range: datetime

  total_posts:
    description: "Total posts in conversation"
    range: integer

  participants:
    description: "Participants in the conversation"
    range: CMCParticipant
    multivalued: true

  # ---------------------------------------------------------------------------
  # Participant Slots
  # ---------------------------------------------------------------------------

  participant_id:
    description: "Unique identifier for participant"
    range: string

  username:
    description: "Platform username"
    range: string

  display_name:
    description: "Display name (may differ from username)"
    range: string

  anonymized_id:
    description: "Anonymized identifier (when privacy applied)"
    range: string

  anonymization_level:
    description: "Level of anonymization applied"
    range: AnonymizationLevelEnum

  account_created:
    description: "When the account was created"
    range: datetime

  account_verified:
    description: "Whether account is verified"
    range: boolean

  is_bot:
    description: "Whether account is a bot"
    range: boolean

  platform_user_url:
    description: "URL to user profile on platform"
    range: uri

  person_ref:
    description: "Reference to TEI person element"
    range: string

  role_in_conversation:
    description: "Role in conversation (moderator, admin, member)"
    range: string

  post_count_in_corpus:
    description: "Number of posts by this participant in corpus"
    range: integer

  demographic_info:
    description: "Demographic information (if available and consented)"
    range: string

  # ---------------------------------------------------------------------------
  # Group Slots
  # ---------------------------------------------------------------------------

  group_id:
    description: "Identifier for the group"
    range: string

  group_name:
    description: "Name of the group"
    range: string

  group_type:
    description: "Type of group (public, private, etc.)"
    range: string

  members:
    description: "Members of the group"
    range: CMCParticipant
    multivalued: true

  member_count:
    description: "Number of members"
    range: integer

  creation_date:
    description: "When the group was created"
    range: datetime

  platform_group_url:
    description: "URL to group on platform"
    range: uri

  # ---------------------------------------------------------------------------
  # Emoji Slots
  # ---------------------------------------------------------------------------

  emoji_char:
    description: "The emoji character"
    range: string

  emoji_codepoint:
    description: "Unicode codepoint(s) for the emoji"
    range: string

  emoji_name:
    description: "Short name/description of emoji"
    range: string

  emoji_category:
    description: "Emoji category"
    range: EmojiCategoryEnum

  is_custom:
    description: "Whether this is a custom/platform-specific emoji"
    range: boolean

  custom_emoji_url:
    description: "URL to custom emoji image"
    range: uri

  text_equivalent:
    description: "Text equivalent or description"
    range: string

  position_in_post:
    description: "Character position in post text"
    range: integer

  sentiment_valence:
    description: "Sentiment valence (-1.0 to 1.0)"
    range: float
    minimum_value: -1.0
    maximum_value: 1.0

  # ---------------------------------------------------------------------------
  # Emoticon Slots
  # ---------------------------------------------------------------------------

  emoticon_text:
    description: "The emoticon text (e.g., ':)', 'XD')"
    range: string

  normalized_form:
    description: "Normalized emoticon form"
    range: string

  emoji_equivalent:
    description: "Unicode emoji equivalent"
    range: string

  # ---------------------------------------------------------------------------
  # Hashtag Slots
  # ---------------------------------------------------------------------------

  hashtag_text:
    description: "Full hashtag text including #"
    range: string

  hashtag_normalized:
    description: "Normalized hashtag (lowercase, no #)"
    range: string

  is_trending:
    description: "Whether hashtag is trending"
    range: boolean

  topic_category:
    description: "Topic category for hashtag"
    range: string

  # ---------------------------------------------------------------------------
  # Mention Slots
  # ---------------------------------------------------------------------------

  mention_text:
    description: "Full mention text including @"
    range: string

  mentioned_username:
    description: "Username being mentioned"
    range: string

  mentioned_user_ref:
    description: "Reference to CMCParticipant"
    range: string

  is_reply_mention:
    description: "Whether mention is auto-added reply mention"
    range: boolean

  entity_type:
    description: "Entity type for NER"
    range: string

  entity_ref:
    description: "Reference to known entity (Wikidata, VIAF, etc.)"
    range: uri

  # ---------------------------------------------------------------------------
  # Media Slots
  # ---------------------------------------------------------------------------

  media_type:
    description: "Type of media"
    range: CMCMediaTypeEnum

  media_url:
    description: "URL to media content"
    range: uri

  thumbnail_url:
    description: "URL to thumbnail"
    range: uri

  alt_text:
    description: "Alt text for accessibility"
    range: string

  caption:
    description: "Caption for media"
    range: string

  duration_seconds:
    description: "Duration in seconds (for audio/video)"
    range: float

  file_size:
    description: "File size in bytes"
    range: integer

  mime_type:
    description: "MIME type"
    range: string

  is_quoted_content:
    description: "Whether this is quoted/embedded content"
    range: boolean

  original_post_ref:
    description: "Reference to original post (if quoted)"
    range: string

  ocr_text:
    description: "OCR-extracted text from image"
    range: string

  transcription:
    description: "Transcription of audio/video"
    range: string

  # ---------------------------------------------------------------------------
  # Reaction Slots
  # ---------------------------------------------------------------------------

  reaction_type:
    description: "Type of reaction (like, love, angry, etc.)"
    range: string

  reaction_emoji:
    description: "Emoji for reaction"
    range: string

  reactor_ref:
    description: "Reference to reactor participant"
    range: string

  reaction_time:
    description: "Time of reaction"
    range: datetime

  reaction_count:
    description: "Number of this reaction type"
    range: integer

  total_reactions:
    description: "Total reactions"
    range: integer

  reaction_breakdown:
    description: "Breakdown by reaction type"
    range: string

  top_reactors:
    description: "Top reactors"
    range: string
    multivalued: true

  # ---------------------------------------------------------------------------
  # Platform Metadata Slots
  # ---------------------------------------------------------------------------

  platform_name:
    description: "Name of platform"
    range: string

  platform_type:
    description: "Type of platform"
    range: CMCPlatformTypeEnum

  platform_version:
    description: "Version of platform/API"
    range: string

  platform_url:
    description: "URL to platform"
    range: uri

  api_version:
    description: "API version used for collection"
    range: string

  collection_date:
    description: "Date of data collection"
    range: datetime

  collection_method:
    description: "Method of data collection"
    range: string

  terms_of_service_url:
    description: "URL to platform ToS"
    range: uri

  rate_limit_info:
    description: "Rate limit information"
    range: string

  geographic_availability:
    description: "Geographic availability of platform"
    range: string

  # ---------------------------------------------------------------------------
  # Post Metadata Slots
  # ---------------------------------------------------------------------------

  platform_post_id:
    description: "Platform-specific post ID"
    range: string

  platform_post_url:
    description: "URL to post on platform"
    range: uri

  view_count:
    description: "Number of views"
    range: integer

  reply_count:
    description: "Number of replies"
    range: integer

  repost_count:
    description: "Number of reposts/shares"
    range: integer

  like_count:
    description: "Number of likes"
    range: integer

  quote_count:
    description: "Number of quote posts"
    range: integer

  bookmark_count:
    description: "Number of bookmarks"
    range: integer

  is_edited:
    description: "Whether post was edited"
    range: boolean

  edit_count:
    description: "Number of edits"
    range: integer

  language_detected:
    description: "Detected language of post"
    range: string

  is_sensitive:
    description: "Whether post is marked sensitive"
    range: boolean

  is_sponsored:
    description: "Whether post is sponsored/promoted"
    range: boolean

  visibility:
    description: "Visibility setting (public, private, followers)"
    range: string

  # ---------------------------------------------------------------------------
  # Corpus Slots
  # ---------------------------------------------------------------------------

  corpus_id:
    description: "Identifier for corpus"
    range: string

  corpus_title:
    description: "Title of corpus"
    range: string

  corpus_description:
    description: "Description of corpus"
    range: string

  platforms_included:
    description: "Platforms included in corpus"
    range: string
    multivalued: true

  time_range_start:
    description: "Start of time range"
    range: datetime

  time_range_end:
    description: "End of time range"
    range: datetime

  total_participants:
    description: "Total participants in corpus"
    range: integer

  total_threads:
    description: "Total threads in corpus"
    range: integer

  languages_included:
    description: "Languages in corpus"
    range: string
    multivalued: true

  collection_methodology:
    description: "Methodology for collection"
    range: string

  sampling_strategy:
    description: "Sampling strategy used"
    range: string

  anonymization_applied:
    description: "Anonymization applied to corpus"
    range: AnonymizationLevelEnum

  ethical_approval:
    description: "Ethical approval information"
    range: string

  license:
    description: "License for corpus"
    range: string

  # ---------------------------------------------------------------------------
  # NER Entity Slots
  # ---------------------------------------------------------------------------

  entity_text:
    description: "Original text of entity mention"
    range: string

  mention_format:
    description: "Format of mention (hashtag, @mention, URL, plain)"
    range: string

  is_hashtag_entity:
    description: "Whether entity is from hashtag"
    range: boolean

  is_mention_entity:
    description: "Whether entity is from @mention"
    range: boolean

  is_url_entity:
    description: "Whether entity is from URL"
    range: boolean

  informal_variant:
    description: "Informal/slang variant of entity"
    range: string

  canonical_form:
    description: "Canonical form of entity"
    range: string

  confidence_score:
    description: "NER confidence score"
    range: float
    minimum_value: 0.0
    maximum_value: 1.0

  context_window:
    description: "Surrounding context for entity"
    range: string

  post_ref:
    description: "Reference to source post"
    range: string

  # ---------------------------------------------------------------------------
  # Normalization Slots
  # ---------------------------------------------------------------------------

  original_text:
    description: "Original text before normalization"
    range: string

  normalized_text:
    description: "Normalized text"
    range: string

  normalization_type:
    description: "Type of normalization applied"
    range: string

  normalization_confidence:
    description: "Confidence in normalization"
    range: float

  normalization_method:
    description: "Method used for normalization"
    range: string

# =============================================================================
# ONTOLOGY MAPPINGS SUMMARY
# =============================================================================
#
# SIOC (Semantically-Interlinked Online Communities):
#   - CMCPost: sioc:Post
#   - CMCThread: sioc:Thread
#   - CMCConversation: sioc:Forum
#   - CMCParticipant: sioc:UserAccount
#   - CMCParticipantGroup: sioc:Usergroup
#
# Activity Streams 2.0:
#   - CMCPost: as:Note, as:Article
#   - CMCReaction: as:Like, as:Announce
#   - CMCParticipant: as:Person
#
# Schema.org:
#   - CMCPost: schema:SocialMediaPosting
#   - CMCThread: schema:DiscussionForumPosting
#   - CMCParticipant: schema:Person
#   - CMCEmbeddedMedia: schema:MediaObject
#
# FOAF:
#   - CMCParticipant: foaf:OnlineAccount
#   - CMCParticipantGroup: foaf:Group
#
# CIDOC-CRM:
#   - CMCPost: crm:E33_Linguistic_Object
#   - CMCParticipant: crm:E39_Actor
#
# PROV-O:
#   - CMCPlatformMetadata: prov:Activity
#   - CMCCorpus: prov:Collection
#
# Web Annotation (W3C OA):
#   - CMCEntityMention: oa:Annotation
#
# NIF:
#   - CMCEntityMention: nif:String
#
# =============================================================================

# =============================================================================
# GLAM-NER HYPERNYM MAPPINGS
# =============================================================================
#
# TXT.CMC: Computer-mediated communication
#   TXT.CMC.PST: CMC post
#   TXT.CMC.THR: CMC thread
#   TXT.CMC.CNV: CMC conversation
#   TXT.CMC.EMJ: Emoji
#   TXT.CMC.EMO: Emoticon
#   TXT.CMC.RXN: Reaction
#   TXT.CMC.RXS: Reaction set
#
# AGT.CMC: CMC agents
#   AGT.CMC.USR: CMC user account
#
# GRP.CMC: CMC groups
#
# APP.CMC: CMC appellations
#   APP.CMC.HTG: Hashtag
#   APP.CMC.MEN: @-mention
#
# THG.CMC: CMC things
#   THG.CMC.MED: CMC embedded media
#
# DOC.MET.CMC: CMC metadata
#   DOC.MET.CMC.PST: Post metadata
#
# DOC.CRP.CMC: CMC corpus
#
# NER.CMC: NER for CMC
#   NER.CMC.ENT: CMC entity mention
#   NER.CMC.NRM: CMC entity normalization
#
# =============================================================================