glam/data/entity_annotation/modules/advanced/tei/cmc.yaml
2025-12-05 15:30:23 +01:00

1478 lines
41 KiB
YAML

# =============================================================================
# GLAM-NER: TEI P5 COMPUTER-MEDIATED COMMUNICATION (CMC) MODULE
# =============================================================================
# Module: modules/advanced/tei/cmc.yaml
# TEI Chapter: 9 (Computer-mediated Communication)
# TEI Module Name: cmc
# Version: 1.0.0
# Status: Complete
# =============================================================================
#
# This module provides LinkML representations of TEI P5 elements for encoding
# computer-mediated communication including social media posts, chat messages,
# forum threads, wiki discussions, and other digital discourse. Essential for
# social media NER, online discourse analysis, and digital heritage collections.
#
# Key Features:
# - Post element for CMC turns/messages
# - Threading and reply structures
# - Emoji and emoticon encoding
# - Multimodal content (text, images, audio, video)
# - Participant metadata and anonymization
# - Bot/automated content detection
# - Cross-platform CMC normalization
#
# TEI Source: https://tei-c.org/release/doc/tei-p5-doc/en/html/CMC.html
# =============================================================================
id: https://w3id.org/glam/ner/tei/cmc
name: glam-ner-tei-cmc
title: "TEI P5 Computer-mediated Communication Module for GLAM-NER"
version: "1.0.0"
license: https://creativecommons.org/licenses/by/4.0/
see_also:
- https://tei-c.org/release/doc/tei-p5-doc/en/html/CMC.html
- https://www.w3.org/community/ontolex/wiki/CMC
prefixes:
linkml: https://w3id.org/linkml/
glam: https://w3id.org/glam/ner/
tei: http://www.tei-c.org/ns/1.0/
schema: http://schema.org/
sioc: http://rdfs.org/sioc/ns#
as: https://www.w3.org/ns/activitystreams#
foaf: http://xmlns.com/foaf/0.1/
dcterms: http://purl.org/dc/terms/
crm: http://www.cidoc-crm.org/cidoc-crm/
prov: http://www.w3.org/ns/prov#
oa: http://www.w3.org/ns/oa#
default_range: string
imports:
- linkml:types
# =============================================================================
# ENUMERATIONS
# =============================================================================
enums:
# ---------------------------------------------------------------------------
# CMC Modality (written vs spoken)
# ---------------------------------------------------------------------------
CMCModalityEnum:
description: >-
Modality of computer-mediated communication. Distinguishes between
written text-based communication and spoken/audio-based communication
transmitted via digital channels.
permissible_values:
written:
description: "Text-based CMC (chat, email, forum posts, social media text)"
spoken:
description: "Voice-based CMC (voice messages, audio calls, podcasts)"
mixed:
description: "Combines written and spoken modalities (video with text overlays)"
# ---------------------------------------------------------------------------
# Content Generation Source
# ---------------------------------------------------------------------------
GeneratedByEnum:
description: >-
Source of content generation for CMC posts. Distinguishes between
human-authored content and various forms of automated/system-generated
content. Essential for NER to identify bot-generated text.
permissible_values:
human:
description: "Content authored by a human user"
template:
description: "Content generated from a template with user input"
system:
description: "System-generated content (notifications, status updates)"
bot:
description: "Content generated by an automated bot/AI agent"
unknown:
description: "Generation source cannot be determined"
# ---------------------------------------------------------------------------
# CMC Platform Type
# ---------------------------------------------------------------------------
CMCPlatformTypeEnum:
description: >-
Type of computer-mediated communication platform. Categorizes the
technological context of CMC for normalization and analysis.
permissible_values:
social_media:
description: "Social networking platforms (Twitter/X, Facebook, Instagram, LinkedIn)"
chat:
description: "Synchronous chat platforms (WhatsApp, Telegram, Slack, Discord)"
forum:
description: "Asynchronous discussion forums (Reddit, Stack Overflow, phpBB)"
wiki_talk:
description: "Wiki discussion pages (Wikipedia talk pages, MediaWiki)"
email:
description: "Email communication (Gmail, Outlook, mailing lists)"
blog:
description: "Blog platforms (WordPress, Medium, Blogger)"
comment:
description: "Comment sections (news articles, video comments)"
microblog:
description: "Microblogging platforms (Twitter/X, Mastodon, Bluesky)"
video_live:
description: "Live video streaming chat (YouTube Live, Twitch)"
gaming:
description: "Gaming communication (in-game chat, Discord gaming)"
dating:
description: "Dating platform messaging"
professional:
description: "Professional networking (LinkedIn messages)"
customer_support:
description: "Customer service chat systems"
sms:
description: "SMS/text messaging"
other:
description: "Other CMC platform type"
# ---------------------------------------------------------------------------
# Post Type
# ---------------------------------------------------------------------------
CMCPostTypeEnum:
description: >-
Type of CMC post within a communication thread. Distinguishes between
original posts, replies, reposts/shares, and other post types.
permissible_values:
original:
description: "Original post starting a new thread"
reply:
description: "Reply to another post"
repost:
description: "Share/repost of another post (RT, reblog)"
quote:
description: "Quote post with commentary"
reaction:
description: "Reaction-only post (emoji reaction, like)"
edit:
description: "Edit of a previous post"
deletion:
description: "Deletion marker (post was deleted)"
system:
description: "System message (join/leave notifications)"
pinned:
description: "Pinned/sticky post"
# ---------------------------------------------------------------------------
# Emoji Category
# ---------------------------------------------------------------------------
EmojiCategoryEnum:
description: >-
Unicode emoji category classification. Used for emoji/emoticon encoding
in CMC transcriptions.
permissible_values:
smileys_emotion:
description: "Smileys & Emotion (face expressions, hearts)"
people_body:
description: "People & Body (hand gestures, people)"
animals_nature:
description: "Animals & Nature (animals, plants)"
food_drink:
description: "Food & Drink"
travel_places:
description: "Travel & Places (buildings, transport)"
activities:
description: "Activities (sports, arts)"
objects:
description: "Objects (tools, symbols)"
symbols:
description: "Symbols (arrows, zodiac, flags)"
flags:
description: "Flags (country flags, special flags)"
component:
description: "Component (skin tones, hair)"
# ---------------------------------------------------------------------------
# Participant Anonymization Level
# ---------------------------------------------------------------------------
AnonymizationLevelEnum:
description: >-
Level of participant anonymization applied to CMC data. Important
for privacy protection in CMC corpora and compliance with GDPR.
permissible_values:
none:
description: "No anonymization (original usernames preserved)"
pseudonymized:
description: "Usernames replaced with consistent pseudonyms"
anonymized:
description: "Full anonymization (no identifying information)"
aggregated:
description: "Aggregated data (no individual posts)"
# ---------------------------------------------------------------------------
# Thread Structure Type
# ---------------------------------------------------------------------------
ThreadStructureEnum:
description: >-
Type of threading structure in CMC platform. Affects how replies
and conversations are organized and visualized.
permissible_values:
flat:
description: "Flat chronological list (no threading)"
linear:
description: "Linear thread with reply references"
nested:
description: "Nested/threaded replies (Reddit-style)"
wiki_indent:
description: "Wiki-style indentation threading"
graph:
description: "Graph structure (multiple parents)"
# ---------------------------------------------------------------------------
# Multimodal Content Type
# ---------------------------------------------------------------------------
CMCMediaTypeEnum:
description: >-
Type of multimodal content embedded in CMC posts. Posts may contain
text plus images, video, audio, or other media.
permissible_values:
text_only:
description: "Text-only post"
image:
description: "Post with image(s)"
video:
description: "Post with video"
audio:
description: "Post with audio/voice message"
gif:
description: "Post with animated GIF"
sticker:
description: "Post with sticker"
file:
description: "Post with file attachment"
poll:
description: "Post with poll"
location:
description: "Post with location/check-in"
link:
description: "Post with link preview"
mixed:
description: "Post with multiple media types"
# =============================================================================
# CLASSES
# =============================================================================
classes:
# ===========================================================================
# CORE POST ELEMENT
# ===========================================================================
CMCPost:
description: >-
A single post, message, or contribution in computer-mediated communication.
The fundamental unit of CMC discourse, corresponding to TEI <post> element.
Can represent tweets, chat messages, forum posts, wiki talk contributions,
email messages, blog comments, etc.
class_uri: tei:post
slots:
- xml_id
- post_type
- modality
- generated_by
- reply_to
- indent_level
- who
- when_written
- when_posted
- synch
- content_text
- embedded_media
- mentions
- hashtags
- emojis
- urls
- edit_history
- reactions
- platform_metadata
slot_usage:
xml_id:
required: true
content_text:
required: true
annotations:
tei_element: "post"
tei_module: "cmc"
glam_hypernym: "TXT.CMC.PST"
ner_relevance: |
CMC posts are primary sources for NER in social media and online discourse.
May contain informal language, abbreviations, hashtags, and @mentions.
Entity recognition must handle: platform-specific formats (@ for users,
# for topics), emoji as sentiment markers, URLs as references, and
non-standard orthography typical of CMC.
# ===========================================================================
# THREADING AND STRUCTURE
# ===========================================================================
CMCThread:
description: >-
A thread of related CMC posts, representing a conversation or discussion.
Contains an ordered collection of posts with reply relationships.
Used for forum threads, Twitter threads, email chains, chat conversations.
class_uri: sioc:Thread
slots:
- xml_id
- thread_id
- thread_title
- thread_structure
- original_post
- posts
- post_count
- participant_count
- start_time
- last_activity
- is_closed
- is_pinned
- platform_thread_url
slot_usage:
xml_id:
required: true
posts:
required: true
multivalued: true
annotations:
glam_hypernym: "TXT.CMC.THR"
sioc_mapping: "sioc:Thread"
CMCConversation:
description: >-
A conversation context containing one or more related threads.
Represents broader discourse context, such as all discussion on a topic
across multiple threads, or a chat room conversation over time.
class_uri: sioc:Forum
slots:
- xml_id
- conversation_id
- conversation_title
- threads
- platform_type
- channel_name
- start_time
- end_time
- total_posts
- participants
annotations:
glam_hypernym: "TXT.CMC.CNV"
sioc_mapping: "sioc:Forum"
# ===========================================================================
# PARTICIPANT METADATA
# ===========================================================================
CMCParticipant:
description: >-
A participant in computer-mediated communication. Represents a user
account or identity within a CMC platform. May be pseudonymized or
anonymized for privacy. Links to TEI <person> for speaker identification.
class_uri: sioc:UserAccount
slots:
- xml_id
- participant_id
- username
- display_name
- anonymized_id
- anonymization_level
- account_created
- account_verified
- is_bot
- platform_user_url
- person_ref
- role_in_conversation
- post_count_in_corpus
- demographic_info
slot_usage:
xml_id:
required: true
annotations:
glam_hypernym: "AGT.CMC.USR"
sioc_mapping: "sioc:UserAccount"
foaf_mapping: "foaf:OnlineAccount"
privacy_note: |
CMC participant data must comply with privacy regulations (GDPR, CCPA).
Use anonymization_level to document privacy protection measures.
Real identities should only be preserved when explicit consent exists
or for public figures in public discourse.
CMCParticipantGroup:
description: >-
A group of participants in CMC, such as members of a chat room,
forum community, or social media follower group.
class_uri: sioc:Usergroup
slots:
- xml_id
- group_id
- group_name
- group_type
- members
- member_count
- creation_date
- platform_group_url
annotations:
glam_hypernym: "GRP.CMC"
sioc_mapping: "sioc:Usergroup"
# ===========================================================================
# EMOJI AND EMOTICON ENCODING
# ===========================================================================
CMCEmoji:
description: >-
An emoji or emoticon in CMC text. Encodes both Unicode emoji and
text-based emoticons (e.g., :) :-P). Essential for sentiment analysis
and understanding informal CMC expression.
slots:
- xml_id
- emoji_char
- emoji_codepoint
- emoji_name
- emoji_category
- is_custom
- custom_emoji_url
- text_equivalent
- position_in_post
- sentiment_valence
annotations:
glam_hypernym: "TXT.CMC.EMJ"
ner_note: |
Emoji can function as: sentiment markers, entity references (flag emoji
for countries), topic markers, or standalone expressions. NER pipelines
should consider emoji context for entity disambiguation and sentiment.
CMCEmoticon:
description: >-
A text-based emoticon in CMC (e.g., :), :-), :P, <3, XD).
Distinguished from Unicode emoji as ASCII-based representations.
slots:
- xml_id
- emoticon_text
- normalized_form
- emoji_equivalent
- position_in_post
- sentiment_valence
annotations:
glam_hypernym: "TXT.CMC.EMO"
# ===========================================================================
# HASHTAGS AND MENTIONS
# ===========================================================================
CMCHashtag:
description: >-
A hashtag in CMC text (e.g., #BlackLivesMatter, #AI, #heritage).
Hashtags function as topic markers, community identifiers, and
sometimes as named entities themselves (event names, campaign names).
slots:
- xml_id
- hashtag_text
- hashtag_normalized
- position_in_post
- is_trending
- topic_category
- entity_ref
annotations:
glam_hypernym: "APP.CMC.HTG"
ner_note: |
Hashtags may represent: events (#Olympics2024), organizations (#UNESCO),
movements (#MeToo), topics (#AI), or locations (#Paris). NER should
consider hashtags as potential entity mentions.
CMCMention:
description: >-
An @-mention of a user in CMC text (e.g., @username, @NASA).
Mentions explicitly reference other users or accounts and
function as named entity references within CMC.
slots:
- xml_id
- mention_text
- mentioned_username
- mentioned_user_ref
- position_in_post
- is_reply_mention
- entity_type
- entity_ref
annotations:
glam_hypernym: "APP.CMC.MEN"
ner_note: |
@-mentions can reference: individuals (@jack), organizations (@NASA),
bots (@github-actions), or fictional entities. NER should resolve
mentions to known entities when possible.
# ===========================================================================
# MULTIMODAL CONTENT
# ===========================================================================
CMCEmbeddedMedia:
description: >-
Media content embedded in a CMC post (images, videos, audio,
GIFs, stickers, files). Extends TEI <figure>/<graphic> for CMC context.
slots:
- xml_id
- media_type
- media_url
- thumbnail_url
- alt_text
- caption
- duration_seconds
- file_size
- mime_type
- is_quoted_content
- original_post_ref
- ocr_text
- transcription
annotations:
glam_hypernym: "THG.CMC.MED"
ner_note: |
Embedded media may contain entities: images of people/places,
screenshots with text, quoted posts with entity mentions.
OCR/transcription enables NER on visual content.
# ===========================================================================
# REACTIONS AND ENGAGEMENT
# ===========================================================================
CMCReaction:
description: >-
A reaction to a CMC post (like, love, laugh, angry, etc.).
Captures engagement metrics and sentiment signals.
slots:
- xml_id
- reaction_type
- reaction_emoji
- reactor_ref
- reaction_time
- reaction_count
annotations:
glam_hypernym: "TXT.CMC.RXN"
CMCReactionSet:
description: >-
Aggregated reactions on a CMC post, summarizing all reaction types
and counts. Useful for engagement analysis.
slots:
- xml_id
- total_reactions
- reaction_breakdown
- top_reactors
annotations:
glam_hypernym: "TXT.CMC.RXS"
# ===========================================================================
# PLATFORM METADATA
# ===========================================================================
CMCPlatformMetadata:
description: >-
Platform-specific metadata for CMC content. Captures technical
and contextual information from the originating platform.
slots:
- xml_id
- platform_name
- platform_type
- platform_version
- platform_url
- api_version
- collection_date
- collection_method
- terms_of_service_url
- rate_limit_info
- geographic_availability
annotations:
glam_hypernym: "DOC.MET.CMC"
prov_mapping: "prov:Activity"
CMCPostMetadata:
description: >-
Platform-specific metadata for an individual CMC post.
Includes platform IDs, engagement metrics, and technical details.
slots:
- xml_id
- platform_post_id
- platform_post_url
- view_count
- reply_count
- repost_count
- like_count
- quote_count
- bookmark_count
- is_edited
- edit_count
- language_detected
- is_sensitive
- is_sponsored
- visibility
annotations:
glam_hypernym: "DOC.MET.CMC.PST"
# ===========================================================================
# CMC CORPUS STRUCTURE
# ===========================================================================
CMCCorpus:
description: >-
A corpus of CMC data for linguistic or NER research. Extends
TEI <teiCorpus> for CMC-specific collection contexts.
slots:
- xml_id
- corpus_id
- corpus_title
- corpus_description
- platforms_included
- time_range_start
- time_range_end
- total_posts
- total_participants
- total_threads
- languages_included
- collection_methodology
- sampling_strategy
- anonymization_applied
- ethical_approval
- license
annotations:
glam_hypernym: "DOC.CRP.CMC"
void_mapping: "void:Dataset"
# ===========================================================================
# NER EXTENSIONS FOR CMC
# ===========================================================================
CMCEntityMention:
description: >-
An entity mention extracted from CMC text. Extends standard NER
entity mention with CMC-specific attributes like platform context,
mention format (@user, #hashtag), and informal language handling.
slots:
- xml_id
- entity_text
- entity_type
- entity_ref
- mention_format
- is_hashtag_entity
- is_mention_entity
- is_url_entity
- informal_variant
- canonical_form
- confidence_score
- context_window
- post_ref
annotations:
glam_hypernym: "NER.CMC.ENT"
nif_mapping: "nif:String"
CMCEntityNormalization:
description: >-
Normalization of informal CMC entity mentions to canonical forms.
Handles abbreviations, typos, slang, and platform-specific formats.
slots:
- xml_id
- original_text
- normalized_text
- entity_ref
- normalization_type
- normalization_confidence
- normalization_method
annotations:
glam_hypernym: "NER.CMC.NRM"
# =============================================================================
# SLOTS
# =============================================================================
slots:
# ---------------------------------------------------------------------------
# Common Identifiers
# ---------------------------------------------------------------------------
xml_id:
description: "Unique identifier for the element"
range: string
identifier: true
slot_uri: tei:id
# ---------------------------------------------------------------------------
# Post Slots
# ---------------------------------------------------------------------------
post_type:
description: "Type of CMC post (original, reply, repost, etc.)"
range: CMCPostTypeEnum
modality:
description: "Modality of communication (written, spoken, mixed)"
range: CMCModalityEnum
slot_uri: tei:modality
generated_by:
description: "Source of content generation (human, bot, system, template)"
range: GeneratedByEnum
slot_uri: tei:generatedBy
reply_to:
description: "Reference to post being replied to"
range: string
slot_uri: tei:replyTo
annotations:
note: "Value is xml:id of the post being replied to"
indent_level:
description: "Indentation level for wiki-style threading (0=root)"
range: integer
slot_uri: tei:indentLevel
minimum_value: 0
who:
description: "Reference to participant who authored the post"
range: string
slot_uri: tei:who
when_written:
description: "Time when the post was written/composed"
range: datetime
when_posted:
description: "Time when the post was published/sent"
range: datetime
slot_uri: tei:when
synch:
description: "Synchronization point for temporal alignment"
range: string
slot_uri: tei:synch
content_text:
description: "Text content of the post"
range: string
embedded_media:
description: "Media embedded in the post"
range: CMCEmbeddedMedia
multivalued: true
mentions:
description: "@-mentions in the post"
range: CMCMention
multivalued: true
hashtags:
description: "Hashtags in the post"
range: CMCHashtag
multivalued: true
emojis:
description: "Emoji in the post"
range: CMCEmoji
multivalued: true
urls:
description: "URLs in the post"
range: string
multivalued: true
edit_history:
description: "History of edits to the post"
range: string
multivalued: true
reactions:
description: "Reactions to the post"
range: CMCReactionSet
platform_metadata:
description: "Platform-specific metadata for the post"
range: CMCPostMetadata
# ---------------------------------------------------------------------------
# Thread Slots
# ---------------------------------------------------------------------------
thread_id:
description: "Platform-specific thread identifier"
range: string
thread_title:
description: "Title of the thread"
range: string
thread_structure:
description: "Type of threading structure"
range: ThreadStructureEnum
original_post:
description: "Reference to the original/root post"
range: CMCPost
posts:
description: "Posts in the thread"
range: CMCPost
multivalued: true
post_count:
description: "Total number of posts in thread"
range: integer
participant_count:
description: "Number of unique participants"
range: integer
start_time:
description: "Time of first post"
range: datetime
last_activity:
description: "Time of most recent activity"
range: datetime
is_closed:
description: "Whether thread is closed for new replies"
range: boolean
is_pinned:
description: "Whether thread is pinned/sticky"
range: boolean
platform_thread_url:
description: "URL to thread on platform"
range: uri
# ---------------------------------------------------------------------------
# Conversation Slots
# ---------------------------------------------------------------------------
conversation_id:
description: "Identifier for the conversation"
range: string
conversation_title:
description: "Title of the conversation"
range: string
threads:
description: "Threads in the conversation"
range: CMCThread
multivalued: true
channel_name:
description: "Name of the channel/room"
range: string
end_time:
description: "Time of conversation end"
range: datetime
total_posts:
description: "Total posts in conversation"
range: integer
participants:
description: "Participants in the conversation"
range: CMCParticipant
multivalued: true
# ---------------------------------------------------------------------------
# Participant Slots
# ---------------------------------------------------------------------------
participant_id:
description: "Unique identifier for participant"
range: string
username:
description: "Platform username"
range: string
display_name:
description: "Display name (may differ from username)"
range: string
anonymized_id:
description: "Anonymized identifier (when privacy applied)"
range: string
anonymization_level:
description: "Level of anonymization applied"
range: AnonymizationLevelEnum
account_created:
description: "When the account was created"
range: datetime
account_verified:
description: "Whether account is verified"
range: boolean
is_bot:
description: "Whether account is a bot"
range: boolean
platform_user_url:
description: "URL to user profile on platform"
range: uri
person_ref:
description: "Reference to TEI person element"
range: string
role_in_conversation:
description: "Role in conversation (moderator, admin, member)"
range: string
post_count_in_corpus:
description: "Number of posts by this participant in corpus"
range: integer
demographic_info:
description: "Demographic information (if available and consented)"
range: string
# ---------------------------------------------------------------------------
# Group Slots
# ---------------------------------------------------------------------------
group_id:
description: "Identifier for the group"
range: string
group_name:
description: "Name of the group"
range: string
group_type:
description: "Type of group (public, private, etc.)"
range: string
members:
description: "Members of the group"
range: CMCParticipant
multivalued: true
member_count:
description: "Number of members"
range: integer
creation_date:
description: "When the group was created"
range: datetime
platform_group_url:
description: "URL to group on platform"
range: uri
# ---------------------------------------------------------------------------
# Emoji Slots
# ---------------------------------------------------------------------------
emoji_char:
description: "The emoji character"
range: string
emoji_codepoint:
description: "Unicode codepoint(s) for the emoji"
range: string
emoji_name:
description: "Short name/description of emoji"
range: string
emoji_category:
description: "Emoji category"
range: EmojiCategoryEnum
is_custom:
description: "Whether this is a custom/platform-specific emoji"
range: boolean
custom_emoji_url:
description: "URL to custom emoji image"
range: uri
text_equivalent:
description: "Text equivalent or description"
range: string
position_in_post:
description: "Character position in post text"
range: integer
sentiment_valence:
description: "Sentiment valence (-1.0 to 1.0)"
range: float
minimum_value: -1.0
maximum_value: 1.0
# ---------------------------------------------------------------------------
# Emoticon Slots
# ---------------------------------------------------------------------------
emoticon_text:
description: "The emoticon text (e.g., ':)', 'XD')"
range: string
normalized_form:
description: "Normalized emoticon form"
range: string
emoji_equivalent:
description: "Unicode emoji equivalent"
range: string
# ---------------------------------------------------------------------------
# Hashtag Slots
# ---------------------------------------------------------------------------
hashtag_text:
description: "Full hashtag text including #"
range: string
hashtag_normalized:
description: "Normalized hashtag (lowercase, no #)"
range: string
is_trending:
description: "Whether hashtag is trending"
range: boolean
topic_category:
description: "Topic category for hashtag"
range: string
# ---------------------------------------------------------------------------
# Mention Slots
# ---------------------------------------------------------------------------
mention_text:
description: "Full mention text including @"
range: string
mentioned_username:
description: "Username being mentioned"
range: string
mentioned_user_ref:
description: "Reference to CMCParticipant"
range: string
is_reply_mention:
description: "Whether mention is auto-added reply mention"
range: boolean
entity_type:
description: "Entity type for NER"
range: string
entity_ref:
description: "Reference to known entity (Wikidata, VIAF, etc.)"
range: uri
# ---------------------------------------------------------------------------
# Media Slots
# ---------------------------------------------------------------------------
media_type:
description: "Type of media"
range: CMCMediaTypeEnum
media_url:
description: "URL to media content"
range: uri
thumbnail_url:
description: "URL to thumbnail"
range: uri
alt_text:
description: "Alt text for accessibility"
range: string
caption:
description: "Caption for media"
range: string
duration_seconds:
description: "Duration in seconds (for audio/video)"
range: float
file_size:
description: "File size in bytes"
range: integer
mime_type:
description: "MIME type"
range: string
is_quoted_content:
description: "Whether this is quoted/embedded content"
range: boolean
original_post_ref:
description: "Reference to original post (if quoted)"
range: string
ocr_text:
description: "OCR-extracted text from image"
range: string
transcription:
description: "Transcription of audio/video"
range: string
# ---------------------------------------------------------------------------
# Reaction Slots
# ---------------------------------------------------------------------------
reaction_type:
description: "Type of reaction (like, love, angry, etc.)"
range: string
reaction_emoji:
description: "Emoji for reaction"
range: string
reactor_ref:
description: "Reference to reactor participant"
range: string
reaction_time:
description: "Time of reaction"
range: datetime
reaction_count:
description: "Number of this reaction type"
range: integer
total_reactions:
description: "Total reactions"
range: integer
reaction_breakdown:
description: "Breakdown by reaction type"
range: string
top_reactors:
description: "Top reactors"
range: string
multivalued: true
# ---------------------------------------------------------------------------
# Platform Metadata Slots
# ---------------------------------------------------------------------------
platform_name:
description: "Name of platform"
range: string
platform_type:
description: "Type of platform"
range: CMCPlatformTypeEnum
platform_version:
description: "Version of platform/API"
range: string
platform_url:
description: "URL to platform"
range: uri
api_version:
description: "API version used for collection"
range: string
collection_date:
description: "Date of data collection"
range: datetime
collection_method:
description: "Method of data collection"
range: string
terms_of_service_url:
description: "URL to platform ToS"
range: uri
rate_limit_info:
description: "Rate limit information"
range: string
geographic_availability:
description: "Geographic availability of platform"
range: string
# ---------------------------------------------------------------------------
# Post Metadata Slots
# ---------------------------------------------------------------------------
platform_post_id:
description: "Platform-specific post ID"
range: string
platform_post_url:
description: "URL to post on platform"
range: uri
view_count:
description: "Number of views"
range: integer
reply_count:
description: "Number of replies"
range: integer
repost_count:
description: "Number of reposts/shares"
range: integer
like_count:
description: "Number of likes"
range: integer
quote_count:
description: "Number of quote posts"
range: integer
bookmark_count:
description: "Number of bookmarks"
range: integer
is_edited:
description: "Whether post was edited"
range: boolean
edit_count:
description: "Number of edits"
range: integer
language_detected:
description: "Detected language of post"
range: string
is_sensitive:
description: "Whether post is marked sensitive"
range: boolean
is_sponsored:
description: "Whether post is sponsored/promoted"
range: boolean
visibility:
description: "Visibility setting (public, private, followers)"
range: string
# ---------------------------------------------------------------------------
# Corpus Slots
# ---------------------------------------------------------------------------
corpus_id:
description: "Identifier for corpus"
range: string
corpus_title:
description: "Title of corpus"
range: string
corpus_description:
description: "Description of corpus"
range: string
platforms_included:
description: "Platforms included in corpus"
range: string
multivalued: true
time_range_start:
description: "Start of time range"
range: datetime
time_range_end:
description: "End of time range"
range: datetime
total_participants:
description: "Total participants in corpus"
range: integer
total_threads:
description: "Total threads in corpus"
range: integer
languages_included:
description: "Languages in corpus"
range: string
multivalued: true
collection_methodology:
description: "Methodology for collection"
range: string
sampling_strategy:
description: "Sampling strategy used"
range: string
anonymization_applied:
description: "Anonymization applied to corpus"
range: AnonymizationLevelEnum
ethical_approval:
description: "Ethical approval information"
range: string
license:
description: "License for corpus"
range: string
# ---------------------------------------------------------------------------
# NER Entity Slots
# ---------------------------------------------------------------------------
entity_text:
description: "Original text of entity mention"
range: string
mention_format:
description: "Format of mention (hashtag, @mention, URL, plain)"
range: string
is_hashtag_entity:
description: "Whether entity is from hashtag"
range: boolean
is_mention_entity:
description: "Whether entity is from @mention"
range: boolean
is_url_entity:
description: "Whether entity is from URL"
range: boolean
informal_variant:
description: "Informal/slang variant of entity"
range: string
canonical_form:
description: "Canonical form of entity"
range: string
confidence_score:
description: "NER confidence score"
range: float
minimum_value: 0.0
maximum_value: 1.0
context_window:
description: "Surrounding context for entity"
range: string
post_ref:
description: "Reference to source post"
range: string
# ---------------------------------------------------------------------------
# Normalization Slots
# ---------------------------------------------------------------------------
original_text:
description: "Original text before normalization"
range: string
normalized_text:
description: "Normalized text"
range: string
normalization_type:
description: "Type of normalization applied"
range: string
normalization_confidence:
description: "Confidence in normalization"
range: float
normalization_method:
description: "Method used for normalization"
range: string
# =============================================================================
# ONTOLOGY MAPPINGS SUMMARY
# =============================================================================
#
# SIOC (Semantically-Interlinked Online Communities):
# - CMCPost: sioc:Post
# - CMCThread: sioc:Thread
# - CMCConversation: sioc:Forum
# - CMCParticipant: sioc:UserAccount
# - CMCParticipantGroup: sioc:Usergroup
#
# Activity Streams 2.0:
# - CMCPost: as:Note, as:Article
# - CMCReaction: as:Like, as:Announce
# - CMCParticipant: as:Person
#
# Schema.org:
# - CMCPost: schema:SocialMediaPosting
# - CMCThread: schema:DiscussionForumPosting
# - CMCParticipant: schema:Person
# - CMCEmbeddedMedia: schema:MediaObject
#
# FOAF:
# - CMCParticipant: foaf:OnlineAccount
# - CMCParticipantGroup: foaf:Group
#
# CIDOC-CRM:
# - CMCPost: crm:E33_Linguistic_Object
# - CMCParticipant: crm:E39_Actor
#
# PROV-O:
# - CMCPlatformMetadata: prov:Activity
# - CMCCorpus: prov:Collection
#
# Web Annotation (W3C OA):
# - CMCEntityMention: oa:Annotation
#
# NIF:
# - CMCEntityMention: nif:String
#
# =============================================================================
# =============================================================================
# GLAM-NER HYPERNYM MAPPINGS
# =============================================================================
#
# TXT.CMC: Computer-mediated communication
# TXT.CMC.PST: CMC post
# TXT.CMC.THR: CMC thread
# TXT.CMC.CNV: CMC conversation
# TXT.CMC.EMJ: Emoji
# TXT.CMC.EMO: Emoticon
# TXT.CMC.RXN: Reaction
# TXT.CMC.RXS: Reaction set
#
# AGT.CMC: CMC agents
# AGT.CMC.USR: CMC user account
#
# GRP.CMC: CMC groups
#
# APP.CMC: CMC appellations
# APP.CMC.HTG: Hashtag
# APP.CMC.MEN: @-mention
#
# THG.CMC: CMC things
# THG.CMC.MED: CMC embedded media
#
# DOC.MET.CMC: CMC metadata
# DOC.MET.CMC.PST: Post metadata
#
# DOC.CRP.CMC: CMC corpus
#
# NER.CMC: NER for CMC
# NER.CMC.ENT: CMC entity mention
# NER.CMC.NRM: CMC entity normalization
#
# =============================================================================