1478 lines
41 KiB
YAML
1478 lines
41 KiB
YAML
# =============================================================================
|
|
# GLAM-NER: TEI P5 COMPUTER-MEDIATED COMMUNICATION (CMC) MODULE
|
|
# =============================================================================
|
|
# Module: modules/advanced/tei/cmc.yaml
|
|
# TEI Chapter: 9 (Computer-mediated Communication)
|
|
# TEI Module Name: cmc
|
|
# Version: 1.0.0
|
|
# Status: Complete
|
|
# =============================================================================
|
|
#
|
|
# This module provides LinkML representations of TEI P5 elements for encoding
|
|
# computer-mediated communication including social media posts, chat messages,
|
|
# forum threads, wiki discussions, and other digital discourse. Essential for
|
|
# social media NER, online discourse analysis, and digital heritage collections.
|
|
#
|
|
# Key Features:
|
|
# - Post element for CMC turns/messages
|
|
# - Threading and reply structures
|
|
# - Emoji and emoticon encoding
|
|
# - Multimodal content (text, images, audio, video)
|
|
# - Participant metadata and anonymization
|
|
# - Bot/automated content detection
|
|
# - Cross-platform CMC normalization
|
|
#
|
|
# TEI Source: https://tei-c.org/release/doc/tei-p5-doc/en/html/CMC.html
|
|
# =============================================================================
|
|
|
|
id: https://w3id.org/glam/ner/tei/cmc
|
|
name: glam-ner-tei-cmc
|
|
title: "TEI P5 Computer-mediated Communication Module for GLAM-NER"
|
|
version: "1.0.0"
|
|
|
|
license: https://creativecommons.org/licenses/by/4.0/
|
|
see_also:
|
|
- https://tei-c.org/release/doc/tei-p5-doc/en/html/CMC.html
|
|
- https://www.w3.org/community/ontolex/wiki/CMC
|
|
|
|
prefixes:
|
|
linkml: https://w3id.org/linkml/
|
|
glam: https://w3id.org/glam/ner/
|
|
tei: http://www.tei-c.org/ns/1.0/
|
|
schema: http://schema.org/
|
|
sioc: http://rdfs.org/sioc/ns#
|
|
as: https://www.w3.org/ns/activitystreams#
|
|
foaf: http://xmlns.com/foaf/0.1/
|
|
dcterms: http://purl.org/dc/terms/
|
|
crm: http://www.cidoc-crm.org/cidoc-crm/
|
|
prov: http://www.w3.org/ns/prov#
|
|
oa: http://www.w3.org/ns/oa#
|
|
|
|
default_range: string
|
|
|
|
imports:
|
|
- linkml:types
|
|
|
|
# =============================================================================
|
|
# ENUMERATIONS
|
|
# =============================================================================
|
|
|
|
enums:
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# CMC Modality (written vs spoken)
|
|
# ---------------------------------------------------------------------------
|
|
CMCModalityEnum:
|
|
description: >-
|
|
Modality of computer-mediated communication. Distinguishes between
|
|
written text-based communication and spoken/audio-based communication
|
|
transmitted via digital channels.
|
|
permissible_values:
|
|
written:
|
|
description: "Text-based CMC (chat, email, forum posts, social media text)"
|
|
spoken:
|
|
description: "Voice-based CMC (voice messages, audio calls, podcasts)"
|
|
mixed:
|
|
description: "Combines written and spoken modalities (video with text overlays)"
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Content Generation Source
|
|
# ---------------------------------------------------------------------------
|
|
GeneratedByEnum:
|
|
description: >-
|
|
Source of content generation for CMC posts. Distinguishes between
|
|
human-authored content and various forms of automated/system-generated
|
|
content. Essential for NER to identify bot-generated text.
|
|
permissible_values:
|
|
human:
|
|
description: "Content authored by a human user"
|
|
template:
|
|
description: "Content generated from a template with user input"
|
|
system:
|
|
description: "System-generated content (notifications, status updates)"
|
|
bot:
|
|
description: "Content generated by an automated bot/AI agent"
|
|
unknown:
|
|
description: "Generation source cannot be determined"
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# CMC Platform Type
|
|
# ---------------------------------------------------------------------------
|
|
CMCPlatformTypeEnum:
|
|
description: >-
|
|
Type of computer-mediated communication platform. Categorizes the
|
|
technological context of CMC for normalization and analysis.
|
|
permissible_values:
|
|
social_media:
|
|
description: "Social networking platforms (Twitter/X, Facebook, Instagram, LinkedIn)"
|
|
chat:
|
|
description: "Synchronous chat platforms (WhatsApp, Telegram, Slack, Discord)"
|
|
forum:
|
|
description: "Asynchronous discussion forums (Reddit, Stack Overflow, phpBB)"
|
|
wiki_talk:
|
|
description: "Wiki discussion pages (Wikipedia talk pages, MediaWiki)"
|
|
email:
|
|
description: "Email communication (Gmail, Outlook, mailing lists)"
|
|
blog:
|
|
description: "Blog platforms (WordPress, Medium, Blogger)"
|
|
comment:
|
|
description: "Comment sections (news articles, video comments)"
|
|
microblog:
|
|
description: "Microblogging platforms (Twitter/X, Mastodon, Bluesky)"
|
|
video_live:
|
|
description: "Live video streaming chat (YouTube Live, Twitch)"
|
|
gaming:
|
|
description: "Gaming communication (in-game chat, Discord gaming)"
|
|
dating:
|
|
description: "Dating platform messaging"
|
|
professional:
|
|
description: "Professional networking (LinkedIn messages)"
|
|
customer_support:
|
|
description: "Customer service chat systems"
|
|
sms:
|
|
description: "SMS/text messaging"
|
|
other:
|
|
description: "Other CMC platform type"
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Post Type
|
|
# ---------------------------------------------------------------------------
|
|
CMCPostTypeEnum:
|
|
description: >-
|
|
Type of CMC post within a communication thread. Distinguishes between
|
|
original posts, replies, reposts/shares, and other post types.
|
|
permissible_values:
|
|
original:
|
|
description: "Original post starting a new thread"
|
|
reply:
|
|
description: "Reply to another post"
|
|
repost:
|
|
description: "Share/repost of another post (RT, reblog)"
|
|
quote:
|
|
description: "Quote post with commentary"
|
|
reaction:
|
|
description: "Reaction-only post (emoji reaction, like)"
|
|
edit:
|
|
description: "Edit of a previous post"
|
|
deletion:
|
|
description: "Deletion marker (post was deleted)"
|
|
system:
|
|
description: "System message (join/leave notifications)"
|
|
pinned:
|
|
description: "Pinned/sticky post"
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Emoji Category
|
|
# ---------------------------------------------------------------------------
|
|
EmojiCategoryEnum:
|
|
description: >-
|
|
Unicode emoji category classification. Used for emoji/emoticon encoding
|
|
in CMC transcriptions.
|
|
permissible_values:
|
|
smileys_emotion:
|
|
description: "Smileys & Emotion (face expressions, hearts)"
|
|
people_body:
|
|
description: "People & Body (hand gestures, people)"
|
|
animals_nature:
|
|
description: "Animals & Nature (animals, plants)"
|
|
food_drink:
|
|
description: "Food & Drink"
|
|
travel_places:
|
|
description: "Travel & Places (buildings, transport)"
|
|
activities:
|
|
description: "Activities (sports, arts)"
|
|
objects:
|
|
description: "Objects (tools, symbols)"
|
|
symbols:
|
|
description: "Symbols (arrows, zodiac, flags)"
|
|
flags:
|
|
description: "Flags (country flags, special flags)"
|
|
component:
|
|
description: "Component (skin tones, hair)"
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Participant Anonymization Level
|
|
# ---------------------------------------------------------------------------
|
|
AnonymizationLevelEnum:
|
|
description: >-
|
|
Level of participant anonymization applied to CMC data. Important
|
|
for privacy protection in CMC corpora and compliance with GDPR.
|
|
permissible_values:
|
|
none:
|
|
description: "No anonymization (original usernames preserved)"
|
|
pseudonymized:
|
|
description: "Usernames replaced with consistent pseudonyms"
|
|
anonymized:
|
|
description: "Full anonymization (no identifying information)"
|
|
aggregated:
|
|
description: "Aggregated data (no individual posts)"
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Thread Structure Type
|
|
# ---------------------------------------------------------------------------
|
|
ThreadStructureEnum:
|
|
description: >-
|
|
Type of threading structure in CMC platform. Affects how replies
|
|
and conversations are organized and visualized.
|
|
permissible_values:
|
|
flat:
|
|
description: "Flat chronological list (no threading)"
|
|
linear:
|
|
description: "Linear thread with reply references"
|
|
nested:
|
|
description: "Nested/threaded replies (Reddit-style)"
|
|
wiki_indent:
|
|
description: "Wiki-style indentation threading"
|
|
graph:
|
|
description: "Graph structure (multiple parents)"
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Multimodal Content Type
|
|
# ---------------------------------------------------------------------------
|
|
CMCMediaTypeEnum:
|
|
description: >-
|
|
Type of multimodal content embedded in CMC posts. Posts may contain
|
|
text plus images, video, audio, or other media.
|
|
permissible_values:
|
|
text_only:
|
|
description: "Text-only post"
|
|
image:
|
|
description: "Post with image(s)"
|
|
video:
|
|
description: "Post with video"
|
|
audio:
|
|
description: "Post with audio/voice message"
|
|
gif:
|
|
description: "Post with animated GIF"
|
|
sticker:
|
|
description: "Post with sticker"
|
|
file:
|
|
description: "Post with file attachment"
|
|
poll:
|
|
description: "Post with poll"
|
|
location:
|
|
description: "Post with location/check-in"
|
|
link:
|
|
description: "Post with link preview"
|
|
mixed:
|
|
description: "Post with multiple media types"
|
|
|
|
# =============================================================================
|
|
# CLASSES
|
|
# =============================================================================
|
|
|
|
classes:
|
|
|
|
# ===========================================================================
|
|
# CORE POST ELEMENT
|
|
# ===========================================================================
|
|
|
|
CMCPost:
|
|
description: >-
|
|
A single post, message, or contribution in computer-mediated communication.
|
|
The fundamental unit of CMC discourse, corresponding to TEI <post> element.
|
|
Can represent tweets, chat messages, forum posts, wiki talk contributions,
|
|
email messages, blog comments, etc.
|
|
class_uri: tei:post
|
|
slots:
|
|
- xml_id
|
|
- post_type
|
|
- modality
|
|
- generated_by
|
|
- reply_to
|
|
- indent_level
|
|
- who
|
|
- when_written
|
|
- when_posted
|
|
- synch
|
|
- content_text
|
|
- embedded_media
|
|
- mentions
|
|
- hashtags
|
|
- emojis
|
|
- urls
|
|
- edit_history
|
|
- reactions
|
|
- platform_metadata
|
|
slot_usage:
|
|
xml_id:
|
|
required: true
|
|
content_text:
|
|
required: true
|
|
annotations:
|
|
tei_element: "post"
|
|
tei_module: "cmc"
|
|
glam_hypernym: "TXT.CMC.PST"
|
|
ner_relevance: |
|
|
CMC posts are primary sources for NER in social media and online discourse.
|
|
May contain informal language, abbreviations, hashtags, and @mentions.
|
|
Entity recognition must handle: platform-specific formats (@ for users,
|
|
# for topics), emoji as sentiment markers, URLs as references, and
|
|
non-standard orthography typical of CMC.
|
|
|
|
# ===========================================================================
|
|
# THREADING AND STRUCTURE
|
|
# ===========================================================================
|
|
|
|
CMCThread:
|
|
description: >-
|
|
A thread of related CMC posts, representing a conversation or discussion.
|
|
Contains an ordered collection of posts with reply relationships.
|
|
Used for forum threads, Twitter threads, email chains, chat conversations.
|
|
class_uri: sioc:Thread
|
|
slots:
|
|
- xml_id
|
|
- thread_id
|
|
- thread_title
|
|
- thread_structure
|
|
- original_post
|
|
- posts
|
|
- post_count
|
|
- participant_count
|
|
- start_time
|
|
- last_activity
|
|
- is_closed
|
|
- is_pinned
|
|
- platform_thread_url
|
|
slot_usage:
|
|
xml_id:
|
|
required: true
|
|
posts:
|
|
required: true
|
|
multivalued: true
|
|
annotations:
|
|
glam_hypernym: "TXT.CMC.THR"
|
|
sioc_mapping: "sioc:Thread"
|
|
|
|
CMCConversation:
|
|
description: >-
|
|
A conversation context containing one or more related threads.
|
|
Represents broader discourse context, such as all discussion on a topic
|
|
across multiple threads, or a chat room conversation over time.
|
|
class_uri: sioc:Forum
|
|
slots:
|
|
- xml_id
|
|
- conversation_id
|
|
- conversation_title
|
|
- threads
|
|
- platform_type
|
|
- channel_name
|
|
- start_time
|
|
- end_time
|
|
- total_posts
|
|
- participants
|
|
annotations:
|
|
glam_hypernym: "TXT.CMC.CNV"
|
|
sioc_mapping: "sioc:Forum"
|
|
|
|
# ===========================================================================
|
|
# PARTICIPANT METADATA
|
|
# ===========================================================================
|
|
|
|
CMCParticipant:
|
|
description: >-
|
|
A participant in computer-mediated communication. Represents a user
|
|
account or identity within a CMC platform. May be pseudonymized or
|
|
anonymized for privacy. Links to TEI <person> for speaker identification.
|
|
class_uri: sioc:UserAccount
|
|
slots:
|
|
- xml_id
|
|
- participant_id
|
|
- username
|
|
- display_name
|
|
- anonymized_id
|
|
- anonymization_level
|
|
- account_created
|
|
- account_verified
|
|
- is_bot
|
|
- platform_user_url
|
|
- person_ref
|
|
- role_in_conversation
|
|
- post_count_in_corpus
|
|
- demographic_info
|
|
slot_usage:
|
|
xml_id:
|
|
required: true
|
|
annotations:
|
|
glam_hypernym: "AGT.CMC.USR"
|
|
sioc_mapping: "sioc:UserAccount"
|
|
foaf_mapping: "foaf:OnlineAccount"
|
|
privacy_note: |
|
|
CMC participant data must comply with privacy regulations (GDPR, CCPA).
|
|
Use anonymization_level to document privacy protection measures.
|
|
Real identities should only be preserved when explicit consent exists
|
|
or for public figures in public discourse.
|
|
|
|
CMCParticipantGroup:
|
|
description: >-
|
|
A group of participants in CMC, such as members of a chat room,
|
|
forum community, or social media follower group.
|
|
class_uri: sioc:Usergroup
|
|
slots:
|
|
- xml_id
|
|
- group_id
|
|
- group_name
|
|
- group_type
|
|
- members
|
|
- member_count
|
|
- creation_date
|
|
- platform_group_url
|
|
annotations:
|
|
glam_hypernym: "GRP.CMC"
|
|
sioc_mapping: "sioc:Usergroup"
|
|
|
|
# ===========================================================================
|
|
# EMOJI AND EMOTICON ENCODING
|
|
# ===========================================================================
|
|
|
|
CMCEmoji:
|
|
description: >-
|
|
An emoji or emoticon in CMC text. Encodes both Unicode emoji and
|
|
text-based emoticons (e.g., :) :-P). Essential for sentiment analysis
|
|
and understanding informal CMC expression.
|
|
slots:
|
|
- xml_id
|
|
- emoji_char
|
|
- emoji_codepoint
|
|
- emoji_name
|
|
- emoji_category
|
|
- is_custom
|
|
- custom_emoji_url
|
|
- text_equivalent
|
|
- position_in_post
|
|
- sentiment_valence
|
|
annotations:
|
|
glam_hypernym: "TXT.CMC.EMJ"
|
|
ner_note: |
|
|
Emoji can function as: sentiment markers, entity references (flag emoji
|
|
for countries), topic markers, or standalone expressions. NER pipelines
|
|
should consider emoji context for entity disambiguation and sentiment.
|
|
|
|
CMCEmoticon:
|
|
description: >-
|
|
A text-based emoticon in CMC (e.g., :), :-), :P, <3, XD).
|
|
Distinguished from Unicode emoji as ASCII-based representations.
|
|
slots:
|
|
- xml_id
|
|
- emoticon_text
|
|
- normalized_form
|
|
- emoji_equivalent
|
|
- position_in_post
|
|
- sentiment_valence
|
|
annotations:
|
|
glam_hypernym: "TXT.CMC.EMO"
|
|
|
|
# ===========================================================================
|
|
# HASHTAGS AND MENTIONS
|
|
# ===========================================================================
|
|
|
|
CMCHashtag:
|
|
description: >-
|
|
A hashtag in CMC text (e.g., #BlackLivesMatter, #AI, #heritage).
|
|
Hashtags function as topic markers, community identifiers, and
|
|
sometimes as named entities themselves (event names, campaign names).
|
|
slots:
|
|
- xml_id
|
|
- hashtag_text
|
|
- hashtag_normalized
|
|
- position_in_post
|
|
- is_trending
|
|
- topic_category
|
|
- entity_ref
|
|
annotations:
|
|
glam_hypernym: "APP.CMC.HTG"
|
|
ner_note: |
|
|
Hashtags may represent: events (#Olympics2024), organizations (#UNESCO),
|
|
movements (#MeToo), topics (#AI), or locations (#Paris). NER should
|
|
consider hashtags as potential entity mentions.
|
|
|
|
CMCMention:
|
|
description: >-
|
|
An @-mention of a user in CMC text (e.g., @username, @NASA).
|
|
Mentions explicitly reference other users or accounts and
|
|
function as named entity references within CMC.
|
|
slots:
|
|
- xml_id
|
|
- mention_text
|
|
- mentioned_username
|
|
- mentioned_user_ref
|
|
- position_in_post
|
|
- is_reply_mention
|
|
- entity_type
|
|
- entity_ref
|
|
annotations:
|
|
glam_hypernym: "APP.CMC.MEN"
|
|
ner_note: |
|
|
@-mentions can reference: individuals (@jack), organizations (@NASA),
|
|
bots (@github-actions), or fictional entities. NER should resolve
|
|
mentions to known entities when possible.
|
|
|
|
# ===========================================================================
|
|
# MULTIMODAL CONTENT
|
|
# ===========================================================================
|
|
|
|
CMCEmbeddedMedia:
|
|
description: >-
|
|
Media content embedded in a CMC post (images, videos, audio,
|
|
GIFs, stickers, files). Extends TEI <figure>/<graphic> for CMC context.
|
|
slots:
|
|
- xml_id
|
|
- media_type
|
|
- media_url
|
|
- thumbnail_url
|
|
- alt_text
|
|
- caption
|
|
- duration_seconds
|
|
- file_size
|
|
- mime_type
|
|
- is_quoted_content
|
|
- original_post_ref
|
|
- ocr_text
|
|
- transcription
|
|
annotations:
|
|
glam_hypernym: "THG.CMC.MED"
|
|
ner_note: |
|
|
Embedded media may contain entities: images of people/places,
|
|
screenshots with text, quoted posts with entity mentions.
|
|
OCR/transcription enables NER on visual content.
|
|
|
|
# ===========================================================================
|
|
# REACTIONS AND ENGAGEMENT
|
|
# ===========================================================================
|
|
|
|
CMCReaction:
|
|
description: >-
|
|
A reaction to a CMC post (like, love, laugh, angry, etc.).
|
|
Captures engagement metrics and sentiment signals.
|
|
slots:
|
|
- xml_id
|
|
- reaction_type
|
|
- reaction_emoji
|
|
- reactor_ref
|
|
- reaction_time
|
|
- reaction_count
|
|
annotations:
|
|
glam_hypernym: "TXT.CMC.RXN"
|
|
|
|
CMCReactionSet:
|
|
description: >-
|
|
Aggregated reactions on a CMC post, summarizing all reaction types
|
|
and counts. Useful for engagement analysis.
|
|
slots:
|
|
- xml_id
|
|
- total_reactions
|
|
- reaction_breakdown
|
|
- top_reactors
|
|
annotations:
|
|
glam_hypernym: "TXT.CMC.RXS"
|
|
|
|
# ===========================================================================
|
|
# PLATFORM METADATA
|
|
# ===========================================================================
|
|
|
|
CMCPlatformMetadata:
|
|
description: >-
|
|
Platform-specific metadata for CMC content. Captures technical
|
|
and contextual information from the originating platform.
|
|
slots:
|
|
- xml_id
|
|
- platform_name
|
|
- platform_type
|
|
- platform_version
|
|
- platform_url
|
|
- api_version
|
|
- collection_date
|
|
- collection_method
|
|
- terms_of_service_url
|
|
- rate_limit_info
|
|
- geographic_availability
|
|
annotations:
|
|
glam_hypernym: "DOC.MET.CMC"
|
|
prov_mapping: "prov:Activity"
|
|
|
|
CMCPostMetadata:
|
|
description: >-
|
|
Platform-specific metadata for an individual CMC post.
|
|
Includes platform IDs, engagement metrics, and technical details.
|
|
slots:
|
|
- xml_id
|
|
- platform_post_id
|
|
- platform_post_url
|
|
- view_count
|
|
- reply_count
|
|
- repost_count
|
|
- like_count
|
|
- quote_count
|
|
- bookmark_count
|
|
- is_edited
|
|
- edit_count
|
|
- language_detected
|
|
- is_sensitive
|
|
- is_sponsored
|
|
- visibility
|
|
annotations:
|
|
glam_hypernym: "DOC.MET.CMC.PST"
|
|
|
|
# ===========================================================================
|
|
# CMC CORPUS STRUCTURE
|
|
# ===========================================================================
|
|
|
|
CMCCorpus:
|
|
description: >-
|
|
A corpus of CMC data for linguistic or NER research. Extends
|
|
TEI <teiCorpus> for CMC-specific collection contexts.
|
|
slots:
|
|
- xml_id
|
|
- corpus_id
|
|
- corpus_title
|
|
- corpus_description
|
|
- platforms_included
|
|
- time_range_start
|
|
- time_range_end
|
|
- total_posts
|
|
- total_participants
|
|
- total_threads
|
|
- languages_included
|
|
- collection_methodology
|
|
- sampling_strategy
|
|
- anonymization_applied
|
|
- ethical_approval
|
|
- license
|
|
annotations:
|
|
glam_hypernym: "DOC.CRP.CMC"
|
|
void_mapping: "void:Dataset"
|
|
|
|
# ===========================================================================
|
|
# NER EXTENSIONS FOR CMC
|
|
# ===========================================================================
|
|
|
|
CMCEntityMention:
|
|
description: >-
|
|
An entity mention extracted from CMC text. Extends standard NER
|
|
entity mention with CMC-specific attributes like platform context,
|
|
mention format (@user, #hashtag), and informal language handling.
|
|
slots:
|
|
- xml_id
|
|
- entity_text
|
|
- entity_type
|
|
- entity_ref
|
|
- mention_format
|
|
- is_hashtag_entity
|
|
- is_mention_entity
|
|
- is_url_entity
|
|
- informal_variant
|
|
- canonical_form
|
|
- confidence_score
|
|
- context_window
|
|
- post_ref
|
|
annotations:
|
|
glam_hypernym: "NER.CMC.ENT"
|
|
nif_mapping: "nif:String"
|
|
|
|
CMCEntityNormalization:
|
|
description: >-
|
|
Normalization of informal CMC entity mentions to canonical forms.
|
|
Handles abbreviations, typos, slang, and platform-specific formats.
|
|
slots:
|
|
- xml_id
|
|
- original_text
|
|
- normalized_text
|
|
- entity_ref
|
|
- normalization_type
|
|
- normalization_confidence
|
|
- normalization_method
|
|
annotations:
|
|
glam_hypernym: "NER.CMC.NRM"
|
|
|
|
# =============================================================================
|
|
# SLOTS
|
|
# =============================================================================
|
|
|
|
slots:
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Common Identifiers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
xml_id:
|
|
description: "Unique identifier for the element"
|
|
range: string
|
|
identifier: true
|
|
slot_uri: tei:id
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Post Slots
|
|
# ---------------------------------------------------------------------------
|
|
|
|
post_type:
|
|
description: "Type of CMC post (original, reply, repost, etc.)"
|
|
range: CMCPostTypeEnum
|
|
|
|
modality:
|
|
description: "Modality of communication (written, spoken, mixed)"
|
|
range: CMCModalityEnum
|
|
slot_uri: tei:modality
|
|
|
|
generated_by:
|
|
description: "Source of content generation (human, bot, system, template)"
|
|
range: GeneratedByEnum
|
|
slot_uri: tei:generatedBy
|
|
|
|
reply_to:
|
|
description: "Reference to post being replied to"
|
|
range: string
|
|
slot_uri: tei:replyTo
|
|
annotations:
|
|
note: "Value is xml:id of the post being replied to"
|
|
|
|
indent_level:
|
|
description: "Indentation level for wiki-style threading (0=root)"
|
|
range: integer
|
|
slot_uri: tei:indentLevel
|
|
minimum_value: 0
|
|
|
|
who:
|
|
description: "Reference to participant who authored the post"
|
|
range: string
|
|
slot_uri: tei:who
|
|
|
|
when_written:
|
|
description: "Time when the post was written/composed"
|
|
range: datetime
|
|
|
|
when_posted:
|
|
description: "Time when the post was published/sent"
|
|
range: datetime
|
|
slot_uri: tei:when
|
|
|
|
synch:
|
|
description: "Synchronization point for temporal alignment"
|
|
range: string
|
|
slot_uri: tei:synch
|
|
|
|
content_text:
|
|
description: "Text content of the post"
|
|
range: string
|
|
|
|
embedded_media:
|
|
description: "Media embedded in the post"
|
|
range: CMCEmbeddedMedia
|
|
multivalued: true
|
|
|
|
mentions:
|
|
description: "@-mentions in the post"
|
|
range: CMCMention
|
|
multivalued: true
|
|
|
|
hashtags:
|
|
description: "Hashtags in the post"
|
|
range: CMCHashtag
|
|
multivalued: true
|
|
|
|
emojis:
|
|
description: "Emoji in the post"
|
|
range: CMCEmoji
|
|
multivalued: true
|
|
|
|
urls:
|
|
description: "URLs in the post"
|
|
range: string
|
|
multivalued: true
|
|
|
|
edit_history:
|
|
description: "History of edits to the post"
|
|
range: string
|
|
multivalued: true
|
|
|
|
reactions:
|
|
description: "Reactions to the post"
|
|
range: CMCReactionSet
|
|
|
|
platform_metadata:
|
|
description: "Platform-specific metadata for the post"
|
|
range: CMCPostMetadata
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Thread Slots
|
|
# ---------------------------------------------------------------------------
|
|
|
|
thread_id:
|
|
description: "Platform-specific thread identifier"
|
|
range: string
|
|
|
|
thread_title:
|
|
description: "Title of the thread"
|
|
range: string
|
|
|
|
thread_structure:
|
|
description: "Type of threading structure"
|
|
range: ThreadStructureEnum
|
|
|
|
original_post:
|
|
description: "Reference to the original/root post"
|
|
range: CMCPost
|
|
|
|
posts:
|
|
description: "Posts in the thread"
|
|
range: CMCPost
|
|
multivalued: true
|
|
|
|
post_count:
|
|
description: "Total number of posts in thread"
|
|
range: integer
|
|
|
|
participant_count:
|
|
description: "Number of unique participants"
|
|
range: integer
|
|
|
|
start_time:
|
|
description: "Time of first post"
|
|
range: datetime
|
|
|
|
last_activity:
|
|
description: "Time of most recent activity"
|
|
range: datetime
|
|
|
|
is_closed:
|
|
description: "Whether thread is closed for new replies"
|
|
range: boolean
|
|
|
|
is_pinned:
|
|
description: "Whether thread is pinned/sticky"
|
|
range: boolean
|
|
|
|
platform_thread_url:
|
|
description: "URL to thread on platform"
|
|
range: uri
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Conversation Slots
|
|
# ---------------------------------------------------------------------------
|
|
|
|
conversation_id:
|
|
description: "Identifier for the conversation"
|
|
range: string
|
|
|
|
conversation_title:
|
|
description: "Title of the conversation"
|
|
range: string
|
|
|
|
threads:
|
|
description: "Threads in the conversation"
|
|
range: CMCThread
|
|
multivalued: true
|
|
|
|
channel_name:
|
|
description: "Name of the channel/room"
|
|
range: string
|
|
|
|
end_time:
|
|
description: "Time of conversation end"
|
|
range: datetime
|
|
|
|
total_posts:
|
|
description: "Total posts in conversation"
|
|
range: integer
|
|
|
|
participants:
|
|
description: "Participants in the conversation"
|
|
range: CMCParticipant
|
|
multivalued: true
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Participant Slots
|
|
# ---------------------------------------------------------------------------
|
|
|
|
participant_id:
|
|
description: "Unique identifier for participant"
|
|
range: string
|
|
|
|
username:
|
|
description: "Platform username"
|
|
range: string
|
|
|
|
display_name:
|
|
description: "Display name (may differ from username)"
|
|
range: string
|
|
|
|
anonymized_id:
|
|
description: "Anonymized identifier (when privacy applied)"
|
|
range: string
|
|
|
|
anonymization_level:
|
|
description: "Level of anonymization applied"
|
|
range: AnonymizationLevelEnum
|
|
|
|
account_created:
|
|
description: "When the account was created"
|
|
range: datetime
|
|
|
|
account_verified:
|
|
description: "Whether account is verified"
|
|
range: boolean
|
|
|
|
is_bot:
|
|
description: "Whether account is a bot"
|
|
range: boolean
|
|
|
|
platform_user_url:
|
|
description: "URL to user profile on platform"
|
|
range: uri
|
|
|
|
person_ref:
|
|
description: "Reference to TEI person element"
|
|
range: string
|
|
|
|
role_in_conversation:
|
|
description: "Role in conversation (moderator, admin, member)"
|
|
range: string
|
|
|
|
post_count_in_corpus:
|
|
description: "Number of posts by this participant in corpus"
|
|
range: integer
|
|
|
|
demographic_info:
|
|
description: "Demographic information (if available and consented)"
|
|
range: string
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Group Slots
|
|
# ---------------------------------------------------------------------------
|
|
|
|
group_id:
|
|
description: "Identifier for the group"
|
|
range: string
|
|
|
|
group_name:
|
|
description: "Name of the group"
|
|
range: string
|
|
|
|
group_type:
|
|
description: "Type of group (public, private, etc.)"
|
|
range: string
|
|
|
|
members:
|
|
description: "Members of the group"
|
|
range: CMCParticipant
|
|
multivalued: true
|
|
|
|
member_count:
|
|
description: "Number of members"
|
|
range: integer
|
|
|
|
creation_date:
|
|
description: "When the group was created"
|
|
range: datetime
|
|
|
|
platform_group_url:
|
|
description: "URL to group on platform"
|
|
range: uri
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Emoji Slots
|
|
# ---------------------------------------------------------------------------
|
|
|
|
emoji_char:
|
|
description: "The emoji character"
|
|
range: string
|
|
|
|
emoji_codepoint:
|
|
description: "Unicode codepoint(s) for the emoji"
|
|
range: string
|
|
|
|
emoji_name:
|
|
description: "Short name/description of emoji"
|
|
range: string
|
|
|
|
emoji_category:
|
|
description: "Emoji category"
|
|
range: EmojiCategoryEnum
|
|
|
|
is_custom:
|
|
description: "Whether this is a custom/platform-specific emoji"
|
|
range: boolean
|
|
|
|
custom_emoji_url:
|
|
description: "URL to custom emoji image"
|
|
range: uri
|
|
|
|
text_equivalent:
|
|
description: "Text equivalent or description"
|
|
range: string
|
|
|
|
position_in_post:
|
|
description: "Character position in post text"
|
|
range: integer
|
|
|
|
sentiment_valence:
|
|
description: "Sentiment valence (-1.0 to 1.0)"
|
|
range: float
|
|
minimum_value: -1.0
|
|
maximum_value: 1.0
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Emoticon Slots
|
|
# ---------------------------------------------------------------------------
|
|
|
|
emoticon_text:
|
|
description: "The emoticon text (e.g., ':)', 'XD')"
|
|
range: string
|
|
|
|
normalized_form:
|
|
description: "Normalized emoticon form"
|
|
range: string
|
|
|
|
emoji_equivalent:
|
|
description: "Unicode emoji equivalent"
|
|
range: string
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Hashtag Slots
|
|
# ---------------------------------------------------------------------------
|
|
|
|
hashtag_text:
|
|
description: "Full hashtag text including #"
|
|
range: string
|
|
|
|
hashtag_normalized:
|
|
description: "Normalized hashtag (lowercase, no #)"
|
|
range: string
|
|
|
|
is_trending:
|
|
description: "Whether hashtag is trending"
|
|
range: boolean
|
|
|
|
topic_category:
|
|
description: "Topic category for hashtag"
|
|
range: string
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Mention Slots
|
|
# ---------------------------------------------------------------------------
|
|
|
|
mention_text:
|
|
description: "Full mention text including @"
|
|
range: string
|
|
|
|
mentioned_username:
|
|
description: "Username being mentioned"
|
|
range: string
|
|
|
|
mentioned_user_ref:
|
|
description: "Reference to CMCParticipant"
|
|
range: string
|
|
|
|
is_reply_mention:
|
|
description: "Whether mention is auto-added reply mention"
|
|
range: boolean
|
|
|
|
entity_type:
|
|
description: "Entity type for NER"
|
|
range: string
|
|
|
|
entity_ref:
|
|
description: "Reference to known entity (Wikidata, VIAF, etc.)"
|
|
range: uri
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Media Slots
|
|
# ---------------------------------------------------------------------------
|
|
|
|
media_type:
|
|
description: "Type of media"
|
|
range: CMCMediaTypeEnum
|
|
|
|
media_url:
|
|
description: "URL to media content"
|
|
range: uri
|
|
|
|
thumbnail_url:
|
|
description: "URL to thumbnail"
|
|
range: uri
|
|
|
|
alt_text:
|
|
description: "Alt text for accessibility"
|
|
range: string
|
|
|
|
caption:
|
|
description: "Caption for media"
|
|
range: string
|
|
|
|
duration_seconds:
|
|
description: "Duration in seconds (for audio/video)"
|
|
range: float
|
|
|
|
file_size:
|
|
description: "File size in bytes"
|
|
range: integer
|
|
|
|
mime_type:
|
|
description: "MIME type"
|
|
range: string
|
|
|
|
is_quoted_content:
|
|
description: "Whether this is quoted/embedded content"
|
|
range: boolean
|
|
|
|
original_post_ref:
|
|
description: "Reference to original post (if quoted)"
|
|
range: string
|
|
|
|
ocr_text:
|
|
description: "OCR-extracted text from image"
|
|
range: string
|
|
|
|
transcription:
|
|
description: "Transcription of audio/video"
|
|
range: string
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Reaction Slots
|
|
# ---------------------------------------------------------------------------
|
|
|
|
reaction_type:
|
|
description: "Type of reaction (like, love, angry, etc.)"
|
|
range: string
|
|
|
|
reaction_emoji:
|
|
description: "Emoji for reaction"
|
|
range: string
|
|
|
|
reactor_ref:
|
|
description: "Reference to reactor participant"
|
|
range: string
|
|
|
|
reaction_time:
|
|
description: "Time of reaction"
|
|
range: datetime
|
|
|
|
reaction_count:
|
|
description: "Number of this reaction type"
|
|
range: integer
|
|
|
|
total_reactions:
|
|
description: "Total reactions"
|
|
range: integer
|
|
|
|
reaction_breakdown:
|
|
description: "Breakdown by reaction type"
|
|
range: string
|
|
|
|
top_reactors:
|
|
description: "Top reactors"
|
|
range: string
|
|
multivalued: true
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Platform Metadata Slots
|
|
# ---------------------------------------------------------------------------
|
|
|
|
platform_name:
|
|
description: "Name of platform"
|
|
range: string
|
|
|
|
platform_type:
|
|
description: "Type of platform"
|
|
range: CMCPlatformTypeEnum
|
|
|
|
platform_version:
|
|
description: "Version of platform/API"
|
|
range: string
|
|
|
|
platform_url:
|
|
description: "URL to platform"
|
|
range: uri
|
|
|
|
api_version:
|
|
description: "API version used for collection"
|
|
range: string
|
|
|
|
collection_date:
|
|
description: "Date of data collection"
|
|
range: datetime
|
|
|
|
collection_method:
|
|
description: "Method of data collection"
|
|
range: string
|
|
|
|
terms_of_service_url:
|
|
description: "URL to platform ToS"
|
|
range: uri
|
|
|
|
rate_limit_info:
|
|
description: "Rate limit information"
|
|
range: string
|
|
|
|
geographic_availability:
|
|
description: "Geographic availability of platform"
|
|
range: string
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Post Metadata Slots
|
|
# ---------------------------------------------------------------------------
|
|
|
|
platform_post_id:
|
|
description: "Platform-specific post ID"
|
|
range: string
|
|
|
|
platform_post_url:
|
|
description: "URL to post on platform"
|
|
range: uri
|
|
|
|
view_count:
|
|
description: "Number of views"
|
|
range: integer
|
|
|
|
reply_count:
|
|
description: "Number of replies"
|
|
range: integer
|
|
|
|
repost_count:
|
|
description: "Number of reposts/shares"
|
|
range: integer
|
|
|
|
like_count:
|
|
description: "Number of likes"
|
|
range: integer
|
|
|
|
quote_count:
|
|
description: "Number of quote posts"
|
|
range: integer
|
|
|
|
bookmark_count:
|
|
description: "Number of bookmarks"
|
|
range: integer
|
|
|
|
is_edited:
|
|
description: "Whether post was edited"
|
|
range: boolean
|
|
|
|
edit_count:
|
|
description: "Number of edits"
|
|
range: integer
|
|
|
|
language_detected:
|
|
description: "Detected language of post"
|
|
range: string
|
|
|
|
is_sensitive:
|
|
description: "Whether post is marked sensitive"
|
|
range: boolean
|
|
|
|
is_sponsored:
|
|
description: "Whether post is sponsored/promoted"
|
|
range: boolean
|
|
|
|
visibility:
|
|
description: "Visibility setting (public, private, followers)"
|
|
range: string
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Corpus Slots
|
|
# ---------------------------------------------------------------------------
|
|
|
|
corpus_id:
|
|
description: "Identifier for corpus"
|
|
range: string
|
|
|
|
corpus_title:
|
|
description: "Title of corpus"
|
|
range: string
|
|
|
|
corpus_description:
|
|
description: "Description of corpus"
|
|
range: string
|
|
|
|
platforms_included:
|
|
description: "Platforms included in corpus"
|
|
range: string
|
|
multivalued: true
|
|
|
|
time_range_start:
|
|
description: "Start of time range"
|
|
range: datetime
|
|
|
|
time_range_end:
|
|
description: "End of time range"
|
|
range: datetime
|
|
|
|
total_participants:
|
|
description: "Total participants in corpus"
|
|
range: integer
|
|
|
|
total_threads:
|
|
description: "Total threads in corpus"
|
|
range: integer
|
|
|
|
languages_included:
|
|
description: "Languages in corpus"
|
|
range: string
|
|
multivalued: true
|
|
|
|
collection_methodology:
|
|
description: "Methodology for collection"
|
|
range: string
|
|
|
|
sampling_strategy:
|
|
description: "Sampling strategy used"
|
|
range: string
|
|
|
|
anonymization_applied:
|
|
description: "Anonymization applied to corpus"
|
|
range: AnonymizationLevelEnum
|
|
|
|
ethical_approval:
|
|
description: "Ethical approval information"
|
|
range: string
|
|
|
|
license:
|
|
description: "License for corpus"
|
|
range: string
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# NER Entity Slots
|
|
# ---------------------------------------------------------------------------
|
|
|
|
entity_text:
|
|
description: "Original text of entity mention"
|
|
range: string
|
|
|
|
mention_format:
|
|
description: "Format of mention (hashtag, @mention, URL, plain)"
|
|
range: string
|
|
|
|
is_hashtag_entity:
|
|
description: "Whether entity is from hashtag"
|
|
range: boolean
|
|
|
|
is_mention_entity:
|
|
description: "Whether entity is from @mention"
|
|
range: boolean
|
|
|
|
is_url_entity:
|
|
description: "Whether entity is from URL"
|
|
range: boolean
|
|
|
|
informal_variant:
|
|
description: "Informal/slang variant of entity"
|
|
range: string
|
|
|
|
canonical_form:
|
|
description: "Canonical form of entity"
|
|
range: string
|
|
|
|
confidence_score:
|
|
description: "NER confidence score"
|
|
range: float
|
|
minimum_value: 0.0
|
|
maximum_value: 1.0
|
|
|
|
context_window:
|
|
description: "Surrounding context for entity"
|
|
range: string
|
|
|
|
post_ref:
|
|
description: "Reference to source post"
|
|
range: string
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Normalization Slots
|
|
# ---------------------------------------------------------------------------
|
|
|
|
original_text:
|
|
description: "Original text before normalization"
|
|
range: string
|
|
|
|
normalized_text:
|
|
description: "Normalized text"
|
|
range: string
|
|
|
|
normalization_type:
|
|
description: "Type of normalization applied"
|
|
range: string
|
|
|
|
normalization_confidence:
|
|
description: "Confidence in normalization"
|
|
range: float
|
|
|
|
normalization_method:
|
|
description: "Method used for normalization"
|
|
range: string
|
|
|
|
# =============================================================================
|
|
# ONTOLOGY MAPPINGS SUMMARY
|
|
# =============================================================================
|
|
#
|
|
# SIOC (Semantically-Interlinked Online Communities):
|
|
# - CMCPost: sioc:Post
|
|
# - CMCThread: sioc:Thread
|
|
# - CMCConversation: sioc:Forum
|
|
# - CMCParticipant: sioc:UserAccount
|
|
# - CMCParticipantGroup: sioc:Usergroup
|
|
#
|
|
# Activity Streams 2.0:
|
|
# - CMCPost: as:Note, as:Article
|
|
# - CMCReaction: as:Like, as:Announce
|
|
# - CMCParticipant: as:Person
|
|
#
|
|
# Schema.org:
|
|
# - CMCPost: schema:SocialMediaPosting
|
|
# - CMCThread: schema:DiscussionForumPosting
|
|
# - CMCParticipant: schema:Person
|
|
# - CMCEmbeddedMedia: schema:MediaObject
|
|
#
|
|
# FOAF:
|
|
# - CMCParticipant: foaf:OnlineAccount
|
|
# - CMCParticipantGroup: foaf:Group
|
|
#
|
|
# CIDOC-CRM:
|
|
# - CMCPost: crm:E33_Linguistic_Object
|
|
# - CMCParticipant: crm:E39_Actor
|
|
#
|
|
# PROV-O:
|
|
# - CMCPlatformMetadata: prov:Activity
|
|
# - CMCCorpus: prov:Collection
|
|
#
|
|
# Web Annotation (W3C OA):
|
|
# - CMCEntityMention: oa:Annotation
|
|
#
|
|
# NIF:
|
|
# - CMCEntityMention: nif:String
|
|
#
|
|
# =============================================================================
|
|
|
|
# =============================================================================
|
|
# GLAM-NER HYPERNYM MAPPINGS
|
|
# =============================================================================
|
|
#
|
|
# TXT.CMC: Computer-mediated communication
|
|
# TXT.CMC.PST: CMC post
|
|
# TXT.CMC.THR: CMC thread
|
|
# TXT.CMC.CNV: CMC conversation
|
|
# TXT.CMC.EMJ: Emoji
|
|
# TXT.CMC.EMO: Emoticon
|
|
# TXT.CMC.RXN: Reaction
|
|
# TXT.CMC.RXS: Reaction set
|
|
#
|
|
# AGT.CMC: CMC agents
|
|
# AGT.CMC.USR: CMC user account
|
|
#
|
|
# GRP.CMC: CMC groups
|
|
#
|
|
# APP.CMC: CMC appellations
|
|
# APP.CMC.HTG: Hashtag
|
|
# APP.CMC.MEN: @-mention
|
|
#
|
|
# THG.CMC: CMC things
|
|
# THG.CMC.MED: CMC embedded media
|
|
#
|
|
# DOC.MET.CMC: CMC metadata
|
|
# DOC.MET.CMC.PST: Post metadata
|
|
#
|
|
# DOC.CRP.CMC: CMC corpus
|
|
#
|
|
# NER.CMC: NER for CMC
|
|
# NER.CMC.ENT: CMC entity mention
|
|
# NER.CMC.NRM: CMC entity normalization
|
|
#
|
|
# =============================================================================
|