# ============================================================================= # GLAM-NER: TEI P5 COMPUTER-MEDIATED COMMUNICATION (CMC) MODULE # ============================================================================= # Module: modules/advanced/tei/cmc.yaml # TEI Chapter: 9 (Computer-mediated Communication) # TEI Module Name: cmc # Version: 1.0.0 # Status: Complete # ============================================================================= # # This module provides LinkML representations of TEI P5 elements for encoding # computer-mediated communication including social media posts, chat messages, # forum threads, wiki discussions, and other digital discourse. Essential for # social media NER, online discourse analysis, and digital heritage collections. # # Key Features: # - Post element for CMC turns/messages # - Threading and reply structures # - Emoji and emoticon encoding # - Multimodal content (text, images, audio, video) # - Participant metadata and anonymization # - Bot/automated content detection # - Cross-platform CMC normalization # # TEI Source: https://tei-c.org/release/doc/tei-p5-doc/en/html/CMC.html # ============================================================================= id: https://w3id.org/glam/ner/tei/cmc name: glam-ner-tei-cmc title: "TEI P5 Computer-mediated Communication Module for GLAM-NER" version: "1.0.0" license: https://creativecommons.org/licenses/by/4.0/ see_also: - https://tei-c.org/release/doc/tei-p5-doc/en/html/CMC.html - https://www.w3.org/community/ontolex/wiki/CMC prefixes: linkml: https://w3id.org/linkml/ glam: https://w3id.org/glam/ner/ tei: http://www.tei-c.org/ns/1.0/ schema: http://schema.org/ sioc: http://rdfs.org/sioc/ns# as: https://www.w3.org/ns/activitystreams# foaf: http://xmlns.com/foaf/0.1/ dcterms: http://purl.org/dc/terms/ crm: http://www.cidoc-crm.org/cidoc-crm/ prov: http://www.w3.org/ns/prov# oa: http://www.w3.org/ns/oa# default_range: string imports: - linkml:types # ============================================================================= # ENUMERATIONS # ============================================================================= enums: # --------------------------------------------------------------------------- # CMC Modality (written vs spoken) # --------------------------------------------------------------------------- CMCModalityEnum: description: >- Modality of computer-mediated communication. Distinguishes between written text-based communication and spoken/audio-based communication transmitted via digital channels. permissible_values: written: description: "Text-based CMC (chat, email, forum posts, social media text)" spoken: description: "Voice-based CMC (voice messages, audio calls, podcasts)" mixed: description: "Combines written and spoken modalities (video with text overlays)" # --------------------------------------------------------------------------- # Content Generation Source # --------------------------------------------------------------------------- GeneratedByEnum: description: >- Source of content generation for CMC posts. Distinguishes between human-authored content and various forms of automated/system-generated content. Essential for NER to identify bot-generated text. permissible_values: human: description: "Content authored by a human user" template: description: "Content generated from a template with user input" system: description: "System-generated content (notifications, status updates)" bot: description: "Content generated by an automated bot/AI agent" unknown: description: "Generation source cannot be determined" # --------------------------------------------------------------------------- # CMC Platform Type # --------------------------------------------------------------------------- CMCPlatformTypeEnum: description: >- Type of computer-mediated communication platform. Categorizes the technological context of CMC for normalization and analysis. permissible_values: social_media: description: "Social networking platforms (Twitter/X, Facebook, Instagram, LinkedIn)" chat: description: "Synchronous chat platforms (WhatsApp, Telegram, Slack, Discord)" forum: description: "Asynchronous discussion forums (Reddit, Stack Overflow, phpBB)" wiki_talk: description: "Wiki discussion pages (Wikipedia talk pages, MediaWiki)" email: description: "Email communication (Gmail, Outlook, mailing lists)" blog: description: "Blog platforms (WordPress, Medium, Blogger)" comment: description: "Comment sections (news articles, video comments)" microblog: description: "Microblogging platforms (Twitter/X, Mastodon, Bluesky)" video_live: description: "Live video streaming chat (YouTube Live, Twitch)" gaming: description: "Gaming communication (in-game chat, Discord gaming)" dating: description: "Dating platform messaging" professional: description: "Professional networking (LinkedIn messages)" customer_support: description: "Customer service chat systems" sms: description: "SMS/text messaging" other: description: "Other CMC platform type" # --------------------------------------------------------------------------- # Post Type # --------------------------------------------------------------------------- CMCPostTypeEnum: description: >- Type of CMC post within a communication thread. Distinguishes between original posts, replies, reposts/shares, and other post types. permissible_values: original: description: "Original post starting a new thread" reply: description: "Reply to another post" repost: description: "Share/repost of another post (RT, reblog)" quote: description: "Quote post with commentary" reaction: description: "Reaction-only post (emoji reaction, like)" edit: description: "Edit of a previous post" deletion: description: "Deletion marker (post was deleted)" system: description: "System message (join/leave notifications)" pinned: description: "Pinned/sticky post" # --------------------------------------------------------------------------- # Emoji Category # --------------------------------------------------------------------------- EmojiCategoryEnum: description: >- Unicode emoji category classification. Used for emoji/emoticon encoding in CMC transcriptions. permissible_values: smileys_emotion: description: "Smileys & Emotion (face expressions, hearts)" people_body: description: "People & Body (hand gestures, people)" animals_nature: description: "Animals & Nature (animals, plants)" food_drink: description: "Food & Drink" travel_places: description: "Travel & Places (buildings, transport)" activities: description: "Activities (sports, arts)" objects: description: "Objects (tools, symbols)" symbols: description: "Symbols (arrows, zodiac, flags)" flags: description: "Flags (country flags, special flags)" component: description: "Component (skin tones, hair)" # --------------------------------------------------------------------------- # Participant Anonymization Level # --------------------------------------------------------------------------- AnonymizationLevelEnum: description: >- Level of participant anonymization applied to CMC data. Important for privacy protection in CMC corpora and compliance with GDPR. permissible_values: none: description: "No anonymization (original usernames preserved)" pseudonymized: description: "Usernames replaced with consistent pseudonyms" anonymized: description: "Full anonymization (no identifying information)" aggregated: description: "Aggregated data (no individual posts)" # --------------------------------------------------------------------------- # Thread Structure Type # --------------------------------------------------------------------------- ThreadStructureEnum: description: >- Type of threading structure in CMC platform. Affects how replies and conversations are organized and visualized. permissible_values: flat: description: "Flat chronological list (no threading)" linear: description: "Linear thread with reply references" nested: description: "Nested/threaded replies (Reddit-style)" wiki_indent: description: "Wiki-style indentation threading" graph: description: "Graph structure (multiple parents)" # --------------------------------------------------------------------------- # Multimodal Content Type # --------------------------------------------------------------------------- CMCMediaTypeEnum: description: >- Type of multimodal content embedded in CMC posts. Posts may contain text plus images, video, audio, or other media. permissible_values: text_only: description: "Text-only post" image: description: "Post with image(s)" video: description: "Post with video" audio: description: "Post with audio/voice message" gif: description: "Post with animated GIF" sticker: description: "Post with sticker" file: description: "Post with file attachment" poll: description: "Post with poll" location: description: "Post with location/check-in" link: description: "Post with link preview" mixed: description: "Post with multiple media types" # ============================================================================= # CLASSES # ============================================================================= classes: # =========================================================================== # CORE POST ELEMENT # =========================================================================== CMCPost: description: >- A single post, message, or contribution in computer-mediated communication. The fundamental unit of CMC discourse, corresponding to TEI element. Can represent tweets, chat messages, forum posts, wiki talk contributions, email messages, blog comments, etc. class_uri: tei:post slots: - xml_id - post_type - modality - generated_by - reply_to - indent_level - who - when_written - when_posted - synch - content_text - embedded_media - mentions - hashtags - emojis - urls - edit_history - reactions - platform_metadata slot_usage: xml_id: required: true content_text: required: true annotations: tei_element: "post" tei_module: "cmc" glam_hypernym: "TXT.CMC.PST" ner_relevance: | CMC posts are primary sources for NER in social media and online discourse. May contain informal language, abbreviations, hashtags, and @mentions. Entity recognition must handle: platform-specific formats (@ for users, # for topics), emoji as sentiment markers, URLs as references, and non-standard orthography typical of CMC. # =========================================================================== # THREADING AND STRUCTURE # =========================================================================== CMCThread: description: >- A thread of related CMC posts, representing a conversation or discussion. Contains an ordered collection of posts with reply relationships. Used for forum threads, Twitter threads, email chains, chat conversations. class_uri: sioc:Thread slots: - xml_id - thread_id - thread_title - thread_structure - original_post - posts - post_count - participant_count - start_time - last_activity - is_closed - is_pinned - platform_thread_url slot_usage: xml_id: required: true posts: required: true multivalued: true annotations: glam_hypernym: "TXT.CMC.THR" sioc_mapping: "sioc:Thread" CMCConversation: description: >- A conversation context containing one or more related threads. Represents broader discourse context, such as all discussion on a topic across multiple threads, or a chat room conversation over time. class_uri: sioc:Forum slots: - xml_id - conversation_id - conversation_title - threads - platform_type - channel_name - start_time - end_time - total_posts - participants annotations: glam_hypernym: "TXT.CMC.CNV" sioc_mapping: "sioc:Forum" # =========================================================================== # PARTICIPANT METADATA # =========================================================================== CMCParticipant: description: >- A participant in computer-mediated communication. Represents a user account or identity within a CMC platform. May be pseudonymized or anonymized for privacy. Links to TEI for speaker identification. class_uri: sioc:UserAccount slots: - xml_id - participant_id - username - display_name - anonymized_id - anonymization_level - account_created - account_verified - is_bot - platform_user_url - person_ref - role_in_conversation - post_count_in_corpus - demographic_info slot_usage: xml_id: required: true annotations: glam_hypernym: "AGT.CMC.USR" sioc_mapping: "sioc:UserAccount" foaf_mapping: "foaf:OnlineAccount" privacy_note: | CMC participant data must comply with privacy regulations (GDPR, CCPA). Use anonymization_level to document privacy protection measures. Real identities should only be preserved when explicit consent exists or for public figures in public discourse. CMCParticipantGroup: description: >- A group of participants in CMC, such as members of a chat room, forum community, or social media follower group. class_uri: sioc:Usergroup slots: - xml_id - group_id - group_name - group_type - members - member_count - creation_date - platform_group_url annotations: glam_hypernym: "GRP.CMC" sioc_mapping: "sioc:Usergroup" # =========================================================================== # EMOJI AND EMOTICON ENCODING # =========================================================================== CMCEmoji: description: >- An emoji or emoticon in CMC text. Encodes both Unicode emoji and text-based emoticons (e.g., :) :-P). Essential for sentiment analysis and understanding informal CMC expression. slots: - xml_id - emoji_char - emoji_codepoint - emoji_name - emoji_category - is_custom - custom_emoji_url - text_equivalent - position_in_post - sentiment_valence annotations: glam_hypernym: "TXT.CMC.EMJ" ner_note: | Emoji can function as: sentiment markers, entity references (flag emoji for countries), topic markers, or standalone expressions. NER pipelines should consider emoji context for entity disambiguation and sentiment. CMCEmoticon: description: >- A text-based emoticon in CMC (e.g., :), :-), :P, <3, XD). Distinguished from Unicode emoji as ASCII-based representations. slots: - xml_id - emoticon_text - normalized_form - emoji_equivalent - position_in_post - sentiment_valence annotations: glam_hypernym: "TXT.CMC.EMO" # =========================================================================== # HASHTAGS AND MENTIONS # =========================================================================== CMCHashtag: description: >- A hashtag in CMC text (e.g., #BlackLivesMatter, #AI, #heritage). Hashtags function as topic markers, community identifiers, and sometimes as named entities themselves (event names, campaign names). slots: - xml_id - hashtag_text - hashtag_normalized - position_in_post - is_trending - topic_category - entity_ref annotations: glam_hypernym: "APP.CMC.HTG" ner_note: | Hashtags may represent: events (#Olympics2024), organizations (#UNESCO), movements (#MeToo), topics (#AI), or locations (#Paris). NER should consider hashtags as potential entity mentions. CMCMention: description: >- An @-mention of a user in CMC text (e.g., @username, @NASA). Mentions explicitly reference other users or accounts and function as named entity references within CMC. slots: - xml_id - mention_text - mentioned_username - mentioned_user_ref - position_in_post - is_reply_mention - entity_type - entity_ref annotations: glam_hypernym: "APP.CMC.MEN" ner_note: | @-mentions can reference: individuals (@jack), organizations (@NASA), bots (@github-actions), or fictional entities. NER should resolve mentions to known entities when possible. # =========================================================================== # MULTIMODAL CONTENT # =========================================================================== CMCEmbeddedMedia: description: >- Media content embedded in a CMC post (images, videos, audio, GIFs, stickers, files). Extends TEI
/ for CMC context. slots: - xml_id - media_type - media_url - thumbnail_url - alt_text - caption - duration_seconds - file_size - mime_type - is_quoted_content - original_post_ref - ocr_text - transcription annotations: glam_hypernym: "THG.CMC.MED" ner_note: | Embedded media may contain entities: images of people/places, screenshots with text, quoted posts with entity mentions. OCR/transcription enables NER on visual content. # =========================================================================== # REACTIONS AND ENGAGEMENT # =========================================================================== CMCReaction: description: >- A reaction to a CMC post (like, love, laugh, angry, etc.). Captures engagement metrics and sentiment signals. slots: - xml_id - reaction_type - reaction_emoji - reactor_ref - reaction_time - reaction_count annotations: glam_hypernym: "TXT.CMC.RXN" CMCReactionSet: description: >- Aggregated reactions on a CMC post, summarizing all reaction types and counts. Useful for engagement analysis. slots: - xml_id - total_reactions - reaction_breakdown - top_reactors annotations: glam_hypernym: "TXT.CMC.RXS" # =========================================================================== # PLATFORM METADATA # =========================================================================== CMCPlatformMetadata: description: >- Platform-specific metadata for CMC content. Captures technical and contextual information from the originating platform. slots: - xml_id - platform_name - platform_type - platform_version - platform_url - api_version - collection_date - collection_method - terms_of_service_url - rate_limit_info - geographic_availability annotations: glam_hypernym: "DOC.MET.CMC" prov_mapping: "prov:Activity" CMCPostMetadata: description: >- Platform-specific metadata for an individual CMC post. Includes platform IDs, engagement metrics, and technical details. slots: - xml_id - platform_post_id - platform_post_url - view_count - reply_count - repost_count - like_count - quote_count - bookmark_count - is_edited - edit_count - language_detected - is_sensitive - is_sponsored - visibility annotations: glam_hypernym: "DOC.MET.CMC.PST" # =========================================================================== # CMC CORPUS STRUCTURE # =========================================================================== CMCCorpus: description: >- A corpus of CMC data for linguistic or NER research. Extends TEI for CMC-specific collection contexts. slots: - xml_id - corpus_id - corpus_title - corpus_description - platforms_included - time_range_start - time_range_end - total_posts - total_participants - total_threads - languages_included - collection_methodology - sampling_strategy - anonymization_applied - ethical_approval - license annotations: glam_hypernym: "DOC.CRP.CMC" void_mapping: "void:Dataset" # =========================================================================== # NER EXTENSIONS FOR CMC # =========================================================================== CMCEntityMention: description: >- An entity mention extracted from CMC text. Extends standard NER entity mention with CMC-specific attributes like platform context, mention format (@user, #hashtag), and informal language handling. slots: - xml_id - entity_text - entity_type - entity_ref - mention_format - is_hashtag_entity - is_mention_entity - is_url_entity - informal_variant - canonical_form - confidence_score - context_window - post_ref annotations: glam_hypernym: "NER.CMC.ENT" nif_mapping: "nif:String" CMCEntityNormalization: description: >- Normalization of informal CMC entity mentions to canonical forms. Handles abbreviations, typos, slang, and platform-specific formats. slots: - xml_id - original_text - normalized_text - entity_ref - normalization_type - normalization_confidence - normalization_method annotations: glam_hypernym: "NER.CMC.NRM" # ============================================================================= # SLOTS # ============================================================================= slots: # --------------------------------------------------------------------------- # Common Identifiers # --------------------------------------------------------------------------- xml_id: description: "Unique identifier for the element" range: string identifier: true slot_uri: tei:id # --------------------------------------------------------------------------- # Post Slots # --------------------------------------------------------------------------- post_type: description: "Type of CMC post (original, reply, repost, etc.)" range: CMCPostTypeEnum modality: description: "Modality of communication (written, spoken, mixed)" range: CMCModalityEnum slot_uri: tei:modality generated_by: description: "Source of content generation (human, bot, system, template)" range: GeneratedByEnum slot_uri: tei:generatedBy reply_to: description: "Reference to post being replied to" range: string slot_uri: tei:replyTo annotations: note: "Value is xml:id of the post being replied to" indent_level: description: "Indentation level for wiki-style threading (0=root)" range: integer slot_uri: tei:indentLevel minimum_value: 0 who: description: "Reference to participant who authored the post" range: string slot_uri: tei:who when_written: description: "Time when the post was written/composed" range: datetime when_posted: description: "Time when the post was published/sent" range: datetime slot_uri: tei:when synch: description: "Synchronization point for temporal alignment" range: string slot_uri: tei:synch content_text: description: "Text content of the post" range: string embedded_media: description: "Media embedded in the post" range: CMCEmbeddedMedia multivalued: true mentions: description: "@-mentions in the post" range: CMCMention multivalued: true hashtags: description: "Hashtags in the post" range: CMCHashtag multivalued: true emojis: description: "Emoji in the post" range: CMCEmoji multivalued: true urls: description: "URLs in the post" range: string multivalued: true edit_history: description: "History of edits to the post" range: string multivalued: true reactions: description: "Reactions to the post" range: CMCReactionSet platform_metadata: description: "Platform-specific metadata for the post" range: CMCPostMetadata # --------------------------------------------------------------------------- # Thread Slots # --------------------------------------------------------------------------- thread_id: description: "Platform-specific thread identifier" range: string thread_title: description: "Title of the thread" range: string thread_structure: description: "Type of threading structure" range: ThreadStructureEnum original_post: description: "Reference to the original/root post" range: CMCPost posts: description: "Posts in the thread" range: CMCPost multivalued: true post_count: description: "Total number of posts in thread" range: integer participant_count: description: "Number of unique participants" range: integer start_time: description: "Time of first post" range: datetime last_activity: description: "Time of most recent activity" range: datetime is_closed: description: "Whether thread is closed for new replies" range: boolean is_pinned: description: "Whether thread is pinned/sticky" range: boolean platform_thread_url: description: "URL to thread on platform" range: uri # --------------------------------------------------------------------------- # Conversation Slots # --------------------------------------------------------------------------- conversation_id: description: "Identifier for the conversation" range: string conversation_title: description: "Title of the conversation" range: string threads: description: "Threads in the conversation" range: CMCThread multivalued: true channel_name: description: "Name of the channel/room" range: string end_time: description: "Time of conversation end" range: datetime total_posts: description: "Total posts in conversation" range: integer participants: description: "Participants in the conversation" range: CMCParticipant multivalued: true # --------------------------------------------------------------------------- # Participant Slots # --------------------------------------------------------------------------- participant_id: description: "Unique identifier for participant" range: string username: description: "Platform username" range: string display_name: description: "Display name (may differ from username)" range: string anonymized_id: description: "Anonymized identifier (when privacy applied)" range: string anonymization_level: description: "Level of anonymization applied" range: AnonymizationLevelEnum account_created: description: "When the account was created" range: datetime account_verified: description: "Whether account is verified" range: boolean is_bot: description: "Whether account is a bot" range: boolean platform_user_url: description: "URL to user profile on platform" range: uri person_ref: description: "Reference to TEI person element" range: string role_in_conversation: description: "Role in conversation (moderator, admin, member)" range: string post_count_in_corpus: description: "Number of posts by this participant in corpus" range: integer demographic_info: description: "Demographic information (if available and consented)" range: string # --------------------------------------------------------------------------- # Group Slots # --------------------------------------------------------------------------- group_id: description: "Identifier for the group" range: string group_name: description: "Name of the group" range: string group_type: description: "Type of group (public, private, etc.)" range: string members: description: "Members of the group" range: CMCParticipant multivalued: true member_count: description: "Number of members" range: integer creation_date: description: "When the group was created" range: datetime platform_group_url: description: "URL to group on platform" range: uri # --------------------------------------------------------------------------- # Emoji Slots # --------------------------------------------------------------------------- emoji_char: description: "The emoji character" range: string emoji_codepoint: description: "Unicode codepoint(s) for the emoji" range: string emoji_name: description: "Short name/description of emoji" range: string emoji_category: description: "Emoji category" range: EmojiCategoryEnum is_custom: description: "Whether this is a custom/platform-specific emoji" range: boolean custom_emoji_url: description: "URL to custom emoji image" range: uri text_equivalent: description: "Text equivalent or description" range: string position_in_post: description: "Character position in post text" range: integer sentiment_valence: description: "Sentiment valence (-1.0 to 1.0)" range: float minimum_value: -1.0 maximum_value: 1.0 # --------------------------------------------------------------------------- # Emoticon Slots # --------------------------------------------------------------------------- emoticon_text: description: "The emoticon text (e.g., ':)', 'XD')" range: string normalized_form: description: "Normalized emoticon form" range: string emoji_equivalent: description: "Unicode emoji equivalent" range: string # --------------------------------------------------------------------------- # Hashtag Slots # --------------------------------------------------------------------------- hashtag_text: description: "Full hashtag text including #" range: string hashtag_normalized: description: "Normalized hashtag (lowercase, no #)" range: string is_trending: description: "Whether hashtag is trending" range: boolean topic_category: description: "Topic category for hashtag" range: string # --------------------------------------------------------------------------- # Mention Slots # --------------------------------------------------------------------------- mention_text: description: "Full mention text including @" range: string mentioned_username: description: "Username being mentioned" range: string mentioned_user_ref: description: "Reference to CMCParticipant" range: string is_reply_mention: description: "Whether mention is auto-added reply mention" range: boolean entity_type: description: "Entity type for NER" range: string entity_ref: description: "Reference to known entity (Wikidata, VIAF, etc.)" range: uri # --------------------------------------------------------------------------- # Media Slots # --------------------------------------------------------------------------- media_type: description: "Type of media" range: CMCMediaTypeEnum media_url: description: "URL to media content" range: uri thumbnail_url: description: "URL to thumbnail" range: uri alt_text: description: "Alt text for accessibility" range: string caption: description: "Caption for media" range: string duration_seconds: description: "Duration in seconds (for audio/video)" range: float file_size: description: "File size in bytes" range: integer mime_type: description: "MIME type" range: string is_quoted_content: description: "Whether this is quoted/embedded content" range: boolean original_post_ref: description: "Reference to original post (if quoted)" range: string ocr_text: description: "OCR-extracted text from image" range: string transcription: description: "Transcription of audio/video" range: string # --------------------------------------------------------------------------- # Reaction Slots # --------------------------------------------------------------------------- reaction_type: description: "Type of reaction (like, love, angry, etc.)" range: string reaction_emoji: description: "Emoji for reaction" range: string reactor_ref: description: "Reference to reactor participant" range: string reaction_time: description: "Time of reaction" range: datetime reaction_count: description: "Number of this reaction type" range: integer total_reactions: description: "Total reactions" range: integer reaction_breakdown: description: "Breakdown by reaction type" range: string top_reactors: description: "Top reactors" range: string multivalued: true # --------------------------------------------------------------------------- # Platform Metadata Slots # --------------------------------------------------------------------------- platform_name: description: "Name of platform" range: string platform_type: description: "Type of platform" range: CMCPlatformTypeEnum platform_version: description: "Version of platform/API" range: string platform_url: description: "URL to platform" range: uri api_version: description: "API version used for collection" range: string collection_date: description: "Date of data collection" range: datetime collection_method: description: "Method of data collection" range: string terms_of_service_url: description: "URL to platform ToS" range: uri rate_limit_info: description: "Rate limit information" range: string geographic_availability: description: "Geographic availability of platform" range: string # --------------------------------------------------------------------------- # Post Metadata Slots # --------------------------------------------------------------------------- platform_post_id: description: "Platform-specific post ID" range: string platform_post_url: description: "URL to post on platform" range: uri view_count: description: "Number of views" range: integer reply_count: description: "Number of replies" range: integer repost_count: description: "Number of reposts/shares" range: integer like_count: description: "Number of likes" range: integer quote_count: description: "Number of quote posts" range: integer bookmark_count: description: "Number of bookmarks" range: integer is_edited: description: "Whether post was edited" range: boolean edit_count: description: "Number of edits" range: integer language_detected: description: "Detected language of post" range: string is_sensitive: description: "Whether post is marked sensitive" range: boolean is_sponsored: description: "Whether post is sponsored/promoted" range: boolean visibility: description: "Visibility setting (public, private, followers)" range: string # --------------------------------------------------------------------------- # Corpus Slots # --------------------------------------------------------------------------- corpus_id: description: "Identifier for corpus" range: string corpus_title: description: "Title of corpus" range: string corpus_description: description: "Description of corpus" range: string platforms_included: description: "Platforms included in corpus" range: string multivalued: true time_range_start: description: "Start of time range" range: datetime time_range_end: description: "End of time range" range: datetime total_participants: description: "Total participants in corpus" range: integer total_threads: description: "Total threads in corpus" range: integer languages_included: description: "Languages in corpus" range: string multivalued: true collection_methodology: description: "Methodology for collection" range: string sampling_strategy: description: "Sampling strategy used" range: string anonymization_applied: description: "Anonymization applied to corpus" range: AnonymizationLevelEnum ethical_approval: description: "Ethical approval information" range: string license: description: "License for corpus" range: string # --------------------------------------------------------------------------- # NER Entity Slots # --------------------------------------------------------------------------- entity_text: description: "Original text of entity mention" range: string mention_format: description: "Format of mention (hashtag, @mention, URL, plain)" range: string is_hashtag_entity: description: "Whether entity is from hashtag" range: boolean is_mention_entity: description: "Whether entity is from @mention" range: boolean is_url_entity: description: "Whether entity is from URL" range: boolean informal_variant: description: "Informal/slang variant of entity" range: string canonical_form: description: "Canonical form of entity" range: string confidence_score: description: "NER confidence score" range: float minimum_value: 0.0 maximum_value: 1.0 context_window: description: "Surrounding context for entity" range: string post_ref: description: "Reference to source post" range: string # --------------------------------------------------------------------------- # Normalization Slots # --------------------------------------------------------------------------- original_text: description: "Original text before normalization" range: string normalized_text: description: "Normalized text" range: string normalization_type: description: "Type of normalization applied" range: string normalization_confidence: description: "Confidence in normalization" range: float normalization_method: description: "Method used for normalization" range: string # ============================================================================= # ONTOLOGY MAPPINGS SUMMARY # ============================================================================= # # SIOC (Semantically-Interlinked Online Communities): # - CMCPost: sioc:Post # - CMCThread: sioc:Thread # - CMCConversation: sioc:Forum # - CMCParticipant: sioc:UserAccount # - CMCParticipantGroup: sioc:Usergroup # # Activity Streams 2.0: # - CMCPost: as:Note, as:Article # - CMCReaction: as:Like, as:Announce # - CMCParticipant: as:Person # # Schema.org: # - CMCPost: schema:SocialMediaPosting # - CMCThread: schema:DiscussionForumPosting # - CMCParticipant: schema:Person # - CMCEmbeddedMedia: schema:MediaObject # # FOAF: # - CMCParticipant: foaf:OnlineAccount # - CMCParticipantGroup: foaf:Group # # CIDOC-CRM: # - CMCPost: crm:E33_Linguistic_Object # - CMCParticipant: crm:E39_Actor # # PROV-O: # - CMCPlatformMetadata: prov:Activity # - CMCCorpus: prov:Collection # # Web Annotation (W3C OA): # - CMCEntityMention: oa:Annotation # # NIF: # - CMCEntityMention: nif:String # # ============================================================================= # ============================================================================= # GLAM-NER HYPERNYM MAPPINGS # ============================================================================= # # TXT.CMC: Computer-mediated communication # TXT.CMC.PST: CMC post # TXT.CMC.THR: CMC thread # TXT.CMC.CNV: CMC conversation # TXT.CMC.EMJ: Emoji # TXT.CMC.EMO: Emoticon # TXT.CMC.RXN: Reaction # TXT.CMC.RXS: Reaction set # # AGT.CMC: CMC agents # AGT.CMC.USR: CMC user account # # GRP.CMC: CMC groups # # APP.CMC: CMC appellations # APP.CMC.HTG: Hashtag # APP.CMC.MEN: @-mention # # THG.CMC: CMC things # THG.CMC.MED: CMC embedded media # # DOC.MET.CMC: CMC metadata # DOC.MET.CMC.PST: Post metadata # # DOC.CRP.CMC: CMC corpus # # NER.CMC: NER for CMC # NER.CMC.ENT: CMC entity mention # NER.CMC.NRM: CMC entity normalization # # =============================================================================