glam/data/entity_annotation/modules/advanced/tei/gaiji.yaml
2025-12-05 15:30:23 +01:00

948 lines
28 KiB
YAML

# =============================================================================
# GLAM-NER: TEI P5 GAIJI MODULE (LINKML)
# =============================================================================
# Module: modules/advanced/tei/gaiji.yaml
# TEI Chapter: 5 - Characters, Glyphs, and Writing Modes
# TEI Module: gaiji
# Version: 1.0.0
# =============================================================================
#
# This module provides LinkML class definitions for TEI P5 character and
# glyph documentation elements. Essential for manuscript studies, East Asian
# texts, historical documents, and any text using non-standard characters.
#
# TEI Source: https://tei-c.org/release/doc/tei-p5-doc/en/html/WD.html
#
# Key Element Groups:
# - Character declarations: charDecl, char, glyph
# - Character reference: g (gaiji)
# - Character properties: charName, charProp, unicodeName, localProp
# - Character mapping: mapping, figure, graphic
# - Writing modes: style attributes for directionality
#
# Use Cases:
# - Medieval manuscript transcription (special letter forms)
# - CJK (Chinese, Japanese, Korean) character variants
# - Unicode Private Use Area documentation
# - Historical script representation
# - Rare or obsolete character forms
#
# =============================================================================
id: https://w3id.org/glam/ner/tei/gaiji
name: tei-gaiji
title: TEI Gaiji Module for GLAM-NER
version: "1.0.0"
prefixes:
linkml: https://w3id.org/linkml/
tei: http://www.tei-c.org/ns/1.0/
glam: https://w3id.org/glam/ner/
crm: http://www.cidoc-crm.org/cidoc-crm/
schema: http://schema.org/
unicode: http://www.w3.org/2005/Incubator/urw3/
dcterms: http://purl.org/dc/terms/
skos: http://www.w3.org/2004/02/skos/core#
default_prefix: glam
default_range: string
imports:
- linkml:types
# =============================================================================
# ENUMERATIONS
# =============================================================================
enums:
# ---------------------------------------------------------------------------
# Mapping Type
# ---------------------------------------------------------------------------
CharMappingTypeEnum:
description: >-
Type of character mapping. From TEI @type on mapping element.
permissible_values:
standard:
description: Standard Unicode character mapping
PUA:
description: Unicode Private Use Area codepoint
variant:
description: Variant character form
normalized:
description: Normalized form for searching/indexing
simplified:
description: Simplified form (e.g., simplified Chinese)
traditional:
description: Traditional form (e.g., traditional Chinese)
historical:
description: Historical character form
# ---------------------------------------------------------------------------
# Unicode Category
# ---------------------------------------------------------------------------
UnicodeCategoryEnum:
description: >-
Unicode General Category values.
permissible_values:
Lu:
description: Letter, Uppercase
Ll:
description: Letter, Lowercase
Lt:
description: Letter, Titlecase
Lm:
description: Letter, Modifier
Lo:
description: Letter, Other
Mn:
description: Mark, Nonspacing
Mc:
description: Mark, Spacing Combining
Me:
description: Mark, Enclosing
Nd:
description: Number, Decimal Digit
Nl:
description: Number, Letter
No:
description: Number, Other
Pc:
description: Punctuation, Connector
Pd:
description: Punctuation, Dash
Ps:
description: Punctuation, Open
Pe:
description: Punctuation, Close
Pi:
description: Punctuation, Initial quote
Pf:
description: Punctuation, Final quote
Po:
description: Punctuation, Other
Sm:
description: Symbol, Math
Sc:
description: Symbol, Currency
Sk:
description: Symbol, Modifier
So:
description: Symbol, Other
Zs:
description: Separator, Space
Zl:
description: Separator, Line
Zp:
description: Separator, Paragraph
Cc:
description: Other, Control
Cf:
description: Other, Format
Cs:
description: Other, Surrogate
Co:
description: Other, Private Use
Cn:
description: Other, Not Assigned
# ---------------------------------------------------------------------------
# Writing Direction
# ---------------------------------------------------------------------------
WritingDirectionEnum:
description: >-
Writing direction for text blocks. Based on CSS Writing Modes.
permissible_values:
ltr:
description: Left to right (English, Latin scripts)
rtl:
description: Right to left (Arabic, Hebrew)
ttb:
description: Top to bottom (vertical East Asian)
btt:
description: Bottom to top (rare, e.g., some Ogham)
# ---------------------------------------------------------------------------
# Writing Mode
# ---------------------------------------------------------------------------
WritingModeEnum:
description: >-
CSS Writing Mode values for text orientation.
permissible_values:
horizontal-tb:
description: Horizontal, top to bottom line progression
vertical-rl:
description: Vertical, right to left line progression
vertical-lr:
description: Vertical, left to right line progression
sideways-rl:
description: Sideways right to left
sideways-lr:
description: Sideways left to right
# ---------------------------------------------------------------------------
# Text Orientation
# ---------------------------------------------------------------------------
TextOrientationEnum:
description: >-
CSS text-orientation values for glyph orientation in vertical text.
permissible_values:
mixed:
description: Mixed orientation (CJK upright, others rotated)
upright:
description: All glyphs upright
sideways:
description: All glyphs rotated 90 degrees
# ---------------------------------------------------------------------------
# Character Property Type
# ---------------------------------------------------------------------------
CharPropertyTypeEnum:
description: >-
Types of character properties that can be documented.
permissible_values:
unicode_name:
description: Official Unicode character name
unicode_category:
description: Unicode general category
unicode_block:
description: Unicode block name
unicode_script:
description: Unicode script property
radical:
description: CJK radical number
stroke_count:
description: Number of strokes (CJK)
reading:
description: Pronunciation/reading
meaning:
description: Semantic meaning
decomposition:
description: Character decomposition
variant_of:
description: Base character this is a variant of
# =============================================================================
# SLOTS (ATTRIBUTES)
# =============================================================================
slots:
# ---------------------------------------------------------------------------
# Character identification
# ---------------------------------------------------------------------------
unicode_codepoint:
description: Unicode code point (e.g., U+4E2D)
range: string
pattern: "^U\\+[0-9A-Fa-f]{4,6}$"
unicode_name:
description: Official Unicode character name
range: string
unicode_category:
description: Unicode general category
range: UnicodeCategoryEnum
unicode_block:
description: Unicode block name
range: string
# ---------------------------------------------------------------------------
# Character representation
# ---------------------------------------------------------------------------
glyph_ref:
description: Reference to glyph definition
range: string
character_string:
description: The actual character as a string
range: string
# ---------------------------------------------------------------------------
# Mapping attributes
# ---------------------------------------------------------------------------
mapping_type:
description: Type of character mapping
range: CharMappingTypeEnum
mapping_target:
description: Target character or codepoint of mapping
range: string
# ---------------------------------------------------------------------------
# Property attributes
# ---------------------------------------------------------------------------
property_name:
description: Name of the character property
range: string
property_value:
description: Value of the character property
range: string
property_type:
description: Type of character property
range: CharPropertyTypeEnum
# ---------------------------------------------------------------------------
# Writing mode attributes
# ---------------------------------------------------------------------------
writing_direction:
description: Direction of text flow
range: WritingDirectionEnum
writing_mode:
description: CSS writing mode
range: WritingModeEnum
text_orientation:
description: Glyph orientation in vertical text
range: TextOrientationEnum
# =============================================================================
# CLASSES
# =============================================================================
classes:
# ===========================================================================
# DECLARATION CLASSES
# ===========================================================================
# ---------------------------------------------------------------------------
# CharDecl - charDecl
# ---------------------------------------------------------------------------
CharDecl:
description: >-
Container for character and glyph declarations used in a document.
Corresponds to TEI <charDecl> element. Typically placed in encodingDesc.
class_uri: crm:E90_Symbolic_Object
annotations:
tei_element: charDecl
tei_module: gaiji
glam_hypernym: DOC.MET.CHR
attributes:
xml_id:
description: Unique identifier
range: string
desc:
description: Description of the character declaration set
range: string
characters:
description: Character definitions
range: CharDef
multivalued: true
glyphs:
description: Glyph definitions
range: GlyphDef
multivalued: true
# ---------------------------------------------------------------------------
# CharDef - char
# ---------------------------------------------------------------------------
CharDef:
description: >-
Definition of a non-standard character used in the document.
Corresponds to TEI <char> element. Used for characters not in Unicode
or requiring special documentation.
class_uri: crm:E90_Symbolic_Object
annotations:
tei_element: char
tei_module: gaiji
glam_hypernym: TXT.CHR
slots:
- unicode_codepoint
attributes:
xml_id:
description: Unique identifier for the character
range: string
required: true
char_name:
description: Name of the character
range: CharName
char_props:
description: Unicode and other properties
range: CharProp
multivalued: true
mappings:
description: Mappings to other characters
range: CharMapping
multivalued: true
figure:
description: Graphical representation
range: CharFigure
desc:
description: Description of the character
range: string
note:
description: Editorial notes
range: string
multivalued: true
# ---------------------------------------------------------------------------
# GlyphDef - glyph
# ---------------------------------------------------------------------------
GlyphDef:
description: >-
Definition of a specific glyph (visual form) of a character.
Corresponds to TEI <glyph> element. Used when multiple visual forms
exist for a single character.
class_uri: crm:E90_Symbolic_Object
annotations:
tei_element: glyph
tei_module: gaiji
glam_hypernym: TXT.GLY
attributes:
xml_id:
description: Unique identifier for the glyph
range: string
required: true
glyph_name:
description: Name of the glyph
range: GlyphName
mappings:
description: Mappings to characters
range: CharMapping
multivalued: true
figure:
description: Graphical representation
range: CharFigure
desc:
description: Description of the glyph
range: string
note:
description: Editorial notes
range: string
multivalued: true
# ===========================================================================
# NAME AND PROPERTY CLASSES
# ===========================================================================
# ---------------------------------------------------------------------------
# CharName - charName
# ---------------------------------------------------------------------------
CharName:
description: >-
Name of a non-standard character.
Corresponds to TEI <charName> element.
class_uri: skos:prefLabel
annotations:
tei_element: charName
tei_module: gaiji
glam_hypernym: TXT.CHR.NAM
attributes:
value:
description: Character name text
range: string
required: true
# ---------------------------------------------------------------------------
# GlyphName - glyphName
# ---------------------------------------------------------------------------
GlyphName:
description: >-
Name of a glyph.
Corresponds to TEI <glyphName> element.
class_uri: skos:prefLabel
annotations:
tei_element: glyphName
tei_module: gaiji
glam_hypernym: TXT.GLY.NAM
attributes:
value:
description: Glyph name text
range: string
required: true
# ---------------------------------------------------------------------------
# CharProp - charProp
# ---------------------------------------------------------------------------
CharProp:
description: >-
A property of a character (Unicode name, category, etc.).
Corresponds to TEI <charProp> element.
class_uri: crm:E55_Type
annotations:
tei_element: charProp
tei_module: gaiji
glam_hypernym: TXT.CHR.PRP
slots:
- property_type
attributes:
local_name:
description: Local property name element
range: LocalProp
unicode_name:
description: Unicode property name element
range: UnicodeName
value:
description: Property value element
range: PropValue
# ---------------------------------------------------------------------------
# LocalProp - localProp
# ---------------------------------------------------------------------------
LocalProp:
description: >-
Local (project-specific) property name for a character.
Corresponds to TEI <localProp> element.
annotations:
tei_element: localProp
tei_module: gaiji
glam_hypernym: TXT.CHR.LCL
slots:
- property_name
attributes:
name:
description: Property name
range: string
required: true
value:
description: Property value
range: string
required: true
# ---------------------------------------------------------------------------
# UnicodeName - unicodeName
# ---------------------------------------------------------------------------
UnicodeName:
description: >-
Official Unicode property name.
Corresponds to TEI <unicodeName> element.
class_uri: unicode:Property
annotations:
tei_element: unicodeName
tei_module: gaiji
glam_hypernym: TXT.CHR.UNI
slots:
- unicode_name
attributes:
value:
description: Unicode property name
range: string
required: true
# ---------------------------------------------------------------------------
# PropValue - value
# ---------------------------------------------------------------------------
PropValue:
description: >-
Value element for character properties.
Corresponds to TEI <value> element within charProp.
annotations:
tei_element: value
tei_module: gaiji
glam_hypernym: TXT.CHR.VAL
slots:
- property_value
attributes:
value:
description: Property value content
range: string
required: true
# ===========================================================================
# MAPPING CLASSES
# ===========================================================================
# ---------------------------------------------------------------------------
# CharMapping - mapping
# ---------------------------------------------------------------------------
CharMapping:
description: >-
Maps a character to another character or codepoint.
Corresponds to TEI <mapping> element. Used for normalization,
variant relationships, and PUA mappings.
class_uri: skos:exactMatch
annotations:
tei_element: mapping
tei_module: gaiji
glam_hypernym: TXT.CHR.MAP
slots:
- mapping_type
- mapping_target
attributes:
type:
description: Type of mapping (standard, PUA, variant, normalized)
range: CharMappingTypeEnum
target_char:
description: Target character string
range: string
target_codepoint:
description: Target Unicode codepoint
range: string
subtype:
description: More specific mapping type
range: string
# ===========================================================================
# FIGURE/GRAPHIC CLASSES
# ===========================================================================
# ---------------------------------------------------------------------------
# CharFigure - figure
# ---------------------------------------------------------------------------
CharFigure:
description: >-
Graphical representation of a character or glyph.
Corresponds to TEI <figure> element within char/glyph context.
Contains graphic elements pointing to glyph images.
class_uri: schema:ImageObject
annotations:
tei_element: figure
tei_module: gaiji
glam_hypernym: THG.IMG.CHR
attributes:
xml_id:
description: Unique identifier
range: string
graphics:
description: Graphic representations
range: CharGraphic
multivalued: true
desc:
description: Description of the figure
range: string
# ---------------------------------------------------------------------------
# CharGraphic - graphic
# ---------------------------------------------------------------------------
CharGraphic:
description: >-
Graphic image representing a character glyph.
Corresponds to TEI <graphic> element within figure.
class_uri: schema:ImageObject
annotations:
tei_element: graphic
tei_module: gaiji
glam_hypernym: THG.IMG
attributes:
url:
description: URL to the glyph image
range: uri
required: true
width:
description: Image width
range: string
height:
description: Image height
range: string
mime_type:
description: MIME type of the image
range: string
desc:
description: Description
range: string
# ===========================================================================
# REFERENCE CLASS
# ===========================================================================
# ---------------------------------------------------------------------------
# GaijiRef - g
# ---------------------------------------------------------------------------
GaijiRef:
description: >-
Inline reference to a defined character or glyph.
Corresponds to TEI <g> element. Used in text to mark non-standard
characters that are defined in charDecl.
class_uri: crm:E90_Symbolic_Object
annotations:
tei_element: g
tei_module: gaiji
glam_hypernym: TXT.CHR.REF
slots:
- glyph_ref
- character_string
attributes:
ref:
description: Reference to char or glyph definition (URI fragment)
range: string
required: true
n:
description: Number or identifier
range: string
content:
description: Fallback character content
range: string
# ===========================================================================
# WRITING MODE CLASSES
# ===========================================================================
# ---------------------------------------------------------------------------
# WritingModeSpec - (style attribute values)
# ---------------------------------------------------------------------------
WritingModeSpec:
description: >-
Specification of writing mode properties for a text block.
Captures CSS Writing Modes properties as used in TEI @style attribute.
Not a TEI element but a value structure.
annotations:
tei_element: null
tei_module: gaiji
glam_hypernym: TXT.WRT.MOD
slots:
- writing_direction
- writing_mode
- text_orientation
attributes:
direction:
description: Text direction (ltr, rtl)
range: WritingDirectionEnum
writing_mode:
description: CSS writing-mode property
range: WritingModeEnum
text_orientation:
description: CSS text-orientation property
range: TextOrientationEnum
transform:
description: CSS transform for rotation
range: string
# ---------------------------------------------------------------------------
# BiDiOverride - (for explicit directionality)
# ---------------------------------------------------------------------------
BiDiOverride:
description: >-
Specification for bidirectional text override.
Captures Unicode Bidi Algorithm override values.
annotations:
tei_element: null
tei_module: gaiji
glam_hypernym: TXT.WRT.BDI
attributes:
unicode_bidi:
description: Unicode-bidi property (normal, embed, override, isolate)
range: string
direction:
description: Direction property (ltr, rtl)
range: WritingDirectionEnum
# ===========================================================================
# CJK-SPECIFIC CLASSES
# ===========================================================================
# ---------------------------------------------------------------------------
# CJKCharProperties - (extension for CJK characters)
# ---------------------------------------------------------------------------
CJKCharProperties:
description: >-
Extended properties for CJK (Chinese, Japanese, Korean) characters.
Captures radical, stroke count, readings, and variant relationships.
annotations:
tei_element: null
tei_module: gaiji
glam_hypernym: TXT.CHR.CJK
attributes:
radical_number:
description: Kangxi radical number (1-214)
range: integer
minimum_value: 1
maximum_value: 214
additional_strokes:
description: Strokes beyond the radical
range: integer
total_strokes:
description: Total stroke count
range: integer
pinyin:
description: Mandarin pinyin reading(s)
range: string
multivalued: true
on_reading:
description: Japanese on'yomi reading(s)
range: string
multivalued: true
kun_reading:
description: Japanese kun'yomi reading(s)
range: string
multivalued: true
hangul:
description: Korean hangul reading(s)
range: string
multivalued: true
vietnamese:
description: Vietnamese reading(s)
range: string
multivalued: true
simplified_variant:
description: Simplified Chinese variant
range: string
traditional_variant:
description: Traditional Chinese variant
range: string
japanese_variant:
description: Japanese shinjitai variant
range: string
semantic_variant:
description: Semantic variant characters
range: string
multivalued: true
z_variant:
description: Z-variant (graphical variant) characters
range: string
multivalued: true
# ===========================================================================
# MEDIEVAL MANUSCRIPT CLASSES
# ===========================================================================
# ---------------------------------------------------------------------------
# MedievalCharProperties - (extension for medieval scripts)
# ---------------------------------------------------------------------------
MedievalCharProperties:
description: >-
Extended properties for medieval manuscript characters.
Captures letterforms, abbreviation marks, and scribal conventions.
annotations:
tei_element: null
tei_module: gaiji
glam_hypernym: TXT.CHR.MED
attributes:
letterform:
description: Type of letterform (long s, round r, etc.)
range: string
abbreviation_mark:
description: Type of abbreviation mark if applicable
range: string
expansion:
description: Expanded form if abbreviation
range: string
script_type:
description: Script type (caroline, gothic, etc.)
range: string
period:
description: Typical period of use
range: string
region:
description: Geographic region of use
range: string
# =============================================================================
# ONTOLOGY MAPPINGS SUMMARY
# =============================================================================
#
# CIDOC-CRM:
# - CharDecl: crm:E90_Symbolic_Object
# - CharDef: crm:E90_Symbolic_Object
# - GlyphDef: crm:E90_Symbolic_Object
# - CharProp: crm:E55_Type
# - GaijiRef: crm:E90_Symbolic_Object
#
# SKOS:
# - CharName: skos:prefLabel
# - GlyphName: skos:prefLabel
# - CharMapping: skos:exactMatch
#
# Schema.org:
# - CharFigure: schema:ImageObject
# - CharGraphic: schema:ImageObject
#
# Unicode (W3C):
# - UnicodeName: unicode:Property
#
# =============================================================================