glam/data/entity_annotation/modules/advanced/tei/analysis.yaml
2025-12-05 15:30:23 +01:00

976 lines
28 KiB
YAML

# =============================================================================
# GLAM-NER: TEI Analysis Module (LinkML)
# =============================================================================
# Module: modules/advanced/tei/analysis.yaml
# TEI Chapter: 18 - Simple Analytic Mechanisms
# TEI Module: analysis
# Version: 1.0.0
# =============================================================================
#
# This module defines LinkML classes for TEI P5 Chapter 18 elements used for
# simple analytic mechanisms including linguistic segmentation, POS tagging,
# morphological analysis, syntactic annotation, and interpretive markup.
#
# Key Element Groups:
# - Linguistic Segments: s, cl, phr, w, m, c, pc
# - Spans and Interpretations: span, spanGrp, interp, interpGrp
# - Analysis Attributes: @ana, @lemma, @pos, @msd, @join
#
# GLAM-NER Integration:
# - Linguistic annotation for NLP preprocessing
# - POS tagging and lemmatization output
# - Syntactic structure annotation
# - Interpretive annotation for entity classification
#
# Ontology Alignments:
# - NIF (NLP Interchange Format): Linguistic annotation
# - OntoLex-Lemon: Lexical entries and forms
# - OLiA (Ontologies of Linguistic Annotation): POS tags
# - CIDOC-CRM: Interpretive assertions
# - Web Annotation: Span annotations
#
# =============================================================================
id: https://w3id.org/glam/ner/tei/analysis
name: glam-ner-tei-analysis
title: "TEI Analysis Module for GLAM-NER"
version: "1.0.0"
license: https://creativecommons.org/licenses/by/4.0/
see_also:
- https://tei-c.org/release/doc/tei-p5-doc/en/html/AI.html
prefixes:
linkml: https://w3id.org/linkml/
tei: http://www.tei-c.org/ns/1.0/
glam: https://w3id.org/glam/ner/
nif: http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#
ontolex: http://www.w3.org/ns/lemon/ontolex#
olia: http://purl.org/olia/olia.owl#
crm: http://www.cidoc-crm.org/cidoc-crm/
oa: http://www.w3.org/ns/oa#
skos: http://www.w3.org/2004/02/skos/core#
schema: http://schema.org/
default_prefix: glam
default_range: string
imports:
- linkml:types
# =============================================================================
# ENUMERATIONS
# =============================================================================
enums:
# ---------------------------------------------------------------------------
# Linguistic Unit Type Enum
# ---------------------------------------------------------------------------
LinguisticUnitTypeEnum:
description: >-
Types of linguistic units for segmentation.
permissible_values:
sentence:
description: Sentence unit (s-unit)
meaning: nif:Sentence
clause:
description: Clause (syntactic unit)
meaning: olia:Clause
phrase:
description: Phrase (noun phrase, verb phrase, etc.)
meaning: olia:Phrase
word:
description: Word (grammatical word)
meaning: nif:Word
aliases:
- token
morpheme:
description: Morpheme (minimal meaningful unit)
meaning: olia:Morpheme
character:
description: Character (single grapheme)
meaning: nif:Character
punctuation:
description: Punctuation mark
meaning: olia:Punctuation
# ---------------------------------------------------------------------------
# Part of Speech Enum (Universal Dependencies tagset)
# ---------------------------------------------------------------------------
POSTagEnum:
description: >-
Part of speech tags based on Universal Dependencies (UD) tagset.
Extended with additional categories for compatibility with other tagsets.
permissible_values:
# Open class words
ADJ:
description: Adjective
meaning: olia:Adjective
ADV:
description: Adverb
meaning: olia:Adverb
INTJ:
description: Interjection
meaning: olia:Interjection
NOUN:
description: Noun
meaning: olia:Noun
PROPN:
description: Proper noun
meaning: olia:ProperNoun
VERB:
description: Verb
meaning: olia:Verb
# Closed class words
ADP:
description: Adposition (preposition/postposition)
meaning: olia:Adposition
AUX:
description: Auxiliary verb
meaning: olia:AuxiliaryVerb
CCONJ:
description: Coordinating conjunction
meaning: olia:CoordinatingConjunction
DET:
description: Determiner
meaning: olia:Determiner
NUM:
description: Numeral
meaning: olia:Numeral
PART:
description: Particle
meaning: olia:Particle
PRON:
description: Pronoun
meaning: olia:Pronoun
SCONJ:
description: Subordinating conjunction
meaning: olia:SubordinatingConjunction
# Other
PUNCT:
description: Punctuation
meaning: olia:Punctuation
SYM:
description: Symbol
meaning: olia:Symbol
X:
description: Other/Unknown
meaning: olia:Residual
# ---------------------------------------------------------------------------
# Phrase Type Enum
# ---------------------------------------------------------------------------
PhraseTypeEnum:
description: >-
Types of syntactic phrases.
permissible_values:
NP:
description: Noun phrase
meaning: olia:NounPhrase
VP:
description: Verb phrase
meaning: olia:VerbPhrase
PP:
description: Prepositional phrase
meaning: olia:PrepositionalPhrase
AP:
description: Adjective phrase
meaning: olia:AdjectivePhrase
ADVP:
description: Adverb phrase
meaning: olia:AdverbPhrase
S:
description: Sentence/clause
meaning: olia:Sentence
SBAR:
description: Subordinate clause
meaning: olia:SubordinateClause
CP:
description: Complementizer phrase
DP:
description: Determiner phrase
IP:
description: Inflectional phrase
# ---------------------------------------------------------------------------
# Join Direction Enum
# ---------------------------------------------------------------------------
JoinDirectionEnum:
description: >-
Direction of token joining (for handling whitespace).
permissible_values:
left:
description: Join to preceding token (no space before)
right:
description: Join to following token (no space after)
both:
description: Join to both adjacent tokens
overlap:
description: Token overlaps with neighbors
# ---------------------------------------------------------------------------
# Morphological Feature Type Enum
# ---------------------------------------------------------------------------
MorphFeatureTypeEnum:
description: >-
Types of morphological features (Universal Features).
permissible_values:
# Nominal features
Case:
description: Grammatical case
meaning: olia:hasCase
Definite:
description: Definiteness
meaning: olia:hasDefiniteness
Gender:
description: Grammatical gender
meaning: olia:hasGender
Number:
description: Grammatical number
meaning: olia:hasNumber
# Verbal features
Aspect:
description: Grammatical aspect
meaning: olia:hasAspect
Mood:
description: Grammatical mood
meaning: olia:hasMood
Person:
description: Grammatical person
meaning: olia:hasPerson
Tense:
description: Grammatical tense
meaning: olia:hasTense
VerbForm:
description: Verb form (finite, infinitive, participle, etc.)
meaning: olia:hasVerbForm
Voice:
description: Grammatical voice
meaning: olia:hasVoice
# Other features
Degree:
description: Degree of comparison
meaning: olia:hasDegree
Polarity:
description: Polarity (affirmative/negative)
meaning: olia:hasPolarity
Poss:
description: Possessive
PronType:
description: Pronoun type
Reflex:
description: Reflexive
# ---------------------------------------------------------------------------
# Interpretation Type Enum
# ---------------------------------------------------------------------------
InterpretationTypeEnum:
description: >-
Types of interpretive annotation.
permissible_values:
semantic:
description: Semantic interpretation
thematic:
description: Thematic/topic interpretation
stylistic:
description: Stylistic interpretation
pragmatic:
description: Pragmatic interpretation
discourse:
description: Discourse-level interpretation
rhetorical:
description: Rhetorical interpretation
cultural:
description: Cultural/historical interpretation
entity:
description: Named entity interpretation
meaning: oa:identifying
# =============================================================================
# SLOTS (Attributes)
# =============================================================================
slots:
# ---------------------------------------------------------------------------
# Global Analysis Attributes
# ---------------------------------------------------------------------------
ana:
description: >-
Analysis reference - points to interpretation element(s) for this segment.
TEI @ana attribute, equivalent to NIF annotation reference.
range: uriorcurie
multivalued: true
slot_uri: tei:ana
annotations:
tei_attribute: ana
nif_mapping: nif:annotation
lemma:
description: >-
Base form (lemma/dictionary form) of a word.
TEI @lemma attribute.
range: string
slot_uri: tei:lemma
annotations:
tei_attribute: lemma
ontolex_mapping: ontolex:canonicalForm
pos:
description: >-
Part of speech tag. Can use any tagset but Universal Dependencies
recommended for interoperability.
range: string
slot_uri: tei:pos
annotations:
tei_attribute: pos
olia_mapping: olia:hasTag
msd:
description: >-
Morphosyntactic description - detailed morphological features.
Typically in CONLL-U or similar format (e.g., "Case=Nom|Number=Sing").
range: string
slot_uri: tei:msd
annotations:
tei_attribute: msd
join_direction:
description: >-
How this token joins with adjacent tokens (whitespace handling).
TEI @join attribute.
range: JoinDirectionEnum
slot_uri: tei:join
annotations:
tei_attribute: join
# ---------------------------------------------------------------------------
# Segmentation Attributes
# ---------------------------------------------------------------------------
segment_type:
description: >-
Type of linguistic segment.
range: LinguisticUnitTypeEnum
segment_function:
description: >-
Grammatical or functional role of segment.
range: string
real:
description: >-
Whether segment represents actual occurrence in source.
False for editorial/analytical additions.
range: boolean
slot_uri: tei:real
annotations:
tei_attribute: real
# ---------------------------------------------------------------------------
# Word/Token Attributes
# ---------------------------------------------------------------------------
norm:
description: >-
Normalized/regularized form of word.
range: string
slot_uri: tei:norm
annotations:
tei_attribute: norm
orig:
description: >-
Original form (before normalization).
range: string
slot_uri: tei:orig
annotations:
tei_attribute: orig
# ---------------------------------------------------------------------------
# Span Attributes
# ---------------------------------------------------------------------------
span_from:
description: >-
Start point of span annotation (TEI @from).
range: uriorcurie
slot_uri: tei:from
annotations:
tei_attribute: from
span_to:
description: >-
End point of span annotation (TEI @to).
range: uriorcurie
slot_uri: tei:to
annotations:
tei_attribute: to
span_target:
description: >-
Target element(s) of span annotation.
range: uriorcurie
multivalued: true
# ---------------------------------------------------------------------------
# Interpretation Attributes
# ---------------------------------------------------------------------------
interp_type:
description: >-
Type of interpretation.
range: InterpretationTypeEnum
interp_value:
description: >-
Value or content of interpretation.
range: string
inst:
description: >-
Instances (element IDs) to which interpretation applies.
TEI @inst attribute.
range: uriorcurie
multivalued: true
slot_uri: tei:inst
annotations:
tei_attribute: inst
# ---------------------------------------------------------------------------
# Phrase Attributes
# ---------------------------------------------------------------------------
phrase_type:
description: >-
Type of syntactic phrase.
range: PhraseTypeEnum
# ---------------------------------------------------------------------------
# Morphological Feature Slots
# ---------------------------------------------------------------------------
morph_features:
description: >-
Collection of morphological features as key-value pairs.
range: MorphFeature
multivalued: true
inlined: true
inlined_as_list: true
# =============================================================================
# CLASSES
# =============================================================================
classes:
# ---------------------------------------------------------------------------
# BASE CLASSES
# ---------------------------------------------------------------------------
AnalysisElement:
description: >-
Abstract base class for all analysis module elements.
abstract: true
slots:
- ana
class_uri: tei:AnalysisElement
annotations:
tei_module: analysis
LinguisticSegment:
description: >-
Abstract base class for linguistic segmentation elements.
Maps to NIF String for interoperability.
abstract: true
is_a: AnalysisElement
slots:
- segment_type
- segment_function
- real
class_uri: nif:String
annotations:
tei_module: analysis
nif_mapping: nif:String
# ---------------------------------------------------------------------------
# SENTENCE AND CLAUSE ELEMENTS
# ---------------------------------------------------------------------------
Sentence:
description: >-
Sentence or s-unit - a grammatical sentence or equivalent unit.
TEI <s> element. Maps to NIF Sentence.
is_a: LinguisticSegment
slots:
- segment_type
slot_usage:
segment_type:
ifabsent: "string(sentence)"
class_uri: tei:s
annotations:
tei_element: s
tei_module: analysis
glam_hypernym: TXT.ANA.SEN
nif_mapping: nif:Sentence
Clause:
description: >-
Clause - a syntactic unit including a finite or non-finite verb.
TEI <cl> element.
is_a: LinguisticSegment
slots:
- phrase_type
slot_usage:
segment_type:
ifabsent: "string(clause)"
class_uri: tei:cl
annotations:
tei_element: cl
tei_module: analysis
glam_hypernym: TXT.ANA.CLS
olia_mapping: olia:Clause
Phrase:
description: >-
Phrase - a syntactic phrase (NP, VP, PP, etc.).
TEI <phr> element.
is_a: LinguisticSegment
slots:
- phrase_type
slot_usage:
segment_type:
ifabsent: "string(phrase)"
class_uri: tei:phr
annotations:
tei_element: phr
tei_module: analysis
glam_hypernym: TXT.ANA.PHR
olia_mapping: olia:Phrase
# ---------------------------------------------------------------------------
# TOKEN-LEVEL ELEMENTS
# ---------------------------------------------------------------------------
Word:
description: >-
Word - a grammatical (not necessarily orthographic) word.
TEI <w> element. Core element for POS tagging and lemmatization.
Maps to NIF Word and OntoLex Form.
is_a: LinguisticSegment
slots:
- lemma
- pos
- msd
- join_direction
- norm
- orig
- morph_features
slot_usage:
segment_type:
ifabsent: "string(word)"
class_uri: tei:w
annotations:
tei_element: w
tei_module: analysis
glam_hypernym: TXT.ANA.WRD
nif_mapping: nif:Word
ontolex_mapping: ontolex:Form
Morpheme:
description: >-
Morpheme - a minimal meaningful unit within a word.
TEI <m> element.
is_a: LinguisticSegment
slots:
- lemma
- pos
- msd
attributes:
base_form:
description: >-
Base form of the morpheme.
range: string
annotations:
tei_attribute: baseForm
slot_usage:
segment_type:
ifabsent: "string(morpheme)"
class_uri: tei:m
annotations:
tei_element: m
tei_module: analysis
glam_hypernym: TXT.ANA.MOR
olia_mapping: olia:Morpheme
Character:
description: >-
Character - a single character (grapheme).
TEI <c> element. Used for character-level annotation.
is_a: LinguisticSegment
slot_usage:
segment_type:
ifabsent: "string(character)"
class_uri: tei:c
annotations:
tei_element: c
tei_module: analysis
glam_hypernym: TXT.ANA.CHR
nif_mapping: nif:Character
Punctuation:
description: >-
Punctuation character or string regarded as a single punctuation mark.
TEI <pc> element.
is_a: LinguisticSegment
slots:
- pos
- join_direction
attributes:
unit:
description: >-
Whether this is a unit-initial or unit-final punctuation.
range: string
annotations:
tei_attribute: unit
pre:
description: >-
Whitespace or punctuation appearing before.
range: string
annotations:
tei_attribute: pre
force:
description: >-
Strength of punctuation (strong, weak).
range: string
annotations:
tei_attribute: force
slot_usage:
segment_type:
ifabsent: "string(punctuation)"
class_uri: tei:pc
annotations:
tei_element: pc
tei_module: analysis
glam_hypernym: TXT.ANA.PNC
olia_mapping: olia:Punctuation
# ---------------------------------------------------------------------------
# MORPHOLOGICAL FEATURES
# ---------------------------------------------------------------------------
MorphFeature:
description: >-
A single morphological feature (key-value pair).
Used to represent Universal Dependencies-style morphological annotation.
attributes:
feature_name:
description: >-
Name of morphological feature (e.g., Case, Number, Gender).
range: MorphFeatureTypeEnum
required: true
feature_value:
description: >-
Value of morphological feature (e.g., Nom, Sing, Masc).
range: string
required: true
class_uri: olia:MorphologicalFeature
annotations:
glam_hypernym: TXT.ANA.MFT
# ---------------------------------------------------------------------------
# SPAN AND INTERPRETATION ELEMENTS
# ---------------------------------------------------------------------------
Span:
description: >-
Span - associates interpretive annotation with a span of text.
TEI <span> element. Maps to Web Annotation for standoff annotation.
is_a: AnalysisElement
slots:
- span_from
- span_to
- span_target
- interp_type
- interp_value
class_uri: tei:span
annotations:
tei_element: span
tei_module: analysis
glam_hypernym: TXT.ANA.SPN
oa_mapping: oa:Annotation
SpanGroup:
description: >-
Span group - groups related span annotations.
TEI <spanGrp> element.
is_a: AnalysisElement
slots:
- interp_type
attributes:
spans:
description: >-
Collection of span annotations in this group.
range: Span
multivalued: true
inlined: true
inlined_as_list: true
class_uri: tei:spanGrp
annotations:
tei_element: spanGrp
tei_module: analysis
glam_hypernym: TXT.ANA.SPG
Interp:
description: >-
Interpretation - summarizes an interpretive annotation.
TEI <interp> element. Used for entity classification, thematic coding, etc.
is_a: AnalysisElement
slots:
- interp_type
- interp_value
- inst
class_uri: tei:interp
annotations:
tei_element: interp
tei_module: analysis
glam_hypernym: TXT.ANA.INT
crm_mapping: crm:E13_Attribute_Assignment
InterpGroup:
description: >-
Interpretation group - collects related interpretation elements.
TEI <interpGrp> element. Can represent a tagset or coding scheme.
is_a: AnalysisElement
slots:
- interp_type
attributes:
interps:
description: >-
Collection of interpretation definitions.
range: Interp
multivalued: true
inlined: true
inlined_as_list: true
class_uri: tei:interpGrp
annotations:
tei_element: interpGrp
tei_module: analysis
glam_hypernym: TXT.ANA.IGP
skos_mapping: skos:ConceptScheme
# ---------------------------------------------------------------------------
# NLP OUTPUT CLASSES
# ---------------------------------------------------------------------------
TokenizedText:
description: >-
Container for tokenized text output from NLP pipeline.
Holds a sequence of sentences, each containing tokens.
attributes:
sentences:
description: >-
Tokenized sentences.
range: TokenizedSentence
multivalued: true
inlined: true
inlined_as_list: true
tokenizer:
description: >-
Name of tokenizer used (e.g., "spacy-en_core_web_sm").
range: string
tokenization_date:
description: >-
Date/time of tokenization.
range: datetime
class_uri: nif:Context
annotations:
glam_hypernym: TXT.ANA.TOK
TokenizedSentence:
description: >-
A tokenized sentence with word/token sequence.
Extends TEI Sentence with NLP-specific fields.
is_a: Sentence
attributes:
tokens:
description: >-
Sequence of tokens in sentence.
range: Word
multivalued: true
inlined: true
inlined_as_list: true
sentence_text:
description: >-
Original sentence text.
range: string
begin_offset:
description: >-
Character offset of sentence start in source document.
range: integer
end_offset:
description: >-
Character offset of sentence end in source document.
range: integer
class_uri: nif:Sentence
annotations:
glam_hypernym: TXT.ANA.TSN
POSTaggedToken:
description: >-
A token with POS tag and optional morphological analysis.
Specialized Word class for POS tagger output.
is_a: Word
attributes:
pos_tag:
description: >-
Part of speech tag.
range: POSTagEnum
pos_confidence:
description: >-
Confidence score for POS tag (0.0-1.0).
range: float
minimum_value: 0.0
maximum_value: 1.0
pos_alternatives:
description: >-
Alternative POS tags with scores.
range: POSAlternative
multivalued: true
inlined: true
inlined_as_list: true
class_uri: olia:Token
annotations:
glam_hypernym: TXT.ANA.POS
POSAlternative:
description: >-
An alternative POS tag with confidence score.
attributes:
tag:
description: >-
Alternative POS tag.
range: POSTagEnum
score:
description: >-
Confidence score (0.0-1.0).
range: float
minimum_value: 0.0
maximum_value: 1.0
annotations:
glam_hypernym: TXT.ANA.PAL
DependencyParse:
description: >-
Dependency parse tree for a sentence.
Represents syntactic dependencies between tokens.
attributes:
sentence_ref:
description: >-
Reference to source sentence.
range: uriorcurie
dependencies:
description: >-
Dependency relations.
range: DependencyRelation
multivalued: true
inlined: true
inlined_as_list: true
parse_method:
description: >-
Parser used (e.g., "spacy", "stanza", "stanford").
range: string
class_uri: nif:DependencyTree
annotations:
glam_hypernym: TXT.ANA.DEP
DependencyRelation:
description: >-
A single dependency relation between two tokens.
attributes:
head:
description: >-
Index of head token (0 = root).
range: integer
required: true
dependent:
description: >-
Index of dependent token.
range: integer
required: true
relation:
description: >-
Dependency relation type (e.g., nsubj, dobj, amod).
range: string
required: true
enhanced:
description: >-
Whether this is an enhanced dependency.
range: boolean
annotations:
glam_hypernym: TXT.ANA.DRL
# ---------------------------------------------------------------------------
# LINGUISTIC ANNOTATION SCHEMES
# ---------------------------------------------------------------------------
AnnotationScheme:
description: >-
Definition of a linguistic annotation scheme (tagset, coding scheme).
Used to document the vocabulary for @ana, @pos, etc.
attributes:
scheme_id:
description: >-
Unique identifier for scheme.
range: uriorcurie
identifier: true
scheme_name:
description: >-
Human-readable name.
range: string
required: true
scheme_uri:
description: >-
URI of external scheme definition.
range: uri
scheme_type:
description: >-
Type of scheme (pos_tagset, ner_tagset, dependency_scheme, etc.).
range: string
tags:
description: >-
Tag definitions in scheme.
range: TagDefinition
multivalued: true
inlined: true
inlined_as_list: true
class_uri: skos:ConceptScheme
annotations:
glam_hypernym: DOC.MET.SCH
TagDefinition:
description: >-
Definition of a single tag in an annotation scheme.
attributes:
tag_id:
description: >-
Tag identifier/code.
range: string
required: true
tag_label:
description: >-
Human-readable label.
range: string
tag_description:
description: >-
Full description of tag meaning.
range: string
tag_uri:
description: >-
URI linking to external definition (OLiA, etc.).
range: uri
tag_parent:
description: >-
Parent tag in hierarchy (if applicable).
range: string
class_uri: skos:Concept
annotations:
glam_hypernym: DOC.MET.TAG