# ============================================================================= # GLAM-NER: TEI Analysis Module (LinkML) # ============================================================================= # Module: modules/advanced/tei/analysis.yaml # TEI Chapter: 18 - Simple Analytic Mechanisms # TEI Module: analysis # Version: 1.0.0 # ============================================================================= # # This module defines LinkML classes for TEI P5 Chapter 18 elements used for # simple analytic mechanisms including linguistic segmentation, POS tagging, # morphological analysis, syntactic annotation, and interpretive markup. # # Key Element Groups: # - Linguistic Segments: s, cl, phr, w, m, c, pc # - Spans and Interpretations: span, spanGrp, interp, interpGrp # - Analysis Attributes: @ana, @lemma, @pos, @msd, @join # # GLAM-NER Integration: # - Linguistic annotation for NLP preprocessing # - POS tagging and lemmatization output # - Syntactic structure annotation # - Interpretive annotation for entity classification # # Ontology Alignments: # - NIF (NLP Interchange Format): Linguistic annotation # - OntoLex-Lemon: Lexical entries and forms # - OLiA (Ontologies of Linguistic Annotation): POS tags # - CIDOC-CRM: Interpretive assertions # - Web Annotation: Span annotations # # ============================================================================= id: https://w3id.org/glam/ner/tei/analysis name: glam-ner-tei-analysis title: "TEI Analysis Module for GLAM-NER" version: "1.0.0" license: https://creativecommons.org/licenses/by/4.0/ see_also: - https://tei-c.org/release/doc/tei-p5-doc/en/html/AI.html prefixes: linkml: https://w3id.org/linkml/ tei: http://www.tei-c.org/ns/1.0/ glam: https://w3id.org/glam/ner/ nif: http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core# ontolex: http://www.w3.org/ns/lemon/ontolex# olia: http://purl.org/olia/olia.owl# crm: http://www.cidoc-crm.org/cidoc-crm/ oa: http://www.w3.org/ns/oa# skos: http://www.w3.org/2004/02/skos/core# schema: http://schema.org/ default_prefix: glam default_range: string imports: - linkml:types # ============================================================================= # ENUMERATIONS # ============================================================================= enums: # --------------------------------------------------------------------------- # Linguistic Unit Type Enum # --------------------------------------------------------------------------- LinguisticUnitTypeEnum: description: >- Types of linguistic units for segmentation. permissible_values: sentence: description: Sentence unit (s-unit) meaning: nif:Sentence clause: description: Clause (syntactic unit) meaning: olia:Clause phrase: description: Phrase (noun phrase, verb phrase, etc.) meaning: olia:Phrase word: description: Word (grammatical word) meaning: nif:Word aliases: - token morpheme: description: Morpheme (minimal meaningful unit) meaning: olia:Morpheme character: description: Character (single grapheme) meaning: nif:Character punctuation: description: Punctuation mark meaning: olia:Punctuation # --------------------------------------------------------------------------- # Part of Speech Enum (Universal Dependencies tagset) # --------------------------------------------------------------------------- POSTagEnum: description: >- Part of speech tags based on Universal Dependencies (UD) tagset. Extended with additional categories for compatibility with other tagsets. permissible_values: # Open class words ADJ: description: Adjective meaning: olia:Adjective ADV: description: Adverb meaning: olia:Adverb INTJ: description: Interjection meaning: olia:Interjection NOUN: description: Noun meaning: olia:Noun PROPN: description: Proper noun meaning: olia:ProperNoun VERB: description: Verb meaning: olia:Verb # Closed class words ADP: description: Adposition (preposition/postposition) meaning: olia:Adposition AUX: description: Auxiliary verb meaning: olia:AuxiliaryVerb CCONJ: description: Coordinating conjunction meaning: olia:CoordinatingConjunction DET: description: Determiner meaning: olia:Determiner NUM: description: Numeral meaning: olia:Numeral PART: description: Particle meaning: olia:Particle PRON: description: Pronoun meaning: olia:Pronoun SCONJ: description: Subordinating conjunction meaning: olia:SubordinatingConjunction # Other PUNCT: description: Punctuation meaning: olia:Punctuation SYM: description: Symbol meaning: olia:Symbol X: description: Other/Unknown meaning: olia:Residual # --------------------------------------------------------------------------- # Phrase Type Enum # --------------------------------------------------------------------------- PhraseTypeEnum: description: >- Types of syntactic phrases. permissible_values: NP: description: Noun phrase meaning: olia:NounPhrase VP: description: Verb phrase meaning: olia:VerbPhrase PP: description: Prepositional phrase meaning: olia:PrepositionalPhrase AP: description: Adjective phrase meaning: olia:AdjectivePhrase ADVP: description: Adverb phrase meaning: olia:AdverbPhrase S: description: Sentence/clause meaning: olia:Sentence SBAR: description: Subordinate clause meaning: olia:SubordinateClause CP: description: Complementizer phrase DP: description: Determiner phrase IP: description: Inflectional phrase # --------------------------------------------------------------------------- # Join Direction Enum # --------------------------------------------------------------------------- JoinDirectionEnum: description: >- Direction of token joining (for handling whitespace). permissible_values: left: description: Join to preceding token (no space before) right: description: Join to following token (no space after) both: description: Join to both adjacent tokens overlap: description: Token overlaps with neighbors # --------------------------------------------------------------------------- # Morphological Feature Type Enum # --------------------------------------------------------------------------- MorphFeatureTypeEnum: description: >- Types of morphological features (Universal Features). permissible_values: # Nominal features Case: description: Grammatical case meaning: olia:hasCase Definite: description: Definiteness meaning: olia:hasDefiniteness Gender: description: Grammatical gender meaning: olia:hasGender Number: description: Grammatical number meaning: olia:hasNumber # Verbal features Aspect: description: Grammatical aspect meaning: olia:hasAspect Mood: description: Grammatical mood meaning: olia:hasMood Person: description: Grammatical person meaning: olia:hasPerson Tense: description: Grammatical tense meaning: olia:hasTense VerbForm: description: Verb form (finite, infinitive, participle, etc.) meaning: olia:hasVerbForm Voice: description: Grammatical voice meaning: olia:hasVoice # Other features Degree: description: Degree of comparison meaning: olia:hasDegree Polarity: description: Polarity (affirmative/negative) meaning: olia:hasPolarity Poss: description: Possessive PronType: description: Pronoun type Reflex: description: Reflexive # --------------------------------------------------------------------------- # Interpretation Type Enum # --------------------------------------------------------------------------- InterpretationTypeEnum: description: >- Types of interpretive annotation. permissible_values: semantic: description: Semantic interpretation thematic: description: Thematic/topic interpretation stylistic: description: Stylistic interpretation pragmatic: description: Pragmatic interpretation discourse: description: Discourse-level interpretation rhetorical: description: Rhetorical interpretation cultural: description: Cultural/historical interpretation entity: description: Named entity interpretation meaning: oa:identifying # ============================================================================= # SLOTS (Attributes) # ============================================================================= slots: # --------------------------------------------------------------------------- # Global Analysis Attributes # --------------------------------------------------------------------------- ana: description: >- Analysis reference - points to interpretation element(s) for this segment. TEI @ana attribute, equivalent to NIF annotation reference. range: uriorcurie multivalued: true slot_uri: tei:ana annotations: tei_attribute: ana nif_mapping: nif:annotation lemma: description: >- Base form (lemma/dictionary form) of a word. TEI @lemma attribute. range: string slot_uri: tei:lemma annotations: tei_attribute: lemma ontolex_mapping: ontolex:canonicalForm pos: description: >- Part of speech tag. Can use any tagset but Universal Dependencies recommended for interoperability. range: string slot_uri: tei:pos annotations: tei_attribute: pos olia_mapping: olia:hasTag msd: description: >- Morphosyntactic description - detailed morphological features. Typically in CONLL-U or similar format (e.g., "Case=Nom|Number=Sing"). range: string slot_uri: tei:msd annotations: tei_attribute: msd join_direction: description: >- How this token joins with adjacent tokens (whitespace handling). TEI @join attribute. range: JoinDirectionEnum slot_uri: tei:join annotations: tei_attribute: join # --------------------------------------------------------------------------- # Segmentation Attributes # --------------------------------------------------------------------------- segment_type: description: >- Type of linguistic segment. range: LinguisticUnitTypeEnum segment_function: description: >- Grammatical or functional role of segment. range: string real: description: >- Whether segment represents actual occurrence in source. False for editorial/analytical additions. range: boolean slot_uri: tei:real annotations: tei_attribute: real # --------------------------------------------------------------------------- # Word/Token Attributes # --------------------------------------------------------------------------- norm: description: >- Normalized/regularized form of word. range: string slot_uri: tei:norm annotations: tei_attribute: norm orig: description: >- Original form (before normalization). range: string slot_uri: tei:orig annotations: tei_attribute: orig # --------------------------------------------------------------------------- # Span Attributes # --------------------------------------------------------------------------- span_from: description: >- Start point of span annotation (TEI @from). range: uriorcurie slot_uri: tei:from annotations: tei_attribute: from span_to: description: >- End point of span annotation (TEI @to). range: uriorcurie slot_uri: tei:to annotations: tei_attribute: to span_target: description: >- Target element(s) of span annotation. range: uriorcurie multivalued: true # --------------------------------------------------------------------------- # Interpretation Attributes # --------------------------------------------------------------------------- interp_type: description: >- Type of interpretation. range: InterpretationTypeEnum interp_value: description: >- Value or content of interpretation. range: string inst: description: >- Instances (element IDs) to which interpretation applies. TEI @inst attribute. range: uriorcurie multivalued: true slot_uri: tei:inst annotations: tei_attribute: inst # --------------------------------------------------------------------------- # Phrase Attributes # --------------------------------------------------------------------------- phrase_type: description: >- Type of syntactic phrase. range: PhraseTypeEnum # --------------------------------------------------------------------------- # Morphological Feature Slots # --------------------------------------------------------------------------- morph_features: description: >- Collection of morphological features as key-value pairs. range: MorphFeature multivalued: true inlined: true inlined_as_list: true # ============================================================================= # CLASSES # ============================================================================= classes: # --------------------------------------------------------------------------- # BASE CLASSES # --------------------------------------------------------------------------- AnalysisElement: description: >- Abstract base class for all analysis module elements. abstract: true slots: - ana class_uri: tei:AnalysisElement annotations: tei_module: analysis LinguisticSegment: description: >- Abstract base class for linguistic segmentation elements. Maps to NIF String for interoperability. abstract: true is_a: AnalysisElement slots: - segment_type - segment_function - real class_uri: nif:String annotations: tei_module: analysis nif_mapping: nif:String # --------------------------------------------------------------------------- # SENTENCE AND CLAUSE ELEMENTS # --------------------------------------------------------------------------- Sentence: description: >- Sentence or s-unit - a grammatical sentence or equivalent unit. TEI element. Maps to NIF Sentence. is_a: LinguisticSegment slots: - segment_type slot_usage: segment_type: ifabsent: "string(sentence)" class_uri: tei:s annotations: tei_element: s tei_module: analysis glam_hypernym: TXT.ANA.SEN nif_mapping: nif:Sentence Clause: description: >- Clause - a syntactic unit including a finite or non-finite verb. TEI element. is_a: LinguisticSegment slots: - phrase_type slot_usage: segment_type: ifabsent: "string(clause)" class_uri: tei:cl annotations: tei_element: cl tei_module: analysis glam_hypernym: TXT.ANA.CLS olia_mapping: olia:Clause Phrase: description: >- Phrase - a syntactic phrase (NP, VP, PP, etc.). TEI element. is_a: LinguisticSegment slots: - phrase_type slot_usage: segment_type: ifabsent: "string(phrase)" class_uri: tei:phr annotations: tei_element: phr tei_module: analysis glam_hypernym: TXT.ANA.PHR olia_mapping: olia:Phrase # --------------------------------------------------------------------------- # TOKEN-LEVEL ELEMENTS # --------------------------------------------------------------------------- Word: description: >- Word - a grammatical (not necessarily orthographic) word. TEI element. Core element for POS tagging and lemmatization. Maps to NIF Word and OntoLex Form. is_a: LinguisticSegment slots: - lemma - pos - msd - join_direction - norm - orig - morph_features slot_usage: segment_type: ifabsent: "string(word)" class_uri: tei:w annotations: tei_element: w tei_module: analysis glam_hypernym: TXT.ANA.WRD nif_mapping: nif:Word ontolex_mapping: ontolex:Form Morpheme: description: >- Morpheme - a minimal meaningful unit within a word. TEI element. is_a: LinguisticSegment slots: - lemma - pos - msd attributes: base_form: description: >- Base form of the morpheme. range: string annotations: tei_attribute: baseForm slot_usage: segment_type: ifabsent: "string(morpheme)" class_uri: tei:m annotations: tei_element: m tei_module: analysis glam_hypernym: TXT.ANA.MOR olia_mapping: olia:Morpheme Character: description: >- Character - a single character (grapheme). TEI element. Used for character-level annotation. is_a: LinguisticSegment slot_usage: segment_type: ifabsent: "string(character)" class_uri: tei:c annotations: tei_element: c tei_module: analysis glam_hypernym: TXT.ANA.CHR nif_mapping: nif:Character Punctuation: description: >- Punctuation character or string regarded as a single punctuation mark. TEI element. is_a: LinguisticSegment slots: - pos - join_direction attributes: unit: description: >- Whether this is a unit-initial or unit-final punctuation. range: string annotations: tei_attribute: unit pre: description: >- Whitespace or punctuation appearing before. range: string annotations: tei_attribute: pre force: description: >- Strength of punctuation (strong, weak). range: string annotations: tei_attribute: force slot_usage: segment_type: ifabsent: "string(punctuation)" class_uri: tei:pc annotations: tei_element: pc tei_module: analysis glam_hypernym: TXT.ANA.PNC olia_mapping: olia:Punctuation # --------------------------------------------------------------------------- # MORPHOLOGICAL FEATURES # --------------------------------------------------------------------------- MorphFeature: description: >- A single morphological feature (key-value pair). Used to represent Universal Dependencies-style morphological annotation. attributes: feature_name: description: >- Name of morphological feature (e.g., Case, Number, Gender). range: MorphFeatureTypeEnum required: true feature_value: description: >- Value of morphological feature (e.g., Nom, Sing, Masc). range: string required: true class_uri: olia:MorphologicalFeature annotations: glam_hypernym: TXT.ANA.MFT # --------------------------------------------------------------------------- # SPAN AND INTERPRETATION ELEMENTS # --------------------------------------------------------------------------- Span: description: >- Span - associates interpretive annotation with a span of text. TEI element. Maps to Web Annotation for standoff annotation. is_a: AnalysisElement slots: - span_from - span_to - span_target - interp_type - interp_value class_uri: tei:span annotations: tei_element: span tei_module: analysis glam_hypernym: TXT.ANA.SPN oa_mapping: oa:Annotation SpanGroup: description: >- Span group - groups related span annotations. TEI element. is_a: AnalysisElement slots: - interp_type attributes: spans: description: >- Collection of span annotations in this group. range: Span multivalued: true inlined: true inlined_as_list: true class_uri: tei:spanGrp annotations: tei_element: spanGrp tei_module: analysis glam_hypernym: TXT.ANA.SPG Interp: description: >- Interpretation - summarizes an interpretive annotation. TEI element. Used for entity classification, thematic coding, etc. is_a: AnalysisElement slots: - interp_type - interp_value - inst class_uri: tei:interp annotations: tei_element: interp tei_module: analysis glam_hypernym: TXT.ANA.INT crm_mapping: crm:E13_Attribute_Assignment InterpGroup: description: >- Interpretation group - collects related interpretation elements. TEI element. Can represent a tagset or coding scheme. is_a: AnalysisElement slots: - interp_type attributes: interps: description: >- Collection of interpretation definitions. range: Interp multivalued: true inlined: true inlined_as_list: true class_uri: tei:interpGrp annotations: tei_element: interpGrp tei_module: analysis glam_hypernym: TXT.ANA.IGP skos_mapping: skos:ConceptScheme # --------------------------------------------------------------------------- # NLP OUTPUT CLASSES # --------------------------------------------------------------------------- TokenizedText: description: >- Container for tokenized text output from NLP pipeline. Holds a sequence of sentences, each containing tokens. attributes: sentences: description: >- Tokenized sentences. range: TokenizedSentence multivalued: true inlined: true inlined_as_list: true tokenizer: description: >- Name of tokenizer used (e.g., "spacy-en_core_web_sm"). range: string tokenization_date: description: >- Date/time of tokenization. range: datetime class_uri: nif:Context annotations: glam_hypernym: TXT.ANA.TOK TokenizedSentence: description: >- A tokenized sentence with word/token sequence. Extends TEI Sentence with NLP-specific fields. is_a: Sentence attributes: tokens: description: >- Sequence of tokens in sentence. range: Word multivalued: true inlined: true inlined_as_list: true sentence_text: description: >- Original sentence text. range: string begin_offset: description: >- Character offset of sentence start in source document. range: integer end_offset: description: >- Character offset of sentence end in source document. range: integer class_uri: nif:Sentence annotations: glam_hypernym: TXT.ANA.TSN POSTaggedToken: description: >- A token with POS tag and optional morphological analysis. Specialized Word class for POS tagger output. is_a: Word attributes: pos_tag: description: >- Part of speech tag. range: POSTagEnum pos_confidence: description: >- Confidence score for POS tag (0.0-1.0). range: float minimum_value: 0.0 maximum_value: 1.0 pos_alternatives: description: >- Alternative POS tags with scores. range: POSAlternative multivalued: true inlined: true inlined_as_list: true class_uri: olia:Token annotations: glam_hypernym: TXT.ANA.POS POSAlternative: description: >- An alternative POS tag with confidence score. attributes: tag: description: >- Alternative POS tag. range: POSTagEnum score: description: >- Confidence score (0.0-1.0). range: float minimum_value: 0.0 maximum_value: 1.0 annotations: glam_hypernym: TXT.ANA.PAL DependencyParse: description: >- Dependency parse tree for a sentence. Represents syntactic dependencies between tokens. attributes: sentence_ref: description: >- Reference to source sentence. range: uriorcurie dependencies: description: >- Dependency relations. range: DependencyRelation multivalued: true inlined: true inlined_as_list: true parse_method: description: >- Parser used (e.g., "spacy", "stanza", "stanford"). range: string class_uri: nif:DependencyTree annotations: glam_hypernym: TXT.ANA.DEP DependencyRelation: description: >- A single dependency relation between two tokens. attributes: head: description: >- Index of head token (0 = root). range: integer required: true dependent: description: >- Index of dependent token. range: integer required: true relation: description: >- Dependency relation type (e.g., nsubj, dobj, amod). range: string required: true enhanced: description: >- Whether this is an enhanced dependency. range: boolean annotations: glam_hypernym: TXT.ANA.DRL # --------------------------------------------------------------------------- # LINGUISTIC ANNOTATION SCHEMES # --------------------------------------------------------------------------- AnnotationScheme: description: >- Definition of a linguistic annotation scheme (tagset, coding scheme). Used to document the vocabulary for @ana, @pos, etc. attributes: scheme_id: description: >- Unique identifier for scheme. range: uriorcurie identifier: true scheme_name: description: >- Human-readable name. range: string required: true scheme_uri: description: >- URI of external scheme definition. range: uri scheme_type: description: >- Type of scheme (pos_tagset, ner_tagset, dependency_scheme, etc.). range: string tags: description: >- Tag definitions in scheme. range: TagDefinition multivalued: true inlined: true inlined_as_list: true class_uri: skos:ConceptScheme annotations: glam_hypernym: DOC.MET.SCH TagDefinition: description: >- Definition of a single tag in an annotation scheme. attributes: tag_id: description: >- Tag identifier/code. range: string required: true tag_label: description: >- Human-readable label. range: string tag_description: description: >- Full description of tag meaning. range: string tag_uri: description: >- URI linking to external definition (OLiA, etc.). range: uri tag_parent: description: >- Parent tag in hierarchy (if applicable). range: string class_uri: skos:Concept annotations: glam_hypernym: DOC.MET.TAG