glam/data/entity_annotation/modules/advanced/tei/index.yaml
2025-12-05 15:30:23 +01:00

1792 lines
66 KiB
YAML

# =============================================================================
# GLAM-NER: TEI P5 LINKML MODULES INDEX
# =============================================================================
# Module: modules/advanced/tei/index.yaml
# Purpose: Index of TEI P5 chapter modules converted to LinkML
# Source: TEI P5 4.10.2 (September 2025)
# =============================================================================
#
# This directory contains LinkML schema representations of TEI P5 element
# definitions, enabling interoperability between TEI XML annotations and
# GLAM-NER entity extraction pipelines.
#
# Each module corresponds to a TEI P5 chapter and provides:
# - LinkML class definitions for TEI elements
# - Mappings to CIDOC-CRM, Schema.org, FOAF, and other ontologies
# - GLAM-NER hypernym annotations
# - Authority file integration (VIAF, Wikidata, GeoNames, etc.)
#
# =============================================================================
id: https://w3id.org/glam/ner/tei
name: glam-ner-tei-modules
title: TEI P5 LinkML Modules for GLAM-NER
version: "1.16.0"
tei_source:
specification: "TEI P5 Guidelines"
version: "4.10.2"
release_date: "2025-09-04"
url: "https://tei-c.org/release/doc/tei-p5-doc/en/html/"
commit: "bcfa98f42"
namespace: "http://www.tei-c.org/ns/1.0"
total_elements: 588
# =============================================================================
# AVAILABLE MODULES
# =============================================================================
modules:
# ---------------------------------------------------------------------------
# Chapter 2: TEI Header
# ---------------------------------------------------------------------------
header:
path: "header.yaml"
tei_chapter: 2
tei_module_name: "header"
title: "TEI Header"
description: |
Metadata elements for describing TEI documents including file description,
encoding description, profile description, and revision history. Essential
for document provenance, rights management, and NER pipeline metadata.
element_count: 85
line_count: 3678
status: "complete"
key_elements:
- teiHeader: "Header container (required)"
- fileDesc: "File description (required)"
- titleStmt: "Title statement with authors/editors"
- publicationStmt: "Publication/distribution info"
- sourceDesc: "Source description (provenance)"
- encodingDesc: "Encoding practices"
- profileDesc: "Non-bibliographic aspects"
- revisionDesc: "Revision history"
- correspDesc: "Correspondence description"
- availability: "Access/licence information"
glam_hypernym_mappings:
DOC.MET: ["teiHeader", "fileDesc", "encodingDesc", "profileDesc", "revisionDesc"]
AGT.PER: ["author", "editor", "principal", "person"]
GRP.ORG: ["publisher", "distributor", "authority", "sponsor", "funder", "orgName"]
GEO: ["pubPlace", "place", "setting"]
GEO.ADR: ["address", "addrLine"]
TMP.DAT: ["date"]
TMP.EVT: ["change", "correspAction", "creation"]
WRK: ["title", "bibl", "biblStruct", "biblFull"]
REL: ["relatedItem", "correspContext", "ref", "ptr"]
QTY: ["measure", "extent", "unitDef"]
APP.LBL: ["idno"]
ROL: ["resp"]
ontology_mappings:
dcterms:
- TEIHeader: "dcterms:BibliographicResource"
- TitleStmt: "dcterms:title"
- Author: "dcterms:creator"
- Publisher: "dcterms:publisher"
- SourceDesc: "dcterms:source"
- Availability: "dcterms:accessRights"
- Licence: "dcterms:license"
- Abstract: "dcterms:abstract"
schema:
- TEIHeader: "schema:CreativeWork"
- Author: "schema:author"
- Publisher: "schema:publisher"
- Keywords: "schema:keywords"
- Application: "schema:SoftwareApplication"
prov:
- Change: "prov:Activity"
- RevisionDesc: "prov:Activity"
- CorrespAction: "prov:Activity"
- Creation: "prov:wasGeneratedBy"
- RespStmt: "prov:wasAttributedTo"
- Application: "prov:SoftwareAgent"
bibo:
- Editor: "bibo:editor"
- Edition: "bibo:edition"
- BiblStruct: "bibo:Document"
- Series: "bibo:Series"
foaf:
- Person: "foaf:Person"
- PersonGrp: "foaf:Group"
- OrgName: "foaf:Organization"
skos:
- Taxonomy: "skos:ConceptScheme"
- Category: "skos:Concept"
- Term: "skos:Concept"
premis:
- EncodingDesc: "premis:PreservationMetadata"
- Change: "premis:Event"
# ---------------------------------------------------------------------------
# Chapter 3: Core Elements
# ---------------------------------------------------------------------------
core:
path: "core.yaml"
tei_chapter: 3
tei_module_name: "core"
title: "Core Elements"
description: |
Elements available in all TEI documents regardless of customization.
Includes date, time, name, title, measure, and bibliographic elements.
element_count: 45
line_count: 1575
status: "complete"
key_elements:
- date: "Date expressions (TimeML aligned)"
- time: "Time expressions"
- name: "Generic proper noun"
- title: "Titles of works"
- measure: "Quantities with units (QUDT aligned)"
- num: "Numeric values"
- address: "Postal addresses"
- rs: "Referencing strings"
- bibl: "Bibliographic citations"
- quote: "Quotations"
- ptr: "Pointers"
- ref: "References"
glam_hypernym_mappings:
TMP.DAT: ["date"]
TMP.TIM: ["time"]
APP: ["name", "rs"]
WRK: ["title", "bibl"]
QTY: ["measure", "num"]
GEO.ADR: ["address"]
# ---------------------------------------------------------------------------
# Chapter 14: Names, Dates, People, Places
# ---------------------------------------------------------------------------
namesdates:
path: "namesdates.yaml"
tei_chapter: 14
tei_module_name: "namesdates"
title: "Names, Dates, People, Places"
description: |
TEI elements for encoding names and information about named entities
including persons, places, organizations, and events. This is the
primary module for Named Entity Recognition interoperability.
element_count: 58
line_count: 1962
status: "complete"
key_elements:
- persName: "Personal names (with forename, surname, addName components)"
- placeName: "Place names (with settlement, region, country, geogName)"
- orgName: "Organization names"
- objectName: "Named objects (artworks, artifacts)"
- eventName: "Named events"
- person: "Person descriptions (standOff)"
- place: "Place descriptions (standOff)"
- org: "Organization descriptions (standOff)"
- event: "Event descriptions (standOff)"
- relation: "Relationships between entities"
glam_hypernym_mappings:
AGT.PER: ["persName", "person", "persona"]
GRP.ORG: ["orgName", "org"]
GRP: ["personGrp"]
GEO: ["placeName", "place", "location"]
GEO.SET: ["settlement"]
GEO.REG: ["region"]
GEO.CTY: ["country"]
GEO.BLC: ["bloc"]
GEO.DST: ["district"]
GEO.FEA: ["geogName", "geogFeat"]
TMP.EVT: ["event", "eventName", "birth", "death"]
TMP.PER: ["floruit"]
THG.OBJ: ["objectName", "object"]
ROL: ["roleName"]
ROL.OCC: ["occupation"]
ROL.AFF: ["affiliation"]
ROL.EDU: ["education"]
APP.REL: ["faith"]
REL: ["relation", "listRelation"]
# ---------------------------------------------------------------------------
# Chapter 11: Manuscript Description
# ---------------------------------------------------------------------------
msdescription:
path: "msdescription.yaml"
tei_chapter: 11
tei_module_name: "msdescription"
title: "Manuscript Description"
description: |
Elements for describing manuscripts and similar primary sources including
physical description, history, and intellectual content. Essential for
heritage institution cataloging and archival finding aids.
element_count: 58
line_count: 1923
status: "complete"
key_elements:
- msDesc: "Manuscript description container"
- msIdentifier: "Identifier (repository, shelfmark, settlement)"
- physDesc: "Physical description (support, extent, dimensions)"
- objectDesc: "Object form and layout"
- handDesc: "Handwriting description"
- decoDesc: "Decoration description"
- bindingDesc: "Binding description"
- history: "Origin, provenance, acquisition"
- origin: "Place and date of origin"
- provenance: "Provenance events"
- msContents: "Intellectual content"
- msItem: "Individual texts/works"
- msPart: "Composite manuscript parts"
glam_hypernym_mappings:
THG.OBJ: ["msDesc", "msPart", "seal", "decoNote"]
AGT.PER: ["handNote", "provenance (personal)", "author"]
GRP.ORG: ["repository", "institution", "provenance (institutional)"]
GEO: ["origPlace", "settlement", "provenance (location)"]
TMP.EVT: ["provenance", "acquisition", "custEvent"]
TMP.PER: ["origDate"]
WRK: ["msItem", "title"]
APP.LBL: ["msIdentifier", "shelfmark"]
ontology_mappings:
cidoc_crm:
- msDesc: "crm:E22_Human-Made_Object"
- provenance: "crm:E10_Transfer_of_Custody"
- acquisition: "crm:E8_Acquisition"
- origin: "crm:E12_Production"
rico:
- msDesc: "rico:RecordResource"
- repository: "rico:Agent"
schema:
- msDesc: "schema:ArchiveComponent"
- msItem: "schema:CreativeWork"
frbroo:
- msDesc: "frbroo:F4_Manifestation_Singleton"
# ---------------------------------------------------------------------------
# Chapter 17: Linking, Segmentation, Alignment
# ---------------------------------------------------------------------------
linking:
path: "linking.yaml"
tei_chapter: 17
tei_module_name: "linking"
title: "Linking, Segmentation, Alignment"
description: |
Elements for creating links, segments, and alignments within and
between documents. Essential for standoff annotation and NER pipelines.
Aligned with W3C Web Annotation Data Model.
element_count: 20
line_count: 1393
status: "complete"
key_elements:
- standOff: "Standoff annotation container"
- annotation: "W3C Web Annotation compatible annotation"
- listAnnotation: "Annotation collection"
- annotationBlock: "Grouped annotations"
- link: "Hyperlink between elements"
- linkGrp: "Link group"
- seg: "Arbitrary text segment"
- anchor: "Anchor point for linking"
- join: "Fragment aggregation"
- alt: "Alternative readings"
- timeline: "Temporal alignment"
- when: "Time point"
- certainty: "Certainty annotation"
glam_hypernym_mappings:
# Linking module provides infrastructure, not entities
# Entity types assigned via contained annotations
ontology_mappings:
web_annotation:
- annotation: "oa:Annotation"
- listAnnotation: "oa:AnnotationCollection"
- selector: "oa:Selector"
- motivation: "oa:Motivation"
nif:
- seg: "nif:String"
- EntityMention: "nif:String"
- begin_index: "nif:beginIndex"
- end_index: "nif:endIndex"
cidoc_crm:
- annotation: "crm:E13_Attribute_Assignment"
ner_integration:
description: |
The linking module provides core infrastructure for NER annotation:
- TEIAnnotation: Primary class for NER output
- NERAnnotation: Specialized subclass with confidence scores
- EntityMention: NIF-aligned entity span class
- Selector: Target identification (text quote, offset, XPath)
- TEISeg: Inline entity span markup
output_format: |
NER pipelines should output to <standOff><listAnnotation> with:
- @motivation="identifying" (oa:identifying)
- Selector with exact text match and offsets
- Entity type in body (glam_hypernym)
- Authority URI in @ref (VIAF, Wikidata, etc.)
# =============================================================================
# AVAILABLE MODULES (continued)
# =============================================================================
# ---------------------------------------------------------------------------
# Chapter 8: Transcriptions of Speech
# ---------------------------------------------------------------------------
spoken:
path: "spoken.yaml"
tei_chapter: 8
tei_module_name: "spoken"
title: "Transcriptions of Speech"
description: |
Elements for transcribing spoken language including utterances, pauses,
vocal sounds, kinesic events, paralinguistic features, and temporal
alignment. Essential for oral history, sociolinguistics, discourse
analysis, and heritage institution audio/video collections.
element_count: 18
line_count: 1153
status: "complete"
key_elements:
- u: "Utterance (primary speech unit)"
- pause: "Pause in speech"
- vocal: "Vocal sounds (laughter, cough, etc.)"
- incident: "Non-vocal events"
- kinesic: "Body language and gestures"
- shift: "Paralinguistic feature changes"
- writing: "Written text in spoken context"
- unclear: "Inaudible/unclear speech"
- timeline: "Temporal alignment structure"
- when: "Time synchronization point"
- recording: "Recording metadata"
- broadcast: "Broadcast source information"
glam_hypernym_mappings:
TXT.SPK: ["u", "seg", "unclear"]
TXT.SPK.VOC: ["vocal"]
TXT.SPK.KIN: ["kinesic"]
TXT.SPK.PAR: ["shift"]
TXT.WRT: ["writing"]
TMP.DUR: ["pause"]
TMP.EVT: ["incident", "broadcast"]
TMP.TLN: ["timeline"]
TMP.PNT: ["when"]
THG.REC: ["recording"]
THG.EQP: ["equipment"]
DOC.MET: ["recordingStmt", "transcriptionDesc"]
WRK: ["scriptStmt"]
ontology_mappings:
cidoc_crm:
- Utterance: "crm:E33_Linguistic_Object"
- Vocal: "crm:E7_Activity"
- Kinesic: "crm:E7_Activity"
- Incident: "crm:E5_Event"
- Pause: "crm:E52_Time-Span"
- Timeline: "crm:E52_Time-Span"
- When: "crm:E61_Time_Primitive"
- Shift: "crm:E13_Attribute_Assignment"
- Recording: "crm:E65_Creation"
schema:
- Utterance: "schema:SpeakAction"
- Recording: "schema:AudioObject"
- Broadcast: "schema:BroadcastEvent"
time:
- When: "time:Instant"
# ---------------------------------------------------------------------------
# Chapter 12: Representation of Primary Sources
# ---------------------------------------------------------------------------
transcr:
path: "transcr.yaml"
tei_chapter: 12
tei_module_name: "transcr"
title: "Representation of Primary Sources"
description: |
Elements for transcribing primary source materials including facsimiles,
surfaces, zones, damage, additions, deletions, substitutions, abbreviations,
and editorial interventions. Essential for manuscript studies, diplomatic
editions, genetic criticism, and archival work.
element_count: 38
line_count: 1746
status: "complete"
key_elements:
- facsimile: "Digital facsimile container (images)"
- sourceDoc: "Source document with embedded transcription"
- surface: "Written surface (recto/verso/patch)"
- zone: "Area of interest on surface"
- add: "Text addition"
- del: "Text deletion"
- subst: "Substitution (deletion + addition)"
- damage: "Damaged text area"
- gap: "Gap/omission in transcription"
- unclear: "Illegible/uncertain text"
- supplied: "Editorially supplied text"
- abbr: "Abbreviation"
- expan: "Expansion of abbreviation"
- metamark: "Authorial/scribal markup"
- handShift: "Change of scribal hand"
glam_hypernym_mappings:
DOC.FAC: ["facsimile"]
DOC.SRC: ["sourceDoc"]
DOC.DMG: ["damage", "damageSpan"]
DOC.MET: ["handNotes"]
THG.SRF: ["surface", "surfaceGrp"]
THG.ZON: ["zone"]
THG.IMG: ["graphic"]
THG.PTH: ["path"]
TXT.LIN: ["line"]
TXT.ADD: ["add", "addSpan"]
TXT.DEL: ["del", "delSpan"]
TXT.SUB: ["subst", "substJoin"]
TXT.GAP: ["gap"]
TXT.UNC: ["unclear"]
TXT.SUP: ["supplied"]
TXT.SUR: ["surplus"]
TXT.SEC: ["secl"]
TXT.ABR: ["abbr", "am"]
TXT.EXP: ["expan", "ex"]
TXT.MOD: ["mod"]
TXT.MRK: ["metamark"]
TXT.HND: ["handShift"]
TXT.RST: ["restore"]
TXT.RTR: ["retrace"]
TXT.UND: ["undo"]
TXT.RDO: ["redo"]
TXT.TRN: ["listTranspose", "transpose"]
TXT.FW: ["fw"]
TXT.SPC: ["space"]
ontology_mappings:
cidoc_crm:
- SourceDoc: "crm:E22_Human-Made_Object"
- Surface: "crm:E25_Human-Made_Feature"
- Add: "crm:E13_Attribute_Assignment"
- Del: "crm:E79_Part_Removal"
- Damage: "crm:E14_Condition_Assessment"
- Metamark: "crm:E37_Mark"
- Line: "crm:E33_Linguistic_Object"
iiif:
- Facsimile: "iiif:Manifest"
- Surface: "iiif:Canvas"
- Graphic: "iiif:Image"
schema:
- Facsimile: "schema:ImageObject"
- SourceDoc: "schema:ArchiveComponent"
- Graphic: "schema:ImageObject"
web_annotation:
- Zone: "oa:FragmentSelector"
# ---------------------------------------------------------------------------
# Chapter 13: Critical Apparatus
# ---------------------------------------------------------------------------
textcrit:
path: "textcrit.yaml"
tei_chapter: 13
tei_module_name: "textcrit"
title: "Critical Apparatus"
description: |
Elements for encoding textual variants and critical apparatus.
Essential for scholarly editions, manuscript collation, and
philological research. Supports both inline and external apparatus.
element_count: 16
line_count: 720
status: "complete"
key_elements:
- app: "Apparatus entry (container for variants)"
- lem: "Lemma (base text reading)"
- rdg: "Reading (variant reading)"
- rdgGrp: "Reading group (related variants)"
- witness: "Witness description"
- listWit: "List of witnesses"
- witDetail: "Witness details"
- witStart: "Witness start marker"
- witEnd: "Witness end marker"
- lacunaStart: "Lacuna start marker"
- lacunaEnd: "Lacuna end marker"
- listApp: "List of apparatus entries"
glam_hypernym_mappings:
TXT.VAR: ["app", "lem", "rdg", "rdgGrp"]
THG.OBJ: ["witness", "listWit"]
DOC.MET: ["witDetail", "witStart", "witEnd", "lacunaStart", "lacunaEnd", "listApp"]
WRK: ["bibl"]
ontology_mappings:
cidoc_crm:
- App: "crm:E33_Linguistic_Object"
- Lem: "crm:E33_Linguistic_Object"
- Rdg: "crm:E33_Linguistic_Object"
- Witness: "crm:E22_Human-Made_Object"
frbroo:
- Witness: "frbroo:F4_Manifestation_Singleton"
dcterms:
- Bibl: "dcterms:bibliographicCitation"
web_annotation:
- Note: "oa:Annotation"
# ---------------------------------------------------------------------------
# Chapter 6: Verse
# ---------------------------------------------------------------------------
verse:
path: "verse.yaml"
tei_chapter: 6
tei_module_name: "verse"
title: "Verse"
description: |
Elements for encoding verse texts including verse lines, line groups
(stanzas), rhyme, meter, caesura, and metrical analysis. Essential
for poetry collections, literary archives, manuscript transcription,
and text analysis.
element_count: 7
line_count: 689
status: "complete"
key_elements:
- l: "Verse line"
- lg: "Line group (stanza)"
- seg: "Verse segment (metrical unit)"
- rhyme: "Rhyming portion"
- caesura: "Break point in verse line"
- metDecl: "Metrical notation declaration"
- metSym: "Metrical symbol definition"
glam_hypernym_mappings:
TXT.VRS.LIN: ["l"]
TXT.VRS.STZ: ["lg"]
TXT.VRS.SEG: ["seg"]
TXT.VRS.RHY: ["rhyme"]
TXT.VRS.CES: ["caesura"]
DOC.MET: ["metDecl"]
DOC.MET.SYM: ["metSym"]
ontology_mappings:
cidoc_crm:
- VerseLine: "crm:E33_Linguistic_Object"
- LineGroup: "crm:E33_Linguistic_Object"
- VerseSegment: "crm:E33_Linguistic_Object"
- Rhyme: "crm:E33_Linguistic_Object"
schema:
- VerseLine: "schema:CreativeWork"
# ---------------------------------------------------------------------------
# Chapter 7: Performance Texts
# ---------------------------------------------------------------------------
drama:
path: "drama.yaml"
tei_chapter: 7
tei_module_name: "drama"
title: "Performance Texts"
description: |
Elements for encoding dramatic texts and performance scripts including
cast lists, speeches, speakers, stage directions, and performance
metadata. Essential for theater archives, opera collections, film
scripts, and performance studies.
element_count: 15
line_count: 781
status: "complete"
key_elements:
- castList: "Cast list container"
- castGroup: "Grouped cast members"
- castItem: "Individual cast entry"
- role: "Character/role name"
- roleDesc: "Role description"
- actor: "Actor/performer name"
- sp: "Speech container"
- speaker: "Speaker identification"
- spGrp: "Speech group"
- stage: "Stage direction"
- move: "Character movement"
- set: "Set/scenery description"
- prologue: "Prologue section"
- epilogue: "Epilogue section"
- performance: "Performance metadata"
glam_hypernym_mappings:
TXT.DRM.CST: ["castList", "castGroup", "castItem"]
TXT.DRM.ROL: ["role", "roleDesc"]
TXT.DRM.SPK: ["sp", "speaker", "spGrp"]
TXT.DRM.STG: ["stage", "move", "set"]
TXT.DRM.FRM: ["prologue", "epilogue"]
AGT.PER: ["actor"]
TMP.EVT: ["performance"]
WRK.DRM: ["castList", "performance"]
ontology_mappings:
cidoc_crm:
- CastList: "crm:E33_Linguistic_Object"
- Role: "crm:E33_Linguistic_Object"
- Speech: "crm:E33_Linguistic_Object"
- StageDirection: "crm:E33_Linguistic_Object"
- Performance: "crm:E7_Activity"
schema:
- CastList: "schema:CreativeWork"
- Actor: "schema:Person"
- Performance: "schema:TheaterEvent"
frbroo:
- Performance: "frbroo:F31_Performance"
- Role: "frbroo:F38_Character"
foaf:
- Actor: "foaf:Person"
# ---------------------------------------------------------------------------
# Chapter 10: Dictionaries
# ---------------------------------------------------------------------------
dictionaries:
path: "dictionaries.yaml"
tei_chapter: 10
tei_module_name: "dictionaries"
title: "Dictionaries"
description: |
Elements for encoding dictionary entries including headwords, forms,
pronunciation, grammatical information, senses, definitions, etymology,
usage labels, and cross-references. Essential for lexicography, historical
linguistics, terminology management, and heritage language collections.
element_count: 35
line_count: 1740
status: "complete"
key_elements:
- entry: "Dictionary entry container"
- form: "Form information (orthography, pronunciation)"
- orth: "Orthographic (written) form"
- pron: "Pronunciation"
- gramGrp: "Grammatical information group"
- pos: "Part of speech"
- sense: "Sense/meaning definition"
- def: "Definition text"
- cit: "Citation/example"
- etym: "Etymology"
- usg: "Usage information"
- xr: "Cross-reference"
- re: "Related entry"
glam_hypernym_mappings:
TXT.LEX.ENT: ["entry", "entryFree", "superEntry"]
TXT.LEX.FRM: ["form"]
TXT.LEX.ORT: ["orth"]
TXT.LEX.PRN: ["pron"]
TXT.LEX.GRM: ["gramGrp"]
TXT.LEX.POS: ["pos"]
TXT.LEX.SNS: ["sense"]
TXT.LEX.DEF: ["def"]
TXT.LEX.CIT: ["cit"]
TXT.LEX.ETY: ["etym"]
TXT.LEX.USG: ["usg"]
TXT.LEX.XRF: ["xr"]
TXT.LEX.REL: ["re"]
APP.LNG: ["lang"]
ontology_mappings:
ontolex:
- DictEntry: "ontolex:LexicalEntry"
- DictForm: "ontolex:Form"
- DictSense: "ontolex:LexicalSense"
- DictOrth: "ontolex:writtenRep"
- DictPron: "ontolex:phoneticRep"
lexinfo:
- DictGramGrp: "lexinfo:MorphosyntacticProperty"
- DictPOS: "lexinfo:partOfSpeech"
- DictGen: "lexinfo:gender"
- DictNumber: "lexinfo:number"
- DictCase: "lexinfo:case"
- DictEtym: "lexinfo:etymology"
- DictUsg: "lexinfo:usageNote"
skos:
- DictDef: "skos:definition"
- DictXRef: "skos:related"
cidoc_crm:
- DictMentioned: "crm:E33_Linguistic_Object"
# ---------------------------------------------------------------------------
# Chapter 5: Characters, Glyphs, and Writing Modes
# ---------------------------------------------------------------------------
gaiji:
path: "gaiji.yaml"
tei_chapter: 5
tei_module_name: "gaiji"
title: "Characters, Glyphs, and Writing Modes"
description: |
Elements for documenting non-standard characters, glyph variants, and
writing modes. Essential for medieval manuscripts, CJK texts, historical
documents, and texts using Unicode Private Use Area characters.
element_count: 18
line_count: 948
status: "complete"
key_elements:
- charDecl: "Character declarations container"
- char: "Character definition"
- glyph: "Glyph variant definition"
- g: "Inline gaiji reference"
- charName: "Character name"
- charProp: "Character property"
- unicodeName: "Unicode property name"
- localProp: "Local property"
- mapping: "Character mapping"
- figure: "Glyph image container"
- graphic: "Glyph image"
glam_hypernym_mappings:
DOC.MET.CHR: ["charDecl"]
TXT.CHR: ["char"]
TXT.GLY: ["glyph"]
TXT.CHR.REF: ["g"]
TXT.CHR.NAM: ["charName"]
TXT.GLY.NAM: ["glyphName"]
TXT.CHR.PRP: ["charProp"]
TXT.CHR.UNI: ["unicodeName"]
TXT.CHR.LCL: ["localProp"]
TXT.CHR.MAP: ["mapping"]
THG.IMG: ["graphic", "figure"]
ontology_mappings:
cidoc_crm:
- CharDecl: "crm:E90_Symbolic_Object"
- CharDef: "crm:E90_Symbolic_Object"
- GlyphDef: "crm:E90_Symbolic_Object"
- GaijiRef: "crm:E90_Symbolic_Object"
- CharProp: "crm:E55_Type"
skos:
- CharName: "skos:prefLabel"
- GlyphName: "skos:prefLabel"
- CharMapping: "skos:exactMatch"
schema:
- CharFigure: "schema:ImageObject"
- CharGraphic: "schema:ImageObject"
# ---------------------------------------------------------------------------
# Chapter 22: Certainty, Precision, and Responsibility
# ---------------------------------------------------------------------------
certainty:
path: "certainty.yaml"
tei_chapter: 22
tei_module_name: "certainty"
title: "Certainty, Precision, and Responsibility"
description: |
Elements for encoding certainty, precision, and responsibility for
annotations and assertions. Essential for NER confidence scoring,
annotation provenance, and scholarly attribution of interpretations.
element_count: 10
line_count: 662
status: "complete"
key_elements:
- certainty: "Certainty annotation"
- precision: "Precision of values"
- respons: "Responsibility for content"
- NERConfidenceScore: "NER confidence metadata"
- AnnotationProvenance: "Annotation attribution"
- ModelAssertionSet: "ML model assertion bundle"
glam_hypernym_mappings:
DOC.MET.CRT: ["certainty"]
DOC.MET.PRC: ["precision"]
DOC.MET.RSP: ["respons"]
DOC.MET.CNF: ["NERConfidenceScore"]
DOC.MET.PRV: ["AnnotationProvenance"]
DOC.MET.ASR: ["ModelAssertionSet"]
ontology_mappings:
prov:
- Respons: "prov:wasAttributedTo"
- AnnotationProvenance: "prov:Activity"
- AnnotationAgent: "prov:Agent"
web_annotation:
- Certainty: "oa:Annotation"
- Precision: "oa:Annotation"
cidoc_crm:
- Certainty: "crm:E13_Attribute_Assignment"
mls:
- NERConfidenceScore: "mls:hasConfidence"
- ModelAssertionSet: "mls:Model"
# ---------------------------------------------------------------------------
# Chapter 18: Simple Analytic Mechanisms
# ---------------------------------------------------------------------------
analysis:
path: "analysis.yaml"
tei_chapter: 18
tei_module_name: "analysis"
title: "Simple Analytic Mechanisms"
description: |
Elements for linguistic annotation including segmentation, POS tagging,
lemmatization, morphological analysis, syntactic parsing, and interpretive
annotation. Essential for NLP preprocessing and corpus linguistics.
element_count: 22
line_count: 976
status: "complete"
key_elements:
- s: "Sentence (s-unit)"
- cl: "Clause"
- phr: "Phrase (NP, VP, etc.)"
- w: "Word token"
- m: "Morpheme"
- c: "Character"
- pc: "Punctuation"
- span: "Span annotation"
- spanGrp: "Span group"
- interp: "Interpretation"
- interpGrp: "Interpretation group"
glam_hypernym_mappings:
TXT.ANA.SEN: ["s"]
TXT.ANA.CLS: ["cl"]
TXT.ANA.PHR: ["phr"]
TXT.ANA.WRD: ["w"]
TXT.ANA.MOR: ["m"]
TXT.ANA.CHR: ["c"]
TXT.ANA.PNC: ["pc"]
TXT.ANA.SPN: ["span"]
TXT.ANA.SPG: ["spanGrp"]
TXT.ANA.INT: ["interp"]
TXT.ANA.IGP: ["interpGrp"]
ontology_mappings:
nif:
- Sentence: "nif:Sentence"
- Word: "nif:Word"
- Character: "nif:Character"
- TokenizedText: "nif:Context"
- DependencyParse: "nif:DependencyTree"
olia:
- Clause: "olia:Clause"
- Phrase: "olia:Phrase"
- Morpheme: "olia:Morpheme"
- Punctuation: "olia:Punctuation"
- POSTaggedToken: "olia:Token"
ontolex:
- Word: "ontolex:Form"
cidoc_crm:
- Interp: "crm:E13_Attribute_Assignment"
web_annotation:
- Span: "oa:Annotation"
skos:
- InterpGroup: "skos:ConceptScheme"
- AnnotationScheme: "skos:ConceptScheme"
- TagDefinition: "skos:Concept"
# ---------------------------------------------------------------------------
# Chapter 15: Tables, Formulae, Graphics, and Notated Music
# ---------------------------------------------------------------------------
figures:
path: "figures.yaml"
tei_chapter: 15
tei_module_name: "figures"
title: "Tables, Formulae, Graphics, and Notated Music"
description: |
Elements for encoding tables, mathematical formulae, graphic images,
and notated music. Essential for scientific publications, illustrated
manuscripts, heritage image collections, and digital humanities projects.
element_count: 18
line_count: 743
status: "complete"
key_elements:
- figure: "Figure container"
- graphic: "Graphic image reference"
- figDesc: "Figure description (accessibility)"
- table: "Table container"
- row: "Table row"
- cell: "Table cell"
- formula: "Mathematical/chemical formula"
- notatedMusic: "Music notation"
- media: "Audio/video media"
- binaryObject: "Embedded binary data"
glam_hypernym_mappings:
THG.IMG.FIG: ["figure"]
THG.IMG.GRA: ["graphic"]
THG.IMG.DES: ["figDesc"]
THG.TBL: ["table"]
THG.TBL.ROW: ["row"]
THG.TBL.CEL: ["cell"]
THG.FRM: ["formula"]
THG.MUS: ["notatedMusic"]
THG.MED: ["media"]
THG.BIN: ["binaryObject"]
ontology_mappings:
schema:
- Figure: "schema:ImageObject"
- Table: "schema:Table"
- Cell: "schema:TableCell"
- Media: "schema:MediaObject"
- NotatedMusic: "schema:MusicComposition"
iiif:
- Graphic: "iiif:Image"
- IIIFManifest: "iiif:Manifest"
- IIIFImageService: "iiif:ImageService"
cidoc_crm:
- HeritageImageMetadata: "crm:E38_Image"
mathml:
- Formula: "mathml:math"
# ---------------------------------------------------------------------------
# Chapter 4: Default Text Structure
# ---------------------------------------------------------------------------
textstructure:
path: "textstructure.yaml"
tei_chapter: 4
tei_module_name: "textstructure"
title: "Default Text Structure"
description: |
Elements for default document structure including divisions, front/back
matter, title pages, and document organization. Essential for document
parsing, structure navigation, and metadata extraction.
element_count: 24
line_count: 767
status: "complete"
key_elements:
- TEI: "Root document element"
- text: "Text container"
- body: "Main body content"
- front: "Front matter"
- back: "Back matter"
- div: "Text division"
- group: "Text group (composite works)"
- titlePage: "Title page"
- docTitle: "Document title"
- docAuthor: "Document author"
- docImprint: "Publication imprint"
- opener: "Opening formula"
- closer: "Closing formula"
glam_hypernym_mappings:
DOC: ["TEI"]
DOC.TXT: ["text"]
DOC.TXT.BDY: ["body"]
DOC.TXT.FRT: ["front"]
DOC.TXT.BCK: ["back"]
DOC.TXT.DIV: ["div"]
DOC.TXT.GRP: ["group"]
DOC.TXT.TTP: ["titlePage"]
DOC.TXT.DTL: ["docTitle"]
AGT.PER.AUT: ["docAuthor"]
DOC.TXT.IMP: ["docImprint"]
ontology_mappings:
schema:
- TEIDocument: "schema:CreativeWork"
bibo:
- Division: "bibo:DocumentPart"
dcterms:
- DocTitle: "dcterms:title"
- DocAuthor: "dcterms:creator"
- DocImprint: "dcterms:publisher"
- Dateline: "dcterms:date"
cidoc_crm:
- Text: "crm:E33_Linguistic_Object"
foaf:
- DocAuthor: "foaf:Person"
# ---------------------------------------------------------------------------
# Chapter 16: Language Corpora
# ---------------------------------------------------------------------------
corpus:
path: "corpus.yaml"
tei_chapter: 16
tei_module_name: "corpus"
title: "Language Corpora"
description: |
Elements for representing language corpora including corpus structure,
text descriptions, participant information, and sampling methodology.
Essential for NER training data management, corpus linguistics,
and heritage document collections.
element_count: 15
line_count: 1050
status: "complete"
key_elements:
- teiCorpus: "Corpus container (collection of TEI documents)"
- textDesc: "Text description (genre, channel, purpose)"
- channel: "Communication channel (spoken, written, mixed)"
- constitution: "Text completeness information"
- derivation: "Original vs. derivative text status"
- domain: "Subject domain classification"
- factuality: "Factual vs. fictional status"
- interaction: "Interaction type (none, partial, active)"
- preparedness: "Degree of text preparation"
- purpose: "Communicative purpose"
- particDesc: "Participant description"
- person: "Participant person"
- personGrp: "Participant group"
- settingDesc: "Setting/context description"
- setting: "Individual setting"
glam_hypernym_mappings:
DOC.CRP: ["teiCorpus"]
DOC.MET.TXT: ["textDesc"]
DOC.MET.CHN: ["channel"]
DOC.MET.CON: ["constitution"]
DOC.MET.DRV: ["derivation"]
DOC.MET.DOM: ["domain"]
DOC.MET.FCT: ["factuality"]
DOC.MET.INT: ["interaction"]
DOC.MET.PRE: ["preparedness"]
DOC.MET.PUR: ["purpose"]
AGT: ["particDesc", "person", "personGrp"]
GEO: ["settingDesc", "setting"]
glam_ner_extensions:
- NERTrainingCorpus: "Corpus for NER model training"
- EntityTypeCount: "Entity type statistics"
- CorpusSplit: "Train/dev/test splits"
- HeritageDocumentCollection: "Heritage document corpus"
ontology_mappings:
void:
- TEICorpus: "void:Dataset"
- NERTrainingCorpus: "void:Dataset"
nif:
- TEICorpus: "nif:Context"
oa:
- CorpusAnnotation: "oa:Annotation"
dcat:
- TEICorpus: "dcat:Dataset"
- CorpusSplit: "dcat:Distribution"
prov:
- TextDesc: "prov:Entity"
- SamplingDeclaration: "prov:Activity"
foaf:
- Participant: "foaf:Person"
- ParticipantGroup: "foaf:Group"
skos:
- Domain: "skos:Concept"
- Purpose: "skos:Concept"
# ---------------------------------------------------------------------------
# Chapter 20: Graphs, Networks, and Trees
# ---------------------------------------------------------------------------
nets:
path: "nets.yaml"
tei_chapter: 20
tei_module_name: "nets"
title: "Graphs, Networks, and Trees"
description: |
Elements for encoding graphs, networks, and tree structures including
nodes, arcs, trees, forests, and stemmatology. Essential for entity
relationship graphs, coreference chains, dependency parsing, and
manuscript stemma visualization.
element_count: 14
line_count: 980
status: "complete"
key_elements:
- graph: "Graph container (directed, undirected)"
- node: "Graph node"
- arc: "Graph edge/arc"
- tree: "Tree structure"
- root: "Tree root node"
- iNode: "Internal tree node"
- leaf: "Tree leaf node"
- label: "Node/arc label"
- forest: "Collection of trees"
- eTree: "Embedded tree"
- triangle: "Collapsed subtree representation"
- eLeaf: "Embedded leaf"
glam_hypernym_mappings:
DOC.GRF: ["graph", "forest"]
DOC.GRF.NOD: ["node", "root", "iNode", "leaf", "eLeaf"]
DOC.GRF.ARC: ["arc"]
DOC.GRF.TRE: ["tree", "eTree"]
DOC.GRF.LBL: ["label"]
DOC.GRF.TRI: ["triangle"]
glam_ner_extensions:
- EntityRelationGraph: "Entity relationship network"
- EntityNode: "Entity as graph node"
- RelationArc: "Typed relation between entities"
- CoreferenceChain: "Coreference resolution chain"
- EntityMention: "Entity mention in coreference"
- DependencyTree: "Syntactic dependency tree"
- TokenNode: "Token in dependency tree"
- DependencyArc: "Dependency relation"
stemmatology_extensions:
- Stemma: "Manuscript stemma (witness relationships)"
- Witness: "Manuscript witness"
- TransmissionRelation: "Transmission relationship type"
ontology_mappings:
cidoc_crm:
- Graph: "crm:E89_Propositional_Object"
- Node: "crm:E1_CRM_Entity"
- Arc: "crm:E13_Attribute_Assignment"
- EntityRelationGraph: "crm:E89_Propositional_Object"
- Stemma: "crm:E89_Propositional_Object"
- Witness: "crm:E22_Human-Made_Object"
nif:
- DependencyTree: "nif:DependencyTree"
- CoreferenceChain: "nif:String"
oa:
- EntityNode: "oa:Annotation"
- RelationArc: "oa:Annotation"
skos:
- Label: "skos:prefLabel"
owl:
- RelationArc: "owl:ObjectProperty"
# ---------------------------------------------------------------------------
# Chapter 19: Feature Structures (ISO-FS)
# ---------------------------------------------------------------------------
iso-fs:
path: "iso-fs.yaml"
tei_chapter: 19
tei_module_name: "iso-fs"
title: "Feature Structures"
description: |
Elements for encoding feature structures based on ISO/IEC 24610 (Feature
Structures) standard. Essential for morphological analysis, syntactic
feature unification, lexical semantics, and NLP feature representation.
element_count: 22
line_count: 863
status: "complete"
key_elements:
- fs: "Feature structure container"
- f: "Feature (name-value pair)"
- binary: "Binary value (+/-)"
- symbol: "Symbolic value"
- numeric: "Numeric value"
- string: "String value"
- vColl: "Collection of values"
- vAlt: "Alternative values (disjunction)"
- vNot: "Negated value"
- vMerge: "Merged values"
- default: "Default value"
- if: "Conditional feature"
- then: "Conditional consequent"
- fLib: "Feature structure library"
- vLib: "Value library"
- fvLib: "Feature-value library"
- fsdDecl: "Feature system declaration"
- fsDecl: "Feature structure type declaration"
- fDecl: "Feature declaration"
- vRange: "Valid value range"
glam_hypernym_mappings:
TXT.ANA.FS: ["fs"]
TXT.ANA.FEA: ["f"]
TXT.ANA.VAL: ["binary", "symbol", "numeric", "string"]
TXT.ANA.COL: ["vColl", "vAlt", "vNot", "vMerge"]
TXT.ANA.DEF: ["default"]
TXT.ANA.CND: ["if", "then"]
DOC.MET.LIB: ["fLib", "vLib", "fvLib"]
DOC.MET.DCL: ["fsdDecl", "fsDecl", "fDecl", "vRange"]
glam_ner_extensions:
- EntityFeatureStructure: "NER entity features"
- MorphologicalFS: "Morphological analysis features"
- SemanticRoleFS: "Semantic role features"
ontology_mappings:
gold:
- FeatureStructure: "gold:FeatureStructure"
- Feature: "gold:Feature"
- FeatureValue: "gold:FeatureValue"
olia:
- MorphologicalFS: "olia:MorphologicalCategory"
- POS: "olia:PartOfSpeech"
- Case: "olia:Case"
- Gender: "olia:Gender"
- Number: "olia:Number"
- Tense: "olia:Tense"
lexinfo:
- MorphologicalFS: "lexinfo:MorphosyntacticProperty"
cidoc_crm:
- FeatureStructure: "crm:E55_Type"
skos:
- SymbolValue: "skos:Concept"
# ---------------------------------------------------------------------------
# Chapter 9: Documentation Elements (TEI ODD)
# ---------------------------------------------------------------------------
tagdocs:
path: "tagdocs.yaml"
tei_chapter: 9
tei_module_name: "tagdocs"
title: "Documentation Elements (TEI ODD)"
description: |
Elements for TEI ODD (One Document Does it all) customization including
schema specifications, element definitions, attribute declarations,
content models, and constraint rules. Essential for defining custom
annotation schemas and entity type taxonomies for GLAM-NER pipelines.
element_count: 35
line_count: 1542
status: "complete"
key_elements:
- schemaSpec: "Schema specification container"
- moduleRef: "Reference to TEI module"
- moduleSpec: "Module definition"
- elementSpec: "Element documentation"
- attDef: "Attribute definition"
- attList: "Attribute list"
- classSpec: "Class specification (model/atts)"
- memberOf: "Class membership declaration"
- content: "Content model specification"
- sequence: "Ordered content sequence"
- alternate: "Content alternation (choice)"
- elementRef: "Element reference in content model"
- classRef: "Class reference in content model"
- macroSpec: "Reusable pattern/macro"
- constraintSpec: "Constraint specification (Schematron, etc.)"
- constraint: "Individual constraint rule"
- datatype: "Data type specification"
- valList: "Valid values list"
- valItem: "Valid value item"
- exemplum: "Usage example"
- remarks: "Additional documentation"
glam_hypernym_mappings:
DOC.SCH: ["schemaSpec"]
DOC.SCH.MOD: ["moduleRef", "moduleSpec"]
DOC.SCH.ELM: ["elementSpec"]
DOC.SCH.ATT: ["attDef"]
DOC.SCH.ATL: ["attList"]
DOC.SCH.CLS: ["classSpec"]
DOC.SCH.MEM: ["memberOf"]
DOC.SCH.CNT: ["content"]
DOC.SCH.SEQ: ["sequence"]
DOC.SCH.ALT: ["alternate"]
DOC.SCH.CSP: ["constraintSpec"]
DOC.SCH.CON: ["constraint"]
DOC.SCH.DTP: ["datatype"]
DOC.SCH.VLL: ["valList"]
DOC.SCH.VLI: ["valItem"]
DOC.SCH.EXM: ["exemplum"]
DOC.SCH.REM: ["remarks"]
glam_ner_extensions:
- EntityTypeSpec: "Entity type definition for GLAM-NER"
- AnnotationSchemaSpec: "Complete annotation schema"
- RelationTypeSpec: "Relation type definition"
- OntologyMapping: "Mapping to ontology classes"
ontology_mappings:
owl:
- SchemaSpec: "owl:Ontology"
- ModuleRef: "owl:imports"
rdfs:
- ElementSpec: "rdfs:Class"
- AttDef: "rdfs:Property"
- ClassSpec: "rdfs:Class"
- MemberOf: "rdfs:subClassOf"
- Datatype: "rdfs:Datatype"
skos:
- ValList: "skos:ConceptScheme"
- ValItem: "skos:Concept"
- Exemplum: "skos:example"
- Remarks: "skos:note"
shacl:
- ConstraintSpec: "sh:NodeShape"
- Constraint: "sh:PropertyShape"
dcterms:
- ListRef: "dcterms:references"
# ---------------------------------------------------------------------------
# Computer-mediated Communication (CMC)
# ---------------------------------------------------------------------------
cmc:
path: "cmc.yaml"
tei_chapter: "CMC"
tei_module_name: "cmc"
title: "Computer-mediated Communication"
description: |
Elements for encoding computer-mediated communication including social
media posts, chat messages, forum threads, wiki discussions, and other
digital discourse. Essential for social media NER, online discourse
analysis, and digital heritage collections. Covers the TEI <post> element
and associated attributes for modality, threading, and content generation.
element_count: 17
line_count: 1478
status: "complete"
key_elements:
- post: "CMC post/message (primary element)"
- CMCThread: "Thread of related posts"
- CMCConversation: "Conversation context"
- CMCParticipant: "User account/identity"
- CMCEmoji: "Emoji encoding"
- CMCHashtag: "Hashtag encoding"
- CMCMention: "@-mention encoding"
- CMCEmbeddedMedia: "Multimodal content"
- CMCReaction: "Reactions/engagement"
- CMCCorpus: "CMC corpus structure"
- CMCEntityMention: "NER entity from CMC"
glam_hypernym_mappings:
TXT.CMC: ["post"]
TXT.CMC.PST: ["CMCPost"]
TXT.CMC.THR: ["CMCThread"]
TXT.CMC.CNV: ["CMCConversation"]
TXT.CMC.EMJ: ["CMCEmoji"]
TXT.CMC.EMO: ["CMCEmoticon"]
TXT.CMC.RXN: ["CMCReaction"]
AGT.CMC.USR: ["CMCParticipant"]
GRP.CMC: ["CMCParticipantGroup"]
APP.CMC.HTG: ["CMCHashtag"]
APP.CMC.MEN: ["CMCMention"]
THG.CMC.MED: ["CMCEmbeddedMedia"]
DOC.MET.CMC: ["CMCPlatformMetadata", "CMCPostMetadata"]
DOC.CRP.CMC: ["CMCCorpus"]
NER.CMC.ENT: ["CMCEntityMention"]
NER.CMC.NRM: ["CMCEntityNormalization"]
glam_ner_extensions:
- CMCEntityMention: "Entity mention from CMC text"
- CMCEntityNormalization: "Informal entity normalization"
- CMCCorpus: "CMC corpus for NER training"
ontology_mappings:
sioc:
- CMCPost: "sioc:Post"
- CMCThread: "sioc:Thread"
- CMCConversation: "sioc:Forum"
- CMCParticipant: "sioc:UserAccount"
- CMCParticipantGroup: "sioc:Usergroup"
activitystreams:
- CMCPost: "as:Note"
- CMCReaction: "as:Like"
- CMCParticipant: "as:Person"
schema:
- CMCPost: "schema:SocialMediaPosting"
- CMCThread: "schema:DiscussionForumPosting"
- CMCParticipant: "schema:Person"
- CMCEmbeddedMedia: "schema:MediaObject"
foaf:
- CMCParticipant: "foaf:OnlineAccount"
- CMCParticipantGroup: "foaf:Group"
cidoc_crm:
- CMCPost: "crm:E33_Linguistic_Object"
- CMCParticipant: "crm:E39_Actor"
prov:
- CMCPlatformMetadata: "prov:Activity"
- CMCCorpus: "prov:Collection"
nif:
- CMCEntityMention: "nif:String"
# =============================================================================
# PLANNED MODULES
# =============================================================================
planned_modules:
# =============================================================================
# INTEGRATION NOTES
# =============================================================================
integration:
linkml_usage: |
These modules can be imported into LinkML schemas using:
```yaml
imports:
- https://w3id.org/glam/ner/tei/header
- https://w3id.org/glam/ner/tei/namesdates
- https://w3id.org/glam/ner/tei/msdescription
- https://w3id.org/glam/ner/tei/linking
```
Or locally:
```yaml
imports:
- modules/advanced/tei/header
- modules/advanced/tei/namesdates
- modules/advanced/tei/msdescription
- modules/advanced/tei/linking
```
tei_xml_conversion: |
TEI XML documents can be converted to LinkML-compliant YAML/JSON using
XSLT or Python transformations. The key mappings are:
- TEI element → LinkML class
- TEI @xml:id → LinkML identifier
- TEI @ref/@sameAs → LinkML TEIPointer (URI reference)
- TEI att.datable → TEIDatableAttributes mixin
- TEI att.global → TEIGlobalAttributes mixin
- TEI <standOff> → LinkML TEIStandOff class
- TEI <annotation> → LinkML TEIAnnotation class
ner_pipeline_integration: |
NER pipelines can output annotations using these LinkML classes:
1. Extract entity mentions from text
2. Create TEIAnnotation or NERAnnotation instances
3. Use Selector (TextQuoteSelector) for target identification
4. Set motivation to "identifying" for entity recognition
5. Populate body with:
- entity_type (GLAM-NER hypernym)
- entity_ref (authority URI: VIAF, Wikidata, GeoNames)
- entity_label (human-readable name)
6. Add confidence_score and ner_method
7. Serialize to JSON-LD, YAML, or TEI XML
Example output structure:
```yaml
- class: NERAnnotation
xml_id: ann-001
motivation: identifying
glam_hypernym: AGT.PER
annotation_target:
source_uri: "document.xml"
selector:
selector_type: text_quote
exact_match: "William Shakespeare"
prefix_context: "the playwright "
suffix_context: " was born"
start_position: 45
end_position: 65
annotation_body:
entity_type: AGT.PER
entity_ref: https://viaf.org/viaf/96994048
entity_label: "William Shakespeare"
confidence_score: 0.95
ner_method: "spacy-en-core-web-trf"
```
manuscript_cataloging: |
For manuscript cataloging, use msdescription module:
1. Create MsDesc instance as container
2. Populate MsIdentifier with:
- repository_name, institution_name
- settlement_name, country_name
- shelfmark (required)
3. Add MsPhysDesc for physical attributes:
- object_form (codex, scroll, etc.)
- support_material (parchment, paper, etc.)
- dimensions, extent
- hand_desc, deco_desc, binding_desc
4. Add MsHistory for provenance:
- origin (place and date)
- provenance_events (ownership chain)
- acquisition_info
5. Add MsContents for intellectual content:
- content_items (MsItem instances)
- text_language
Ontology mappings ensure interoperability with:
- CIDOC-CRM (museum/heritage sector)
- RiC-O (archives)
- Schema.org (web discovery)
- FRBRoo (bibliographic)
# =============================================================================
# STATISTICS
# =============================================================================
statistics:
completed_modules: 21
total_elements_covered: 590
total_line_count: 27367
element_coverage:
header: "85/85 (100%)"
core: "45/90 (50%)"
namesdates: "58/58 (100%)"
msdescription: "58/58 (100%)"
linking: "20/20 (100%)"
textcrit: "16/16 (100%)"
spoken: "18/18 (100%)"
transcr: "38/38 (100%)"
verse: "7/7 (100%)"
drama: "15/15 (100%)"
dictionaries: "35/35 (100%)"
gaiji: "18/18 (100%)"
certainty: "10/10 (100%)"
analysis: "22/22 (100%)"
figures: "18/18 (100%)"
textstructure: "24/24 (100%)"
corpus: "15/15 (100%)"
nets: "14/14 (100%)"
iso-fs: "22/22 (100%)"
tagdocs: "35/35 (100%)"
cmc: "17/17 (100%)"
ontology_coverage:
dcterms: "Full alignment for header metadata"
cidoc_crm: "Full alignment for heritage classes"
schema_org: "Full alignment for web discovery"
web_annotation: "Full alignment for linking module"
prov_o: "Full alignment for header provenance"
nif: "Full alignment for analysis module (NLP)"
rico: "Partial alignment for archival classes"
frbroo: "Partial alignment for bibliographic classes"
foaf: "Used in namesdates, header, corpus, cmc modules"
qudt: "Used in core and header modules (measures)"
timeml: "Used in core module (temporal)"
bibo: "Used in header module (bibliography)"
skos: "Used in header, corpus, nets, iso-fs modules (taxonomies)"
premis: "Used in header module (preservation)"
olia: "Full alignment for analysis, iso-fs modules (linguistic annotation)"
iiif: "Full alignment for figures module (image delivery)"
mathml: "Partial alignment for figures module (formulae)"
void: "Used in corpus module (datasets)"
dcat: "Used in corpus module (data catalogs)"
gold: "Used in iso-fs module (feature structures)"
lexinfo: "Used in dictionaries, iso-fs modules (lexical info)"
ontolex: "Used in dictionaries module (lexical entries)"
owl: "Used in nets, tagdocs modules (relations, ontology definitions)"
rdfs: "Used in tagdocs module (class/property definitions)"
shacl: "Used in tagdocs module (constraint shapes)"
sioc: "Full alignment for cmc module (online communities)"
activitystreams: "Used in cmc module (social activities)"
# =============================================================================
# VERSION HISTORY
# =============================================================================
version_history:
- version: "1.16.0"
date: "2025-12-04"
changes:
- "Added cmc.yaml (Computer-mediated Communication) - 1478 lines, 17 classes, 8 enums"
- "TEI CMC module for social media, chat, forums, wiki discussions"
- "Core post element with modality, generatedBy, replyTo, indentLevel attributes"
- "Threading structures: CMCThread, CMCConversation"
- "Participant metadata: CMCParticipant, CMCParticipantGroup with anonymization"
- "Emoji/emoticon encoding: CMCEmoji, CMCEmoticon with Unicode and sentiment"
- "Hashtag and mention encoding: CMCHashtag, CMCMention for entity extraction"
- "Multimodal content: CMCEmbeddedMedia for images, video, audio, GIFs"
- "Engagement tracking: CMCReaction, CMCReactionSet"
- "Platform metadata: CMCPlatformMetadata, CMCPostMetadata"
- "Corpus support: CMCCorpus for NER training data management"
- "NER extensions: CMCEntityMention, CMCEntityNormalization for informal text"
- "Ontology mappings: SIOC (online communities), Activity Streams, Schema.org"
- "Privacy support: anonymization levels for GDPR compliance"
- "Bot detection: generatedBy attribute for human/bot/system content"
- "Updated total line count to 27367"
- "Updated total elements covered to 590"
- "Updated total modules to 21"
- version: "1.15.0"
date: "2025-12-04"
changes:
- "Added tagdocs.yaml (Chapter 9) - 1542 lines, 35 classes, 8 enums"
- "TEI ODD documentation elements for schema specification"
- "Schema elements: schemaSpec, moduleRef, moduleSpec"
- "Element definition: elementSpec, attDef, attList, classSpec, memberOf"
- "Content models: content, sequence, alternate, elementRef, classRef"
- "Constraints: constraintSpec, constraint (Schematron/RelaxNG)"
- "Data types and values: datatype, valList, valItem"
- "Documentation: exemplum, remarks, listRef"
- "GLAM-NER extensions: EntityTypeSpec, AnnotationSchemaSpec, RelationTypeSpec"
- "Ontology mappings: OWL (ontology), RDFS (class definitions), SHACL (constraints)"
- "Updated total line count to 25889"
- "Updated total elements covered to 573"
- "Updated total modules to 20"
- version: "1.14.0"
date: "2025-12-03"
changes:
- "Added corpus.yaml (Chapter 16) - 1050 lines, 15 classes, 9 enums"
- "Added nets.yaml (Chapter 20) - 980 lines, 14 classes, 6 enums"
- "Added iso-fs.yaml (Chapter 19) - 863 lines, 22 classes, 7 enums"
- "Corpus module: TEI corpus structure, text descriptions, participant info"
- "GLAM-NER corpus extensions: NERTrainingCorpus, CorpusSplit, EntityTypeCount"
- "Heritage document collection support for corpus linguistics"
- "Nets module: graphs, trees, forests for relationship modeling"
- "Entity relationship graphs and coreference chains for NER"
- "Stemmatology extensions for manuscript tradition visualization"
- "Dependency tree support for syntactic parsing output"
- "ISO-FS module: feature structures per ISO/IEC 24610"
- "Morphological feature structures for linguistic analysis"
- "Feature libraries and declarations for reusable feature sets"
- "Ontology mappings: void, dcat, nif, gold, OLiA, LexInfo"
- "Updated total line count to 24347"
- "Updated total elements covered to 538"
- "Updated total modules to 19"
- version: "1.13.0"
date: "2025-12-03"
changes:
- "Added textstructure.yaml (Chapter 4) - 767 lines, 24 classes, 4 enums"
- "Moved textstructure from planned to completed modules"
- "Document structure: TEI, text, body, front, back, group"
- "Divisions: div with type enumeration"
- "Front matter: titlePage, docTitle, docAuthor, docImprint, byline, epigraph"
- "Back matter: trailer, closer, opener, dateline, postscript"
- "Floating text support for embedded texts"
- "Ontology mappings: Schema.org, BIBO, Dublin Core, CIDOC-CRM, FOAF"
- "Updated total line count to 21456"
- "Updated total elements covered to 487"
- version: "1.12.0"
date: "2025-12-03"
changes:
- "Added figures.yaml (Chapter 15) - 743 lines, 18 classes, 6 enums"
- "Moved figures from planned to completed modules"
- "Figure elements: figure, graphic, figDesc"
- "Table elements: table, row, cell"
- "Formula support: formula with MathML, TeX, image options"
- "Notated music: notatedMusic with MEI, MusicXML support"
- "Media elements: media, binaryObject"
- "IIIF integration: IIIFManifest, IIIFImageService classes"
- "Heritage image metadata with CIDOC-CRM alignment"
- "Ontology mappings: Schema.org, IIIF, CIDOC-CRM, MathML"
- "Updated total line count to 20689"
- "Updated total elements covered to 463"
- version: "1.11.0"
date: "2025-12-03"
changes:
- "Added analysis.yaml (Chapter 18) - 976 lines, 22 classes, 6 enums"
- "Moved analysis from planned to completed modules"
- "Linguistic segments: s (sentence), cl (clause), phr (phrase)"
- "Token elements: w (word), m (morpheme), c (character), pc (punctuation)"
- "Span annotations: span, spanGrp for standoff annotation"
- "Interpretations: interp, interpGrp for coding schemes"
- "NLP output classes: TokenizedText, POSTaggedToken, DependencyParse"
- "Universal Dependencies POS tagset integration"
- "Morphological feature support (CONLL-U compatible)"
- "Ontology mappings: NIF, OLiA, OntoLex, Web Annotation"
- "Updated total line count to 19946"
- "Updated total elements covered to 445"
- version: "1.10.0"
date: "2025-12-03"
changes:
- "Added certainty.yaml (Chapter 22) - 662 lines, 10 classes, 7 enums"
- "Moved certainty from planned to completed modules"
- "TEI certainty elements: certainty, precision, respons"
- "NER confidence scoring: NERConfidenceScore, ConfidenceMetrics"
- "Annotation provenance: AnnotationProvenance, AnnotationAgent"
- "ML model support: ModelAssertionSet for batch predictions"
- "Ontology mappings: PROV-O, Web Annotation, CIDOC-CRM, ML Schema"
- "Updated total line count to 18970"
- "Updated total elements covered to 423"
- version: "1.9.0"
date: "2025-12-03"
changes:
- "Added gaiji.yaml (Chapter 5) - 948 lines, 18 classes, 6 enums"
- "Moved gaiji from planned to completed modules"
- "Character declarations: charDecl, char, glyph"
- "Character properties: charName, charProp, unicodeName, localProp"
- "Character mappings: mapping with type support"
- "Glyph graphics: figure, graphic"
- "Inline reference: g (gaiji)"
- "Writing modes: direction, writing-mode, text-orientation"
- "CJK extensions: radical, strokes, readings"
- "Medieval manuscript extensions: letterforms, abbreviations"
- "Ontology mappings: CIDOC-CRM, SKOS, Schema.org"
- "Updated total line count to 18308"
- "Updated total elements covered to 413"
- version: "1.8.0"
date: "2025-12-03"
changes:
- "Added dictionaries.yaml (Chapter 10) - 1740 lines, 35 classes, 13 enums"
- "Moved dictionaries from planned to completed modules"
- "Entry structure: entry, entryFree, superEntry, hom"
- "Form elements: form, orth, pron, hyph, syll, stress"
- "Grammar elements: gramGrp, pos, gen, number, case, tns, mood, per, iType, subc"
- "Sense elements: sense, def, cit, quote, trans"
- "Etymology elements: etym, lang, mentioned, gloss"
- "Usage and cross-references: usg, lbl, xr, re"
- "Ontology mappings: OntoLex-Lemon, LexInfo, SKOS, CIDOC-CRM"
- "Updated total line count to 17360"
- "Updated total elements covered to 395"
- version: "1.7.0"
date: "2025-12-03"
changes:
- "Added drama.yaml (Chapter 7) - 781 lines, 15 classes, 4 enums"
- "Moved drama from planned to completed modules"
- "Cast list elements: castList, castGroup, castItem, role, roleDesc, actor"
- "Speech elements: sp (speech), speaker, spGrp (speech group)"
- "Stage directions: stage, move, set"
- "Framing elements: prologue, epilogue"
- "Performance metadata tracking"
- "Ontology mappings: CIDOC-CRM, Schema.org, FRBRoo, FOAF"
- "Updated total line count to 15620"
- "Updated total elements covered to 360"
- version: "1.6.0"
date: "2025-12-03"
changes:
- "Added verse.yaml (Chapter 6) - 689 lines, 7 classes, 6 enums"
- "Moved verse from planned to completed modules"
- "Core verse elements: l (line), lg (stanza), seg, rhyme, caesura"
- "Metrical declaration: metDecl, metSym"
- "Support for meter patterns, rhyme schemes, enjambment"
- "Ontology mappings: CIDOC-CRM, Schema.org"
- "Updated total line count to 14839"
- "Updated total elements covered to 345"
- version: "1.5.0"
date: "2025-12-03"
changes:
- "Added transcr.yaml (Chapter 12) - 1746 lines, 38 classes, 11 enums"
- "Moved transcr from planned to completed modules"
- "Full facsimile/surface support for digital editions (IIIF aligned)"
- "Editorial interventions: add, del, subst, restore, retrace"
- "Damage/illegibility: damage, gap, unclear, supplied, surplus, secl"
- "Abbreviation handling: abbr, expan, am, ex"
- "Metamark and transposition support"
- "Hand tracking: handNotes, handShift"
- "Ontology mappings: CIDOC-CRM, Schema.org, IIIF, Web Annotation"
- "Updated total line count to 14150"
- "Updated total elements covered to 338"
- version: "1.4.0"
date: "2025-12-03"
changes:
- "Added spoken.yaml (Chapter 8) - 1153 lines, 18 classes, 10 enums"
- "Moved spoken from planned to completed modules"
- "Full utterance, pause, vocal, kinesic, incident support"
- "Temporal alignment (timeline, when) for audio/video sync"
- "Recording metadata (recordingStmt, recording, equipment, broadcast)"
- "Paralinguistic feature tracking (shift)"
- "Ontology mappings: CIDOC-CRM, Schema.org, W3C Time"
- "Updated total line count to 12404"
- "Updated total elements covered to 300"
- version: "1.3.0"
date: "2025-12-03"
changes:
- "Added textcrit.yaml (Chapter 13) - 720 lines, 16 classes"
- "Moved textcrit from planned to completed modules"
- "Updated total line count to 11251"
- "Updated total elements covered to 282"
- version: "1.2.0"
date: "2025-12-03"
changes:
- "Added header.yaml (Chapter 2) - 3678 lines, 85 classes"
- "Comprehensive TEI Header metadata support"
- "Full correspondence description (correspDesc, correspAction)"
- "Complete encoding description coverage"
- "Revision history tracking (revisionDesc, change, listChange)"
- "Rich ontology mappings: Dublin Core, Schema.org, PROV-O, BIBO, FOAF, SKOS, PREMIS"
- "Updated total line count to 10531"
- "Updated total elements covered to 266"
- version: "1.1.0"
date: "2025-12-03"
changes:
- "Added msdescription.yaml (Chapter 11) - 1923 lines"
- "Added linking.yaml (Chapter 17) - 1393 lines"
- "Marked core.yaml as complete - 1575 lines"
- "Updated statistics and element coverage"
- "Added detailed NER pipeline integration notes"
- "Added manuscript cataloging integration notes"
- version: "1.0.0"
date: "2025-12-02"
changes:
- "Initial release with namesdates.yaml (Chapter 14)"
- "Created core.yaml structure (partial)"
- "Established module architecture and index"