glam/docs/convention/schema/transcription_convention.yaml
2025-12-02 14:36:01 +01:00

565 lines
17 KiB
YAML

id: https://w3id.org/nationaalarchief/transcription-convention
name: transcription-convention
title: Transcription and Annotation Convention Schema
description: >-
LinkML schema capturing the convention for correcting and annotating
transcribed historical records (version 1.4.3). This schema encodes rules
for layout correction, diplomatic transcription, and entity annotation
following the Gado2 annotation scheme for historical languages.
version: 1.4.3
license: https://creativecommons.org/publicdomain/zero/1.0/
prefixes:
linkml: https://w3id.org/linkml/
schema: http://schema.org/
na: https://w3id.org/nationaalarchief/
skos: http://www.w3.org/2004/02/skos/core#
dcterms: http://purl.org/dc/terms/
default_prefix: na
default_range: string
imports:
- linkml:types
classes:
TranscriptionConvention:
class_uri: na:TranscriptionConvention
description: >-
Top-level container for all transcription convention rules and guidelines
attributes:
version:
description: Convention version identifier
range: string
required: true
publication_date:
description: Date of convention publication
range: date
status:
description: Current status of the convention
range: ConventionStatus
layout_rules:
description: Rules for correcting document layout
range: LayoutRules
required: true
transcription_rules:
description: Rules for diplomatic transcription
range: TranscriptionRules
required: true
annotation_rules:
description: Rules for entity annotation
range: AnnotationRules
required: true
formulaic_phrases:
description: Corpus-specific formulaic phrase patterns
range: FormulaicPhraseRules
multivalued: true
LayoutRules:
class_uri: na:LayoutRules
description: >-
Rules for correcting layouts in transcribed documents, including
text regions, text lines, and baselines
attributes:
text_region_types:
description: Defined types of text regions
range: TextRegionType
multivalued: true
required: true
baseline_rules:
description: Rules for correcting baseline placement
range: BaselineRule
multivalued: true
text_line_ordering:
description: Rules for ordering text lines within regions
range: TextLineOrderingRule
TextRegionType:
class_uri: na:TextRegionType
description: >-
A type of text region that can appear in historical documents
attributes:
name:
description: Name of the text region type
range: TextRegionTypeName
required: true
identifier: true
description:
description: Detailed description of the text region type
range: string
required: true
ordering_rules:
description: Rules for ordering this type of region
range: string
BaselineRule:
class_uri: na:BaselineRule
description: Rules for correcting and adjusting baselines
attributes:
rule_id:
description: Unique identifier for the baseline rule
range: string
required: true
identifier: true
description:
description: Description of the baseline rule
range: string
required: true
applies_to:
description: Context where this rule applies
range: string
action:
description: Action to take when rule applies
range: BaselineAction
TextLineOrderingRule:
class_uri: na:TextLineOrderingRule
description: Rules for ordering text lines within text regions
attributes:
method:
description: Method used for ordering (e.g., coordinate-based)
range: string
required: true
applies_to_region:
description: Text region type this applies to
range: TextRegionTypeName
TranscriptionRules:
class_uri: na:TranscriptionRules
description: >-
Rules for diplomatic transcription of historical documents
attributes:
is_diplomatic:
description: Whether transcription is diplomatic (no spelling correction)
range: boolean
required: true
character_rules:
description: Rules for transcribing specific characters
range: CharacterTranscriptionRule
multivalued: true
abbreviation_rules:
description: Rules for handling abbreviations
range: AbbreviationRule
multivalued: true
formatting_rules:
description: Rules for transcribing text formatting
range: FormattingRule
multivalued: true
special_symbol_rules:
description: Rules for special symbols and marks
range: SpecialSymbolRule
multivalued: true
CharacterTranscriptionRule:
class_uri: na:CharacterTranscriptionRule
description: Rule for transcribing specific characters or character classes
attributes:
rule_id:
description: Unique identifier for the rule
range: string
required: true
identifier: true
name:
description: Name of the character rule
range: string
required: true
description:
description: Description of when and how to apply the rule
range: string
required: true
source_characters:
description: Characters in source document
range: string
multivalued: true
target_transcription:
description: How to transcribe in output
range: string
examples:
description: Examples of rule application
range: TranscriptionExample
multivalued: true
AbbreviationRule:
class_uri: na:AbbreviationRule
description: Rule for handling abbreviations in transcription
attributes:
rule_id:
description: Unique identifier for the abbreviation rule
range: string
required: true
identifier: true
abbreviation_symbol:
description: Symbol used for abbreviation (e.g., colon for superscript)
range: string
expansion_policy:
description: Whether and when to expand abbreviations
range: ExpansionPolicy
required: true
special_cases:
description: Special abbreviations that should be expanded
range: SpecialAbbreviation
multivalued: true
SpecialAbbreviation:
class_uri: na:SpecialAbbreviation
description: A specific abbreviation pattern with defined expansion
attributes:
symbol:
description: Abbreviation symbol or pattern
range: string
required: true
expansion:
description: Expanded form of the abbreviation
range: string
required: true
comment:
description: Additional notes about usage
range: string
FormattingRule:
class_uri: na:FormattingRule
description: Rule for handling text formatting in transcription
attributes:
rule_id:
description: Unique identifier for the formatting rule
range: string
required: true
identifier: true
formatting_type:
description: Type of formatting (bold, italic, underline, strikethrough)
range: FormattingType
required: true
applies_to_handwritten:
description: Whether rule applies to handwritten sources
range: boolean
applies_to_printed:
description: Whether rule applies to printed sources
range: boolean
transcription_method:
description: How to represent formatting in transcription
range: string
SpecialSymbolRule:
class_uri: na:SpecialSymbolRule
description: Rule for transcribing special symbols and marks
attributes:
rule_id:
description: Unique identifier for the symbol rule
range: string
required: true
identifier: true
symbol_name:
description: Name of the special symbol
range: string
required: true
symbol_unicode:
description: Unicode representation of the symbol
range: string
usage_context:
description: Context where symbol appears
range: string
transcription_note:
description: Notes on how to transcribe
range: string
TranscriptionExample:
class_uri: na:TranscriptionExample
description: Example demonstrating a transcription rule
attributes:
source:
description: Source text/image representation
range: string
required: true
transcription:
description: Correct transcription
range: string
required: true
explanation:
description: Explanation of rule application
range: string
AnnotationRules:
class_uri: na:AnnotationRules
description: >-
Rules for annotating entities in transcribed texts following
the Gado2 annotation scheme (7 entity types)
attributes:
annotation_scheme:
description: Name of annotation scheme (e.g., Gado2)
range: string
required: true
no_double_tagging:
description: Policy against double tagging entities
range: boolean
required: true
context_sensitive:
description: Whether same word can have different tags based on context
range: boolean
required: true
entity_types:
description: Entity type annotation rules
range: EntityTypeRules
multivalued: true
required: true
EntityTypeRules:
class_uri: na:EntityTypeRules
description: Rules for annotating a specific entity type
attributes:
entity_type:
description: Type of entity
range: EntityType
required: true
identifier: true
description:
description: Description of the entity type
range: string
required: true
inclusion_rules:
description: Rules for what to include in annotation
range: InclusionRule
multivalued: true
exclusion_rules:
description: Rules for what to exclude from annotation
range: ExclusionRule
multivalued: true
subcategories:
description: Subcategories of this entity type
range: EntitySubcategory
multivalued: true
examples:
description: Annotated examples
range: AnnotationExample
multivalued: true
InclusionRule:
class_uri: na:InclusionRule
description: Rule specifying what should be included in entity annotation
attributes:
rule_id:
description: Unique identifier for the inclusion rule
range: string
required: true
identifier: true
description:
description: Description of what to include
range: string
required: true
conditions:
description: Conditions under which rule applies
range: string
multivalued: true
examples:
description: Examples of rule application
range: string
multivalued: true
ExclusionRule:
class_uri: na:ExclusionRule
description: Rule specifying what should be excluded from entity annotation
attributes:
rule_id:
description: Unique identifier for the exclusion rule
range: string
required: true
identifier: true
description:
description: Description of what to exclude
range: string
required: true
rationale:
description: Reason for exclusion
range: string
examples:
description: Examples of rule application
range: string
multivalued: true
EntitySubcategory:
class_uri: na:EntitySubcategory
description: A subcategory within an entity type
attributes:
name:
description: Name of the subcategory
range: string
required: true
identifier: true
description:
description: Description of the subcategory
range: string
required: true
examples:
description: Examples of entities in this subcategory
range: string
multivalued: true
AnnotationExample:
class_uri: na:AnnotationExample
description: Example of entity annotation
attributes:
text:
description: Text containing the entity
range: string
required: true
entity_span:
description: The annotated entity span
range: string
required: true
entity_type:
description: Type of entity annotated
range: EntityType
required: true
explanation:
description: Explanation of annotation decision
range: string
FormulaicPhraseRules:
class_uri: na:FormulaicPhraseRules
description: >-
Corpus-specific formulaic phrase patterns used to distinguish
text region types
attributes:
corpus:
description: Corpus to which these patterns apply
range: string
required: true
language:
description: Language of the corpus
range: string
required: true
century:
description: Century of the corpus
range: string
phrase_patterns:
description: Formulaic phrase patterns
range: FormulaicPhrasePattern
multivalued: true
required: true
FormulaicPhrasePattern:
class_uri: na:FormulaicPhrasePattern
description: A formulaic phrase pattern with entity placeholders
attributes:
pattern_id:
description: Unique identifier for the pattern
range: string
required: true
identifier: true
text_region_type:
description: Text region type this pattern identifies
range: TextRegionTypeName
required: true
pattern:
description: Pattern with entity type placeholders
range: string
required: true
example:
description: Example instantiation of the pattern
range: string
multivalued: true
description:
description: Description of when pattern appears
range: string
enums:
ConventionStatus:
description: Status of the convention specification
permissible_values:
DRAFT:
description: Convention is in draft status
SPECIFIED:
description: Convention is specified for specific archives
FINALIZED:
description: Convention is finalized for general use
TextRegionTypeName:
description: Names of text region types
permissible_values:
PARAGRAPH:
description: Running text within the type area
PAGE_NUMBER:
description: Page or folium number
HEADER:
description: Header text at top margin
FOOTER:
description: Footer text at bottom margin
HEADING:
description: Title or section heading
FOOTNOTE:
description: Footnotes below running text
TABLE:
description: Tabular layout
MARGINALIA:
description: Notes in margins
CAPTION:
description: Image captions
COLOPHON:
description: Author/creation information
BaselineAction:
description: Actions for baseline correction
permissible_values:
REMOVE:
description: Remove baseline or transcription
SHORTEN:
description: Shorten baseline
LENGTHEN:
description: Lengthen baseline
SPLIT:
description: Split baseline into multiple
MERGE:
description: Merge with main baseline
ADJUST:
description: Adjust position
ExpansionPolicy:
description: Policy for expanding abbreviations
permissible_values:
NO_EXPANSION:
description: Do not expand abbreviations
EXPAND_SPECIAL:
description: Only expand special cases
EXPAND_ALL:
description: Expand all abbreviations
FormattingType:
description: Types of text formatting
permissible_values:
BOLD:
description: Bold text
ITALIC:
description: Italic text
UNDERLINE:
description: Underlined text
STRIKETHROUGH:
description: Struck through text
SUPERSCRIPT:
description: Superscript text
SUBSCRIPT:
description: Subscript text
EntityType:
description: Types of entities in Gado2 annotation scheme
permissible_values:
PERSON:
description: >-
Personal names and specific references to persons (named or unnamed)
PLACE:
description: >-
Geographic locations including streets, cities, landforms, buildings
ORGANISATION:
description: >-
Organizations including companies, institutions, governments
DENOMINATION:
description: >-
Ethnicity, profession, religion, demonym, ideology - references to
groups or attributes
QUANTITY:
description: >-
Quantities including currency, counts, distances, weights
TEMPORAL_REFERENCE:
description: >-
Temporal references including dates, time periods, campaigns
TEXTUAL_REFERENCE:
description: >-
References to written sources, laws, documents, titles