565 lines
17 KiB
YAML
565 lines
17 KiB
YAML
id: https://w3id.org/nationaalarchief/transcription-convention
|
|
name: transcription-convention
|
|
title: Transcription and Annotation Convention Schema
|
|
description: >-
|
|
LinkML schema capturing the convention for correcting and annotating
|
|
transcribed historical records (version 1.4.3). This schema encodes rules
|
|
for layout correction, diplomatic transcription, and entity annotation
|
|
following the Gado2 annotation scheme for historical languages.
|
|
version: 1.4.3
|
|
license: https://creativecommons.org/publicdomain/zero/1.0/
|
|
|
|
prefixes:
|
|
linkml: https://w3id.org/linkml/
|
|
schema: http://schema.org/
|
|
na: https://w3id.org/nationaalarchief/
|
|
skos: http://www.w3.org/2004/02/skos/core#
|
|
dcterms: http://purl.org/dc/terms/
|
|
|
|
default_prefix: na
|
|
default_range: string
|
|
|
|
imports:
|
|
- linkml:types
|
|
|
|
classes:
|
|
TranscriptionConvention:
|
|
class_uri: na:TranscriptionConvention
|
|
description: >-
|
|
Top-level container for all transcription convention rules and guidelines
|
|
attributes:
|
|
version:
|
|
description: Convention version identifier
|
|
range: string
|
|
required: true
|
|
publication_date:
|
|
description: Date of convention publication
|
|
range: date
|
|
status:
|
|
description: Current status of the convention
|
|
range: ConventionStatus
|
|
layout_rules:
|
|
description: Rules for correcting document layout
|
|
range: LayoutRules
|
|
required: true
|
|
transcription_rules:
|
|
description: Rules for diplomatic transcription
|
|
range: TranscriptionRules
|
|
required: true
|
|
annotation_rules:
|
|
description: Rules for entity annotation
|
|
range: AnnotationRules
|
|
required: true
|
|
formulaic_phrases:
|
|
description: Corpus-specific formulaic phrase patterns
|
|
range: FormulaicPhraseRules
|
|
multivalued: true
|
|
|
|
LayoutRules:
|
|
class_uri: na:LayoutRules
|
|
description: >-
|
|
Rules for correcting layouts in transcribed documents, including
|
|
text regions, text lines, and baselines
|
|
attributes:
|
|
text_region_types:
|
|
description: Defined types of text regions
|
|
range: TextRegionType
|
|
multivalued: true
|
|
required: true
|
|
baseline_rules:
|
|
description: Rules for correcting baseline placement
|
|
range: BaselineRule
|
|
multivalued: true
|
|
text_line_ordering:
|
|
description: Rules for ordering text lines within regions
|
|
range: TextLineOrderingRule
|
|
|
|
TextRegionType:
|
|
class_uri: na:TextRegionType
|
|
description: >-
|
|
A type of text region that can appear in historical documents
|
|
attributes:
|
|
name:
|
|
description: Name of the text region type
|
|
range: TextRegionTypeName
|
|
required: true
|
|
identifier: true
|
|
description:
|
|
description: Detailed description of the text region type
|
|
range: string
|
|
required: true
|
|
ordering_rules:
|
|
description: Rules for ordering this type of region
|
|
range: string
|
|
|
|
BaselineRule:
|
|
class_uri: na:BaselineRule
|
|
description: Rules for correcting and adjusting baselines
|
|
attributes:
|
|
rule_id:
|
|
description: Unique identifier for the baseline rule
|
|
range: string
|
|
required: true
|
|
identifier: true
|
|
description:
|
|
description: Description of the baseline rule
|
|
range: string
|
|
required: true
|
|
applies_to:
|
|
description: Context where this rule applies
|
|
range: string
|
|
action:
|
|
description: Action to take when rule applies
|
|
range: BaselineAction
|
|
|
|
TextLineOrderingRule:
|
|
class_uri: na:TextLineOrderingRule
|
|
description: Rules for ordering text lines within text regions
|
|
attributes:
|
|
method:
|
|
description: Method used for ordering (e.g., coordinate-based)
|
|
range: string
|
|
required: true
|
|
applies_to_region:
|
|
description: Text region type this applies to
|
|
range: TextRegionTypeName
|
|
|
|
TranscriptionRules:
|
|
class_uri: na:TranscriptionRules
|
|
description: >-
|
|
Rules for diplomatic transcription of historical documents
|
|
attributes:
|
|
is_diplomatic:
|
|
description: Whether transcription is diplomatic (no spelling correction)
|
|
range: boolean
|
|
required: true
|
|
character_rules:
|
|
description: Rules for transcribing specific characters
|
|
range: CharacterTranscriptionRule
|
|
multivalued: true
|
|
abbreviation_rules:
|
|
description: Rules for handling abbreviations
|
|
range: AbbreviationRule
|
|
multivalued: true
|
|
formatting_rules:
|
|
description: Rules for transcribing text formatting
|
|
range: FormattingRule
|
|
multivalued: true
|
|
special_symbol_rules:
|
|
description: Rules for special symbols and marks
|
|
range: SpecialSymbolRule
|
|
multivalued: true
|
|
|
|
CharacterTranscriptionRule:
|
|
class_uri: na:CharacterTranscriptionRule
|
|
description: Rule for transcribing specific characters or character classes
|
|
attributes:
|
|
rule_id:
|
|
description: Unique identifier for the rule
|
|
range: string
|
|
required: true
|
|
identifier: true
|
|
name:
|
|
description: Name of the character rule
|
|
range: string
|
|
required: true
|
|
description:
|
|
description: Description of when and how to apply the rule
|
|
range: string
|
|
required: true
|
|
source_characters:
|
|
description: Characters in source document
|
|
range: string
|
|
multivalued: true
|
|
target_transcription:
|
|
description: How to transcribe in output
|
|
range: string
|
|
examples:
|
|
description: Examples of rule application
|
|
range: TranscriptionExample
|
|
multivalued: true
|
|
|
|
AbbreviationRule:
|
|
class_uri: na:AbbreviationRule
|
|
description: Rule for handling abbreviations in transcription
|
|
attributes:
|
|
rule_id:
|
|
description: Unique identifier for the abbreviation rule
|
|
range: string
|
|
required: true
|
|
identifier: true
|
|
abbreviation_symbol:
|
|
description: Symbol used for abbreviation (e.g., colon for superscript)
|
|
range: string
|
|
expansion_policy:
|
|
description: Whether and when to expand abbreviations
|
|
range: ExpansionPolicy
|
|
required: true
|
|
special_cases:
|
|
description: Special abbreviations that should be expanded
|
|
range: SpecialAbbreviation
|
|
multivalued: true
|
|
|
|
SpecialAbbreviation:
|
|
class_uri: na:SpecialAbbreviation
|
|
description: A specific abbreviation pattern with defined expansion
|
|
attributes:
|
|
symbol:
|
|
description: Abbreviation symbol or pattern
|
|
range: string
|
|
required: true
|
|
expansion:
|
|
description: Expanded form of the abbreviation
|
|
range: string
|
|
required: true
|
|
comment:
|
|
description: Additional notes about usage
|
|
range: string
|
|
|
|
FormattingRule:
|
|
class_uri: na:FormattingRule
|
|
description: Rule for handling text formatting in transcription
|
|
attributes:
|
|
rule_id:
|
|
description: Unique identifier for the formatting rule
|
|
range: string
|
|
required: true
|
|
identifier: true
|
|
formatting_type:
|
|
description: Type of formatting (bold, italic, underline, strikethrough)
|
|
range: FormattingType
|
|
required: true
|
|
applies_to_handwritten:
|
|
description: Whether rule applies to handwritten sources
|
|
range: boolean
|
|
applies_to_printed:
|
|
description: Whether rule applies to printed sources
|
|
range: boolean
|
|
transcription_method:
|
|
description: How to represent formatting in transcription
|
|
range: string
|
|
|
|
SpecialSymbolRule:
|
|
class_uri: na:SpecialSymbolRule
|
|
description: Rule for transcribing special symbols and marks
|
|
attributes:
|
|
rule_id:
|
|
description: Unique identifier for the symbol rule
|
|
range: string
|
|
required: true
|
|
identifier: true
|
|
symbol_name:
|
|
description: Name of the special symbol
|
|
range: string
|
|
required: true
|
|
symbol_unicode:
|
|
description: Unicode representation of the symbol
|
|
range: string
|
|
usage_context:
|
|
description: Context where symbol appears
|
|
range: string
|
|
transcription_note:
|
|
description: Notes on how to transcribe
|
|
range: string
|
|
|
|
TranscriptionExample:
|
|
class_uri: na:TranscriptionExample
|
|
description: Example demonstrating a transcription rule
|
|
attributes:
|
|
source:
|
|
description: Source text/image representation
|
|
range: string
|
|
required: true
|
|
transcription:
|
|
description: Correct transcription
|
|
range: string
|
|
required: true
|
|
explanation:
|
|
description: Explanation of rule application
|
|
range: string
|
|
|
|
AnnotationRules:
|
|
class_uri: na:AnnotationRules
|
|
description: >-
|
|
Rules for annotating entities in transcribed texts following
|
|
the Gado2 annotation scheme (7 entity types)
|
|
attributes:
|
|
annotation_scheme:
|
|
description: Name of annotation scheme (e.g., Gado2)
|
|
range: string
|
|
required: true
|
|
no_double_tagging:
|
|
description: Policy against double tagging entities
|
|
range: boolean
|
|
required: true
|
|
context_sensitive:
|
|
description: Whether same word can have different tags based on context
|
|
range: boolean
|
|
required: true
|
|
entity_types:
|
|
description: Entity type annotation rules
|
|
range: EntityTypeRules
|
|
multivalued: true
|
|
required: true
|
|
|
|
EntityTypeRules:
|
|
class_uri: na:EntityTypeRules
|
|
description: Rules for annotating a specific entity type
|
|
attributes:
|
|
entity_type:
|
|
description: Type of entity
|
|
range: EntityType
|
|
required: true
|
|
identifier: true
|
|
description:
|
|
description: Description of the entity type
|
|
range: string
|
|
required: true
|
|
inclusion_rules:
|
|
description: Rules for what to include in annotation
|
|
range: InclusionRule
|
|
multivalued: true
|
|
exclusion_rules:
|
|
description: Rules for what to exclude from annotation
|
|
range: ExclusionRule
|
|
multivalued: true
|
|
subcategories:
|
|
description: Subcategories of this entity type
|
|
range: EntitySubcategory
|
|
multivalued: true
|
|
examples:
|
|
description: Annotated examples
|
|
range: AnnotationExample
|
|
multivalued: true
|
|
|
|
InclusionRule:
|
|
class_uri: na:InclusionRule
|
|
description: Rule specifying what should be included in entity annotation
|
|
attributes:
|
|
rule_id:
|
|
description: Unique identifier for the inclusion rule
|
|
range: string
|
|
required: true
|
|
identifier: true
|
|
description:
|
|
description: Description of what to include
|
|
range: string
|
|
required: true
|
|
conditions:
|
|
description: Conditions under which rule applies
|
|
range: string
|
|
multivalued: true
|
|
examples:
|
|
description: Examples of rule application
|
|
range: string
|
|
multivalued: true
|
|
|
|
ExclusionRule:
|
|
class_uri: na:ExclusionRule
|
|
description: Rule specifying what should be excluded from entity annotation
|
|
attributes:
|
|
rule_id:
|
|
description: Unique identifier for the exclusion rule
|
|
range: string
|
|
required: true
|
|
identifier: true
|
|
description:
|
|
description: Description of what to exclude
|
|
range: string
|
|
required: true
|
|
rationale:
|
|
description: Reason for exclusion
|
|
range: string
|
|
examples:
|
|
description: Examples of rule application
|
|
range: string
|
|
multivalued: true
|
|
|
|
EntitySubcategory:
|
|
class_uri: na:EntitySubcategory
|
|
description: A subcategory within an entity type
|
|
attributes:
|
|
name:
|
|
description: Name of the subcategory
|
|
range: string
|
|
required: true
|
|
identifier: true
|
|
description:
|
|
description: Description of the subcategory
|
|
range: string
|
|
required: true
|
|
examples:
|
|
description: Examples of entities in this subcategory
|
|
range: string
|
|
multivalued: true
|
|
|
|
AnnotationExample:
|
|
class_uri: na:AnnotationExample
|
|
description: Example of entity annotation
|
|
attributes:
|
|
text:
|
|
description: Text containing the entity
|
|
range: string
|
|
required: true
|
|
entity_span:
|
|
description: The annotated entity span
|
|
range: string
|
|
required: true
|
|
entity_type:
|
|
description: Type of entity annotated
|
|
range: EntityType
|
|
required: true
|
|
explanation:
|
|
description: Explanation of annotation decision
|
|
range: string
|
|
|
|
FormulaicPhraseRules:
|
|
class_uri: na:FormulaicPhraseRules
|
|
description: >-
|
|
Corpus-specific formulaic phrase patterns used to distinguish
|
|
text region types
|
|
attributes:
|
|
corpus:
|
|
description: Corpus to which these patterns apply
|
|
range: string
|
|
required: true
|
|
language:
|
|
description: Language of the corpus
|
|
range: string
|
|
required: true
|
|
century:
|
|
description: Century of the corpus
|
|
range: string
|
|
phrase_patterns:
|
|
description: Formulaic phrase patterns
|
|
range: FormulaicPhrasePattern
|
|
multivalued: true
|
|
required: true
|
|
|
|
FormulaicPhrasePattern:
|
|
class_uri: na:FormulaicPhrasePattern
|
|
description: A formulaic phrase pattern with entity placeholders
|
|
attributes:
|
|
pattern_id:
|
|
description: Unique identifier for the pattern
|
|
range: string
|
|
required: true
|
|
identifier: true
|
|
text_region_type:
|
|
description: Text region type this pattern identifies
|
|
range: TextRegionTypeName
|
|
required: true
|
|
pattern:
|
|
description: Pattern with entity type placeholders
|
|
range: string
|
|
required: true
|
|
example:
|
|
description: Example instantiation of the pattern
|
|
range: string
|
|
multivalued: true
|
|
description:
|
|
description: Description of when pattern appears
|
|
range: string
|
|
|
|
enums:
|
|
ConventionStatus:
|
|
description: Status of the convention specification
|
|
permissible_values:
|
|
DRAFT:
|
|
description: Convention is in draft status
|
|
SPECIFIED:
|
|
description: Convention is specified for specific archives
|
|
FINALIZED:
|
|
description: Convention is finalized for general use
|
|
|
|
TextRegionTypeName:
|
|
description: Names of text region types
|
|
permissible_values:
|
|
PARAGRAPH:
|
|
description: Running text within the type area
|
|
PAGE_NUMBER:
|
|
description: Page or folium number
|
|
HEADER:
|
|
description: Header text at top margin
|
|
FOOTER:
|
|
description: Footer text at bottom margin
|
|
HEADING:
|
|
description: Title or section heading
|
|
FOOTNOTE:
|
|
description: Footnotes below running text
|
|
TABLE:
|
|
description: Tabular layout
|
|
MARGINALIA:
|
|
description: Notes in margins
|
|
CAPTION:
|
|
description: Image captions
|
|
COLOPHON:
|
|
description: Author/creation information
|
|
|
|
BaselineAction:
|
|
description: Actions for baseline correction
|
|
permissible_values:
|
|
REMOVE:
|
|
description: Remove baseline or transcription
|
|
SHORTEN:
|
|
description: Shorten baseline
|
|
LENGTHEN:
|
|
description: Lengthen baseline
|
|
SPLIT:
|
|
description: Split baseline into multiple
|
|
MERGE:
|
|
description: Merge with main baseline
|
|
ADJUST:
|
|
description: Adjust position
|
|
|
|
ExpansionPolicy:
|
|
description: Policy for expanding abbreviations
|
|
permissible_values:
|
|
NO_EXPANSION:
|
|
description: Do not expand abbreviations
|
|
EXPAND_SPECIAL:
|
|
description: Only expand special cases
|
|
EXPAND_ALL:
|
|
description: Expand all abbreviations
|
|
|
|
FormattingType:
|
|
description: Types of text formatting
|
|
permissible_values:
|
|
BOLD:
|
|
description: Bold text
|
|
ITALIC:
|
|
description: Italic text
|
|
UNDERLINE:
|
|
description: Underlined text
|
|
STRIKETHROUGH:
|
|
description: Struck through text
|
|
SUPERSCRIPT:
|
|
description: Superscript text
|
|
SUBSCRIPT:
|
|
description: Subscript text
|
|
|
|
EntityType:
|
|
description: Types of entities in Gado2 annotation scheme
|
|
permissible_values:
|
|
PERSON:
|
|
description: >-
|
|
Personal names and specific references to persons (named or unnamed)
|
|
PLACE:
|
|
description: >-
|
|
Geographic locations including streets, cities, landforms, buildings
|
|
ORGANISATION:
|
|
description: >-
|
|
Organizations including companies, institutions, governments
|
|
DENOMINATION:
|
|
description: >-
|
|
Ethnicity, profession, religion, demonym, ideology - references to
|
|
groups or attributes
|
|
QUANTITY:
|
|
description: >-
|
|
Quantities including currency, counts, distances, weights
|
|
TEMPORAL_REFERENCE:
|
|
description: >-
|
|
Temporal references including dates, time periods, campaigns
|
|
TEXTUAL_REFERENCE:
|
|
description: >-
|
|
References to written sources, laws, documents, titles
|