352 lines
13 KiB
YAML
352 lines
13 KiB
YAML
# Transcription Rules Instance - Section 2 of Convention v1.4.3
|
||
# Captures all diplomatic transcription rules
|
||
|
||
is_diplomatic: true
|
||
|
||
character_rules:
|
||
# Section 2.1 - Characters often mixed by computer
|
||
- rule_id: CHAR001
|
||
name: distinguish_c_e_a_o
|
||
description: >-
|
||
Pay attention to distinguish between c, e, a, and o which are often
|
||
confused by HTR models depending on handwriting style
|
||
source_characters: ["c", "e", "a", "o"]
|
||
target_transcription: Use correct character as appears in image
|
||
examples:
|
||
- source: "unclear character between o and a"
|
||
transcription: "a"
|
||
explanation: "Context indicates this is the letter 'a'"
|
||
|
||
- rule_id: CHAR002
|
||
name: distinguish_m_nn_nl_en_ri
|
||
description: >-
|
||
Distinguish between m, nn, nl, en, and ri which have similar shapes
|
||
in cursive handwriting
|
||
source_characters: ["m", "nn", "nl", "en", "ri"]
|
||
target_transcription: Use correct character sequence as appears in image
|
||
|
||
- rule_id: CHAR003
|
||
name: distinguish_n_r_ri_m
|
||
description: Distinguish between n, r, ri, and m
|
||
source_characters: ["n", "r", "ri", "m"]
|
||
target_transcription: Use correct character as appears in image
|
||
|
||
- rule_id: CHAR004
|
||
name: distinguish_rn_m_ar_n_em
|
||
description: Distinguish between rn, m, ar, n, and em
|
||
source_characters: ["rn", "m", "ar", "n", "em"]
|
||
target_transcription: Use correct character sequence as appears in image
|
||
|
||
- rule_id: CHAR005
|
||
name: distinguish_D_o_0_G
|
||
description: Distinguish between uppercase D, lowercase o, digit 0, and uppercase G
|
||
source_characters: ["D", "o", "0", "G"]
|
||
target_transcription: Use correct character as appears in image
|
||
|
||
- rule_id: CHAR006
|
||
name: distinguish_b_h
|
||
description: Distinguish between lowercase b and lowercase h
|
||
source_characters: ["b", "h"]
|
||
target_transcription: Use correct character as appears in image
|
||
|
||
- rule_id: CHAR007
|
||
name: distinguish_Z_2
|
||
description: Distinguish between uppercase Z and digit 2
|
||
source_characters: ["Z", "2"]
|
||
target_transcription: Use correct character as appears in image
|
||
|
||
- rule_id: CHAR008
|
||
name: distinguish_R_B
|
||
description: Distinguish between uppercase R and uppercase B
|
||
source_characters: ["R", "B"]
|
||
target_transcription: Use correct character as appears in image
|
||
|
||
- rule_id: CHAR009
|
||
name: distinguish_G_6
|
||
description: Distinguish between uppercase G and digit 6
|
||
source_characters: ["G", "6"]
|
||
target_transcription: Use correct character as appears in image
|
||
|
||
- rule_id: CHAR010
|
||
name: distinguish_I_l_i_t_1_slash_exclamation_pipe_J_j
|
||
description: >-
|
||
Distinguish between uppercase I, lowercase l, lowercase i, lowercase t,
|
||
digit 1, forward slash, exclamation mark, pipe, uppercase J, and lowercase j
|
||
source_characters: ["I", "l", "i", "t", "1", "/", "!", "|", "J", "j"]
|
||
target_transcription: Use correct character as appears in image
|
||
|
||
- rule_id: CHAR011
|
||
name: distinguish_IJ_Y_ij_y
|
||
description: >-
|
||
Distinguish between uppercase IJ ligature, uppercase Y, lowercase ij
|
||
ligature, and lowercase y
|
||
source_characters: ["IJ", "Y", "ij", "y"]
|
||
target_transcription: Use correct character as appears in image
|
||
|
||
- rule_id: CHAR012
|
||
name: distinguish_v_y
|
||
description: Distinguish between lowercase v and lowercase y
|
||
source_characters: ["v", "y"]
|
||
target_transcription: Use correct character as appears in image
|
||
|
||
- rule_id: CHAR013
|
||
name: distinguish_S_8_5
|
||
description: Distinguish between uppercase S, digit 8, and digit 5
|
||
source_characters: ["S", "8", "5"]
|
||
target_transcription: Use correct character as appears in image
|
||
|
||
- rule_id: CHAR014
|
||
name: distinguish_HI_M
|
||
description: Distinguish between uppercase H and I together versus uppercase M
|
||
source_characters: ["HI", "M"]
|
||
target_transcription: Use correct characters as appears in image
|
||
|
||
- rule_id: CHAR015
|
||
name: distinguish_K_pipe_less_than
|
||
description: Distinguish between uppercase K, pipe, and less than symbol
|
||
source_characters: ["K", "|", "<"]
|
||
target_transcription: Use correct character as appears in image
|
||
|
||
- rule_id: CHAR016
|
||
name: distinguish_ine_the
|
||
description: Distinguish between letter sequences 'ine' and 'the'
|
||
source_characters: ["ine", "the"]
|
||
target_transcription: Use correct sequence as appears in image
|
||
|
||
# Section 2.3 - Merge and split words
|
||
- rule_id: CHAR017
|
||
name: merge_split_as_shown
|
||
description: >-
|
||
Merge and split words as they are shown on the image of the page,
|
||
not as is common in modern texts
|
||
source_characters: []
|
||
target_transcription: Preserve word boundaries as shown in source
|
||
|
||
# Section 2.4 - Upper and lowercase
|
||
- rule_id: CHAR018
|
||
name: case_as_shown
|
||
description: >-
|
||
Lower- (miniscule) and uppercase (majuscule) are transcribed as shown
|
||
on the page of the image, even if capitals occur halfway words. In case
|
||
ambiguity arises over whether a letter is written in lower- or uppercase,
|
||
modern orthographic rules will be used for capitalizing words and sentences.
|
||
source_characters: []
|
||
target_transcription: Preserve case as shown, use modern rules for ambiguous cases
|
||
|
||
# Section 2.8 - Medial s
|
||
- rule_id: CHAR019
|
||
name: medial_s_transcription
|
||
description: >-
|
||
The medial 's' (ſ) and short 's' are not distinguished; both are
|
||
transcribed as 's'
|
||
source_characters: ["ſ", "s"]
|
||
target_transcription: "s"
|
||
|
||
# Section 2.9 - Font
|
||
- rule_id: CHAR020
|
||
name: fonts_not_distinguished
|
||
description: Fonts like Kurrent or Antiqua are not transcribed differently
|
||
source_characters: []
|
||
target_transcription: Transcribe characters regardless of font style
|
||
|
||
# Section 2.12 - Diacritics
|
||
- rule_id: CHAR021
|
||
name: diacritics_preserve
|
||
description: >-
|
||
Used in printed and handwritten texts. In case the diacritics are difficult
|
||
to distinguish (e.g. — ~) one of the diacritics needs to be chosen and
|
||
applied consistently within the handwritten corpus in which it occurs.
|
||
Lines above the letter 'u' are not transcribed in case they are merely
|
||
meant to distinguish the letter 'u' from the letter 'v'. This is common
|
||
in early modern Dutch texts.
|
||
source_characters: []
|
||
target_transcription: Preserve diacritics consistently
|
||
|
||
abbreviation_rules:
|
||
# Section 2.6 - Super and subscript
|
||
- rule_id: ABB001
|
||
abbreviation_symbol: ":"
|
||
expansion_policy: EXPAND_SPECIAL
|
||
description: >-
|
||
Use a colon (:) for super- and subscripts in handwritten sources.
|
||
Superscript is only transcribed for printed sources.
|
||
special_cases:
|
||
- symbol: "Ed:le"
|
||
expansion: "Ed:le"
|
||
comment: "Use colon instead of superscript notation"
|
||
- symbol: "et=a"
|
||
expansion: "etc."
|
||
comment: "Expand to etc. for consistency"
|
||
- symbol: "et:a"
|
||
expansion: "etc."
|
||
comment: "Expand to etc. for consistency"
|
||
- symbol: "et cetera variations"
|
||
expansion: "etc."
|
||
comment: "General transcription for all et cetera abbreviations"
|
||
|
||
# Section 2.16 - Abbreviations
|
||
- rule_id: ABB002
|
||
abbreviation_symbol: "ver-symbol"
|
||
expansion_policy: EXPAND_SPECIAL
|
||
description: Abbreviations are not written out except in specific cases
|
||
special_cases:
|
||
- symbol: "⁊ (ver symbol)"
|
||
expansion: "ver"
|
||
comment: "Expand the ver abbreviation"
|
||
- symbol: "ende symbol"
|
||
expansion: "ende"
|
||
comment: "Expand the ende abbreviation"
|
||
- symbol: "per symbol"
|
||
expansion: "per"
|
||
comment: "Expand the per abbreviation"
|
||
|
||
formatting_rules:
|
||
# Section 2.10 - Cross outs
|
||
- rule_id: FMT001
|
||
formatting_type: STRIKETHROUGH
|
||
applies_to_handwritten: true
|
||
applies_to_printed: true
|
||
transcription_method: >-
|
||
Use cross outs when they occur in the image of the page. Indicate typos
|
||
and blurred or covered letters through crossing them, e.g. apfel --> ap~~f~~el.
|
||
Cross outs are registered as a textStyle within the PAGE-xmls.
|
||
|
||
# Section 2.11 - Underlining
|
||
- rule_id: FMT002
|
||
formatting_type: UNDERLINE
|
||
applies_to_handwritten: true
|
||
applies_to_printed: true
|
||
transcription_method: >-
|
||
Underline characters if they are underlined on the image of the page.
|
||
Underlining is registered as a textStyle within the PAGE-xmls.
|
||
|
||
# Section 2.12 - Bold and Italics
|
||
- rule_id: FMT003
|
||
formatting_type: BOLD
|
||
applies_to_handwritten: false
|
||
applies_to_printed: true
|
||
transcription_method: >-
|
||
Bold type is only transcribed for printed texts, not handwritten ones.
|
||
|
||
- rule_id: FMT004
|
||
formatting_type: ITALIC
|
||
applies_to_handwritten: false
|
||
applies_to_printed: true
|
||
transcription_method: >-
|
||
Italic type is only transcribed for printed texts, not handwritten ones.
|
||
|
||
special_symbol_rules:
|
||
# Section 2.5 - Hyphens and dashes
|
||
- rule_id: SYM001
|
||
symbol_name: hyphen
|
||
symbol_unicode: "-"
|
||
usage_context: Within words or at line breaks
|
||
transcription_note: >-
|
||
Hyphens (-) and dashes (—) are transcribed based on the length and
|
||
semantic meaning within the phrase in which they occur.
|
||
|
||
- rule_id: SYM002
|
||
symbol_name: negation_sign
|
||
symbol_unicode: "¬"
|
||
usage_context: End of line word breaks
|
||
transcription_note: >-
|
||
Hyphens at the end of a line are transcribed with a negation sign (¬)
|
||
unless their purpose is to join two words or parts which happen to have
|
||
split on two different lines.
|
||
examples:
|
||
- source: "zon-ne"
|
||
transcription: "zon¬ne"
|
||
explanation: "Word split at line break, not a compound word"
|
||
- source: "zonne-energie"
|
||
transcription: "zonne-energie"
|
||
explanation: "Compound word with hyphen"
|
||
|
||
- rule_id: SYM003
|
||
symbol_name: em_dash
|
||
symbol_unicode: "—"
|
||
usage_context: Longer dashes for emphasis or breaks
|
||
transcription_note: Transcribe based on length in source
|
||
|
||
# Section 2.7 - Interpunction and punctuation
|
||
- rule_id: SYM004
|
||
symbol_name: full_stop
|
||
symbol_unicode: "."
|
||
usage_context: End of sentences
|
||
transcription_note: >-
|
||
Lines of ink at the end of text lines are rarely meant as interpunction.
|
||
They often serve to clean the quill before dipping it into the inkwell.
|
||
Transcribing these marks will distort natural language processing and
|
||
should be avoided.
|
||
|
||
- rule_id: SYM005
|
||
symbol_name: quotation_marks
|
||
symbol_unicode: "\" \" , ' ' „ " « » ‹ ›"
|
||
usage_context: Direct speech and quotations
|
||
transcription_note: Distinguish different quotation mark styles
|
||
|
||
- rule_id: SYM006
|
||
symbol_name: punctuation_marks
|
||
symbol_unicode: "! ?"
|
||
usage_context: End of sentences
|
||
transcription_note: Take care to distinguish
|
||
|
||
- rule_id: SYM007
|
||
symbol_name: separators
|
||
symbol_unicode: "/ | \\ ( ) /:"
|
||
usage_context: Various separation and grouping
|
||
transcription_note: Take care to distinguish
|
||
|
||
# Section 2.7 - Special symbols
|
||
- rule_id: SYM008
|
||
symbol_name: f_variations
|
||
symbol_unicode: "f ƒ fl"
|
||
usage_context: Letter f and ligatures
|
||
transcription_note: Make sure to distinguish
|
||
|
||
- rule_id: SYM009
|
||
symbol_name: tironian_et_vs_seven
|
||
symbol_unicode: "⁊ 7"
|
||
usage_context: Marginalia marker vs number
|
||
transcription_note: >-
|
||
⁊ (often used to refer to marginalia) vs 7 (the number seven)
|
||
|
||
# Section 2.14 - Flourish of approval
|
||
- rule_id: SYM010
|
||
symbol_name: flourish_of_approval
|
||
symbol_unicode: "₰"
|
||
usage_context: >-
|
||
Dutch check mark in 19th-21st century texts from Netherlands, Indonesia,
|
||
South Africa, Belgium and Dutch Caribbean islands
|
||
transcription_note: >-
|
||
Represented by German penny symbol (₰). It is advised to substitute this
|
||
symbol with a unique one in the near future, given that the dele—which has
|
||
an opposite meaning to the flourish—also is transcribed as the German penny
|
||
symbol.
|
||
|
||
# Section 2.13 - Unclear words
|
||
- rule_id: SYM011
|
||
symbol_name: unclear_marker
|
||
symbol_unicode: ""
|
||
usage_context: Illegible or uncertain text
|
||
transcription_note: >-
|
||
Tag unclear words as 'unclear'. NEVER delete part of the baseline as this
|
||
will inhibit subsequent Natural Language Processing.
|
||
|
||
# Section 2.15 - Initials and signatures
|
||
- rule_id: SYM012
|
||
symbol_name: unclear_signature
|
||
symbol_unicode: ""
|
||
usage_context: Illegible signatures
|
||
transcription_note: >-
|
||
Note unclear initials and signatures as 'unclear'. If signatures are
|
||
impossible to decipher, their baseline can be deleted. Try to discuss
|
||
this with the editing team first.
|
||
|
||
# Section 2.2 - Baselines
|
||
- rule_id: SYM013
|
||
symbol_name: partial_characters
|
||
symbol_unicode: ""
|
||
usage_context: Characters outside baseline range
|
||
transcription_note: >-
|
||
Characters which are displayed partially outside of the range of baselines,
|
||
but which are part of the phrase should be transcribed too.
|