glam/docs/convention/schema/transcription_rules_instance.yaml

# Transcription Rules Instance - Section 2 of Convention v1.4.3
# Captures all diplomatic transcription rules

is_diplomatic: true

character_rules:
  # Section 2.1 - Characters often mixed by computer
  - rule_id: CHAR001
    name: distinguish_c_e_a_o
    description: >-
      Pay attention to distinguish between c, e, a, and o which are often
      confused by HTR models depending on handwriting style
    source_characters: ["c", "e", "a", "o"]
    target_transcription: Use correct character as appears in image
    examples:
      - source: "unclear character between o and a"
        transcription: "a"
        explanation: "Context indicates this is the letter 'a'"

  - rule_id: CHAR002
    name: distinguish_m_nn_nl_en_ri
    description: >-
      Distinguish between m, nn, nl, en, and ri which have similar shapes
      in cursive handwriting
    source_characters: ["m", "nn", "nl", "en", "ri"]
    target_transcription: Use correct character sequence as appears in image

  - rule_id: CHAR003
    name: distinguish_n_r_ri_m
    description: Distinguish between n, r, ri, and m
    source_characters: ["n", "r", "ri", "m"]
    target_transcription: Use correct character as appears in image

  - rule_id: CHAR004
    name: distinguish_rn_m_ar_n_em
    description: Distinguish between rn, m, ar, n, and em
    source_characters: ["rn", "m", "ar", "n", "em"]
    target_transcription: Use correct character sequence as appears in image

  - rule_id: CHAR005
    name: distinguish_D_o_0_G
    description: Distinguish between uppercase D, lowercase o, digit 0, and uppercase G
    source_characters: ["D", "o", "0", "G"]
    target_transcription: Use correct character as appears in image

  - rule_id: CHAR006
    name: distinguish_b_h
    description: Distinguish between lowercase b and lowercase h
    source_characters: ["b", "h"]
    target_transcription: Use correct character as appears in image

  - rule_id: CHAR007
    name: distinguish_Z_2
    description: Distinguish between uppercase Z and digit 2
    source_characters: ["Z", "2"]
    target_transcription: Use correct character as appears in image

  - rule_id: CHAR008
    name: distinguish_R_B
    description: Distinguish between uppercase R and uppercase B
    source_characters: ["R", "B"]
    target_transcription: Use correct character as appears in image

  - rule_id: CHAR009
    name: distinguish_G_6
    description: Distinguish between uppercase G and digit 6
    source_characters: ["G", "6"]
    target_transcription: Use correct character as appears in image

  - rule_id: CHAR010
    name: distinguish_I_l_i_t_1_slash_exclamation_pipe_J_j
    description: >-
      Distinguish between uppercase I, lowercase l, lowercase i, lowercase t,
      digit 1, forward slash, exclamation mark, pipe, uppercase J, and lowercase j
    source_characters: ["I", "l", "i", "t", "1", "/", "!", "|", "J", "j"]
    target_transcription: Use correct character as appears in image

  - rule_id: CHAR011
    name: distinguish_IJ_Y_ij_y
    description: >-
      Distinguish between uppercase IJ ligature, uppercase Y, lowercase ij
      ligature, and lowercase y
    source_characters: ["IJ", "Y", "ij", "y"]
    target_transcription: Use correct character as appears in image

  - rule_id: CHAR012
    name: distinguish_v_y
    description: Distinguish between lowercase v and lowercase y
    source_characters: ["v", "y"]
    target_transcription: Use correct character as appears in image

  - rule_id: CHAR013
    name: distinguish_S_8_5
    description: Distinguish between uppercase S, digit 8, and digit 5
    source_characters: ["S", "8", "5"]
    target_transcription: Use correct character as appears in image

  - rule_id: CHAR014
    name: distinguish_HI_M
    description: Distinguish between uppercase H and I together versus uppercase M
    source_characters: ["HI", "M"]
    target_transcription: Use correct characters as appears in image

  - rule_id: CHAR015
    name: distinguish_K_pipe_less_than
    description: Distinguish between uppercase K, pipe, and less than symbol
    source_characters: ["K", "|", "<"]
    target_transcription: Use correct character as appears in image

  - rule_id: CHAR016
    name: distinguish_ine_the
    description: Distinguish between letter sequences 'ine' and 'the'
    source_characters: ["ine", "the"]
    target_transcription: Use correct sequence as appears in image

  # Section 2.3 - Merge and split words
  - rule_id: CHAR017
    name: merge_split_as_shown
    description: >-
      Merge and split words as they are shown on the image of the page,
      not as is common in modern texts
    source_characters: []
    target_transcription: Preserve word boundaries as shown in source

  # Section 2.4 - Upper and lowercase
  - rule_id: CHAR018
    name: case_as_shown
    description: >-
      Lower- (miniscule) and uppercase (majuscule) are transcribed as shown
      on the page of the image, even if capitals occur halfway words. In case
      ambiguity arises over whether a letter is written in lower- or uppercase,
      modern orthographic rules will be used for capitalizing words and sentences.
    source_characters: []
    target_transcription: Preserve case as shown, use modern rules for ambiguous cases

  # Section 2.8 - Medial s
  - rule_id: CHAR019
    name: medial_s_transcription
    description: >-
      The medial 's' (ſ) and short 's' are not distinguished; both are
      transcribed as 's'
    source_characters: ["ſ", "s"]
    target_transcription: "s"

  # Section 2.9 - Font
  - rule_id: CHAR020
    name: fonts_not_distinguished
    description: Fonts like Kurrent or Antiqua are not transcribed differently
    source_characters: []
    target_transcription: Transcribe characters regardless of font style

  # Section 2.12 - Diacritics
  - rule_id: CHAR021
    name: diacritics_preserve
    description: >-
      Used in printed and handwritten texts. In case the diacritics are difficult
      to distinguish (e.g. — ~) one of the diacritics needs to be chosen and
      applied consistently within the handwritten corpus in which it occurs.
      Lines above the letter 'u' are not transcribed in case they are merely
      meant to distinguish the letter 'u' from the letter 'v'. This is common
      in early modern Dutch texts.
    source_characters: []
    target_transcription: Preserve diacritics consistently

abbreviation_rules:
  # Section 2.6 - Super and subscript
  - rule_id: ABB001
    abbreviation_symbol: ":"
    expansion_policy: EXPAND_SPECIAL
    description: >-
      Use a colon (:) for super- and subscripts in handwritten sources.
      Superscript is only transcribed for printed sources.
    special_cases:
      - symbol: "Ed:le"
        expansion: "Ed:le"
        comment: "Use colon instead of superscript notation"
      - symbol: "et=a"
        expansion: "etc."
        comment: "Expand to etc. for consistency"
      - symbol: "et:a"
        expansion: "etc."
        comment: "Expand to etc. for consistency"
      - symbol: "et cetera variations"
        expansion: "etc."
        comment: "General transcription for all et cetera abbreviations"

  # Section 2.16 - Abbreviations
  - rule_id: ABB002
    abbreviation_symbol: "ver-symbol"
    expansion_policy: EXPAND_SPECIAL
    description: Abbreviations are not written out except in specific cases
    special_cases:
      - symbol: "⁊ (ver symbol)"
        expansion: "ver"
        comment: "Expand the ver abbreviation"
      - symbol: "ende symbol"
        expansion: "ende"
        comment: "Expand the ende abbreviation"
      - symbol: "per symbol"
        expansion: "per"
        comment: "Expand the per abbreviation"

formatting_rules:
  # Section 2.10 - Cross outs
  - rule_id: FMT001
    formatting_type: STRIKETHROUGH
    applies_to_handwritten: true
    applies_to_printed: true
    transcription_method: >-
      Use cross outs when they occur in the image of the page. Indicate typos
      and blurred or covered letters through crossing them, e.g. apfel --> ap~~f~~el.
      Cross outs are registered as a textStyle within the PAGE-xmls.

  # Section 2.11 - Underlining
  - rule_id: FMT002
    formatting_type: UNDERLINE
    applies_to_handwritten: true
    applies_to_printed: true
    transcription_method: >-
      Underline characters if they are underlined on the image of the page.
      Underlining is registered as a textStyle within the PAGE-xmls.

  # Section 2.12 - Bold and Italics
  - rule_id: FMT003
    formatting_type: BOLD
    applies_to_handwritten: false
    applies_to_printed: true
    transcription_method: >-
      Bold type is only transcribed for printed texts, not handwritten ones.

  - rule_id: FMT004
    formatting_type: ITALIC
    applies_to_handwritten: false
    applies_to_printed: true
    transcription_method: >-
      Italic type is only transcribed for printed texts, not handwritten ones.

special_symbol_rules:
  # Section 2.5 - Hyphens and dashes
  - rule_id: SYM001
    symbol_name: hyphen
    symbol_unicode: "-"
    usage_context: Within words or at line breaks
    transcription_note: >-
      Hyphens (-) and dashes (—) are transcribed based on the length and
      semantic meaning within the phrase in which they occur.

  - rule_id: SYM002
    symbol_name: negation_sign
    symbol_unicode: "¬"
    usage_context: End of line word breaks
    transcription_note: >-
      Hyphens at the end of a line are transcribed with a negation sign (¬)
      unless their purpose is to join two words or parts which happen to have
      split on two different lines.
    examples:
      - source: "zon-ne"
        transcription: "zon¬ne"
        explanation: "Word split at line break, not a compound word"
      - source: "zonne-energie"
        transcription: "zonne-energie"
        explanation: "Compound word with hyphen"

  - rule_id: SYM003
    symbol_name: em_dash
    symbol_unicode: "—"
    usage_context: Longer dashes for emphasis or breaks
    transcription_note: Transcribe based on length in source

  # Section 2.7 - Interpunction and punctuation
  - rule_id: SYM004
    symbol_name: full_stop
    symbol_unicode: "."
    usage_context: End of sentences
    transcription_note: >-
      Lines of ink at the end of text lines are rarely meant as interpunction.
      They often serve to clean the quill before dipping it into the inkwell.
      Transcribing these marks will distort natural language processing and
      should be avoided.

  - rule_id: SYM005
    symbol_name: quotation_marks
    symbol_unicode: "\" \" , ' ' „ " « » ‹ ›"
    usage_context: Direct speech and quotations
    transcription_note: Distinguish different quotation mark styles

  - rule_id: SYM006
    symbol_name: punctuation_marks
    symbol_unicode: "! ?"
    usage_context: End of sentences
    transcription_note: Take care to distinguish

  - rule_id: SYM007
    symbol_name: separators
    symbol_unicode: "/ | \\ ( ) /:"
    usage_context: Various separation and grouping
    transcription_note: Take care to distinguish

  # Section 2.7 - Special symbols
  - rule_id: SYM008
    symbol_name: f_variations
    symbol_unicode: "f ƒ fl"
    usage_context: Letter f and ligatures
    transcription_note: Make sure to distinguish

  - rule_id: SYM009
    symbol_name: tironian_et_vs_seven
    symbol_unicode: "⁊ 7"
    usage_context: Marginalia marker vs number
    transcription_note: >-
      ⁊ (often used to refer to marginalia) vs 7 (the number seven)

  # Section 2.14 - Flourish of approval
  - rule_id: SYM010
    symbol_name: flourish_of_approval
    symbol_unicode: "₰"
    usage_context: >-
      Dutch check mark in 19th-21st century texts from Netherlands, Indonesia,
      South Africa, Belgium and Dutch Caribbean islands
    transcription_note: >-
      Represented by German penny symbol (₰). It is advised to substitute this
      symbol with a unique one in the near future, given that the dele—which has
      an opposite meaning to the flourish—also is transcribed as the German penny
      symbol.

  # Section 2.13 - Unclear words
  - rule_id: SYM011
    symbol_name: unclear_marker
    symbol_unicode: ""
    usage_context: Illegible or uncertain text
    transcription_note: >-
      Tag unclear words as 'unclear'. NEVER delete part of the baseline as this
      will inhibit subsequent Natural Language Processing.

  # Section 2.15 - Initials and signatures
  - rule_id: SYM012
    symbol_name: unclear_signature
    symbol_unicode: ""
    usage_context: Illegible signatures
    transcription_note: >-
      Note unclear initials and signatures as 'unclear'. If signatures are
      impossible to decipher, their baseline can be deleted. Try to discuss
      this with the editing team first.

  # Section 2.2 - Baselines
  - rule_id: SYM013
    symbol_name: partial_characters
    symbol_unicode: ""
    usage_context: Characters outside baseline range
    transcription_note: >-
      Characters which are displayed partially outside of the range of baselines,
      but which are part of the phrase should be transcribed too.