435 lines
16 KiB
YAML
435 lines
16 KiB
YAML
# Complete Convention Instance - Version 1.4.3
|
||
# This file represents a complete instantiation of the transcription convention
|
||
# combining layout rules, transcription rules, annotation rules, and formulaic phrases
|
||
|
||
version: "1.4.3"
|
||
publication_date: "2022-10-04"
|
||
status: SPECIFIED
|
||
|
||
# Layout Rules (Section 1)
|
||
layout_rules:
|
||
text_region_types:
|
||
- name: PARAGRAPH
|
||
description: >-
|
||
The running text within the type area. This is the main body text
|
||
of the document.
|
||
ordering_rules: >-
|
||
Text lines within paragraphs should be ordered top-to-bottom based
|
||
on baseline coordinates.
|
||
|
||
- name: PAGE_NUMBER
|
||
description: >-
|
||
A number—either in digits or written out—, letter or combination of
|
||
both indicating the order of a page or folium in a book or other type
|
||
of writing.
|
||
ordering_rules: >-
|
||
Page numbers are typically located at top or bottom margins and should
|
||
be identified as separate regions.
|
||
|
||
- name: HEADER
|
||
description: >-
|
||
A general text at the top margin of a page or paragraph which can be
|
||
assigned to multiple sections within a source.
|
||
ordering_rules: >-
|
||
Headers should be ordered before main body text when processing pages.
|
||
|
||
- name: FOOTER
|
||
description: >-
|
||
A general text at the lower margin of a page or paragraph which can be
|
||
assigned to multiple sections within a source.
|
||
ordering_rules: >-
|
||
Footers should be ordered after main body text when processing pages.
|
||
|
||
- name: HEADING
|
||
description: >-
|
||
A title or index designation which only applies to a single section of
|
||
a source, i.e., the paragraphs written directly underneath it.
|
||
ordering_rules: >-
|
||
Headings should be ordered immediately before the paragraphs they describe.
|
||
|
||
- name: FOOTNOTE
|
||
description: >-
|
||
Indexed annotations and references which occur underneath the running text
|
||
across multiple pages in a successive order.
|
||
ordering_rules: >-
|
||
Footnotes should be ordered by their index number/symbol and associated
|
||
with their reference in main text.
|
||
|
||
- name: TABLE
|
||
description: >-
|
||
Indices in which the layout of the text is more important than the syntax.
|
||
ordering_rules: >-
|
||
Table cells should preserve their row-column structure. Cell content
|
||
should be ordered left-to-right, top-to-bottom.
|
||
|
||
- name: MARGINALIA
|
||
description: >-
|
||
Notes, scribbles, and commentary in the margins of pages.
|
||
ordering_rules: >-
|
||
Marginalia should be associated with adjacent main text but marked as
|
||
separate regions.
|
||
|
||
- name: CAPTION
|
||
description: >-
|
||
Description of an image which is located approximate—often directly
|
||
underneath—it.
|
||
ordering_rules: >-
|
||
Captions should be associated with their images and ordered after the
|
||
image they describe.
|
||
|
||
- name: COLOPHON
|
||
description: >-
|
||
A piece of text or section of a page in which the author or scribes of
|
||
a textual source are mentioned or in which the creation, place of writing,
|
||
or the delivery of the source are specified.
|
||
ordering_rules: >-
|
||
Colophons typically appear at the end of documents or sections.
|
||
|
||
baseline_rules:
|
||
- rule_id: BL001
|
||
description: Remove transcribed text on pages in the background
|
||
applies_to: >-
|
||
Text regions that do not belong to the current page being transcribed
|
||
action: REMOVE
|
||
|
||
- rule_id: BL002
|
||
description: Shorten baselines extending to decorative textual elements
|
||
applies_to: >-
|
||
Baselines that incorrectly extend into decorative elements such as
|
||
illuminated letters, flourishes, or ornamental borders
|
||
action: SHORTEN
|
||
|
||
- rule_id: BL003
|
||
description: Add space dividers for unusually long distances between words
|
||
applies_to: >-
|
||
Distances between words which are longer than usual considering the
|
||
handwriting style
|
||
action: ADJUST
|
||
|
||
- rule_id: BL004
|
||
description: Split baseline when word distance exceeds half baseline length
|
||
applies_to: >-
|
||
When the distance between words extends beyond half of the total length
|
||
of the baseline
|
||
action: SPLIT
|
||
|
||
- rule_id: BL005
|
||
description: Connect inserted texts to main baseline
|
||
applies_to: >-
|
||
Inserted texts between baselines need to be connected to the main baseline
|
||
of which they are part. This also applies to Lombardic capitals.
|
||
action: MERGE
|
||
|
||
- rule_id: BL006
|
||
description: Cut text region in half when lines cross columns
|
||
applies_to: >-
|
||
Text lines that cross columns or text regions that extend too far
|
||
action: SPLIT
|
||
|
||
text_line_ordering:
|
||
method: coordinate-based
|
||
applies_to_region: PARAGRAPH
|
||
|
||
# Transcription Rules (Section 2)
|
||
transcription_rules:
|
||
is_diplomatic: true
|
||
|
||
character_rules:
|
||
# Key character confusion rules
|
||
- rule_id: CHAR001
|
||
name: distinguish_c_e_a_o
|
||
description: Distinguish between c, e, a, and o
|
||
source_characters: ["c", "e", "a", "o"]
|
||
target_transcription: Use correct character as appears in image
|
||
|
||
- rule_id: CHAR017
|
||
name: merge_split_as_shown
|
||
description: Merge and split words as shown on page
|
||
source_characters: []
|
||
target_transcription: Preserve word boundaries as shown in source
|
||
|
||
- rule_id: CHAR018
|
||
name: case_as_shown
|
||
description: Preserve case as shown, use modern rules for ambiguous cases
|
||
source_characters: []
|
||
target_transcription: Preserve case as shown
|
||
|
||
- rule_id: CHAR019
|
||
name: medial_s_transcription
|
||
description: Medial 's' (ſ) and short 's' both transcribed as 's'
|
||
source_characters: ["ſ", "s"]
|
||
target_transcription: "s"
|
||
|
||
abbreviation_rules:
|
||
- rule_id: ABB001
|
||
abbreviation_symbol: ":"
|
||
expansion_policy: EXPAND_SPECIAL
|
||
description: Use colon for super/subscripts in handwritten sources
|
||
special_cases:
|
||
- symbol: "et=a"
|
||
expansion: "etc."
|
||
comment: "Expand to etc. for consistency"
|
||
- symbol: "et:a"
|
||
expansion: "etc."
|
||
comment: "Expand to etc. for consistency"
|
||
|
||
- rule_id: ABB002
|
||
abbreviation_symbol: "ver-symbol"
|
||
expansion_policy: EXPAND_SPECIAL
|
||
description: Expand specific abbreviation symbols
|
||
special_cases:
|
||
- symbol: "⁊ (ver symbol)"
|
||
expansion: "ver"
|
||
comment: "Expand the ver abbreviation"
|
||
- symbol: "ende symbol"
|
||
expansion: "ende"
|
||
comment: "Expand the ende abbreviation"
|
||
- symbol: "per symbol"
|
||
expansion: "per"
|
||
comment: "Expand the per abbreviation"
|
||
|
||
formatting_rules:
|
||
- rule_id: FMT001
|
||
formatting_type: STRIKETHROUGH
|
||
applies_to_handwritten: true
|
||
applies_to_printed: true
|
||
transcription_method: >-
|
||
Use cross outs when they occur. Indicate typos through crossing,
|
||
e.g. ap~~f~~el. Registered as textStyle in PAGE-xml.
|
||
|
||
- rule_id: FMT002
|
||
formatting_type: UNDERLINE
|
||
applies_to_handwritten: true
|
||
applies_to_printed: true
|
||
transcription_method: >-
|
||
Underline if underlined on page. Registered as textStyle in PAGE-xml.
|
||
|
||
- rule_id: FMT003
|
||
formatting_type: BOLD
|
||
applies_to_handwritten: false
|
||
applies_to_printed: true
|
||
transcription_method: Only for printed texts
|
||
|
||
- rule_id: FMT004
|
||
formatting_type: ITALIC
|
||
applies_to_handwritten: false
|
||
applies_to_printed: true
|
||
transcription_method: Only for printed texts
|
||
|
||
special_symbol_rules:
|
||
- rule_id: SYM002
|
||
symbol_name: negation_sign
|
||
symbol_unicode: "¬"
|
||
usage_context: End of line word breaks
|
||
transcription_note: >-
|
||
Use negation sign (¬) for line-end hyphens unless joining compound words
|
||
|
||
- rule_id: SYM004
|
||
symbol_name: full_stop
|
||
symbol_unicode: "."
|
||
usage_context: End of sentences
|
||
transcription_note: >-
|
||
Lines of ink at line ends often clean the quill, not punctuation.
|
||
Avoid transcribing these to prevent NLP distortion.
|
||
|
||
- rule_id: SYM010
|
||
symbol_name: flourish_of_approval
|
||
symbol_unicode: "₰"
|
||
usage_context: >-
|
||
Dutch check mark in 19th-21st century texts from Netherlands,
|
||
Indonesia, South Africa, Belgium and Dutch Caribbean
|
||
transcription_note: >-
|
||
Represented by German penny symbol (₰)
|
||
|
||
- rule_id: SYM011
|
||
symbol_name: unclear_marker
|
||
symbol_unicode: ""
|
||
usage_context: Illegible or uncertain text
|
||
transcription_note: >-
|
||
Tag unclear words as 'unclear'. NEVER delete baseline.
|
||
|
||
# Annotation Rules (Section 3) - Gado2 Scheme
|
||
annotation_rules:
|
||
annotation_scheme: "Gado2"
|
||
no_double_tagging: true
|
||
context_sensitive: true
|
||
|
||
entity_types:
|
||
# Summary of each entity type with key rules
|
||
- entity_type: PERSON
|
||
description: >-
|
||
Personal names and specific references to persons (named or unnamed).
|
||
Includes animals, fictional characters, religious figures.
|
||
key_inclusion_rules:
|
||
- Include titles only when needed for identification
|
||
- Include articles/demonstratives with specific references
|
||
- Include only given name or surname (with title, not designation)
|
||
key_exclusion_rules:
|
||
- Do not tag abstract references (plural/indefinite)
|
||
- Do not tag pronouns
|
||
- Do not tag designations with complete names
|
||
- Do not include associated organisations/places
|
||
example_tags:
|
||
- "Gouverneur-Generaal van Starkenborgh"
|
||
- "de Koning van Pruisen"
|
||
- "zijn Vrouw"
|
||
- "gemelte gouvern:r"
|
||
|
||
- entity_type: PLACE
|
||
description: >-
|
||
Geographic locations: streets, cities, infrastructure, landforms,
|
||
public spaces, buildings, astronomical objects, coordinates.
|
||
key_inclusion_rules:
|
||
- Include relevant adjectives (directional, descriptive)
|
||
- Include metonymy (entity linking will resolve)
|
||
- Include articles for generic references
|
||
key_exclusion_rules:
|
||
- Do not include articles for specific proper names
|
||
- Do not tag person representations of places
|
||
example_tags:
|
||
- "Indonesische eilandenrijk"
|
||
- "West-Javaanse Bandoeng"
|
||
- "Z. breete van 35 — 31 ten langte 5 — 15"
|
||
|
||
- entity_type: ORGANISATION
|
||
description: >-
|
||
Organizations: companies, institutions, governments, military,
|
||
sports teams, ships, religious orders.
|
||
key_inclusion_rules:
|
||
- Tag branches with placenames (no preposition between)
|
||
- Tag frequently repeated organizational references
|
||
key_exclusion_rules:
|
||
- Do not include articles
|
||
- Do not tag informal groups (these are denominations)
|
||
- Tag publications as textual references in appropriate context
|
||
- Separate placenames with prepositions
|
||
example_tags:
|
||
- "ING Rotterdam"
|
||
- "Ministerie van Financiën"
|
||
- "Stoomschip Sumatra"
|
||
- "aen generael en raden"
|
||
|
||
- entity_type: DENOMINATION
|
||
description: >-
|
||
Ethnicity, profession, religion, demonym, ideology, language,
|
||
community references. Includes pejoratives.
|
||
key_inclusion_rules:
|
||
- Tag both adjective and noun in phrases
|
||
- Tag profession/pejorative alone (becomes person per 3.1 rules)
|
||
key_exclusion_rules:
|
||
- Do not tag organisations (formal structure)
|
||
- Do not tag currencies (textual ref or quantity)
|
||
- Do not tag with numerals (becomes quantity)
|
||
- Do not tag associated places/orgs/persons
|
||
example_tags:
|
||
- "Islamitische gemeenschap"
|
||
- "Chinees"
|
||
- "Slaaf"
|
||
- "Ministers"
|
||
- "Volk van West-Irian"
|
||
|
||
- entity_type: QUANTITY
|
||
description: >-
|
||
Currency, merchandise, people counts, age, distance, weight,
|
||
enumeration, measurements.
|
||
key_inclusion_rules:
|
||
- Infer single items in enumerations
|
||
- Tag denominations with numerals as quantities
|
||
- Tag travel time as distance (not temporal)
|
||
key_exclusion_rules:
|
||
- Do not tag textual references as quantities
|
||
- Do not tag associated organisations
|
||
example_tags:
|
||
- "ƒ 1.50"
|
||
- "23: inlandsche zieken"
|
||
- "3 schootels, zadel, 2 stijgh beugels"
|
||
- "drie â vier dagen varens"
|
||
|
||
- entity_type: TEMPORAL_REFERENCE
|
||
description: >-
|
||
Days, dates, campaigns/wars, holidays, canonised periods,
|
||
genitives, temporal adjectives.
|
||
key_inclusion_rules:
|
||
- Always tag days unless full date written
|
||
- Tag campaigns/wars when referring to time period
|
||
key_exclusion_rules:
|
||
- Do not tag days when full date is present
|
||
example_tags:
|
||
- "Afgelopen Vrijdag"
|
||
- "Twee Wereldoorlog"
|
||
- "Middeleeuwen"
|
||
- "9 dezer"
|
||
- "eighteenth-century Europe"
|
||
|
||
- entity_type: TEXTUAL_REFERENCE
|
||
description: >-
|
||
Written sources, laws, titles, inventory numbers, accounts,
|
||
currency types, URLs, policies, agreements, honours, flags.
|
||
key_inclusion_rules:
|
||
- Tag currency types (not amounts)
|
||
- Tag publications as textual ref in appropriate context
|
||
- Tag activities with recorded minutes
|
||
key_exclusion_rules:
|
||
- Do not tag currency amounts
|
||
- Context determines if publication is org or textual ref
|
||
- Do not confuse document references with quantities
|
||
example_tags:
|
||
- "Spaenschen reael"
|
||
- "In de NRC stond ..."
|
||
- "artikel 156 alinea 2"
|
||
- "Ridder in de orde van Oranje Nassau"
|
||
|
||
# Formulaic Phrases (Section 4) - 17th Century Dutch VOC
|
||
formulaic_phrases:
|
||
- corpus: "VOC Archives (East India Company)"
|
||
language: "Dutch"
|
||
century: "17th Century"
|
||
phrase_patterns:
|
||
- pattern_id: FP_HEADER_001
|
||
text_region_type: HEADER
|
||
pattern: "Int [LOC] [DAT]"
|
||
example:
|
||
- "Int Casteel Batavia A:o 1684: 28:e november"
|
||
|
||
- pattern_id: FP_SALUTATION_001
|
||
text_region_type: PARAGRAPH
|
||
pattern: "Aen d' Edele Heer [PER] [DEN] en d' Edele Heeren [ORG] Van [LOC]"
|
||
example:
|
||
- "Aen d' Edele Heer Johannes Camphuijs Gouverneur Generael en d' Edele Heeren Raden Van India"
|
||
|
||
- pattern_id: FP_SHIPPING_001
|
||
text_region_type: PARAGRAPH
|
||
pattern: "Een [DAT] komt alhier direct uyt [LOC] te verschynen de fluyt, [ORG]"
|
||
description: Report of ship arrival
|
||
|
||
- pattern_id: FP_MUSTER_001
|
||
text_region_type: TABLE
|
||
pattern: "Den [DEN] [PER]\\n[DEN] [PER]\\nsterck [QTY]"
|
||
description: Military muster list format
|
||
example:
|
||
- "Luijtenant adolf winckelaar.\\nvaandrigh Joannes van Buijtenhem\\nsterck 42. Coppen"
|
||
|
||
- pattern_id: FP_CORRESPONDENCE_001
|
||
text_region_type: PARAGRAPH
|
||
pattern: "zijnde de [REF] van het [REF] door den [PER] aen ons gesonden"
|
||
example:
|
||
- "zijnde de copie van het briefie door den luijtenant grevingh aen ons gesonden"
|
||
|
||
# Metadata
|
||
metadata:
|
||
author: "S.C. Kemper"
|
||
role: "Data Scientist"
|
||
department: "Dienstverlening/Services"
|
||
institution: "Nationaal Archief"
|
||
email: "simon.kemper@nationaalarchief.nl"
|
||
first_publication: "2021-09-04"
|
||
current_publication: "2022-10-04"
|
||
scope: "Specified for 1.04.02, to be adjusted further to other archives"
|
||
|
||
annotation_scheme_description: >-
|
||
The Gado2 annotation scheme extends CoNLL-2002 (4 types) to 7 entity types,
|
||
improving recognition of enslaved people, women, and minorities in historical
|
||
texts. It moves from 'named entities' to all entities (named or not), captures
|
||
quantitative and temporal data outside tables, and recognises textual references.
|
||
Dependencies between the 7 categories improve entity disambiguation for linking
|
||
to knowledge bases like Wikidata.
|