glam/docs/convention/schema/convention_complete_instance.yaml
2025-12-02 14:36:01 +01:00

435 lines
16 KiB
YAML
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# Complete Convention Instance - Version 1.4.3
# This file represents a complete instantiation of the transcription convention
# combining layout rules, transcription rules, annotation rules, and formulaic phrases
version: "1.4.3"
publication_date: "2022-10-04"
status: SPECIFIED
# Layout Rules (Section 1)
layout_rules:
text_region_types:
- name: PARAGRAPH
description: >-
The running text within the type area. This is the main body text
of the document.
ordering_rules: >-
Text lines within paragraphs should be ordered top-to-bottom based
on baseline coordinates.
- name: PAGE_NUMBER
description: >-
A number—either in digits or written out—, letter or combination of
both indicating the order of a page or folium in a book or other type
of writing.
ordering_rules: >-
Page numbers are typically located at top or bottom margins and should
be identified as separate regions.
- name: HEADER
description: >-
A general text at the top margin of a page or paragraph which can be
assigned to multiple sections within a source.
ordering_rules: >-
Headers should be ordered before main body text when processing pages.
- name: FOOTER
description: >-
A general text at the lower margin of a page or paragraph which can be
assigned to multiple sections within a source.
ordering_rules: >-
Footers should be ordered after main body text when processing pages.
- name: HEADING
description: >-
A title or index designation which only applies to a single section of
a source, i.e., the paragraphs written directly underneath it.
ordering_rules: >-
Headings should be ordered immediately before the paragraphs they describe.
- name: FOOTNOTE
description: >-
Indexed annotations and references which occur underneath the running text
across multiple pages in a successive order.
ordering_rules: >-
Footnotes should be ordered by their index number/symbol and associated
with their reference in main text.
- name: TABLE
description: >-
Indices in which the layout of the text is more important than the syntax.
ordering_rules: >-
Table cells should preserve their row-column structure. Cell content
should be ordered left-to-right, top-to-bottom.
- name: MARGINALIA
description: >-
Notes, scribbles, and commentary in the margins of pages.
ordering_rules: >-
Marginalia should be associated with adjacent main text but marked as
separate regions.
- name: CAPTION
description: >-
Description of an image which is located approximate—often directly
underneath—it.
ordering_rules: >-
Captions should be associated with their images and ordered after the
image they describe.
- name: COLOPHON
description: >-
A piece of text or section of a page in which the author or scribes of
a textual source are mentioned or in which the creation, place of writing,
or the delivery of the source are specified.
ordering_rules: >-
Colophons typically appear at the end of documents or sections.
baseline_rules:
- rule_id: BL001
description: Remove transcribed text on pages in the background
applies_to: >-
Text regions that do not belong to the current page being transcribed
action: REMOVE
- rule_id: BL002
description: Shorten baselines extending to decorative textual elements
applies_to: >-
Baselines that incorrectly extend into decorative elements such as
illuminated letters, flourishes, or ornamental borders
action: SHORTEN
- rule_id: BL003
description: Add space dividers for unusually long distances between words
applies_to: >-
Distances between words which are longer than usual considering the
handwriting style
action: ADJUST
- rule_id: BL004
description: Split baseline when word distance exceeds half baseline length
applies_to: >-
When the distance between words extends beyond half of the total length
of the baseline
action: SPLIT
- rule_id: BL005
description: Connect inserted texts to main baseline
applies_to: >-
Inserted texts between baselines need to be connected to the main baseline
of which they are part. This also applies to Lombardic capitals.
action: MERGE
- rule_id: BL006
description: Cut text region in half when lines cross columns
applies_to: >-
Text lines that cross columns or text regions that extend too far
action: SPLIT
text_line_ordering:
method: coordinate-based
applies_to_region: PARAGRAPH
# Transcription Rules (Section 2)
transcription_rules:
is_diplomatic: true
character_rules:
# Key character confusion rules
- rule_id: CHAR001
name: distinguish_c_e_a_o
description: Distinguish between c, e, a, and o
source_characters: ["c", "e", "a", "o"]
target_transcription: Use correct character as appears in image
- rule_id: CHAR017
name: merge_split_as_shown
description: Merge and split words as shown on page
source_characters: []
target_transcription: Preserve word boundaries as shown in source
- rule_id: CHAR018
name: case_as_shown
description: Preserve case as shown, use modern rules for ambiguous cases
source_characters: []
target_transcription: Preserve case as shown
- rule_id: CHAR019
name: medial_s_transcription
description: Medial 's' (ſ) and short 's' both transcribed as 's'
source_characters: ["ſ", "s"]
target_transcription: "s"
abbreviation_rules:
- rule_id: ABB001
abbreviation_symbol: ":"
expansion_policy: EXPAND_SPECIAL
description: Use colon for super/subscripts in handwritten sources
special_cases:
- symbol: "et=a"
expansion: "etc."
comment: "Expand to etc. for consistency"
- symbol: "et:a"
expansion: "etc."
comment: "Expand to etc. for consistency"
- rule_id: ABB002
abbreviation_symbol: "ver-symbol"
expansion_policy: EXPAND_SPECIAL
description: Expand specific abbreviation symbols
special_cases:
- symbol: "⁊ (ver symbol)"
expansion: "ver"
comment: "Expand the ver abbreviation"
- symbol: "ende symbol"
expansion: "ende"
comment: "Expand the ende abbreviation"
- symbol: "per symbol"
expansion: "per"
comment: "Expand the per abbreviation"
formatting_rules:
- rule_id: FMT001
formatting_type: STRIKETHROUGH
applies_to_handwritten: true
applies_to_printed: true
transcription_method: >-
Use cross outs when they occur. Indicate typos through crossing,
e.g. ap~~f~~el. Registered as textStyle in PAGE-xml.
- rule_id: FMT002
formatting_type: UNDERLINE
applies_to_handwritten: true
applies_to_printed: true
transcription_method: >-
Underline if underlined on page. Registered as textStyle in PAGE-xml.
- rule_id: FMT003
formatting_type: BOLD
applies_to_handwritten: false
applies_to_printed: true
transcription_method: Only for printed texts
- rule_id: FMT004
formatting_type: ITALIC
applies_to_handwritten: false
applies_to_printed: true
transcription_method: Only for printed texts
special_symbol_rules:
- rule_id: SYM002
symbol_name: negation_sign
symbol_unicode: "¬"
usage_context: End of line word breaks
transcription_note: >-
Use negation sign (¬) for line-end hyphens unless joining compound words
- rule_id: SYM004
symbol_name: full_stop
symbol_unicode: "."
usage_context: End of sentences
transcription_note: >-
Lines of ink at line ends often clean the quill, not punctuation.
Avoid transcribing these to prevent NLP distortion.
- rule_id: SYM010
symbol_name: flourish_of_approval
symbol_unicode: "₰"
usage_context: >-
Dutch check mark in 19th-21st century texts from Netherlands,
Indonesia, South Africa, Belgium and Dutch Caribbean
transcription_note: >-
Represented by German penny symbol (₰)
- rule_id: SYM011
symbol_name: unclear_marker
symbol_unicode: ""
usage_context: Illegible or uncertain text
transcription_note: >-
Tag unclear words as 'unclear'. NEVER delete baseline.
# Annotation Rules (Section 3) - Gado2 Scheme
annotation_rules:
annotation_scheme: "Gado2"
no_double_tagging: true
context_sensitive: true
entity_types:
# Summary of each entity type with key rules
- entity_type: PERSON
description: >-
Personal names and specific references to persons (named or unnamed).
Includes animals, fictional characters, religious figures.
key_inclusion_rules:
- Include titles only when needed for identification
- Include articles/demonstratives with specific references
- Include only given name or surname (with title, not designation)
key_exclusion_rules:
- Do not tag abstract references (plural/indefinite)
- Do not tag pronouns
- Do not tag designations with complete names
- Do not include associated organisations/places
example_tags:
- "Gouverneur-Generaal van Starkenborgh"
- "de Koning van Pruisen"
- "zijn Vrouw"
- "gemelte gouvern:r"
- entity_type: PLACE
description: >-
Geographic locations: streets, cities, infrastructure, landforms,
public spaces, buildings, astronomical objects, coordinates.
key_inclusion_rules:
- Include relevant adjectives (directional, descriptive)
- Include metonymy (entity linking will resolve)
- Include articles for generic references
key_exclusion_rules:
- Do not include articles for specific proper names
- Do not tag person representations of places
example_tags:
- "Indonesische eilandenrijk"
- "West-Javaanse Bandoeng"
- "Z. breete van 35 — 31 ten langte 5 — 15"
- entity_type: ORGANISATION
description: >-
Organizations: companies, institutions, governments, military,
sports teams, ships, religious orders.
key_inclusion_rules:
- Tag branches with placenames (no preposition between)
- Tag frequently repeated organizational references
key_exclusion_rules:
- Do not include articles
- Do not tag informal groups (these are denominations)
- Tag publications as textual references in appropriate context
- Separate placenames with prepositions
example_tags:
- "ING Rotterdam"
- "Ministerie van Financiën"
- "Stoomschip Sumatra"
- "aen generael en raden"
- entity_type: DENOMINATION
description: >-
Ethnicity, profession, religion, demonym, ideology, language,
community references. Includes pejoratives.
key_inclusion_rules:
- Tag both adjective and noun in phrases
- Tag profession/pejorative alone (becomes person per 3.1 rules)
key_exclusion_rules:
- Do not tag organisations (formal structure)
- Do not tag currencies (textual ref or quantity)
- Do not tag with numerals (becomes quantity)
- Do not tag associated places/orgs/persons
example_tags:
- "Islamitische gemeenschap"
- "Chinees"
- "Slaaf"
- "Ministers"
- "Volk van West-Irian"
- entity_type: QUANTITY
description: >-
Currency, merchandise, people counts, age, distance, weight,
enumeration, measurements.
key_inclusion_rules:
- Infer single items in enumerations
- Tag denominations with numerals as quantities
- Tag travel time as distance (not temporal)
key_exclusion_rules:
- Do not tag textual references as quantities
- Do not tag associated organisations
example_tags:
- "ƒ 1.50"
- "23: inlandsche zieken"
- "3 schootels, zadel, 2 stijgh beugels"
- "drie â vier dagen varens"
- entity_type: TEMPORAL_REFERENCE
description: >-
Days, dates, campaigns/wars, holidays, canonised periods,
genitives, temporal adjectives.
key_inclusion_rules:
- Always tag days unless full date written
- Tag campaigns/wars when referring to time period
key_exclusion_rules:
- Do not tag days when full date is present
example_tags:
- "Afgelopen Vrijdag"
- "Twee Wereldoorlog"
- "Middeleeuwen"
- "9 dezer"
- "eighteenth-century Europe"
- entity_type: TEXTUAL_REFERENCE
description: >-
Written sources, laws, titles, inventory numbers, accounts,
currency types, URLs, policies, agreements, honours, flags.
key_inclusion_rules:
- Tag currency types (not amounts)
- Tag publications as textual ref in appropriate context
- Tag activities with recorded minutes
key_exclusion_rules:
- Do not tag currency amounts
- Context determines if publication is org or textual ref
- Do not confuse document references with quantities
example_tags:
- "Spaenschen reael"
- "In de NRC stond ..."
- "artikel 156 alinea 2"
- "Ridder in de orde van Oranje Nassau"
# Formulaic Phrases (Section 4) - 17th Century Dutch VOC
formulaic_phrases:
- corpus: "VOC Archives (East India Company)"
language: "Dutch"
century: "17th Century"
phrase_patterns:
- pattern_id: FP_HEADER_001
text_region_type: HEADER
pattern: "Int [LOC] [DAT]"
example:
- "Int Casteel Batavia A:o 1684: 28:e november"
- pattern_id: FP_SALUTATION_001
text_region_type: PARAGRAPH
pattern: "Aen d' Edele Heer [PER] [DEN] en d' Edele Heeren [ORG] Van [LOC]"
example:
- "Aen d' Edele Heer Johannes Camphuijs Gouverneur Generael en d' Edele Heeren Raden Van India"
- pattern_id: FP_SHIPPING_001
text_region_type: PARAGRAPH
pattern: "Een [DAT] komt alhier direct uyt [LOC] te verschynen de fluyt, [ORG]"
description: Report of ship arrival
- pattern_id: FP_MUSTER_001
text_region_type: TABLE
pattern: "Den [DEN] [PER]\\n[DEN] [PER]\\nsterck [QTY]"
description: Military muster list format
example:
- "Luijtenant adolf winckelaar.\\nvaandrigh Joannes van Buijtenhem\\nsterck 42. Coppen"
- pattern_id: FP_CORRESPONDENCE_001
text_region_type: PARAGRAPH
pattern: "zijnde de [REF] van het [REF] door den [PER] aen ons gesonden"
example:
- "zijnde de copie van het briefie door den luijtenant grevingh aen ons gesonden"
# Metadata
metadata:
author: "S.C. Kemper"
role: "Data Scientist"
department: "Dienstverlening/Services"
institution: "Nationaal Archief"
email: "simon.kemper@nationaalarchief.nl"
first_publication: "2021-09-04"
current_publication: "2022-10-04"
scope: "Specified for 1.04.02, to be adjusted further to other archives"
annotation_scheme_description: >-
The Gado2 annotation scheme extends CoNLL-2002 (4 types) to 7 entity types,
improving recognition of enslaved people, women, and minorities in historical
texts. It moves from 'named entities' to all entities (named or not), captures
quantitative and temporal data outside tables, and recognises textual references.
Dependencies between the 7 categories improve entity disambiguation for linking
to knowledge bases like Wikidata.