glam/docs/convention/schema/convention_complete_instance.yaml

# Complete Convention Instance - Version 1.4.3
# This file represents a complete instantiation of the transcription convention
# combining layout rules, transcription rules, annotation rules, and formulaic phrases

version: "1.4.3"
publication_date: "2022-10-04"
status: SPECIFIED

# Layout Rules (Section 1)
layout_rules:
  text_region_types:
    - name: PARAGRAPH
      description: >-
        The running text within the type area. This is the main body text
        of the document.
      ordering_rules: >-
        Text lines within paragraphs should be ordered top-to-bottom based
        on baseline coordinates.

    - name: PAGE_NUMBER
      description: >-
        A number—either in digits or written out—, letter or combination of
        both indicating the order of a page or folium in a book or other type
        of writing.
      ordering_rules: >-
        Page numbers are typically located at top or bottom margins and should
        be identified as separate regions.

    - name: HEADER
      description: >-
        A general text at the top margin of a page or paragraph which can be
        assigned to multiple sections within a source.
      ordering_rules: >-
        Headers should be ordered before main body text when processing pages.

    - name: FOOTER
      description: >-
        A general text at the lower margin of a page or paragraph which can be
        assigned to multiple sections within a source.
      ordering_rules: >-
        Footers should be ordered after main body text when processing pages.

    - name: HEADING
      description: >-
        A title or index designation which only applies to a single section of
        a source, i.e., the paragraphs written directly underneath it.
      ordering_rules: >-
        Headings should be ordered immediately before the paragraphs they describe.

    - name: FOOTNOTE
      description: >-
        Indexed annotations and references which occur underneath the running text
        across multiple pages in a successive order.
      ordering_rules: >-
        Footnotes should be ordered by their index number/symbol and associated
        with their reference in main text.

    - name: TABLE
      description: >-
        Indices in which the layout of the text is more important than the syntax.
      ordering_rules: >-
        Table cells should preserve their row-column structure. Cell content
        should be ordered left-to-right, top-to-bottom.

    - name: MARGINALIA
      description: >-
        Notes, scribbles, and commentary in the margins of pages.
      ordering_rules: >-
        Marginalia should be associated with adjacent main text but marked as
        separate regions.

    - name: CAPTION
      description: >-
        Description of an image which is located approximate—often directly
        underneath—it.
      ordering_rules: >-
        Captions should be associated with their images and ordered after the
        image they describe.

    - name: COLOPHON
      description: >-
        A piece of text or section of a page in which the author or scribes of
        a textual source are mentioned or in which the creation, place of writing,
        or the delivery of the source are specified.
      ordering_rules: >-
        Colophons typically appear at the end of documents or sections.

  baseline_rules:
    - rule_id: BL001
      description: Remove transcribed text on pages in the background
      applies_to: >-
        Text regions that do not belong to the current page being transcribed
      action: REMOVE

    - rule_id: BL002
      description: Shorten baselines extending to decorative textual elements
      applies_to: >-
        Baselines that incorrectly extend into decorative elements such as
        illuminated letters, flourishes, or ornamental borders
      action: SHORTEN

    - rule_id: BL003
      description: Add space dividers for unusually long distances between words
      applies_to: >-
        Distances between words which are longer than usual considering the
        handwriting style
      action: ADJUST

    - rule_id: BL004
      description: Split baseline when word distance exceeds half baseline length
      applies_to: >-
        When the distance between words extends beyond half of the total length
        of the baseline
      action: SPLIT

    - rule_id: BL005
      description: Connect inserted texts to main baseline
      applies_to: >-
        Inserted texts between baselines need to be connected to the main baseline
        of which they are part. This also applies to Lombardic capitals.
      action: MERGE

    - rule_id: BL006
      description: Cut text region in half when lines cross columns
      applies_to: >-
        Text lines that cross columns or text regions that extend too far
      action: SPLIT

  text_line_ordering:
    method: coordinate-based
    applies_to_region: PARAGRAPH

# Transcription Rules (Section 2)
transcription_rules:
  is_diplomatic: true

  character_rules:
    # Key character confusion rules
    - rule_id: CHAR001
      name: distinguish_c_e_a_o
      description: Distinguish between c, e, a, and o
      source_characters: ["c", "e", "a", "o"]
      target_transcription: Use correct character as appears in image

    - rule_id: CHAR017
      name: merge_split_as_shown
      description: Merge and split words as shown on page
      source_characters: []
      target_transcription: Preserve word boundaries as shown in source

    - rule_id: CHAR018
      name: case_as_shown
      description: Preserve case as shown, use modern rules for ambiguous cases
      source_characters: []
      target_transcription: Preserve case as shown

    - rule_id: CHAR019
      name: medial_s_transcription
      description: Medial 's' (ſ) and short 's' both transcribed as 's'
      source_characters: ["ſ", "s"]
      target_transcription: "s"

  abbreviation_rules:
    - rule_id: ABB001
      abbreviation_symbol: ":"
      expansion_policy: EXPAND_SPECIAL
      description: Use colon for super/subscripts in handwritten sources
      special_cases:
        - symbol: "et=a"
          expansion: "etc."
          comment: "Expand to etc. for consistency"
        - symbol: "et:a"
          expansion: "etc."
          comment: "Expand to etc. for consistency"

    - rule_id: ABB002
      abbreviation_symbol: "ver-symbol"
      expansion_policy: EXPAND_SPECIAL
      description: Expand specific abbreviation symbols
      special_cases:
        - symbol: "⁊ (ver symbol)"
          expansion: "ver"
          comment: "Expand the ver abbreviation"
        - symbol: "ende symbol"
          expansion: "ende"
          comment: "Expand the ende abbreviation"
        - symbol: "per symbol"
          expansion: "per"
          comment: "Expand the per abbreviation"

  formatting_rules:
    - rule_id: FMT001
      formatting_type: STRIKETHROUGH
      applies_to_handwritten: true
      applies_to_printed: true
      transcription_method: >-
        Use cross outs when they occur. Indicate typos through crossing,
        e.g. ap~~f~~el. Registered as textStyle in PAGE-xml.

    - rule_id: FMT002
      formatting_type: UNDERLINE
      applies_to_handwritten: true
      applies_to_printed: true
      transcription_method: >-
        Underline if underlined on page. Registered as textStyle in PAGE-xml.

    - rule_id: FMT003
      formatting_type: BOLD
      applies_to_handwritten: false
      applies_to_printed: true
      transcription_method: Only for printed texts

    - rule_id: FMT004
      formatting_type: ITALIC
      applies_to_handwritten: false
      applies_to_printed: true
      transcription_method: Only for printed texts

  special_symbol_rules:
    - rule_id: SYM002
      symbol_name: negation_sign
      symbol_unicode: "¬"
      usage_context: End of line word breaks
      transcription_note: >-
        Use negation sign (¬) for line-end hyphens unless joining compound words

    - rule_id: SYM004
      symbol_name: full_stop
      symbol_unicode: "."
      usage_context: End of sentences
      transcription_note: >-
        Lines of ink at line ends often clean the quill, not punctuation.
        Avoid transcribing these to prevent NLP distortion.

    - rule_id: SYM010
      symbol_name: flourish_of_approval
      symbol_unicode: "₰"
      usage_context: >-
        Dutch check mark in 19th-21st century texts from Netherlands,
        Indonesia, South Africa, Belgium and Dutch Caribbean
      transcription_note: >-
        Represented by German penny symbol (₰)

    - rule_id: SYM011
      symbol_name: unclear_marker
      symbol_unicode: ""
      usage_context: Illegible or uncertain text
      transcription_note: >-
        Tag unclear words as 'unclear'. NEVER delete baseline.

# Annotation Rules (Section 3) - Gado2 Scheme
annotation_rules:
  annotation_scheme: "Gado2"
  no_double_tagging: true
  context_sensitive: true

  entity_types:
    # Summary of each entity type with key rules
    - entity_type: PERSON
      description: >-
        Personal names and specific references to persons (named or unnamed).
        Includes animals, fictional characters, religious figures.
      key_inclusion_rules:
        - Include titles only when needed for identification
        - Include articles/demonstratives with specific references
        - Include only given name or surname (with title, not designation)
      key_exclusion_rules:
        - Do not tag abstract references (plural/indefinite)
        - Do not tag pronouns
        - Do not tag designations with complete names
        - Do not include associated organisations/places
      example_tags:
        - "Gouverneur-Generaal van Starkenborgh"
        - "de Koning van Pruisen"
        - "zijn Vrouw"
        - "gemelte gouvern:r"

    - entity_type: PLACE
      description: >-
        Geographic locations: streets, cities, infrastructure, landforms,
        public spaces, buildings, astronomical objects, coordinates.
      key_inclusion_rules:
        - Include relevant adjectives (directional, descriptive)
        - Include metonymy (entity linking will resolve)
        - Include articles for generic references
      key_exclusion_rules:
        - Do not include articles for specific proper names
        - Do not tag person representations of places
      example_tags:
        - "Indonesische eilandenrijk"
        - "West-Javaanse Bandoeng"
        - "Z. breete van 35 — 31 ten langte 5 — 15"

    - entity_type: ORGANISATION
      description: >-
        Organizations: companies, institutions, governments, military,
        sports teams, ships, religious orders.
      key_inclusion_rules:
        - Tag branches with placenames (no preposition between)
        - Tag frequently repeated organizational references
      key_exclusion_rules:
        - Do not include articles
        - Do not tag informal groups (these are denominations)
        - Tag publications as textual references in appropriate context
        - Separate placenames with prepositions
      example_tags:
        - "ING Rotterdam"
        - "Ministerie van Financiën"
        - "Stoomschip Sumatra"
        - "aen generael en raden"

    - entity_type: DENOMINATION
      description: >-
        Ethnicity, profession, religion, demonym, ideology, language,
        community references. Includes pejoratives.
      key_inclusion_rules:
        - Tag both adjective and noun in phrases
        - Tag profession/pejorative alone (becomes person per 3.1 rules)
      key_exclusion_rules:
        - Do not tag organisations (formal structure)
        - Do not tag currencies (textual ref or quantity)
        - Do not tag with numerals (becomes quantity)
        - Do not tag associated places/orgs/persons
      example_tags:
        - "Islamitische gemeenschap"
        - "Chinees"
        - "Slaaf"
        - "Ministers"
        - "Volk van West-Irian"

    - entity_type: QUANTITY
      description: >-
        Currency, merchandise, people counts, age, distance, weight,
        enumeration, measurements.
      key_inclusion_rules:
        - Infer single items in enumerations
        - Tag denominations with numerals as quantities
        - Tag travel time as distance (not temporal)
      key_exclusion_rules:
        - Do not tag textual references as quantities
        - Do not tag associated organisations
      example_tags:
        - "ƒ 1.50"
        - "23: inlandsche zieken"
        - "3 schootels, zadel, 2 stijgh beugels"
        - "drie â vier dagen varens"

    - entity_type: TEMPORAL_REFERENCE
      description: >-
        Days, dates, campaigns/wars, holidays, canonised periods,
        genitives, temporal adjectives.
      key_inclusion_rules:
        - Always tag days unless full date written
        - Tag campaigns/wars when referring to time period
      key_exclusion_rules:
        - Do not tag days when full date is present
      example_tags:
        - "Afgelopen Vrijdag"
        - "Twee Wereldoorlog"
        - "Middeleeuwen"
        - "9 dezer"
        - "eighteenth-century Europe"

    - entity_type: TEXTUAL_REFERENCE
      description: >-
        Written sources, laws, titles, inventory numbers, accounts,
        currency types, URLs, policies, agreements, honours, flags.
      key_inclusion_rules:
        - Tag currency types (not amounts)
        - Tag publications as textual ref in appropriate context
        - Tag activities with recorded minutes
      key_exclusion_rules:
        - Do not tag currency amounts
        - Context determines if publication is org or textual ref
        - Do not confuse document references with quantities
      example_tags:
        - "Spaenschen reael"
        - "In de NRC stond ..."
        - "artikel 156 alinea 2"
        - "Ridder in de orde van Oranje Nassau"

# Formulaic Phrases (Section 4) - 17th Century Dutch VOC
formulaic_phrases:
  - corpus: "VOC Archives (East India Company)"
    language: "Dutch"
    century: "17th Century"
    phrase_patterns:
      - pattern_id: FP_HEADER_001
        text_region_type: HEADER
        pattern: "Int [LOC] [DAT]"
        example:
          - "Int Casteel Batavia A:o 1684: 28:e november"

      - pattern_id: FP_SALUTATION_001
        text_region_type: PARAGRAPH
        pattern: "Aen d' Edele Heer [PER] [DEN] en d' Edele Heeren [ORG] Van [LOC]"
        example:
          - "Aen d' Edele Heer Johannes Camphuijs Gouverneur Generael en d' Edele Heeren Raden Van India"

      - pattern_id: FP_SHIPPING_001
        text_region_type: PARAGRAPH
        pattern: "Een [DAT] komt alhier direct uyt [LOC] te verschynen de fluyt, [ORG]"
        description: Report of ship arrival

      - pattern_id: FP_MUSTER_001
        text_region_type: TABLE
        pattern: "Den [DEN] [PER]\\n[DEN] [PER]\\nsterck [QTY]"
        description: Military muster list format
        example:
          - "Luijtenant adolf winckelaar.\\nvaandrigh Joannes van Buijtenhem\\nsterck 42. Coppen"

      - pattern_id: FP_CORRESPONDENCE_001
        text_region_type: PARAGRAPH
        pattern: "zijnde de [REF] van het [REF] door den [PER] aen ons gesonden"
        example:
          - "zijnde de copie van het briefie door den luijtenant grevingh aen ons gesonden"

# Metadata
metadata:
  author: "S.C. Kemper"
  role: "Data Scientist"
  department: "Dienstverlening/Services"
  institution: "Nationaal Archief"
  email: "simon.kemper@nationaalarchief.nl"
  first_publication: "2021-09-04"
  current_publication: "2022-10-04"
  scope: "Specified for 1.04.02, to be adjusted further to other archives"

  annotation_scheme_description: >-
    The Gado2 annotation scheme extends CoNLL-2002 (4 types) to 7 entity types,
    improving recognition of enslaved people, women, and minorities in historical
    texts. It moves from 'named entities' to all entities (named or not), captures
    quantitative and temporal data outside tables, and recognises textual references.
    Dependencies between the 7 categories improve entity disambiguation for linking
    to knowledge bases like Wikidata.