3159 lines
117 KiB
YAML
3159 lines
117 KiB
YAML
# =============================================================================
|
|
# GLAM-NER ENTITY ANNOTATION CONVENTION
|
|
# Module: Document Structure and Namespace Paths
|
|
# =============================================================================
|
|
#
|
|
# VERSION: 1.7.0
|
|
# STATUS: STABLE
|
|
# LAST_UPDATED: 2025-12-02
|
|
#
|
|
# DESCRIPTION:
|
|
# Comprehensive document structure annotation module defining the DOC hypernym
|
|
# and all document region types. Critical for understanding entity context
|
|
# within structured documents.
|
|
#
|
|
# CRITICAL: Namespaces and document paths are ESSENTIAL for clustering entities
|
|
# and distinguishing context-specific entity relationships.
|
|
#
|
|
# DEPENDENCIES:
|
|
# - core/namespaces.yaml (namespace prefixes)
|
|
# - core/convention.yaml (base provenance model)
|
|
#
|
|
# CONTENTS:
|
|
# - Purpose and design principles
|
|
# - Layout Semantic Ontology (DOC hypernym with ~25 region types)
|
|
# - Semantic role enumeration
|
|
# - Nested provenance model (layout claims + entity claims)
|
|
# - Structural contexts for clustering
|
|
# - Format-specific path conventions (PAGE-XML, HTML, JSON, plain text)
|
|
# - Clustering strategies
|
|
# - Provenance requirements
|
|
#
|
|
# =============================================================================
|
|
|
|
module:
|
|
id: document_structure
|
|
name: "Document Structure and Namespace Paths"
|
|
version: "1.7.0"
|
|
status: stable
|
|
category: advanced
|
|
|
|
imports:
|
|
- ../core/namespaces.yaml
|
|
- ../core/convention.yaml
|
|
|
|
# =============================================================================
|
|
# PURPOSE AND DESIGN PRINCIPLES
|
|
# =============================================================================
|
|
|
|
purpose: |
|
|
Entity annotations occur within STRUCTURED DOCUMENTS - not flat text streams.
|
|
The LOCATION of an entity within document structure determines:
|
|
|
|
1. SEMANTIC SCOPE: An entity in a header governs entities in subsequent paragraphs
|
|
2. CO-REFERENCE CLUSTERS: Entities in the same structural unit likely co-refer
|
|
3. RELATIONSHIP CONTEXT: Header-paragraph relations differ from paragraph-paragraph
|
|
4. PROVENANCE PRECISION: XPath/JSONPath enables exact location for verification
|
|
|
|
Without namespace paths, entity extraction loses critical context that
|
|
distinguishes "the director" in a museum description from "the director"
|
|
in a film credits section of the same document.
|
|
|
|
design_principles:
|
|
- principle: "STRUCTURE IS MEANING"
|
|
description: |
|
|
Document structure (headers, sections, paragraphs, lists) carries semantic
|
|
information. A person mentioned in a "Board of Directors" section has a
|
|
different relationship to the organization than a person mentioned in a
|
|
"Historical Overview" section.
|
|
|
|
- principle: "PATHS ENABLE CLUSTERING"
|
|
description: |
|
|
Entities sharing a common path prefix belong to the same structural context.
|
|
Clustering by path enables:
|
|
- Scoped entity resolution (disambiguate within section before document)
|
|
- Contextual relationship inference (section membership implies relationship)
|
|
- Provenance aggregation (all claims from same region share reliability)
|
|
|
|
- principle: "NAMESPACES PREVENT COLLISION"
|
|
description: |
|
|
The same entity mention in different structural contexts may require
|
|
different annotations or link to different knowledge base entities.
|
|
Namespaces ensure annotations are addressable without ambiguity.
|
|
|
|
- principle: "LAYOUT INFORMS SEMANTICS"
|
|
description: |
|
|
Visual layout (sidebars, captions, footnotes, marginalia) carries meaning
|
|
distinct from main body text. PAGE-XML text regions, HTML semantic elements,
|
|
and JSON structural keys all encode layout that affects interpretation.
|
|
|
|
# =============================================================================
|
|
# LAYOUT SEMANTIC ONTOLOGY
|
|
# =============================================================================
|
|
# Comprehensive hypernym classes and properties for document structure annotation.
|
|
# These classes are format-agnostic and apply to PAGE-XML, HTML, JSON, MD, EPUB, PDF.
|
|
#
|
|
# PRIMARY AUTHORITIES:
|
|
# - W3C Web Annotation Data Model (oa:) - annotation targeting and selectors
|
|
# - CIDOC-CRM (crm:) - information objects and carriers
|
|
# - RiC-O (rico:) - record parts and structure
|
|
# - PREMIS (premis:) - digital object hierarchy
|
|
# - Dublin Core (dcterms:) - part/whole relationships
|
|
# - NIF 2.0 (nif:) - text annotation interchange
|
|
# =============================================================================
|
|
|
|
layout_semantic_ontology:
|
|
|
|
purpose: |
|
|
This ontology defines semantic classes for document layout elements that
|
|
generative AI models can use to annotate/describe structural context.
|
|
|
|
CRITICAL: Layout claims are DISTINCT from entity claims. A complete annotation
|
|
requires TWO NESTED LAYERS of provenance:
|
|
|
|
1. LAYOUT CLAIM: "This text region is a sidebar"
|
|
- Has its own provenance (XPath, timestamp, confidence, agent)
|
|
- May be uncertain (is this a sidebar or a marginalia?)
|
|
|
|
2. ENTITY CLAIM: "This sidebar contains person name 'Rembrandt'"
|
|
- Has its own provenance (span offsets, NER model, confidence)
|
|
- REFERENCES the layout claim as context
|
|
|
|
This separation enables:
|
|
- Independent validation of layout vs. entity extraction
|
|
- Different confidence levels for structure vs. content
|
|
- Reasoning about how layout affects entity interpretation
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# HYPERNYM: DOCUMENT_REGION (DOC)
|
|
# ---------------------------------------------------------------------------
|
|
# The top-level class for any identifiable region within a document.
|
|
# Analogous to CIDOC-CRM E73 Information Object but focused on structure.
|
|
# ---------------------------------------------------------------------------
|
|
|
|
DOCUMENT_REGION:
|
|
code: "DOC"
|
|
definition: |
|
|
A discrete, identifiable region within a document that contains content
|
|
and has structural relationships to other regions. Document regions are
|
|
the fundamental unit of layout annotation.
|
|
|
|
Every document region has:
|
|
- BOUNDARIES: Start/end positions (character offsets, coordinates, paths)
|
|
- CONTAINMENT: Parent regions that contain it, child regions it contains
|
|
- SEQUENCE: Position relative to sibling regions
|
|
- SEMANTIC ROLE: Function within the document (header, body, supplement)
|
|
|
|
ontology_mappings:
|
|
primary_class: "crm:E73_Information_Object"
|
|
primary_class_definition: |
|
|
CIDOC-CRM E73 Information Object: "This class comprises identifiable
|
|
immaterial items, such as poems, jokes, data sets, images, texts,
|
|
multimedia objects, procedural prescriptions, computer program code,
|
|
algorithm or mathematical formulae, that have an objectively
|
|
recognizable structure."
|
|
alternative_classes:
|
|
- class: "rico:RecordPart"
|
|
note: "RiC-O class for component parts of archival records"
|
|
- class: "premis:Bitstream"
|
|
note: "PREMIS class for meaningful data segment within file"
|
|
- class: "oa:TextualBody"
|
|
note: "Web Annotation class for textual content"
|
|
linkml_mapping:
|
|
class_uri: "glam:DocumentRegion"
|
|
exact_mappings:
|
|
- "crm:E73_Information_Object"
|
|
close_mappings:
|
|
- "rico:RecordPart"
|
|
- "premis:Bitstream"
|
|
related_mappings:
|
|
- "oa:TextualBody"
|
|
- "nif:Context"
|
|
|
|
properties:
|
|
- property: "hasParentRegion"
|
|
uri: "glam:hasParentRegion"
|
|
range: "DocumentRegion"
|
|
owl_mapping: "dcterms:isPartOf"
|
|
description: "The containing region (section contains paragraph)"
|
|
|
|
- property: "hasChildRegion"
|
|
uri: "glam:hasChildRegion"
|
|
range: "DocumentRegion"
|
|
owl_mapping: "dcterms:hasPart"
|
|
description: "Contained regions (paragraph contains sentences)"
|
|
|
|
- property: "hasNextSibling"
|
|
uri: "glam:hasNextSibling"
|
|
range: "DocumentRegion"
|
|
owl_mapping: "rico:isOrWasAdjacentTo"
|
|
description: "Next region in document order"
|
|
|
|
- property: "hasPreviousSibling"
|
|
uri: "glam:hasPreviousSibling"
|
|
range: "DocumentRegion"
|
|
description: "Previous region in document order"
|
|
|
|
- property: "hasSemanticRole"
|
|
uri: "glam:hasSemanticRole"
|
|
range: "LayoutSemanticRole"
|
|
description: "The semantic function of this region"
|
|
|
|
- property: "regionPath"
|
|
uri: "glam:regionPath"
|
|
range: "xsd:string"
|
|
owl_mapping: "oa:hasSelector"
|
|
description: "XPath, JSONPath, or other path expression"
|
|
|
|
- property: "regionStart"
|
|
uri: "glam:regionStart"
|
|
range: "xsd:integer"
|
|
owl_mapping: "nif:beginIndex"
|
|
description: "Character offset of region start"
|
|
|
|
- property: "regionEnd"
|
|
uri: "glam:regionEnd"
|
|
range: "xsd:integer"
|
|
owl_mapping: "nif:endIndex"
|
|
description: "Character offset of region end"
|
|
|
|
# =========================================================================
|
|
# SUBCATEGORIES - PRIMARY CONTENT REGIONS
|
|
# =========================================================================
|
|
|
|
subcategories:
|
|
|
|
# -----------------------------------------------------------------------
|
|
# PRIMARY CONTENT REGIONS
|
|
# -----------------------------------------------------------------------
|
|
|
|
DOC.HDR:
|
|
name: "HEADER"
|
|
definition: |
|
|
Heading or title region that introduces and governs subsequent content.
|
|
Headers establish scope for entity interpretation and relationship inference.
|
|
|
|
Includes: h1-h6, chapter titles, section headings, running headers,
|
|
PAGE-XML TextRegion[@type='header'], JSON object keys.
|
|
|
|
ontology_mappings:
|
|
primary_class: "schema:headline"
|
|
alternative_classes:
|
|
- "bf:Title"
|
|
- "dcterms:title"
|
|
nif_mapping: "nif:Title"
|
|
|
|
header_levels:
|
|
- level: 1
|
|
scope: "document"
|
|
html: "h1"
|
|
pagexml: "TextRegion[@type='heading'][@level='1']"
|
|
semantic: "Document title or primary topic"
|
|
- level: 2
|
|
scope: "chapter"
|
|
html: "h2"
|
|
pagexml: "TextRegion[@type='heading'][@level='2']"
|
|
semantic: "Major section or chapter heading"
|
|
- level: 3
|
|
scope: "section"
|
|
html: "h3"
|
|
semantic: "Subsection heading"
|
|
- level: 4
|
|
scope: "subsection"
|
|
html: "h4"
|
|
semantic: "Sub-subsection heading"
|
|
- level: 5
|
|
scope: "paragraph_group"
|
|
html: "h5"
|
|
semantic: "Minor heading or label"
|
|
- level: 6
|
|
scope: "inline"
|
|
html: "h6"
|
|
semantic: "Inline heading or run-in head"
|
|
|
|
governing_properties:
|
|
- property: "governs"
|
|
uri: "glam:governs"
|
|
range: "DocumentRegion"
|
|
description: "Regions semantically governed by this header"
|
|
- property: "governsUntil"
|
|
uri: "glam:governsUntil"
|
|
range: "DocumentRegion"
|
|
description: "Region where governance ends (next same-level header)"
|
|
|
|
DOC.PAR:
|
|
name: "PARAGRAPH"
|
|
definition: |
|
|
Block of continuous prose text forming a logical unit of discourse.
|
|
The primary content-bearing unit in most documents.
|
|
|
|
Includes: HTML <p>, PAGE-XML TextRegion[@type='paragraph'],
|
|
text separated by blank lines, JSON string values.
|
|
|
|
ontology_mappings:
|
|
primary_class: "crm:E33_Linguistic_Object"
|
|
alternative_classes:
|
|
- "schema:Text"
|
|
- "nif:Paragraph"
|
|
|
|
paragraph_properties:
|
|
- property: "paragraphIndex"
|
|
uri: "glam:paragraphIndex"
|
|
range: "xsd:integer"
|
|
description: "Zero-based index within containing section"
|
|
- property: "sentenceCount"
|
|
uri: "glam:sentenceCount"
|
|
range: "xsd:integer"
|
|
description: "Number of sentences in paragraph"
|
|
|
|
DOC.SEN:
|
|
name: "SENTENCE"
|
|
definition: |
|
|
A grammatical sentence within a paragraph. The minimal unit for
|
|
syntactic analysis and relationship extraction.
|
|
|
|
ontology_mappings:
|
|
primary_class: "nif:Sentence"
|
|
alternative_classes:
|
|
- "crm:E33_Linguistic_Object"
|
|
|
|
sentence_properties:
|
|
- property: "sentenceIndex"
|
|
uri: "glam:sentenceIndex"
|
|
range: "xsd:integer"
|
|
description: "Zero-based index within containing paragraph"
|
|
|
|
DOC.LST:
|
|
name: "LIST"
|
|
definition: |
|
|
Ordered or unordered enumeration of items sharing parallel structure.
|
|
|
|
Includes: HTML <ul>/<ol>, bulleted/numbered lists, PAGE-XML lists,
|
|
JSON arrays, markdown lists.
|
|
|
|
ontology_mappings:
|
|
primary_class: "schema:ItemList"
|
|
alternative_classes:
|
|
- "rdf:List"
|
|
- "rdf:Seq"
|
|
|
|
list_types:
|
|
- type: "ordered"
|
|
html: "ol"
|
|
semantic: "Sequence with meaningful order"
|
|
- type: "unordered"
|
|
html: "ul"
|
|
semantic: "Set without meaningful order"
|
|
- type: "definition"
|
|
html: "dl"
|
|
semantic: "Term-definition pairs"
|
|
|
|
DOC.LIT:
|
|
name: "LIST_ITEM"
|
|
definition: |
|
|
Individual item within a list. Shares type/relationship with siblings.
|
|
|
|
ontology_mappings:
|
|
primary_class: "schema:ListItem"
|
|
|
|
list_item_properties:
|
|
- property: "itemIndex"
|
|
uri: "glam:itemIndex"
|
|
range: "xsd:integer"
|
|
description: "Position within list (0-based)"
|
|
- property: "itemLabel"
|
|
uri: "glam:itemLabel"
|
|
range: "xsd:string"
|
|
description: "Bullet, number, or marker text"
|
|
|
|
DOC.TBL:
|
|
name: "TABLE"
|
|
definition: |
|
|
Structured tabular data with rows and columns.
|
|
|
|
Includes: HTML <table>, PAGE-XML tables, markdown tables,
|
|
CSV/TSV content, JSON arrays of objects.
|
|
|
|
ontology_mappings:
|
|
primary_class: "schema:Table"
|
|
alternative_classes:
|
|
- "csvw:Table"
|
|
|
|
table_structure:
|
|
- component: "TABLE_HEADER"
|
|
code: "DOC.TBL.HDR"
|
|
html: "thead, th"
|
|
description: "Column headers defining semantics"
|
|
- component: "TABLE_BODY"
|
|
code: "DOC.TBL.BDY"
|
|
html: "tbody"
|
|
description: "Data rows"
|
|
- component: "TABLE_ROW"
|
|
code: "DOC.TBL.ROW"
|
|
html: "tr"
|
|
description: "Single row of cells"
|
|
- component: "TABLE_CELL"
|
|
code: "DOC.TBL.CEL"
|
|
html: "td"
|
|
description: "Individual data cell"
|
|
|
|
# -----------------------------------------------------------------------
|
|
# SUPPLEMENTARY CONTENT REGIONS
|
|
# -----------------------------------------------------------------------
|
|
|
|
DOC.SDB:
|
|
name: "SIDEBAR"
|
|
definition: |
|
|
Supplementary content presented alongside main text, typically
|
|
providing context, metadata, or related information.
|
|
|
|
HYPERNYM for: marginalia, infoboxes, callout boxes, pull quotes,
|
|
asides, floating content.
|
|
|
|
Includes: HTML <aside>, Wikipedia infoboxes, PAGE-XML marginal regions,
|
|
floating boxes, pull quotes.
|
|
|
|
ontology_mappings:
|
|
primary_class: "schema:WPSideBar"
|
|
alternative_classes:
|
|
- "html:aside"
|
|
- "crm:E73_Information_Object"
|
|
|
|
hyponyms:
|
|
|
|
DOC.SDB.MRG:
|
|
name: "MARGINALIA"
|
|
definition: |
|
|
Annotations or notes in the margin of a page. Common in historical
|
|
manuscripts, early printed books, and academic texts.
|
|
|
|
Marginalia often contain:
|
|
- Reader annotations (comments, corrections)
|
|
- Editorial marks (printer instructions)
|
|
- Cross-references (citations, page numbers)
|
|
- Dates or signatures
|
|
|
|
ontology_mappings:
|
|
primary_class: "crm:E34_Inscription"
|
|
note: "Marginalia as inscribed marks on document"
|
|
|
|
margin_positions:
|
|
- position: "left"
|
|
pagexml: "TextRegion[@type='marginalia-left']"
|
|
- position: "right"
|
|
pagexml: "TextRegion[@type='marginalia-right']"
|
|
- position: "top"
|
|
pagexml: "TextRegion[@type='marginalia-top']"
|
|
- position: "bottom"
|
|
pagexml: "TextRegion[@type='marginalia-bottom']"
|
|
- position: "interlinear"
|
|
description: "Between lines of main text"
|
|
|
|
DOC.SDB.IBX:
|
|
name: "INFOBOX"
|
|
definition: |
|
|
Structured summary box containing key facts about a topic.
|
|
Common in encyclopedias, Wikipedia, and reference works.
|
|
|
|
Infoboxes contain structured claims with high extraction value.
|
|
|
|
ontology_mappings:
|
|
primary_class: "schema:Table"
|
|
note: "Infoboxes are typically key-value tables"
|
|
|
|
infobox_properties:
|
|
- property: "infoboxType"
|
|
range: "xsd:string"
|
|
description: "Type of infobox (person, place, organization)"
|
|
|
|
DOC.SDB.CLT:
|
|
name: "CALLOUT"
|
|
definition: |
|
|
Highlighted text box drawing attention to key information.
|
|
Includes pull quotes, highlighted passages, tip boxes.
|
|
|
|
callout_types:
|
|
- type: "pull_quote"
|
|
description: "Excerpt from main text displayed prominently"
|
|
- type: "tip"
|
|
description: "Advice or recommendation"
|
|
- type: "warning"
|
|
description: "Caution or alert"
|
|
- type: "note"
|
|
description: "Additional information"
|
|
|
|
DOC.CAP:
|
|
name: "CAPTION"
|
|
definition: |
|
|
Text describing or explaining a figure, table, or other visual element.
|
|
Captions establish aboutness relationships with visual content.
|
|
|
|
Includes: HTML <figcaption>, PAGE-XML caption regions, alt text,
|
|
image descriptions.
|
|
|
|
ontology_mappings:
|
|
primary_class: "schema:caption"
|
|
alternative_classes:
|
|
- "dcterms:description"
|
|
- "crm:P3_has_note"
|
|
|
|
caption_properties:
|
|
- property: "captionOf"
|
|
uri: "glam:captionOf"
|
|
range: "DocumentRegion"
|
|
owl_mapping: "schema:about"
|
|
description: "The figure/table this caption describes"
|
|
|
|
DOC.FTN:
|
|
name: "FOOTNOTE"
|
|
definition: |
|
|
Reference note at bottom of page or end of section/document.
|
|
Contains supplementary information, citations, or clarifications.
|
|
|
|
Includes: Footnotes, endnotes, sidenotes, margin notes with references.
|
|
|
|
ontology_mappings:
|
|
primary_class: "bibo:Note"
|
|
alternative_classes:
|
|
- "schema:Comment"
|
|
- "crm:E33_Linguistic_Object"
|
|
|
|
footnote_types:
|
|
- type: "footnote"
|
|
location: "bottom_of_page"
|
|
pagexml: "TextRegion[@type='footnote']"
|
|
- type: "endnote"
|
|
location: "end_of_chapter"
|
|
- type: "sidenote"
|
|
location: "margin"
|
|
|
|
footnote_properties:
|
|
- property: "footnoteMarker"
|
|
uri: "glam:footnoteMarker"
|
|
range: "xsd:string"
|
|
description: "The marker symbol (*, 1, a, etc.)"
|
|
- property: "referencesLocation"
|
|
uri: "glam:referencesLocation"
|
|
range: "DocumentRegion"
|
|
description: "Location of footnote marker in main text"
|
|
|
|
DOC.FIG:
|
|
name: "FIGURE"
|
|
definition: |
|
|
Single visual content item (image, diagram, chart).
|
|
For collections of images, use DOC.GAL (Gallery).
|
|
For geographic/spatial visualizations, use DOC.MAP (Map).
|
|
|
|
Entities mentioned in figures require visual analysis (OCR, object detection).
|
|
|
|
Includes: HTML <figure>, embedded images, diagrams, charts, illustrations.
|
|
|
|
ontology_mappings:
|
|
primary_class: "schema:ImageObject"
|
|
alternative_classes:
|
|
- "crm:E38_Image"
|
|
- "edm:WebResource"
|
|
- "foaf:Image"
|
|
linkml_mapping:
|
|
class_uri: "schema:ImageObject"
|
|
exact_mappings:
|
|
- "crm:E38_Image"
|
|
close_mappings:
|
|
- "edm:WebResource"
|
|
|
|
figure_types:
|
|
- type: "photograph"
|
|
description: "Photographic image"
|
|
- type: "illustration"
|
|
description: "Drawing, painting, or artwork"
|
|
- type: "diagram"
|
|
description: "Schematic, flowchart, or technical drawing"
|
|
- type: "chart"
|
|
description: "Data visualization (bar, line, pie, etc.)"
|
|
- type: "scan"
|
|
description: "Digitized physical document or object"
|
|
|
|
# -----------------------------------------------------------------------
|
|
# MEDIA COLLECTION REGIONS
|
|
# -----------------------------------------------------------------------
|
|
|
|
DOC.GAL:
|
|
name: "GALLERY"
|
|
code: "DOC.GAL"
|
|
definition: |
|
|
Collection of related visual or media items presented as a unit.
|
|
Distinguished from single figures by containing MULTIPLE items with
|
|
shared context, navigation, or thematic grouping.
|
|
|
|
Domain-agnostic applications:
|
|
- Heritage: Manuscript illuminations, artifact photo sets, exhibition views
|
|
- Web: Image carousels, product galleries, portfolio showcases
|
|
- Publishing: Photo essays, plate sections, illustration series
|
|
- Archives: Document set scans, correspondence series images
|
|
- Museums: Object photography series, conservation documentation
|
|
|
|
Entities in galleries share contextual scope from gallery title/caption.
|
|
Individual items may have their own DOC.FIG claims nested within.
|
|
|
|
ontology_mappings:
|
|
primary_class: "schema:ImageGallery"
|
|
alternative_classes:
|
|
- "schema:Collection"
|
|
- "crm:E78_Curated_Holding"
|
|
- "edm:Aggregation"
|
|
- "as:Collection"
|
|
linkml_mapping:
|
|
class_uri: "schema:ImageGallery"
|
|
close_mappings:
|
|
- "schema:Collection"
|
|
- "crm:E78_Curated_Holding"
|
|
related_mappings:
|
|
- "edm:Aggregation"
|
|
|
|
format_mappings:
|
|
html:
|
|
elements: ["div[class*='gallery']", "div[class*='carousel']", "div[class*='slider']", "ul[class*='gallery']"]
|
|
aria_roles: ["group", "listbox"]
|
|
json:
|
|
patterns: ["$.gallery", "$.images[]", "$.media.items[]"]
|
|
pagexml:
|
|
note: "Rare in PAGE-XML; group of consecutive ImageRegions"
|
|
|
|
gallery_types:
|
|
- type: "image_gallery"
|
|
description: "Collection of photographs or illustrations"
|
|
examples: ["museum object photos", "manuscript folios", "exhibition views"]
|
|
- type: "carousel"
|
|
description: "Horizontally/vertically scrolling media set"
|
|
- type: "slideshow"
|
|
description: "Sequential presentation with transitions"
|
|
- type: "lightbox"
|
|
description: "Thumbnail grid with modal expansion"
|
|
- type: "filmstrip"
|
|
description: "Linear sequence of video thumbnails or stills"
|
|
- type: "plate_section"
|
|
description: "Bound illustration pages in printed works"
|
|
note: "Common in historical scientific/art publications"
|
|
|
|
gallery_properties:
|
|
- property: "galleryTitle"
|
|
uri: "schema:name"
|
|
description: "Title of the gallery collection"
|
|
- property: "itemCount"
|
|
uri: "schema:numberOfItems"
|
|
description: "Number of items in gallery"
|
|
- property: "galleryItems"
|
|
uri: "schema:hasPart"
|
|
range: "DocumentRegion"
|
|
description: "Individual items (DOC.FIG) within gallery"
|
|
- property: "curatedBy"
|
|
uri: "schema:curator"
|
|
owl_mapping: "prov:wasAttributedTo"
|
|
description: "Agent who assembled the gallery"
|
|
|
|
entity_extraction_notes: |
|
|
Galleries present special extraction challenges:
|
|
1. Gallery TITLE provides context for all contained items
|
|
2. Individual CAPTIONS may override gallery-level context
|
|
3. Items may be ORDERED (narrative sequence) or UNORDERED (thematic group)
|
|
4. Navigation elements (prev/next, thumbnails) are structural, not content
|
|
|
|
Recommended approach:
|
|
- Create layout claim for gallery container (DOC.GAL)
|
|
- Create nested layout claims for each item (DOC.FIG)
|
|
- Entity claims reference their immediate container
|
|
- Inherit gallery context when item lacks own caption
|
|
|
|
DOC.MAP:
|
|
name: "MAP"
|
|
code: "DOC.MAP"
|
|
definition: |
|
|
Cartographic or spatial representation of geographic information.
|
|
Distinguished from generic figures by explicit spatial semantics.
|
|
|
|
CRITICAL DISTINCTION from GEOMETRY (GEO) entity type:
|
|
- DOC.MAP is a DOCUMENT REGION containing a map visualization
|
|
- GEO is an ENTITY TYPE for coordinate/shape data extracted from any region
|
|
|
|
Domain-agnostic applications:
|
|
- Heritage: Historical maps, archaeological site plans, building floorplans
|
|
- Web: Interactive maps, location widgets, route displays
|
|
- Publishing: Atlas plates, thematic maps, navigation charts
|
|
- Archives: Survey maps, cadastral records, military charts
|
|
- Urban planning: Zoning maps, infrastructure layouts
|
|
|
|
Entities in maps require spatial reasoning for extraction.
|
|
Place names (TOP) may appear as labels; geometry (GEO) as shapes/points.
|
|
|
|
ontology_mappings:
|
|
primary_class: "crm:E73_Information_Object"
|
|
alternative_classes:
|
|
- "schema:Map"
|
|
- "edm:WebResource"
|
|
- "geosparql:SpatialObject"
|
|
- "bibo:Map"
|
|
linkml_mapping:
|
|
class_uri: "schema:Map"
|
|
exact_mappings:
|
|
- "bibo:Map"
|
|
close_mappings:
|
|
- "crm:E73_Information_Object"
|
|
related_mappings:
|
|
- "geosparql:SpatialObject"
|
|
|
|
format_mappings:
|
|
html:
|
|
elements: ["div[class*='map']", "div[id*='map']", "leaflet-container", "mapboxgl-map", "gm-style"]
|
|
note: "Interactive maps often use Leaflet, Mapbox, Google Maps containers"
|
|
json:
|
|
patterns: ["$.map", "$.geojson", "$.geometry", "$.features[]"]
|
|
standards: ["GeoJSON (RFC 7946)", "TopoJSON"]
|
|
image_formats:
|
|
extensions: [".geotiff", ".tiff", ".jpg", ".png"]
|
|
note: "Georeferenced images contain embedded coordinate metadata"
|
|
pagexml:
|
|
type: "GraphicRegion[@type='map']"
|
|
|
|
map_types:
|
|
- type: "reference_map"
|
|
description: "General purpose geographic reference"
|
|
examples: ["world map", "country outline", "street map"]
|
|
- type: "thematic_map"
|
|
description: "Data visualization on geographic base"
|
|
examples: ["choropleth", "heat map", "dot distribution"]
|
|
- type: "historical_map"
|
|
description: "Map from historical period (primary source)"
|
|
examples: ["17th century nautical chart", "medieval mappa mundi"]
|
|
- type: "site_plan"
|
|
description: "Architectural or archaeological layout"
|
|
examples: ["floor plan", "excavation grid", "campus map"]
|
|
- type: "route_map"
|
|
description: "Navigation or journey visualization"
|
|
examples: ["transit map", "pilgrimage route", "trade route"]
|
|
- type: "cadastral_map"
|
|
description: "Property boundary documentation"
|
|
examples: ["land survey", "parcel map", "deed plat"]
|
|
- type: "interactive_map"
|
|
description: "User-manipulable web map with layers/zoom"
|
|
examples: ["Leaflet widget", "Google Maps embed", "IIIF geo extension"]
|
|
|
|
map_components:
|
|
- component: "MAP.BAS"
|
|
name: "Base Layer"
|
|
description: "Background geographic reference (satellite, street, terrain)"
|
|
- component: "MAP.OVL"
|
|
name: "Overlay Layer"
|
|
description: "Thematic data layer on top of base"
|
|
- component: "MAP.MRK"
|
|
name: "Markers"
|
|
description: "Point features (pins, icons, labels)"
|
|
- component: "MAP.SHP"
|
|
name: "Shapes"
|
|
description: "Polygon/polyline features (boundaries, routes)"
|
|
- component: "MAP.LEG"
|
|
name: "Legend"
|
|
description: "Symbol key and scale information"
|
|
- component: "MAP.CTL"
|
|
name: "Controls"
|
|
description: "Zoom, pan, layer toggles (structural, not content)"
|
|
|
|
map_properties:
|
|
- property: "mapTitle"
|
|
uri: "schema:name"
|
|
description: "Title of the map"
|
|
- property: "spatialCoverage"
|
|
uri: "schema:spatialCoverage"
|
|
description: "Geographic extent represented"
|
|
- property: "mapScale"
|
|
uri: "schema:contentSize"
|
|
description: "Representative fraction or verbal scale"
|
|
- property: "projection"
|
|
uri: "geosparql:hasSpatialAccuracy"
|
|
description: "Cartographic projection used"
|
|
- property: "temporalCoverage"
|
|
uri: "schema:temporalCoverage"
|
|
description: "Time period depicted (for historical maps)"
|
|
|
|
entity_extraction_notes: |
|
|
Maps present unique extraction challenges:
|
|
|
|
1. TOPONYMS (TOP) appear as:
|
|
- Labels on map face
|
|
- Legend entries
|
|
- Title/caption text
|
|
- Popup/tooltip content (interactive maps)
|
|
|
|
2. GEOMETRY (GEO) data includes:
|
|
- Point coordinates (markers)
|
|
- Bounding boxes
|
|
- Polygon vertices (regions, buildings)
|
|
- Polylines (routes, rivers, borders)
|
|
|
|
3. TEMPORAL context:
|
|
- Historical maps show past geography (not current!)
|
|
- Map creation date ≠ depicted time period
|
|
- Boundary changes over time
|
|
|
|
4. PROVENANCE considerations:
|
|
- Georeferenced scans have transformation accuracy
|
|
- Interactive maps have tile source attribution
|
|
- Derived maps inherit source map provenance
|
|
|
|
Recommended approach:
|
|
- Create layout claim for map container (DOC.MAP)
|
|
- Extract toponyms with map-specific confidence (label legibility)
|
|
- Extract geometry with coordinate reference system metadata
|
|
- Track both map creation date AND depicted time period
|
|
|
|
# -----------------------------------------------------------------------
|
|
# AUDIOVISUAL CONTENT REGIONS
|
|
# -----------------------------------------------------------------------
|
|
|
|
DOC.AUD:
|
|
name: "AUDIO"
|
|
code: "DOC.AUD"
|
|
definition: |
|
|
Audio content region (sound recording, podcast, music, oral history).
|
|
Entities require speech-to-text or audio analysis for extraction.
|
|
|
|
Domain-agnostic applications:
|
|
- Heritage: Oral history recordings, ethnographic field recordings
|
|
- Web: Podcast episodes, music players, audio articles
|
|
- Archives: Radio broadcasts, interview recordings, speeches
|
|
- Museums: Audio guides, soundscapes, musical instrument recordings
|
|
- Linguistics: Language documentation, dialect samples
|
|
|
|
Audio content often has associated TRANSCRIPT (DOC.PAR) which is the
|
|
primary source for NER; audio itself provides prosodic/speaker metadata.
|
|
|
|
ontology_mappings:
|
|
primary_class: "schema:AudioObject"
|
|
alternative_classes:
|
|
- "crm:E73_Information_Object"
|
|
- "edm:WebResource"
|
|
- "premis:IntellectualEntity"
|
|
linkml_mapping:
|
|
class_uri: "schema:AudioObject"
|
|
close_mappings:
|
|
- "crm:E73_Information_Object"
|
|
|
|
format_mappings:
|
|
html:
|
|
elements: ["audio", "div[class*='audio-player']", "div[class*='podcast']"]
|
|
json:
|
|
patterns: ["$.audio", "$.podcast", "$.episodes[]", "$.tracks[]"]
|
|
file_formats:
|
|
extensions: [".mp3", ".wav", ".ogg", ".flac", ".m4a", ".aac"]
|
|
|
|
audio_types:
|
|
- type: "speech"
|
|
description: "Spoken word content"
|
|
examples: ["interview", "lecture", "oral history", "podcast"]
|
|
- type: "music"
|
|
description: "Musical performance or composition"
|
|
examples: ["concert recording", "album track", "folk song"]
|
|
- type: "soundscape"
|
|
description: "Environmental or ambient audio"
|
|
examples: ["field recording", "museum ambiance", "historic sound"]
|
|
- type: "narration"
|
|
description: "Scripted audio guide or documentary"
|
|
examples: ["museum audio guide", "audiobook chapter"]
|
|
|
|
audio_properties:
|
|
- property: "duration"
|
|
uri: "schema:duration"
|
|
description: "Length of audio content (ISO 8601 duration)"
|
|
- property: "transcript"
|
|
uri: "schema:transcript"
|
|
range: "DocumentRegion"
|
|
description: "Text transcription of audio (DOC.PAR)"
|
|
- property: "speaker"
|
|
uri: "schema:actor"
|
|
description: "Person(s) speaking in recording"
|
|
- property: "recordingDate"
|
|
uri: "schema:dateCreated"
|
|
description: "When audio was recorded"
|
|
- property: "recordingLocation"
|
|
uri: "schema:contentLocation"
|
|
description: "Where audio was recorded"
|
|
|
|
audio_segments:
|
|
- segment: "AUD.SPK"
|
|
name: "Speaker Turn"
|
|
description: "Contiguous speech by single speaker"
|
|
- segment: "AUD.MUS"
|
|
name: "Music Segment"
|
|
description: "Musical interlude or background"
|
|
- segment: "AUD.SIL"
|
|
name: "Silence"
|
|
description: "Intentional pause or gap"
|
|
- segment: "AUD.SFX"
|
|
name: "Sound Effect"
|
|
description: "Non-speech, non-music audio event"
|
|
|
|
entity_extraction_notes: |
|
|
Audio requires multi-modal extraction:
|
|
|
|
1. TRANSCRIPT-BASED (primary):
|
|
- Extract entities from associated transcript (DOC.PAR)
|
|
- Timestamps align text spans to audio segments
|
|
- Speaker diarization links entities to speakers
|
|
|
|
2. AUDIO-DIRECT (secondary):
|
|
- Named entity recognition from ASR output
|
|
- Speaker identification (voice biometrics)
|
|
- Language/dialect detection
|
|
- Prosodic analysis (emphasis, emotion)
|
|
|
|
3. METADATA (tertiary):
|
|
- ID3 tags (music files)
|
|
- Episode metadata (podcasts)
|
|
- Catalog records (archives)
|
|
|
|
Provenance must track:
|
|
- ASR model and confidence
|
|
- Human transcription vs. automated
|
|
- Timestamp precision
|
|
|
|
DOC.VID:
|
|
name: "VIDEO"
|
|
code: "DOC.VID"
|
|
definition: |
|
|
Video content region (moving image with or without audio).
|
|
Entities require multimodal analysis: visual, audio, and text (captions/OCR).
|
|
|
|
Domain-agnostic applications:
|
|
- Heritage: Documentary footage, conservation documentation, exhibition videos
|
|
- Web: Embedded videos, live streams, video articles
|
|
- Archives: News broadcasts, home movies, surveillance footage
|
|
- Museums: Artist interviews, performance recordings, virtual tours
|
|
- Education: Lecture recordings, tutorials, demonstrations
|
|
|
|
Videos combine visual (DOC.FIG), audio (DOC.AUD), and textual modalities.
|
|
Entities may appear in any modality and require cross-modal linking.
|
|
|
|
ontology_mappings:
|
|
primary_class: "schema:VideoObject"
|
|
alternative_classes:
|
|
- "crm:E73_Information_Object"
|
|
- "edm:WebResource"
|
|
- "premis:IntellectualEntity"
|
|
linkml_mapping:
|
|
class_uri: "schema:VideoObject"
|
|
close_mappings:
|
|
- "crm:E73_Information_Object"
|
|
|
|
format_mappings:
|
|
html:
|
|
elements: ["video", "iframe[src*='youtube']", "iframe[src*='vimeo']", "div[class*='video-player']"]
|
|
json:
|
|
patterns: ["$.video", "$.videos[]", "$.media.video"]
|
|
file_formats:
|
|
extensions: [".mp4", ".webm", ".mov", ".avi", ".mkv"]
|
|
|
|
video_types:
|
|
- type: "documentary"
|
|
description: "Non-fiction narrative video"
|
|
- type: "interview"
|
|
description: "Recorded conversation with subject"
|
|
- type: "performance"
|
|
description: "Recorded artistic performance"
|
|
- type: "instructional"
|
|
description: "Tutorial or demonstration"
|
|
- type: "surveillance"
|
|
description: "Continuous monitoring footage"
|
|
- type: "news"
|
|
description: "Broadcast journalism content"
|
|
- type: "archival"
|
|
description: "Historical footage or home movies"
|
|
|
|
video_properties:
|
|
- property: "duration"
|
|
uri: "schema:duration"
|
|
description: "Length of video (ISO 8601 duration)"
|
|
- property: "transcript"
|
|
uri: "schema:transcript"
|
|
range: "DocumentRegion"
|
|
description: "Text transcription of speech"
|
|
- property: "caption"
|
|
uri: "schema:caption"
|
|
range: "DocumentRegion"
|
|
description: "Closed captions or subtitles"
|
|
- property: "thumbnail"
|
|
uri: "schema:thumbnail"
|
|
description: "Representative still image"
|
|
|
|
video_segments:
|
|
- segment: "VID.SCN"
|
|
name: "Scene"
|
|
description: "Continuous action segment"
|
|
- segment: "VID.SHT"
|
|
name: "Shot"
|
|
description: "Single camera take"
|
|
- segment: "VID.TTL"
|
|
name: "Title Card"
|
|
description: "Text overlay or title screen"
|
|
- segment: "VID.CRD"
|
|
name: "Credits"
|
|
description: "Attribution information"
|
|
|
|
DOC.EMB:
|
|
name: "EMBEDDED_INTERACTIVE"
|
|
code: "DOC.EMB"
|
|
definition: |
|
|
Embedded interactive content from external source or rich application.
|
|
Includes iframes, widgets, web components, and embedded applications.
|
|
|
|
Domain-agnostic applications:
|
|
- Heritage: IIIF viewers, 3D model viewers, timeline widgets
|
|
- Web: Social media embeds, code playgrounds, data visualizations
|
|
- Publishing: Interactive charts, explorable explanations
|
|
- Museums: Virtual tours, interactive exhibits, AR/VR content
|
|
- Archives: Document viewers, transcription interfaces
|
|
|
|
Entities within embedded content may require API access or headless
|
|
browser rendering to extract; simple iframe inspection is insufficient.
|
|
|
|
ontology_mappings:
|
|
primary_class: "schema:WebApplication"
|
|
alternative_classes:
|
|
- "schema:CreativeWork"
|
|
- "as:Application"
|
|
linkml_mapping:
|
|
class_uri: "schema:WebApplication"
|
|
|
|
format_mappings:
|
|
html:
|
|
elements: ["iframe", "embed", "object", "web-component", "[data-widget]"]
|
|
json:
|
|
patterns: ["$.embed", "$.widget", "$.interactive"]
|
|
|
|
embed_types:
|
|
- type: "iiif_viewer"
|
|
description: "IIIF-compliant image viewer"
|
|
examples: ["Mirador", "Universal Viewer", "OpenSeadragon"]
|
|
note: "Extract manifest URL for entity-rich metadata"
|
|
- type: "3d_viewer"
|
|
description: "3D model visualization"
|
|
examples: ["Sketchfab embed", "Three.js scene", "WebGL viewer"]
|
|
- type: "timeline"
|
|
description: "Temporal data visualization"
|
|
examples: ["TimelineJS", "Chronoline", "d3 timeline"]
|
|
- type: "social_embed"
|
|
description: "Social media post or feed"
|
|
examples: ["Twitter/X embed", "Instagram post", "YouTube video"]
|
|
- type: "data_viz"
|
|
description: "Interactive data visualization"
|
|
examples: ["D3.js chart", "Plotly graph", "Tableau embed"]
|
|
- type: "code_playground"
|
|
description: "Executable code environment"
|
|
examples: ["CodePen", "JSFiddle", "Observable notebook"]
|
|
- type: "form"
|
|
description: "Interactive input form"
|
|
examples: ["Survey", "search widget", "booking form"]
|
|
- type: "virtual_tour"
|
|
description: "360 or spatial navigation experience"
|
|
examples: ["Google Street View", "Matterport", "museum virtual tour"]
|
|
|
|
embed_properties:
|
|
- property: "embedSource"
|
|
uri: "schema:embedUrl"
|
|
description: "URL of embedded content"
|
|
- property: "embedProvider"
|
|
uri: "schema:provider"
|
|
description: "Service providing the embed"
|
|
- property: "embedType"
|
|
uri: "glam:embedType"
|
|
description: "Category of interactive content"
|
|
- property: "requiresInteraction"
|
|
uri: "glam:requiresInteraction"
|
|
description: "Whether user action needed to reveal content"
|
|
|
|
entity_extraction_notes: |
|
|
Embedded content presents significant extraction challenges:
|
|
|
|
1. ACCESSIBILITY:
|
|
- Content may be behind authentication
|
|
- May require JavaScript execution
|
|
- May load asynchronously
|
|
- Cross-origin policies may block access
|
|
|
|
2. STRATEGIES by embed type:
|
|
- IIIF: Fetch manifest JSON for rich metadata
|
|
- Social: Use platform APIs (Twitter, Instagram)
|
|
- Data viz: Extract from data source if accessible
|
|
- 3D: Parse model metadata, texture labels
|
|
|
|
3. PROVENANCE requirements:
|
|
- Record embed source URL
|
|
- Record extraction timestamp (content may change)
|
|
- Record access method (API, headless browser, etc.)
|
|
- Note if content was inaccessible
|
|
|
|
4. FALLBACK strategies:
|
|
- Use surrounding context (embed caption, link text)
|
|
- Use embed URL structure for hints
|
|
- Use embed provider metadata
|
|
|
|
# -----------------------------------------------------------------------
|
|
# NAVIGATION AND STRUCTURE REGIONS
|
|
# -----------------------------------------------------------------------
|
|
|
|
DOC.NAV:
|
|
name: "NAVIGATION"
|
|
definition: |
|
|
Generic navigational elements (breadcrumbs, menus, links).
|
|
For specific navigation structures, use dedicated types:
|
|
- DOC.TOC for tables of contents
|
|
- DOC.IDX for indices
|
|
|
|
ontology_mappings:
|
|
primary_class: "schema:SiteNavigationElement"
|
|
linkml_mapping:
|
|
class_uri: "schema:SiteNavigationElement"
|
|
|
|
navigation_types:
|
|
- type: "breadcrumb"
|
|
name: "Breadcrumb Trail"
|
|
html: "nav[aria-label='breadcrumb'], .breadcrumb"
|
|
- type: "menu"
|
|
name: "Navigation Menu"
|
|
html: "nav, menu"
|
|
- type: "sitemap"
|
|
name: "Site Map"
|
|
description: "Hierarchical site structure overview"
|
|
|
|
DOC.TOC:
|
|
name: "TABLE_OF_CONTENTS"
|
|
code: "DOC.TOC"
|
|
definition: |
|
|
Structured listing of document sections with page/location references.
|
|
Distinguished from general navigation by its document-internal scope
|
|
and hierarchical structure reflecting document organization.
|
|
|
|
Domain-agnostic applications:
|
|
- Heritage: Manuscript tables, register indices, finding aid outlines
|
|
- Publishing: Book TOCs, journal issue contents, report outlines
|
|
- Web: Article outlines, documentation navigation, wiki page contents
|
|
- Archives: Folder listings, series descriptions, container lists
|
|
- Legal: Statute tables, case indices, contract section lists
|
|
|
|
High-value for entity extraction: section titles often contain
|
|
key entities (names, places, dates, topics) that scope content.
|
|
|
|
ontology_mappings:
|
|
primary_class: "bibo:DocumentPart"
|
|
alternative_classes:
|
|
- "schema:ItemList"
|
|
- "crm:E73_Information_Object"
|
|
linkml_mapping:
|
|
class_uri: "bibo:DocumentPart"
|
|
close_mappings:
|
|
- "schema:ItemList"
|
|
|
|
format_mappings:
|
|
html:
|
|
elements: ["nav[role='doc-toc']", ".toc", "#table-of-contents", "ol.toc"]
|
|
pagexml:
|
|
type: "TextRegion[@type='table-of-contents']"
|
|
tei:
|
|
element: "<divGen type='toc'/>"
|
|
epub:
|
|
element: "nav[epub:type='toc']"
|
|
|
|
toc_types:
|
|
- type: "main_toc"
|
|
description: "Primary document table of contents"
|
|
- type: "list_of_figures"
|
|
description: "Figure/illustration listing with page numbers"
|
|
alias: "LOF"
|
|
- type: "list_of_tables"
|
|
description: "Table listing with page numbers"
|
|
alias: "LOT"
|
|
- type: "list_of_abbreviations"
|
|
description: "Abbreviation/acronym listing"
|
|
- type: "list_of_maps"
|
|
description: "Map listing with page numbers"
|
|
- type: "list_of_plates"
|
|
description: "Plate/illustration listing (historical)"
|
|
|
|
toc_properties:
|
|
- property: "tocEntry"
|
|
uri: "schema:itemListElement"
|
|
description: "Individual TOC entry"
|
|
- property: "sectionTitle"
|
|
uri: "schema:name"
|
|
description: "Title of referenced section"
|
|
- property: "pageReference"
|
|
uri: "bibo:pageStart"
|
|
description: "Page number or location reference"
|
|
- property: "nestingLevel"
|
|
uri: "glam:hierarchyLevel"
|
|
description: "Depth in TOC hierarchy (1=chapter, 2=section, etc.)"
|
|
|
|
entity_extraction_notes: |
|
|
TOC entries are HIGH-VALUE for entity extraction:
|
|
|
|
1. SECTION TITLES often contain:
|
|
- Person names (biographical sections)
|
|
- Place names (geographic chapters)
|
|
- Date ranges (chronological sections)
|
|
- Organization names (institutional histories)
|
|
|
|
2. HIERARCHICAL CONTEXT:
|
|
- Parent entries scope child entries
|
|
- "Part I: The Netherlands" -> child sections are Dutch-related
|
|
|
|
3. PAGE REFERENCES enable:
|
|
- Linking entities to page ranges
|
|
- Validating entity locations in document
|
|
|
|
DOC.IDX:
|
|
name: "INDEX"
|
|
code: "DOC.IDX"
|
|
definition: |
|
|
Alphabetical or systematic listing of terms, names, or subjects
|
|
with page/location references. Distinguished from TOC by its
|
|
alphabetical organization and term-based (not section-based) structure.
|
|
|
|
Domain-agnostic applications:
|
|
- Heritage: Name indices, place indices, subject indices
|
|
- Publishing: Back-of-book indices, periodical indices
|
|
- Archives: Finding aid indices, name authority files
|
|
- Legal: Case citation indices, statute indices
|
|
- Academic: Author indices, keyword indices
|
|
|
|
EXTREMELY high-value for NER: indices are curated entity lists
|
|
with location references, essentially pre-annotated entity data.
|
|
|
|
ontology_mappings:
|
|
primary_class: "bibo:DocumentPart"
|
|
alternative_classes:
|
|
- "schema:ItemList"
|
|
- "skos:Collection"
|
|
linkml_mapping:
|
|
class_uri: "bibo:DocumentPart"
|
|
related_mappings:
|
|
- "skos:Collection"
|
|
|
|
format_mappings:
|
|
html:
|
|
elements: ["div.index", "#index", "section[role='doc-index']"]
|
|
pagexml:
|
|
type: "TextRegion[@type='index']"
|
|
tei:
|
|
element: "<divGen type='index'/>, <index>"
|
|
|
|
index_types:
|
|
- type: "name_index"
|
|
description: "Personal and corporate name index"
|
|
entity_type: "AGT"
|
|
note: "Pre-annotated AGENT entities"
|
|
- type: "place_index"
|
|
description: "Geographic and place name index"
|
|
entity_type: "TOP"
|
|
note: "Pre-annotated TOPONYM entities"
|
|
- type: "subject_index"
|
|
description: "Topic and subject index"
|
|
entity_type: "mixed"
|
|
- type: "title_index"
|
|
description: "Work and publication title index"
|
|
entity_type: "WRK"
|
|
note: "Pre-annotated WORK entities"
|
|
- type: "chronological_index"
|
|
description: "Date and event index"
|
|
entity_type: "TMP"
|
|
note: "Pre-annotated TEMPORAL entities"
|
|
|
|
index_properties:
|
|
- property: "indexTerm"
|
|
uri: "skos:prefLabel"
|
|
description: "The indexed term or name"
|
|
- property: "pageReferences"
|
|
uri: "bibo:pages"
|
|
description: "Page number(s) where term appears"
|
|
- property: "seeAlso"
|
|
uri: "skos:related"
|
|
description: "Cross-reference to related terms"
|
|
- property: "subentry"
|
|
uri: "skos:narrower"
|
|
description: "Nested sub-entries under main term"
|
|
|
|
entity_extraction_notes: |
|
|
Indices are GOLD STANDARD entity sources:
|
|
|
|
1. PRE-CURATED ENTITIES:
|
|
- Name indices = curated person/org list
|
|
- Place indices = curated gazetteer
|
|
- Subject indices = controlled vocabulary
|
|
|
|
2. EXTRACTION STRATEGY:
|
|
- Parse index entries as entity mentions
|
|
- Use index type to assign entity class
|
|
- Page references locate entities in text
|
|
|
|
3. CROSS-REFERENCE VALUE:
|
|
- "See also" links indicate entity relationships
|
|
- Sub-entries indicate hierarchical relationships
|
|
|
|
# -----------------------------------------------------------------------
|
|
# FRONT MATTER REGIONS
|
|
# -----------------------------------------------------------------------
|
|
|
|
DOC.TTP:
|
|
name: "TITLE_PAGE"
|
|
code: "DOC.TTP"
|
|
definition: |
|
|
Primary identifying page of a document containing title, author,
|
|
publisher, date, and other key metadata. High-value structured
|
|
information source.
|
|
|
|
Domain-agnostic applications:
|
|
- Heritage: Manuscript title pages, incunabula colophons, broadside headers
|
|
- Publishing: Book title pages, journal covers, report covers
|
|
- Archives: Folder titles, series title sheets
|
|
- Legal: Document covers, contract title pages
|
|
- Academic: Thesis title pages, paper headers
|
|
|
|
Title pages contain STRUCTURED CLAIMS about the document itself.
|
|
|
|
ontology_mappings:
|
|
primary_class: "bibo:DocumentPart"
|
|
alternative_classes:
|
|
- "schema:CoverPage"
|
|
- "crm:E73_Information_Object"
|
|
linkml_mapping:
|
|
class_uri: "bibo:DocumentPart"
|
|
|
|
format_mappings:
|
|
html:
|
|
elements: [".title-page", "#cover", "section.frontmatter"]
|
|
pagexml:
|
|
type: "TextRegion[@type='title-page']"
|
|
tei:
|
|
element: "<titlePage>"
|
|
|
|
title_page_components:
|
|
- component: "TTP.TTL"
|
|
name: "Title"
|
|
description: "Main document title"
|
|
ontology: "dcterms:title"
|
|
- component: "TTP.STL"
|
|
name: "Subtitle"
|
|
description: "Secondary title"
|
|
ontology: "bibo:shortTitle"
|
|
- component: "TTP.AUT"
|
|
name: "Author"
|
|
description: "Creator attribution"
|
|
ontology: "dcterms:creator"
|
|
- component: "TTP.PUB"
|
|
name: "Publisher"
|
|
description: "Publisher/printer"
|
|
ontology: "dcterms:publisher"
|
|
- component: "TTP.DAT"
|
|
name: "Date"
|
|
description: "Publication date"
|
|
ontology: "dcterms:date"
|
|
- component: "TTP.PLC"
|
|
name: "Place"
|
|
description: "Publication place"
|
|
ontology: "dcterms:spatial"
|
|
- component: "TTP.EDT"
|
|
name: "Edition"
|
|
description: "Edition statement"
|
|
ontology: "bibo:edition"
|
|
- component: "TTP.IMP"
|
|
name: "Imprint"
|
|
description: "Full publication statement"
|
|
ontology: "bibo:Note"
|
|
|
|
entity_extraction_notes: |
|
|
Title pages are AUTHORITATIVE entity sources:
|
|
|
|
1. STRUCTURED CLAIMS about document:
|
|
- Author -> AGENT entity with creator role
|
|
- Publisher -> GROUP entity
|
|
- Date -> TEMPORAL entity (publication date)
|
|
- Place -> TOPONYM entity (publication place)
|
|
|
|
2. HIGH CONFIDENCE: Title page claims are intentional,
|
|
not incidental mentions - treat as authoritative.
|
|
|
|
DOC.DED:
|
|
name: "DEDICATION"
|
|
code: "DOC.DED"
|
|
definition: |
|
|
Dedicatory text, epigraph, or inscription typically appearing
|
|
in front matter. May honor a person, quote a source, or set
|
|
thematic context for the work.
|
|
|
|
Domain-agnostic applications:
|
|
- Heritage: Manuscript dedications, donor inscriptions
|
|
- Publishing: Book dedications, memorial pages
|
|
- Archives: Gift acknowledgments, founding documents
|
|
- Academic: Thesis acknowledgments, memorial lectures
|
|
- Monuments: Dedicatory inscriptions, foundation stones
|
|
|
|
ontology_mappings:
|
|
primary_class: "bibo:DocumentPart"
|
|
alternative_classes:
|
|
- "schema:Quotation"
|
|
- "crm:E33_Linguistic_Object"
|
|
linkml_mapping:
|
|
class_uri: "bibo:DocumentPart"
|
|
|
|
format_mappings:
|
|
html:
|
|
elements: [".dedication", ".epigraph", "blockquote.epigraph"]
|
|
tei:
|
|
element: "<div type='dedication'>, <epigraph>"
|
|
|
|
dedication_types:
|
|
- type: "dedication"
|
|
description: "Work dedicated to a person or group"
|
|
note: "Dedicatee is an AGENT entity"
|
|
- type: "epigraph"
|
|
description: "Quotation setting thematic context"
|
|
note: "May contain WORK reference (source)"
|
|
- type: "inscription"
|
|
description: "Physical inscription text"
|
|
note: "Common in monuments, buildings"
|
|
- type: "acknowledgment"
|
|
description: "Thanks to supporters/contributors"
|
|
note: "Multiple AGENT entities"
|
|
|
|
dedication_properties:
|
|
- property: "dedicatee"
|
|
uri: "schema:recipient"
|
|
description: "Person/group to whom work is dedicated"
|
|
- property: "quotationSource"
|
|
uri: "schema:isBasedOn"
|
|
description: "Source of epigraph quotation"
|
|
|
|
DOC.COL:
|
|
name: "COLOPHON"
|
|
code: "DOC.COL"
|
|
definition: |
|
|
Production statement typically at end of document containing
|
|
printing/publication details: printer, date, place, technical
|
|
specifications. Critical for bibliographic identification.
|
|
|
|
Domain-agnostic applications:
|
|
- Heritage: Manuscript colophons, incunabula printer statements
|
|
- Publishing: Printer's statements, production credits
|
|
- Archives: Processing notes, digitization metadata
|
|
- Legal: Document certification statements
|
|
- Digital: Software version, generation metadata
|
|
|
|
Colophons contain AUTHORITATIVE production claims.
|
|
|
|
ontology_mappings:
|
|
primary_class: "bibo:DocumentPart"
|
|
alternative_classes:
|
|
- "crm:E65_Creation"
|
|
- "prov:Activity"
|
|
linkml_mapping:
|
|
class_uri: "bibo:DocumentPart"
|
|
related_mappings:
|
|
- "crm:E65_Creation"
|
|
|
|
format_mappings:
|
|
html:
|
|
elements: [".colophon", "#colophon", "section[role='doc-colophon']"]
|
|
pagexml:
|
|
type: "TextRegion[@type='colophon']"
|
|
tei:
|
|
element: "<colophon>"
|
|
|
|
colophon_components:
|
|
- component: "COL.PRN"
|
|
name: "Printer"
|
|
description: "Printing house or individual printer"
|
|
ontology: "schema:printer"
|
|
- component: "COL.DAT"
|
|
name: "Date"
|
|
description: "Printing/production date"
|
|
ontology: "dcterms:created"
|
|
- component: "COL.PLC"
|
|
name: "Place"
|
|
description: "Place of production"
|
|
ontology: "dcterms:spatial"
|
|
- component: "COL.TYP"
|
|
name: "Typography"
|
|
description: "Font/type information"
|
|
- component: "COL.PAP"
|
|
name: "Paper"
|
|
description: "Paper/material specifications"
|
|
- component: "COL.CPY"
|
|
name: "Copyright"
|
|
description: "Rights statement"
|
|
ontology: "dcterms:rights"
|
|
- component: "COL.EDN"
|
|
name: "Edition"
|
|
description: "Print run, edition number"
|
|
ontology: "bibo:edition"
|
|
|
|
entity_extraction_notes: |
|
|
Colophons are CRITICAL for heritage identification:
|
|
|
|
1. PRODUCTION AGENTS:
|
|
- Printer -> GROUP or AGENT entity
|
|
- Publisher -> GROUP entity
|
|
- Scribe (manuscripts) -> AGENT entity
|
|
|
|
2. PRODUCTION CONTEXT:
|
|
- Date -> TEMPORAL entity
|
|
- Place -> TOPONYM entity
|
|
|
|
3. BIBLIOGRAPHIC VALUE:
|
|
- Often only source for incunabula dating
|
|
- Manuscript colophons name scribes
|
|
|
|
# -----------------------------------------------------------------------
|
|
# BACK MATTER REGIONS
|
|
# -----------------------------------------------------------------------
|
|
|
|
DOC.BIB:
|
|
name: "BIBLIOGRAPHY"
|
|
code: "DOC.BIB"
|
|
definition: |
|
|
List of cited or referenced works. Each entry represents a
|
|
WORK entity with structured bibliographic data.
|
|
|
|
Domain-agnostic applications:
|
|
- Heritage: Manuscript source lists, catalog references
|
|
- Publishing: Book bibliographies, article references
|
|
- Academic: Citation lists, literature reviews
|
|
- Legal: Case citations, statute references
|
|
- Archives: Related materials, provenance sources
|
|
|
|
Bibliographies are STRUCTURED WORK entity lists.
|
|
|
|
ontology_mappings:
|
|
primary_class: "bibo:DocumentPart"
|
|
alternative_classes:
|
|
- "schema:ItemList"
|
|
- "dcterms:BibliographicResource"
|
|
linkml_mapping:
|
|
class_uri: "bibo:DocumentPart"
|
|
|
|
format_mappings:
|
|
html:
|
|
elements: [".bibliography", ".references", "#refs", "section[role='doc-bibliography']"]
|
|
tei:
|
|
element: "<listBibl>, <div type='bibliography'>"
|
|
|
|
bibliography_types:
|
|
- type: "works_cited"
|
|
description: "Works directly cited in text"
|
|
- type: "further_reading"
|
|
description: "Recommended but not cited works"
|
|
- type: "sources"
|
|
description: "Primary source list"
|
|
- type: "discography"
|
|
description: "Music recording references"
|
|
- type: "filmography"
|
|
description: "Film/video references"
|
|
- type: "webography"
|
|
description: "Web resource references"
|
|
|
|
bibliography_entry_components:
|
|
- component: "BIB.AUT"
|
|
name: "Author"
|
|
entity_type: "AGT"
|
|
ontology: "dcterms:creator"
|
|
- component: "BIB.TTL"
|
|
name: "Title"
|
|
entity_type: "WRK"
|
|
ontology: "dcterms:title"
|
|
- component: "BIB.DAT"
|
|
name: "Date"
|
|
entity_type: "TMP"
|
|
ontology: "dcterms:date"
|
|
- component: "BIB.PUB"
|
|
name: "Publisher"
|
|
entity_type: "GRP"
|
|
ontology: "dcterms:publisher"
|
|
- component: "BIB.PLC"
|
|
name: "Place"
|
|
entity_type: "TOP"
|
|
ontology: "dcterms:spatial"
|
|
|
|
entity_extraction_notes: |
|
|
Bibliographies are PRE-STRUCTURED entity sources:
|
|
|
|
1. EACH ENTRY contains:
|
|
- Author(s) -> AGENT entities
|
|
- Title -> WORK entity
|
|
- Date -> TEMPORAL entity
|
|
- Publisher -> GROUP entity
|
|
- Place -> TOPONYM entity
|
|
|
|
2. CITATION PARSING:
|
|
- Citation format indicates field boundaries
|
|
- High confidence due to intentional structure
|
|
|
|
DOC.APP:
|
|
name: "APPENDIX"
|
|
code: "DOC.APP"
|
|
definition: |
|
|
Supplementary material at document end containing supporting
|
|
data, extended discussions, or reference material too detailed
|
|
for main text.
|
|
|
|
Domain-agnostic applications:
|
|
- Heritage: Document transcriptions, provenance records
|
|
- Publishing: Data tables, extended methods, source texts
|
|
- Academic: Statistical data, interview transcripts
|
|
- Legal: Exhibits, supporting documents
|
|
- Archives: Finding aid supplements, accession lists
|
|
|
|
ontology_mappings:
|
|
primary_class: "bibo:DocumentPart"
|
|
alternative_classes:
|
|
- "schema:CreativeWork"
|
|
linkml_mapping:
|
|
class_uri: "bibo:DocumentPart"
|
|
|
|
format_mappings:
|
|
html:
|
|
elements: [".appendix", "section[role='doc-appendix']"]
|
|
tei:
|
|
element: "<div type='appendix'>"
|
|
|
|
appendix_types:
|
|
- type: "data_appendix"
|
|
description: "Statistical or tabular data"
|
|
- type: "document_appendix"
|
|
description: "Reproduced primary documents"
|
|
- type: "technical_appendix"
|
|
description: "Methods, algorithms, specifications"
|
|
- type: "glossary_appendix"
|
|
description: "Extended terminology (see also DOC.GLO)"
|
|
|
|
appendix_properties:
|
|
- property: "appendixLabel"
|
|
uri: "schema:name"
|
|
description: "Appendix letter/number (A, B, C or 1, 2, 3)"
|
|
- property: "appendixTitle"
|
|
uri: "dcterms:title"
|
|
description: "Descriptive title"
|
|
|
|
DOC.GLO:
|
|
name: "GLOSSARY"
|
|
code: "DOC.GLO"
|
|
definition: |
|
|
Alphabetical list of terms with definitions. Each entry is a
|
|
CONCEPT with term (label) and definition (description).
|
|
|
|
Domain-agnostic applications:
|
|
- Heritage: Manuscript terminology, archaic word lists
|
|
- Publishing: Technical glossaries, foreign word lists
|
|
- Academic: Disciplinary terminology
|
|
- Legal: Legal terms, statutory definitions
|
|
- Archives: Archival terminology, provenance terms
|
|
|
|
Glossaries are CONTROLLED VOCABULARY sources.
|
|
|
|
ontology_mappings:
|
|
primary_class: "skos:ConceptScheme"
|
|
alternative_classes:
|
|
- "schema:DefinedTermSet"
|
|
- "bibo:DocumentPart"
|
|
linkml_mapping:
|
|
class_uri: "skos:ConceptScheme"
|
|
|
|
format_mappings:
|
|
html:
|
|
elements: [".glossary", "dl.glossary", "section[role='doc-glossary']"]
|
|
tei:
|
|
element: "<list type='gloss'>, <div type='glossary'>"
|
|
|
|
glossary_entry_components:
|
|
- component: "GLO.TRM"
|
|
name: "Term"
|
|
ontology: "skos:prefLabel"
|
|
- component: "GLO.DEF"
|
|
name: "Definition"
|
|
ontology: "skos:definition"
|
|
- component: "GLO.SYN"
|
|
name: "Synonym"
|
|
ontology: "skos:altLabel"
|
|
- component: "GLO.REL"
|
|
name: "Related Term"
|
|
ontology: "skos:related"
|
|
|
|
# -----------------------------------------------------------------------
|
|
# COMMERCIAL AND BRANDING REGIONS
|
|
# -----------------------------------------------------------------------
|
|
|
|
DOC.ADV:
|
|
name: "ADVERTISEMENT"
|
|
code: "DOC.ADV"
|
|
definition: |
|
|
Commercial or promotional content within document. Important for
|
|
historical documents where ads provide dating, pricing, business,
|
|
and social context. Distinct from main editorial content.
|
|
|
|
Domain-agnostic applications:
|
|
- Heritage: Historical newspaper ads, trade catalog entries, broadside ads
|
|
- Publishing: Book advertisements, periodical ads, classified sections
|
|
- Archives: Commercial records, promotional materials
|
|
- Web: Banner ads, sponsored content, promotional sections
|
|
- Ephemera: Trade cards, handbills, promotional flyers
|
|
|
|
Advertisements are RICH entity sources for historical research:
|
|
business names, addresses, prices, products, and social attitudes.
|
|
|
|
ontology_mappings:
|
|
primary_class: "schema:Advertisement"
|
|
alternative_classes:
|
|
- "crm:E73_Information_Object"
|
|
- "bibo:DocumentPart"
|
|
linkml_mapping:
|
|
class_uri: "schema:Advertisement"
|
|
|
|
format_mappings:
|
|
html:
|
|
elements: [".ad", ".advertisement", "[role='complementary'][aria-label*='sponsor']", "aside.ad"]
|
|
pagexml:
|
|
type: "TextRegion[@type='advertisement']"
|
|
newspaper:
|
|
note: "Common in historical newspaper digitization"
|
|
|
|
advertisement_types:
|
|
- type: "display_ad"
|
|
description: "Large format advertisement with graphics"
|
|
- type: "classified_ad"
|
|
description: "Text-only small advertisement"
|
|
- type: "trade_listing"
|
|
description: "Business directory entry"
|
|
- type: "prospectus"
|
|
description: "Book/publication advertisement"
|
|
- type: "patent_medicine"
|
|
description: "Historical medical product ads"
|
|
note: "Common in 19th century periodicals"
|
|
- type: "auction_notice"
|
|
description: "Sale or auction announcement"
|
|
- type: "legal_notice"
|
|
description: "Required public announcements"
|
|
|
|
advertisement_properties:
|
|
- property: "advertiser"
|
|
uri: "schema:sponsor"
|
|
description: "Business or person placing ad"
|
|
entity_type: "GRP or AGT"
|
|
- property: "product"
|
|
uri: "schema:itemAdvertised"
|
|
description: "Product or service advertised"
|
|
- property: "businessAddress"
|
|
uri: "schema:address"
|
|
description: "Advertiser's address"
|
|
entity_type: "TOP"
|
|
- property: "price"
|
|
uri: "schema:price"
|
|
description: "Advertised price"
|
|
entity_type: "QTY"
|
|
|
|
entity_extraction_notes: |
|
|
Historical advertisements are TREASURE TROVES:
|
|
|
|
1. BUSINESS ENTITIES:
|
|
- Business names -> GROUP entities
|
|
- Proprietor names -> AGENT entities
|
|
- Business addresses -> TOPONYM entities
|
|
|
|
2. HISTORICAL VALUE:
|
|
- Dating evidence (product availability)
|
|
- Pricing history
|
|
- Business locations over time
|
|
- Social/cultural attitudes
|
|
|
|
3. PROVENANCE:
|
|
- Distinguish ad claims from editorial claims
|
|
- Ads have different authority level
|
|
|
|
DOC.LOG:
|
|
name: "LOGO"
|
|
code: "DOC.LOG"
|
|
definition: |
|
|
Visual identity marks: logos, mastheads, colophon marks, printer's
|
|
devices, watermarks, seals, and brand identifiers. Important for
|
|
attribution and provenance.
|
|
|
|
Domain-agnostic applications:
|
|
- Heritage: Printer's marks, publisher devices, watermarks, seals
|
|
- Publishing: Publisher logos, journal mastheads, imprint marks
|
|
- Archives: Institutional seals, letterhead logos
|
|
- Web: Site logos, brand marks, favicons
|
|
- Legal: Notary seals, official stamps, certification marks
|
|
|
|
Logos identify PRODUCING AGENTS and provide provenance evidence.
|
|
|
|
ontology_mappings:
|
|
primary_class: "schema:ImageObject"
|
|
alternative_classes:
|
|
- "crm:E37_Mark"
|
|
- "crm:E73_Information_Object"
|
|
linkml_mapping:
|
|
class_uri: "schema:ImageObject"
|
|
close_mappings:
|
|
- "crm:E37_Mark"
|
|
|
|
format_mappings:
|
|
html:
|
|
elements: [".logo", "header img.logo", "[role='banner'] img", ".masthead img"]
|
|
pagexml:
|
|
type: "GraphicRegion[@type='logo'], GraphicRegion[@type='decoration']"
|
|
|
|
logo_types:
|
|
- type: "publisher_logo"
|
|
description: "Publisher's identifying mark"
|
|
- type: "printer_device"
|
|
description: "Historical printer's identifying mark"
|
|
note: "Critical for incunabula identification"
|
|
- type: "masthead"
|
|
description: "Newspaper/periodical title banner"
|
|
- type: "watermark"
|
|
description: "Paper manufacturer's mark"
|
|
note: "Used for paper dating and provenance"
|
|
- type: "seal"
|
|
description: "Official or personal seal impression"
|
|
entity_type: "AGT or GRP"
|
|
- type: "coat_of_arms"
|
|
description: "Heraldic device"
|
|
entity_type: "AGT or GRP"
|
|
- type: "colophon_mark"
|
|
description: "Decorative mark in colophon"
|
|
- type: "ex_libris"
|
|
description: "Bookplate or ownership mark"
|
|
note: "Provenance evidence for ownership history"
|
|
|
|
logo_properties:
|
|
- property: "logoOwner"
|
|
uri: "schema:creator"
|
|
description: "Entity identified by logo"
|
|
entity_type: "GRP or AGT"
|
|
- property: "logoDescription"
|
|
uri: "schema:description"
|
|
description: "Visual description of mark"
|
|
- property: "logoReference"
|
|
uri: "schema:isBasedOn"
|
|
description: "Reference to mark catalog/database"
|
|
|
|
entity_extraction_notes: |
|
|
Logos provide PROVENANCE evidence:
|
|
|
|
1. ATTRIBUTION:
|
|
- Printer's devices identify producer
|
|
- Publisher logos identify publisher
|
|
- Watermarks date paper production
|
|
|
|
2. OWNERSHIP HISTORY:
|
|
- Ex libris marks trace ownership
|
|
- Seals indicate institutional provenance
|
|
|
|
3. VISUAL ANALYSIS required:
|
|
- May need image matching to logo databases
|
|
- Heraldic interpretation for coats of arms
|
|
|
|
DOC.PGN:
|
|
name: "PAGINATION"
|
|
definition: |
|
|
Page numbers, folio numbers, signature marks in printed/manuscript works.
|
|
|
|
ontology_mappings:
|
|
primary_class: "crm:E42_Identifier"
|
|
|
|
pagination_types:
|
|
- type: "page_number"
|
|
pagexml: "TextRegion[@type='page-number']"
|
|
description: "Arabic or Roman numeral page number"
|
|
- type: "folio"
|
|
description: "Leaf number with recto/verso (e.g., 23r, 23v)"
|
|
- type: "signature_mark"
|
|
pagexml: "TextRegion[@type='signature-mark']"
|
|
description: "Gathering/quire identifier in manuscripts"
|
|
- type: "catch_word"
|
|
pagexml: "TextRegion[@type='catch-word']"
|
|
description: "Word at page bottom matching next page start"
|
|
|
|
DOC.BLK:
|
|
name: "BLOCK_QUOTE"
|
|
definition: |
|
|
Extended quotation from another source, typically indented or styled
|
|
distinctly from surrounding text.
|
|
|
|
ontology_mappings:
|
|
primary_class: "schema:Quotation"
|
|
alternative_classes:
|
|
- "crm:E33_Linguistic_Object"
|
|
|
|
quote_properties:
|
|
- property: "quotedFrom"
|
|
uri: "glam:quotedFrom"
|
|
range: "xsd:anyURI"
|
|
owl_mapping: "prov:wasDerivedFrom"
|
|
description: "Source of the quotation"
|
|
|
|
# -----------------------------------------------------------------------
|
|
# METADATA AND ADMINISTRATIVE REGIONS
|
|
# -----------------------------------------------------------------------
|
|
|
|
DOC.MTD:
|
|
name: "METADATA_BLOCK"
|
|
definition: |
|
|
Region containing document metadata (author, date, keywords, etc.).
|
|
High-value for entity extraction as claims are typically structured.
|
|
|
|
Includes: HTML <head>, document properties, front matter, colophon.
|
|
|
|
ontology_mappings:
|
|
primary_class: "dcterms:BibliographicResource"
|
|
alternative_classes:
|
|
- "schema:CreativeWork"
|
|
|
|
metadata_block_types:
|
|
- type: "front_matter"
|
|
description: "Title page, copyright, dedication"
|
|
- type: "back_matter"
|
|
description: "Appendices, bibliography, colophon"
|
|
- type: "colophon"
|
|
description: "Production details (printer, date, place)"
|
|
pagexml: "TextRegion[@type='colophon']"
|
|
- type: "document_head"
|
|
html: "head"
|
|
description: "HTML metadata section"
|
|
|
|
DOC.ANN:
|
|
name: "ANNOTATION_REGION"
|
|
definition: |
|
|
Region containing annotations or markup added to document.
|
|
Distinguished from original content for provenance tracking.
|
|
|
|
Includes: Editorial additions, transcription notes, TEI annotations.
|
|
|
|
ontology_mappings:
|
|
primary_class: "oa:Annotation"
|
|
alternative_classes:
|
|
- "crm:E13_Attribute_Assignment"
|
|
|
|
annotation_properties:
|
|
- property: "annotationBody"
|
|
uri: "oa:hasBody"
|
|
description: "Content of the annotation"
|
|
- property: "annotationTarget"
|
|
uri: "oa:hasTarget"
|
|
description: "Region being annotated"
|
|
- property: "annotator"
|
|
uri: "oa:annotatedBy"
|
|
owl_mapping: "prov:wasAttributedTo"
|
|
description: "Agent who created annotation"
|
|
|
|
# =============================================================================
|
|
# SEMANTIC ROLE ENUMERATION
|
|
# =============================================================================
|
|
|
|
layout_semantic_roles:
|
|
description: |
|
|
Enumeration of semantic roles that document regions can play.
|
|
A single region may have multiple roles.
|
|
|
|
roles:
|
|
- role: "PRIMARY_CONTENT"
|
|
code: "PRIM"
|
|
description: "Main content bearing primary information"
|
|
typical_regions: ["DOC.PAR", "DOC.HDR", "DOC.LST"]
|
|
|
|
- role: "SUPPLEMENTARY"
|
|
code: "SUPP"
|
|
description: "Additional context or metadata"
|
|
typical_regions: ["DOC.SDB", "DOC.FTN", "DOC.CAP", "DOC.APP"]
|
|
|
|
- role: "NAVIGATIONAL"
|
|
code: "NAV"
|
|
description: "Aids document navigation"
|
|
typical_regions: ["DOC.NAV", "DOC.PGN", "DOC.TOC", "DOC.IDX"]
|
|
|
|
- role: "STRUCTURAL"
|
|
code: "STRC"
|
|
description: "Defines document structure"
|
|
typical_regions: ["DOC.HDR", "DOC.TTP"]
|
|
|
|
- role: "REFERENTIAL"
|
|
code: "REF"
|
|
description: "Points to other resources"
|
|
typical_regions: ["DOC.FTN", "DOC.BLK", "DOC.BIB"]
|
|
|
|
- role: "VISUAL"
|
|
code: "VIS"
|
|
description: "Non-textual visual content (images, maps, diagrams)"
|
|
typical_regions: ["DOC.FIG", "DOC.GAL", "DOC.MAP", "DOC.LOG"]
|
|
|
|
- role: "AUDIOVISUAL"
|
|
code: "AV"
|
|
description: "Time-based media content (audio, video)"
|
|
typical_regions: ["DOC.AUD", "DOC.VID"]
|
|
|
|
- role: "INTERACTIVE"
|
|
code: "INT"
|
|
description: "User-manipulable embedded content"
|
|
typical_regions: ["DOC.EMB", "DOC.MAP"]
|
|
note: "Interactive maps have both VIS and INT roles"
|
|
|
|
- role: "METADATA"
|
|
code: "META"
|
|
description: "Document-level metadata"
|
|
typical_regions: ["DOC.MTD", "DOC.COL", "DOC.TTP"]
|
|
|
|
- role: "SPATIAL"
|
|
code: "SPAT"
|
|
description: "Geographic or spatial representation"
|
|
typical_regions: ["DOC.MAP"]
|
|
note: "Distinct from VISUAL; emphasizes coordinate/location semantics"
|
|
|
|
- role: "FRONT_MATTER"
|
|
code: "FRNT"
|
|
description: "Preliminary material before main content"
|
|
typical_regions: ["DOC.TTP", "DOC.DED", "DOC.TOC"]
|
|
|
|
- role: "BACK_MATTER"
|
|
code: "BACK"
|
|
description: "Material following main content"
|
|
typical_regions: ["DOC.BIB", "DOC.IDX", "DOC.APP", "DOC.GLO", "DOC.COL"]
|
|
|
|
- role: "PARATEXTUAL"
|
|
code: "PARA"
|
|
description: "Content about the document itself (not subject matter)"
|
|
typical_regions: ["DOC.TTP", "DOC.COL", "DOC.DED", "DOC.ADV"]
|
|
note: "Genette's paratext concept - frames the main text"
|
|
|
|
- role: "COMMERCIAL"
|
|
code: "COMM"
|
|
description: "Commercial or promotional content"
|
|
typical_regions: ["DOC.ADV", "DOC.LOG"]
|
|
note: "Distinguish from editorial content for authority assessment"
|
|
|
|
- role: "LEXICAL"
|
|
code: "LEX"
|
|
description: "Vocabulary, terminology, and definitions"
|
|
typical_regions: ["DOC.GLO", "DOC.IDX"]
|
|
|
|
# =============================================================================
|
|
# STRUCTURAL CONTEXTS FOR CLUSTERING
|
|
# =============================================================================
|
|
|
|
structural_contexts:
|
|
description: |
|
|
Document regions provide semantic context for entity clustering and relationship
|
|
inference. Entities appearing in the same structural context are more likely
|
|
to be related or co-referential than entities in distant structures.
|
|
|
|
This section defines annotation rules for major structural context types.
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# List Contexts
|
|
# ---------------------------------------------------------------------------
|
|
list_contexts:
|
|
description: |
|
|
Lists (ul, ol, dl) have special semantics. Items in a list:
|
|
- Share a common parent context (the list introduction)
|
|
- Often have parallel structure (same entity types)
|
|
- May be implicitly related (all members of same group)
|
|
|
|
annotation_rules:
|
|
- rule: "LIST_ITEM_PARALLELISM"
|
|
description: |
|
|
Entities in sibling list items likely share:
|
|
- Same entity TYPE (all museums, all dates, all people)
|
|
- Same RELATIONSHIP to list parent (all "member of", all "located in")
|
|
- Parallel STRUCTURE (if first item has date, others likely do too)
|
|
|
|
- rule: "LIST_HEADER_INHERITANCE"
|
|
description: |
|
|
List items inherit context from the text introducing the list.
|
|
"The following museums are members:" + <li>Rijksmuseum</li>
|
|
implies Rijksmuseum has membership relationship.
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Sidebar Contexts
|
|
# ---------------------------------------------------------------------------
|
|
sidebar_contexts:
|
|
description: |
|
|
Sidebars, asides, marginalia, infoboxes contain SUPPLEMENTARY information.
|
|
Entities in sidebars:
|
|
- Provide metadata ABOUT the main content
|
|
- May contain structured data (birth dates, locations, identifiers)
|
|
- Have different reliability/authority than main narrative
|
|
|
|
annotation_rules:
|
|
- rule: "SIDEBAR_METADATA_EXTRACTION"
|
|
description: |
|
|
Sidebar content often contains STRUCTURED CLAIMS suitable for
|
|
direct property extraction:
|
|
- Infobox fields → claim properties
|
|
- Marginalia dates → temporal metadata
|
|
- Caption text → aboutness relationships
|
|
|
|
- rule: "SIDEBAR_MAIN_LINKING"
|
|
description: |
|
|
Link sidebar entities to main content entities when co-referential.
|
|
The sidebar "Born: 1606" links to the main text "Rembrandt".
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Caption Contexts
|
|
# ---------------------------------------------------------------------------
|
|
caption_contexts:
|
|
description: |
|
|
Captions (figcaption, TextRegion[@type='caption']) describe VISUAL content.
|
|
Entities in captions:
|
|
- Describe depicted subjects (people, places, objects)
|
|
- Provide dates/locations for the depicted scene
|
|
- May differ from main text (image of one thing, text about another)
|
|
|
|
annotation_rules:
|
|
- rule: "CAPTION_VISUAL_BINDING"
|
|
description: |
|
|
Caption entities are ABOUT the associated figure/image.
|
|
This is distinct from main text co-occurrence.
|
|
Use relationship: schema:about with target: figure URI.
|
|
|
|
- rule: "CAPTION_PROVENANCE"
|
|
description: |
|
|
Captions may have different authorship/dates than main text.
|
|
Track caption-specific provenance when available.
|
|
|
|
# =============================================================================
|
|
# FORMAT-SPECIFIC PATH CONVENTIONS
|
|
# =============================================================================
|
|
|
|
format_path_conventions:
|
|
description: |
|
|
Different document formats require different path syntaxes for locating
|
|
entities within document structure. This section defines conventions for
|
|
each supported format.
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# PAGE-XML Paths
|
|
# ---------------------------------------------------------------------------
|
|
page_xml:
|
|
description: |
|
|
PAGE-XML (used for historical manuscript transcription) organizes content
|
|
into TextRegions with type attributes. Critical for manuscript NER.
|
|
|
|
namespace: "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"
|
|
prefix: "page"
|
|
|
|
text_region_types:
|
|
- type: "header"
|
|
semantic_role: "section_heading"
|
|
doc_region: "DOC.HDR"
|
|
path_example: "//page:TextRegion[@type='header'][1]"
|
|
|
|
- type: "paragraph"
|
|
semantic_role: "body_content"
|
|
doc_region: "DOC.PAR"
|
|
path_example: "//page:TextRegion[@type='paragraph'][3]"
|
|
|
|
- type: "marginalia-left"
|
|
semantic_role: "supplementary_note"
|
|
doc_region: "DOC.SDB.MRG"
|
|
path_example: "//page:TextRegion[@type='marginalia-left'][1]"
|
|
|
|
- type: "marginalia-right"
|
|
semantic_role: "supplementary_note"
|
|
doc_region: "DOC.SDB.MRG"
|
|
path_example: "//page:TextRegion[@type='marginalia-right'][1]"
|
|
|
|
- type: "caption"
|
|
semantic_role: "figure_description"
|
|
doc_region: "DOC.CAP"
|
|
path_example: "//page:TextRegion[@type='caption'][1]"
|
|
|
|
- type: "page-number"
|
|
semantic_role: "pagination"
|
|
doc_region: "DOC.PGN"
|
|
path_example: "//page:TextRegion[@type='page-number'][1]"
|
|
|
|
- type: "signature-mark"
|
|
semantic_role: "gathering_identifier"
|
|
doc_region: "DOC.PGN"
|
|
path_example: "//page:TextRegion[@type='signature-mark'][1]"
|
|
|
|
- type: "catch-word"
|
|
semantic_role: "gathering_continuity"
|
|
doc_region: "DOC.PGN"
|
|
path_example: "//page:TextRegion[@type='catch-word'][1]"
|
|
|
|
- type: "table"
|
|
semantic_role: "structured_data"
|
|
doc_region: "DOC.TBL"
|
|
path_example: "//page:TextRegion[@type='table'][1]"
|
|
|
|
- type: "footnote"
|
|
semantic_role: "reference_note"
|
|
doc_region: "DOC.FTN"
|
|
path_example: "//page:TextRegion[@type='footnote'][1]"
|
|
|
|
- type: "table-of-contents"
|
|
semantic_role: "navigation"
|
|
doc_region: "DOC.TOC"
|
|
path_example: "//page:TextRegion[@type='table-of-contents'][1]"
|
|
|
|
- type: "index"
|
|
semantic_role: "navigation"
|
|
doc_region: "DOC.IDX"
|
|
path_example: "//page:TextRegion[@type='index'][1]"
|
|
|
|
- type: "title-page"
|
|
semantic_role: "front_matter"
|
|
doc_region: "DOC.TTP"
|
|
path_example: "//page:TextRegion[@type='title-page'][1]"
|
|
|
|
- type: "colophon"
|
|
semantic_role: "production_statement"
|
|
doc_region: "DOC.COL"
|
|
path_example: "//page:TextRegion[@type='colophon'][1]"
|
|
|
|
- type: "advertisement"
|
|
semantic_role: "commercial_content"
|
|
doc_region: "DOC.ADV"
|
|
path_example: "//page:TextRegion[@type='advertisement'][1]"
|
|
|
|
annotation_output:
|
|
description: "Include PAGE-XML path in entity provenance"
|
|
example:
|
|
entity: "Rembrandt"
|
|
span_start: 145
|
|
span_end: 154
|
|
page_xml_path: "//page:Page[@imageFilename='folio_23r.jpg']/page:TextRegion[@id='r1']/page:TextLine[@id='l1']/page:Word[@id='w3']"
|
|
text_region_type: "paragraph"
|
|
text_region_id: "r1"
|
|
doc_region: "DOC.PAR"
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# HTML Paths
|
|
# ---------------------------------------------------------------------------
|
|
html:
|
|
description: |
|
|
HTML documents use semantic elements for structure. Critical for web NER.
|
|
|
|
semantic_elements:
|
|
headers:
|
|
elements: ["h1", "h2", "h3", "h4", "h5", "h6"]
|
|
semantic_role: "section_heading"
|
|
doc_region: "DOC.HDR"
|
|
hierarchy: "h1 > h2 > h3 > h4 > h5 > h6"
|
|
|
|
body_content:
|
|
elements: ["p", "div.content", "article", "section", "main"]
|
|
semantic_role: "primary_content"
|
|
doc_region: "DOC.PAR"
|
|
|
|
supplementary:
|
|
elements: ["aside", "nav", "footer"]
|
|
semantic_role: "supplementary_content"
|
|
doc_regions:
|
|
aside: "DOC.SDB"
|
|
nav: "DOC.NAV"
|
|
footer: "DOC.MTD"
|
|
|
|
figures:
|
|
elements: ["figure", "figcaption", "img"]
|
|
semantic_role: "visual_content"
|
|
doc_regions:
|
|
figure: "DOC.FIG"
|
|
figcaption: "DOC.CAP"
|
|
|
|
lists:
|
|
elements: ["ul", "ol", "li", "dl", "dt", "dd"]
|
|
semantic_role: "enumerated_content"
|
|
doc_region: "DOC.LST"
|
|
|
|
tables:
|
|
elements: ["table", "thead", "tbody", "tr", "th", "td"]
|
|
semantic_role: "structured_data"
|
|
doc_region: "DOC.TBL"
|
|
|
|
metadata:
|
|
elements: ["meta", "title", "head"]
|
|
semantic_role: "document_metadata"
|
|
doc_region: "DOC.MTD"
|
|
|
|
media:
|
|
audio:
|
|
elements: ["audio", "source[type^='audio']"]
|
|
doc_region: "DOC.AUD"
|
|
video:
|
|
elements: ["video", "source[type^='video']", "iframe[src*='youtube']", "iframe[src*='vimeo']"]
|
|
doc_region: "DOC.VID"
|
|
embed:
|
|
elements: ["iframe", "embed", "object"]
|
|
doc_region: "DOC.EMB"
|
|
|
|
xpath_conventions:
|
|
- pattern: "//article/section[2]/h2[1]"
|
|
description: "Second section's heading in article"
|
|
doc_region: "DOC.HDR"
|
|
|
|
- pattern: "//table[@class='infobox']//td[contains(text(),'Born')]"
|
|
description: "Birth date cell in Wikipedia-style infobox"
|
|
doc_region: "DOC.SDB.IBX"
|
|
|
|
- pattern: "//aside//a[@href]"
|
|
description: "Links in sidebar content"
|
|
doc_region: "DOC.SDB"
|
|
|
|
- pattern: "//figure/figcaption"
|
|
description: "Caption for figure"
|
|
doc_region: "DOC.CAP"
|
|
|
|
- pattern: "//nav[@aria-label='breadcrumb']//li"
|
|
description: "Breadcrumb navigation items"
|
|
doc_region: "DOC.NAV"
|
|
|
|
annotation_output:
|
|
example:
|
|
entity: "Rijksmuseum"
|
|
span_start: 2341
|
|
span_end: 2352
|
|
xpath: "/html/body/main/article/section[3]/p[2]"
|
|
css_selector: "article > section:nth-child(3) > p:nth-child(2)"
|
|
semantic_context: "body_content"
|
|
doc_region: "DOC.PAR"
|
|
parent_header: "Dutch Museums"
|
|
parent_header_path: "/html/body/main/article/section[3]/h2"
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# JSON Paths
|
|
# ---------------------------------------------------------------------------
|
|
json:
|
|
description: |
|
|
JSON documents use key paths for structure. Critical for API response NER.
|
|
|
|
path_notation: "JSONPath (RFC 9535) or dot notation"
|
|
|
|
common_patterns:
|
|
- pattern: "$.results[*].name"
|
|
description: "Name field in array of results"
|
|
likely_entity: "any named entity"
|
|
|
|
- pattern: "$.data.institution.address.city"
|
|
description: "Nested city within institution data"
|
|
likely_entity: "TOP"
|
|
|
|
- pattern: "$['@context']"
|
|
description: "JSON-LD context (for namespace resolution)"
|
|
semantic_role: "metadata"
|
|
|
|
- pattern: "$.metadata.creator"
|
|
description: "Document creator in metadata block"
|
|
likely_entity: "AGT"
|
|
|
|
- pattern: "$.features[*].geometry"
|
|
description: "GeoJSON geometry objects"
|
|
likely_entity: "GEO"
|
|
|
|
- pattern: "$.features[*].properties.name"
|
|
description: "GeoJSON feature names"
|
|
likely_entity: "TOP"
|
|
|
|
semantic_inference:
|
|
description: |
|
|
JSON keys often encode semantic roles. Use key names to guide entity typing:
|
|
|
|
key_patterns:
|
|
- keys: ["name", "title", "label", "displayName"]
|
|
likely_content: "entity names"
|
|
|
|
- keys: ["date", "created", "founded", "birthDate", "deathDate", "startDate", "endDate"]
|
|
likely_entity: "TMP"
|
|
|
|
- keys: ["location", "address", "place", "city", "country", "coordinates"]
|
|
likely_entity: "TOP or GEO"
|
|
|
|
- keys: ["author", "creator", "owner", "contributor", "artist"]
|
|
likely_entity: "AGT"
|
|
|
|
- keys: ["type", "category", "class", "@type"]
|
|
semantic_role: "classification"
|
|
|
|
- keys: ["id", "identifier", "@id", "uri", "url"]
|
|
semantic_role: "identifier"
|
|
|
|
annotation_output:
|
|
example:
|
|
entity: "Amsterdam"
|
|
json_path: "$.data.museums[0].location.city"
|
|
array_index: 0
|
|
parent_key: "location"
|
|
root_key: "museums"
|
|
inferred_type: "TOP"
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# TEI-XML Paths
|
|
# ---------------------------------------------------------------------------
|
|
tei_xml:
|
|
description: |
|
|
TEI (Text Encoding Initiative) is the standard for scholarly text encoding.
|
|
Critical for digital humanities NER.
|
|
|
|
namespace: "http://www.tei-c.org/ns/1.0"
|
|
prefix: "tei"
|
|
|
|
structural_elements:
|
|
- element: "tei:front"
|
|
semantic_role: "front_matter"
|
|
doc_regions: ["DOC.TTP", "DOC.DED", "DOC.TOC"]
|
|
|
|
- element: "tei:body"
|
|
semantic_role: "main_content"
|
|
doc_region: "DOC.PAR"
|
|
|
|
- element: "tei:back"
|
|
semantic_role: "back_matter"
|
|
doc_regions: ["DOC.BIB", "DOC.IDX", "DOC.APP", "DOC.GLO"]
|
|
|
|
- element: "tei:div[@type='chapter']"
|
|
semantic_role: "major_section"
|
|
|
|
- element: "tei:p"
|
|
semantic_role: "paragraph"
|
|
doc_region: "DOC.PAR"
|
|
|
|
- element: "tei:head"
|
|
semantic_role: "heading"
|
|
doc_region: "DOC.HDR"
|
|
|
|
- element: "tei:note[@place='margin']"
|
|
semantic_role: "marginalia"
|
|
doc_region: "DOC.SDB.MRG"
|
|
|
|
- element: "tei:note[@type='footnote']"
|
|
semantic_role: "footnote"
|
|
doc_region: "DOC.FTN"
|
|
|
|
- element: "tei:figure"
|
|
semantic_role: "figure"
|
|
doc_region: "DOC.FIG"
|
|
|
|
- element: "tei:figDesc"
|
|
semantic_role: "figure_description"
|
|
doc_region: "DOC.CAP"
|
|
|
|
- element: "tei:table"
|
|
semantic_role: "table"
|
|
doc_region: "DOC.TBL"
|
|
|
|
- element: "tei:listBibl"
|
|
semantic_role: "bibliography"
|
|
doc_region: "DOC.BIB"
|
|
|
|
- element: "tei:list[@type='gloss']"
|
|
semantic_role: "glossary"
|
|
doc_region: "DOC.GLO"
|
|
|
|
named_entity_elements:
|
|
description: |
|
|
TEI provides dedicated elements for named entities. These should be
|
|
cross-referenced with GLAM-NER annotations.
|
|
|
|
elements:
|
|
- tei_element: "tei:persName"
|
|
glam_ner_type: "AGT.PER"
|
|
|
|
- tei_element: "tei:orgName"
|
|
glam_ner_type: "GRP.ORG"
|
|
|
|
- tei_element: "tei:placeName"
|
|
glam_ner_type: "TOP"
|
|
|
|
- tei_element: "tei:geogName"
|
|
glam_ner_type: "TOP.NAT"
|
|
|
|
- tei_element: "tei:date"
|
|
glam_ner_type: "TMP"
|
|
|
|
- tei_element: "tei:title"
|
|
glam_ner_type: "WRK"
|
|
|
|
annotation_output:
|
|
example:
|
|
entity: "Rembrandt van Rijn"
|
|
tei_path: "//tei:body/tei:div[@type='chapter'][2]/tei:p[3]/tei:persName[1]"
|
|
tei_element: "persName"
|
|
doc_region: "DOC.PAR"
|
|
existing_tei_annotation: true
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Plain Text Paths
|
|
# ---------------------------------------------------------------------------
|
|
plain_text:
|
|
description: |
|
|
Plain text lacks structural markup. Use line/paragraph detection heuristics.
|
|
|
|
structure_detection:
|
|
- heuristic: "BLANK_LINE_PARAGRAPH"
|
|
description: "Two consecutive newlines indicate paragraph break"
|
|
regex: "\\n\\n+"
|
|
|
|
- heuristic: "INDENTATION_STRUCTURE"
|
|
description: "Consistent indentation may indicate hierarchy"
|
|
regex: "^(\\s+)"
|
|
|
|
- heuristic: "CAPITALIZATION_HEADERS"
|
|
description: "ALL CAPS or Title Case lines may be headers"
|
|
patterns:
|
|
- all_caps: "^[A-Z][A-Z\\s]+$"
|
|
- title_case: "^([A-Z][a-z]+\\s)+$"
|
|
|
|
- heuristic: "ENUMERATION_LISTS"
|
|
description: "Lines starting with numbers/bullets are list items"
|
|
patterns:
|
|
- numbered: "^\\d+[.)\\s]"
|
|
- bulleted: "^[•\\-\\*]\\s"
|
|
- lettered: "^[a-z][.)\\s]"
|
|
|
|
offset_notation:
|
|
description: |
|
|
For plain text, use character offsets with optional line/paragraph indices.
|
|
|
|
format: "char:{start}-{end};line:{line};para:{paragraph}"
|
|
|
|
examples:
|
|
- "char:1456-1470;line:23;para:5"
|
|
- "char:0-15;line:1;para:1"
|
|
|
|
annotation_output:
|
|
description: "Use character offsets with paragraph/line indices"
|
|
example:
|
|
entity: "Dr. Jan de Wit"
|
|
char_start: 1456
|
|
char_end: 1470
|
|
line_number: 23
|
|
paragraph_index: 5
|
|
inferred_section: "unknown"
|
|
doc_region: "DOC.PAR"
|
|
|
|
|
|
# =============================================================================
|
|
# ENTITY CLUSTERING BY STRUCTURAL PATH
|
|
# =============================================================================
|
|
|
|
clustering_strategies:
|
|
description: |
|
|
Entities sharing structural context are more likely to be related or
|
|
co-referential. This section defines clustering algorithms that leverage
|
|
document structure for improved NER and relationship extraction.
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Path Prefix Clustering
|
|
# ---------------------------------------------------------------------------
|
|
path_prefix_clustering:
|
|
description: |
|
|
Entities sharing a common path prefix belong to the same structural unit
|
|
and should be clustered for co-reference resolution and relationship extraction.
|
|
|
|
algorithm:
|
|
- step: 1
|
|
action: "Extract path for each entity annotation"
|
|
output: "entity_path_map"
|
|
|
|
- step: 2
|
|
action: "Compute longest common prefix (LCP) for entity pairs"
|
|
output: "pairwise_lcp"
|
|
|
|
- step: 3
|
|
action: "Cluster entities where LCP depth >= threshold"
|
|
parameters:
|
|
threshold: 2
|
|
note: "Depth 2 = same section; adjust for document type"
|
|
|
|
- step: 4
|
|
action: "Within clusters, resolve co-references before cross-cluster"
|
|
priority: "intra-cluster > inter-cluster"
|
|
|
|
example:
|
|
entities:
|
|
- name: "Rembrandt"
|
|
path: "/article/section[2]/p[1]"
|
|
type: "AGT.PER"
|
|
|
|
- name: "Saskia"
|
|
path: "/article/section[2]/p[1]"
|
|
type: "AGT.PER"
|
|
|
|
- name: "the painter"
|
|
path: "/article/section[2]/p[2]"
|
|
type: "AGT.PER"
|
|
|
|
- name: "Vermeer"
|
|
path: "/article/section[3]/p[1]"
|
|
type: "AGT.PER"
|
|
|
|
clusters:
|
|
- cluster_id: 1
|
|
path_prefix: "/article/section[2]"
|
|
entities: ["Rembrandt", "Saskia", "the painter"]
|
|
coref_candidates:
|
|
- mention: "the painter"
|
|
antecedent: "Rembrandt"
|
|
confidence: 0.85
|
|
rationale: "Same paragraph, definite description"
|
|
|
|
- cluster_id: 2
|
|
path_prefix: "/article/section[3]"
|
|
entities: ["Vermeer"]
|
|
note: "Separate section, no co-reference with cluster 1"
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Hierarchical Context Inheritance
|
|
# ---------------------------------------------------------------------------
|
|
hierarchical_inheritance:
|
|
description: |
|
|
Properties from ancestor nodes propagate to descendant entities.
|
|
Header entities establish context inherited by paragraph entities.
|
|
|
|
inheritance_rules:
|
|
- rule: "TEMPORAL_SCOPE"
|
|
description: "Date range in header bounds dates in body"
|
|
example:
|
|
header: "The 17th Century (1600-1699)"
|
|
header_path: "/article/section[1]/h2"
|
|
body_date: "1642"
|
|
body_path: "/article/section[1]/p[3]"
|
|
inherited_context:
|
|
temporal_scope: "1600-1699"
|
|
validation: "1642 falls within 1600-1699"
|
|
|
|
- rule: "SPATIAL_SCOPE"
|
|
description: "Location in header provides default for body entities"
|
|
example:
|
|
header: "Museums in Amsterdam"
|
|
header_path: "/article/section[2]/h2"
|
|
body_org: "Rijksmuseum"
|
|
body_path: "/article/section[2]/p[1]"
|
|
inherited_context:
|
|
default_location: "Amsterdam"
|
|
relationship: "located_in"
|
|
|
|
- rule: "TOPIC_SCOPE"
|
|
description: "Subject in header provides aboutness for body"
|
|
example:
|
|
header: "Rembrandt van Rijn"
|
|
header_path: "/article/section[3]/h2"
|
|
body_text: "He painted The Night Watch"
|
|
body_path: "/article/section[3]/p[1]"
|
|
inherited_context:
|
|
pronoun_antecedent: "Rembrandt van Rijn"
|
|
aboutness: "Rembrandt van Rijn"
|
|
|
|
- rule: "ENTITY_TYPE_SCOPE"
|
|
description: "Entity type in header suggests types for body"
|
|
example:
|
|
header: "Notable Architects"
|
|
header_path: "/article/section[4]/h2"
|
|
body_names: ["Hendrik Petrus Berlage", "J.J.P. Oud"]
|
|
body_path: "/article/section[4]/ul/li"
|
|
inherited_context:
|
|
likely_type: "AGT.PER"
|
|
likely_role: "architect"
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Cross-Reference Resolution
|
|
# ---------------------------------------------------------------------------
|
|
cross_reference_resolution:
|
|
description: |
|
|
Resolve pronouns and definite references using structural proximity.
|
|
Closer structural context = higher resolution priority.
|
|
|
|
resolution_priority:
|
|
- priority: 1
|
|
scope: "same_sentence"
|
|
description: "Check for antecedent in same sentence"
|
|
strategy: "Recency + syntactic constraints"
|
|
|
|
- priority: 2
|
|
scope: "same_paragraph"
|
|
description: "Check preceding sentences in paragraph"
|
|
strategy: "Recency + entity salience"
|
|
|
|
- priority: 3
|
|
scope: "same_section"
|
|
description: "Check preceding paragraphs in section"
|
|
strategy: "Topic coherence + entity prominence"
|
|
|
|
- priority: 4
|
|
scope: "section_header"
|
|
description: "Check governing section header"
|
|
strategy: "Header entities are topical anchors"
|
|
|
|
- priority: 5
|
|
scope: "document"
|
|
description: "Check document-level prominent entities"
|
|
strategy: "Title entities, frequently mentioned entities"
|
|
|
|
example:
|
|
text: |
|
|
## Rembrandt van Rijn
|
|
|
|
The Dutch painter was born in Leiden. He moved to Amsterdam in 1631.
|
|
His most famous work is The Night Watch.
|
|
|
|
resolutions:
|
|
- mention: "The Dutch painter"
|
|
type: "definite_description"
|
|
antecedent: "Rembrandt van Rijn"
|
|
resolution_scope: "section_header"
|
|
confidence: 0.95
|
|
|
|
- mention: "He"
|
|
type: "pronoun"
|
|
antecedent: "'The Dutch painter' -> 'Rembrandt van Rijn'"
|
|
resolution_scope: "same_paragraph"
|
|
confidence: 0.98
|
|
|
|
- mention: "His"
|
|
type: "possessive_pronoun"
|
|
antecedent: "'He' -> 'Rembrandt van Rijn'"
|
|
resolution_scope: "same_paragraph"
|
|
confidence: 0.98
|
|
|
|
- mention: "The Night Watch"
|
|
type: "named_entity"
|
|
antecedent: null
|
|
relationship_to: "Rembrandt van Rijn"
|
|
relationship_type: "crm:P14i_performed"
|
|
note: "Work entity, not a co-reference"
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Multi-Document Clustering
|
|
# ---------------------------------------------------------------------------
|
|
multi_document_clustering:
|
|
description: |
|
|
When annotating corpora with multiple documents, entities may appear
|
|
across documents. Cross-document entity linking uses:
|
|
- Shared identifiers (Wikidata, VIAF, etc.)
|
|
- Name matching with disambiguation
|
|
- Contextual similarity
|
|
|
|
strategies:
|
|
- strategy: "IDENTIFIER_LINKING"
|
|
description: "Same external identifier = same entity"
|
|
priority: 1
|
|
example:
|
|
doc1_entity: "Rijksmuseum"
|
|
doc1_wikidata: "Q190804"
|
|
doc2_entity: "Rijksmuseum Amsterdam"
|
|
doc2_wikidata: "Q190804"
|
|
result: "same_entity"
|
|
|
|
- strategy: "NAME_MATCHING"
|
|
description: "Similar names with compatible context"
|
|
priority: 2
|
|
methods:
|
|
- exact_match: "Identical strings"
|
|
- normalized_match: "Case/diacritic normalization"
|
|
- alias_match: "Known aliases (from knowledge base)"
|
|
- fuzzy_match: "Edit distance < threshold"
|
|
|
|
- strategy: "CONTEXT_SIMILARITY"
|
|
description: "Similar co-occurring entities suggest same referent"
|
|
priority: 3
|
|
example:
|
|
doc1: "Rembrandt painted in Amsterdam"
|
|
doc2: "The artist worked in the Dutch Republic's capital"
|
|
shared_context: "Amsterdam / Dutch capital"
|
|
inference: "Likely same person"
|
|
|
|
# =============================================================================
|
|
# PROVENANCE PATH REQUIREMENTS
|
|
# =============================================================================
|
|
|
|
provenance_requirements:
|
|
description: |
|
|
ALL entity annotations MUST include path information for provenance.
|
|
This enables verification, reproducibility, and precise citation.
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Mandatory Fields
|
|
# ---------------------------------------------------------------------------
|
|
mandatory_path_fields:
|
|
description: |
|
|
Required provenance fields for every entity annotation.
|
|
|
|
required_fields:
|
|
- field: "source_document_uri"
|
|
type: "URI"
|
|
required: true
|
|
description: "Identifier for the source document"
|
|
examples:
|
|
- "https://example.org/documents/manuscript_123.xml"
|
|
- "file:///data/archives/letter_001.tei"
|
|
- "urn:isbn:978-0-123456-78-9"
|
|
|
|
- field: "document_format"
|
|
type: "enum"
|
|
required: true
|
|
values: ["PAGE-XML", "HTML", "JSON", "TEI-XML", "PLAIN_TEXT", "PDF", "ALTO-XML"]
|
|
description: "Format of source document (determines path syntax)"
|
|
|
|
- field: "structural_path"
|
|
type: "string"
|
|
required: true
|
|
description: "XPath, JSONPath, or character offset path to entity"
|
|
format_examples:
|
|
PAGE-XML: "//page:TextRegion[@id='r5']/page:TextLine[@id='l3']"
|
|
HTML: "/html/body/article/section[2]/p[1]"
|
|
JSON: "$.data.institutions[3].name"
|
|
TEI-XML: "//tei:body/tei:div[2]/tei:p[1]/tei:persName[1]"
|
|
PLAIN_TEXT: "char:1456-1470;line:23;para:5"
|
|
|
|
- field: "doc_region"
|
|
type: "enum"
|
|
required: true
|
|
description: "GLAM-NER document region code"
|
|
values_reference: "See document_regions section"
|
|
examples: ["DOC.PAR", "DOC.HDR", "DOC.CAP", "DOC.SDB.IBX"]
|
|
|
|
- field: "structural_context"
|
|
type: "enum"
|
|
required: true
|
|
values:
|
|
- "header"
|
|
- "paragraph"
|
|
- "list_item"
|
|
- "table_cell"
|
|
- "caption"
|
|
- "sidebar"
|
|
- "marginalia"
|
|
- "footnote"
|
|
- "metadata"
|
|
- "index_entry"
|
|
- "bibliography_entry"
|
|
- "title_page"
|
|
description: "Semantic role of containing structure"
|
|
|
|
optional_fields:
|
|
- field: "governing_header"
|
|
type: "object"
|
|
required: false
|
|
description: "Path and text of governing header (if applicable)"
|
|
schema:
|
|
path:
|
|
type: "string"
|
|
description: "XPath/JSONPath to header element"
|
|
text:
|
|
type: "string"
|
|
description: "Header text content"
|
|
level:
|
|
type: "integer"
|
|
description: "Header level (1=h1, 2=h2, etc.)"
|
|
|
|
- field: "parent_container"
|
|
type: "object"
|
|
required: false
|
|
description: "Immediate parent container (list, table, figure, etc.)"
|
|
schema:
|
|
path:
|
|
type: "string"
|
|
type:
|
|
type: "string"
|
|
description: "Container type (ul, table, figure, etc.)"
|
|
|
|
- field: "page_reference"
|
|
type: "object"
|
|
required: false
|
|
description: "Physical page information (for paginated documents)"
|
|
schema:
|
|
page_number:
|
|
type: "string"
|
|
folio:
|
|
type: "string"
|
|
description: "Folio reference (e.g., '23r', '45v')"
|
|
image_filename:
|
|
type: "string"
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# NIF Context Alignment
|
|
# ---------------------------------------------------------------------------
|
|
nif_context_alignment:
|
|
description: |
|
|
Path information aligns with NIF (NLP Interchange Format) Context
|
|
for annotation interchange. GLAM-NER extends NIF with structural path.
|
|
|
|
nif_properties:
|
|
- property: "nif:beginIndex"
|
|
type: "xsd:nonNegativeInteger"
|
|
maps_to: "span_start (character offset)"
|
|
description: "Start offset within context string"
|
|
|
|
- property: "nif:endIndex"
|
|
type: "xsd:nonNegativeInteger"
|
|
maps_to: "span_end (character offset)"
|
|
description: "End offset within context string"
|
|
|
|
- property: "nif:anchorOf"
|
|
type: "xsd:string"
|
|
maps_to: "surface_form"
|
|
description: "Exact text of the annotation"
|
|
|
|
- property: "nif:sourceUrl"
|
|
type: "xsd:anyURI"
|
|
maps_to: "source_document_uri"
|
|
description: "Source document URL"
|
|
|
|
- property: "nif:referenceContext"
|
|
type: "nif:Context"
|
|
maps_to: "document context"
|
|
description: "Parent context containing this annotation"
|
|
|
|
glam_ner_extensions:
|
|
description: |
|
|
GLAM-NER extends NIF with structural provenance properties.
|
|
|
|
properties:
|
|
- property: "glam:structuralPath"
|
|
type: "xsd:string"
|
|
description: "XPath, JSONPath, or offset path to entity location"
|
|
|
|
- property: "glam:docRegion"
|
|
type: "xsd:string"
|
|
description: "GLAM-NER document region code (DOC.*)"
|
|
|
|
- property: "glam:structuralContext"
|
|
type: "xsd:string"
|
|
description: "Semantic role of containing structure"
|
|
|
|
- property: "glam:governingHeader"
|
|
type: "xsd:string"
|
|
description: "Text of governing section header"
|
|
|
|
- property: "glam:governingHeaderPath"
|
|
type: "xsd:string"
|
|
description: "Path to governing header element"
|
|
|
|
example_nif_output:
|
|
"@context":
|
|
- "http://persistence.uni-leipzig.org/nlp2rdf/contexts/nif-2.0.json"
|
|
- glam: "http://glam-ner.org/ns#"
|
|
|
|
"@id": "https://example.org/doc#char=145,154"
|
|
"@type": "nif:String"
|
|
|
|
# Standard NIF properties
|
|
"nif:anchorOf": "Rembrandt"
|
|
"nif:beginIndex": 145
|
|
"nif:endIndex": 154
|
|
"nif:referenceContext": "https://example.org/doc#char=0,5000"
|
|
|
|
# GLAM-NER extensions
|
|
"glam:structuralPath": "//page:TextRegion[@id='r1']/page:TextLine[@id='l1']"
|
|
"glam:docRegion": "DOC.PAR"
|
|
"glam:structuralContext": "paragraph"
|
|
"glam:governingHeader": "Dutch Golden Age Artists"
|
|
"glam:governingHeaderPath": "//page:TextRegion[@id='header1']"
|
|
|
|
# =============================================================================
|
|
# NESTED PROVENANCE MODEL
|
|
# =============================================================================
|
|
|
|
nested_provenance_model:
|
|
description: |
|
|
GLAM-NER uses a two-layer provenance model:
|
|
|
|
1. LAYOUT CLAIM: Provenance for structural/path assertions
|
|
- Where in the document is this region?
|
|
- What type of region is it?
|
|
- Who/what identified this region?
|
|
|
|
2. ENTITY CLAIM: Provenance for entity annotations
|
|
- What entity was recognized?
|
|
- What type is it?
|
|
- Who/what made this annotation?
|
|
- What is the confidence?
|
|
|
|
Layout claims contain entity claims, enabling:
|
|
- Separate assessment of structural vs. semantic accuracy
|
|
- Different annotators for layout vs. NER
|
|
- Tracking provenance at appropriate granularity
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Layout Claim Schema
|
|
# ---------------------------------------------------------------------------
|
|
layout_claim_schema:
|
|
description: |
|
|
A LayoutClaim asserts that a document region exists at a specific path
|
|
with a specific semantic type.
|
|
|
|
properties:
|
|
- property: "claim_id"
|
|
type: "URI"
|
|
required: true
|
|
description: "Unique identifier for this claim"
|
|
|
|
- property: "source_document"
|
|
type: "URI"
|
|
required: true
|
|
description: "Document containing this region"
|
|
|
|
- property: "structural_path"
|
|
type: "string"
|
|
required: true
|
|
description: "Path to region (format-specific)"
|
|
|
|
- property: "doc_region"
|
|
type: "string"
|
|
required: true
|
|
description: "GLAM-NER region code (DOC.*)"
|
|
|
|
- property: "region_text"
|
|
type: "string"
|
|
required: false
|
|
description: "Full text content of region"
|
|
|
|
- property: "char_offset_start"
|
|
type: "integer"
|
|
required: true
|
|
description: "Character offset of region start"
|
|
|
|
- property: "char_offset_end"
|
|
type: "integer"
|
|
required: true
|
|
description: "Character offset of region end"
|
|
|
|
- property: "provenance"
|
|
type: "object"
|
|
required: true
|
|
schema:
|
|
annotator:
|
|
type: "string"
|
|
description: "Human or system that made this claim"
|
|
annotation_date:
|
|
type: "datetime"
|
|
method:
|
|
type: "string"
|
|
enum: ["manual", "automatic", "hybrid"]
|
|
confidence:
|
|
type: "float"
|
|
range: [0.0, 1.0]
|
|
tool_version:
|
|
type: "string"
|
|
description: "Version of annotation tool (if automatic)"
|
|
|
|
- property: "entity_claims"
|
|
type: "array"
|
|
items: "EntityClaim"
|
|
description: "Entity annotations within this region"
|
|
|
|
example:
|
|
claim_id: "urn:glam:layout:doc123:r5"
|
|
source_document: "https://example.org/manuscript_001.xml"
|
|
structural_path: "//page:TextRegion[@id='r5']"
|
|
doc_region: "DOC.PAR"
|
|
region_text: "Rembrandt van Rijn was born in Leiden in 1606."
|
|
char_offset_start: 1234
|
|
char_offset_end: 1282
|
|
provenance:
|
|
annotator: "Transkribus Layout Model v4.2"
|
|
annotation_date: "2024-03-15T10:30:00Z"
|
|
method: "automatic"
|
|
confidence: 0.92
|
|
tool_version: "4.2.1"
|
|
entity_claims:
|
|
- # See entity_claim_schema
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Entity Claim Schema
|
|
# ---------------------------------------------------------------------------
|
|
entity_claim_schema:
|
|
description: |
|
|
An EntityClaim asserts that an entity mention exists at a specific
|
|
location within a layout region.
|
|
|
|
properties:
|
|
- property: "claim_id"
|
|
type: "URI"
|
|
required: true
|
|
description: "Unique identifier for this claim"
|
|
|
|
- property: "parent_layout_claim"
|
|
type: "URI"
|
|
required: true
|
|
description: "Reference to containing LayoutClaim"
|
|
|
|
- property: "surface_form"
|
|
type: "string"
|
|
required: true
|
|
description: "Exact text of entity mention"
|
|
|
|
- property: "span_start"
|
|
type: "integer"
|
|
required: true
|
|
description: "Character offset within region"
|
|
|
|
- property: "span_end"
|
|
type: "integer"
|
|
required: true
|
|
description: "Character offset end within region"
|
|
|
|
- property: "entity_type"
|
|
type: "string"
|
|
required: true
|
|
description: "GLAM-NER entity type code"
|
|
examples: ["AGT.PER", "TOP.ADM", "TMP.DAT", "WRK.TXT"]
|
|
|
|
- property: "normalized_value"
|
|
type: "string"
|
|
required: false
|
|
description: "Normalized/canonical form of entity"
|
|
|
|
- property: "linked_entity"
|
|
type: "URI"
|
|
required: false
|
|
description: "Link to knowledge base entity"
|
|
examples:
|
|
- "http://www.wikidata.org/entity/Q5598"
|
|
- "https://viaf.org/viaf/64013650"
|
|
|
|
- property: "provenance"
|
|
type: "object"
|
|
required: true
|
|
schema:
|
|
annotator:
|
|
type: "string"
|
|
annotation_date:
|
|
type: "datetime"
|
|
method:
|
|
type: "string"
|
|
enum: ["manual", "automatic", "hybrid"]
|
|
confidence:
|
|
type: "float"
|
|
range: [0.0, 1.0]
|
|
model:
|
|
type: "string"
|
|
description: "NER model used (if automatic)"
|
|
model_version:
|
|
type: "string"
|
|
|
|
example:
|
|
claim_id: "urn:glam:entity:doc123:r5:e1"
|
|
parent_layout_claim: "urn:glam:layout:doc123:r5"
|
|
surface_form: "Rembrandt van Rijn"
|
|
span_start: 0
|
|
span_end: 18
|
|
entity_type: "AGT.PER"
|
|
normalized_value: "Rembrandt Harmenszoon van Rijn"
|
|
linked_entity: "http://www.wikidata.org/entity/Q5598"
|
|
provenance:
|
|
annotator: "spaCy NER + GLAM fine-tuning"
|
|
annotation_date: "2024-03-15T11:00:00Z"
|
|
method: "automatic"
|
|
confidence: 0.97
|
|
model: "glam-ner-nl-lg"
|
|
model_version: "1.2.0"
|
|
|
|
# =============================================================================
|
|
# END OF DOCUMENT STRUCTURE MODULE
|
|
# =============================================================================
|
|
|