11162 lines
419 KiB
YAML
11162 lines
419 KiB
YAML
# =============================================================================
|
||
# ⚠️ DEPRECATED - USE ch_annotator-v1_7_0.yaml INSTEAD ⚠️
|
||
# =============================================================================
|
||
# This file has been renamed to: ch_annotator-v1_7_0.yaml
|
||
# Deprecation date: 2025-12-06
|
||
# Reason: Naming standardization to "CH-Annotator" convention
|
||
#
|
||
# DO NOT USE THIS FILE. Use the new location:
|
||
# data/entity_annotation/ch_annotator-v1_7_0.yaml
|
||
# =============================================================================
|
||
|
||
# =============================================================================
|
||
# GLAM-NER: UNIFIED ENTITY ANNOTATION CONVENTION (DEPRECATED)
|
||
# =============================================================================
|
||
# Version: 1.7.0-unified
|
||
# Date: 2025-12-02
|
||
# Status: DEPRECATED - Replaced by ch_annotator-v1_7_0.yaml
|
||
#
|
||
# PURPOSE: Complete convention for Named Entity Recognition, Property Extraction,
|
||
# Entity Resolution, Entity Linking, and Claim Validation across ALL text sources.
|
||
# Domain-agnostic rules that apply universally.
|
||
#
|
||
# This file is COMPLETE and SELF-CONTAINED. No external files required.
|
||
#
|
||
# CHANGELOG (1.7.0): DIGITAL HUMANITIES ONTOLOGY ALIGNMENT + DOCUMENT STRUCTURE
|
||
#
|
||
# PART A: HYPERNYM RESTRUCTURING (Breaking Changes)
|
||
# - BREAKING: Renamed BEING hypernym to AGENT (code: AGT) - includes non-human agents
|
||
# following CIDOC-CRM E39_Actor (broader than E21_Person). Covers humans, animals,
|
||
# AI agents, fictional characters, robots, and mythological beings (TEI CHR concept).
|
||
# - BREAKING: Renamed ORGANISATION hypernym to GROUP (code: GRP) - all collectives
|
||
# following CIDOC-CRM E74_Group. Subcategories distinguish formal organizations
|
||
# (org:FormalOrganization) from informal collectives (foaf:Group).
|
||
# - BREAKING: Split PLACE into TOPONYM (nominal place references) and GEOMETRY
|
||
# (coordinate/shape data) following TEI/Pleiades/GeoSPARQL distinctions.
|
||
# - BREAKING: Restructured TEMPORAL_REFERENCE with TimeML/TIMEX3 type system:
|
||
# * TMP.DAB: Datable (absolute timestamps, fully resolved to calendar)
|
||
# * TMP.DRL: Deictic/Relative (require context for resolution)
|
||
# * TMP.DUR: Durations (time:Duration, time:TemporalDuration)
|
||
# * TMP.SET: Recurring/periodic times (time:Interval with frequency)
|
||
# * TMP.RNG: Ranges (explicit start-end pairs)
|
||
# - BREAKING: Replaced nerd:Product mapping for TEXTUAL_REFERENCE with FRBR-based
|
||
# model (frbr:Work, frbr:Expression, frbr:Manifestation, frbr:Item).
|
||
# - BREAKING: Added new hypernym ROLE (code: ROL) for positions, honorifics, titles,
|
||
# and occupational designations following TEI <roleName> and PiCO role concepts.
|
||
# - Deprecated NERD mappings where they introduce inappropriate biases - NERD is
|
||
# retained for cross-system interchange but is NOT authoritative for this convention.
|
||
# - Added Digital Humanities authority references: TEI P5 Chapter 14, TimeML/TIMEX3,
|
||
# CIDOC-CRM 7.1.3, PiCO v1.0, FRBR, Pleiades, GeoSPARQL.
|
||
#
|
||
# PART B: SECTION 15 - DOCUMENT STRUCTURE AND NAMESPACE PATHS (New)
|
||
# - NEW: Added DOCUMENT_REGION (DOC) hypernym with comprehensive layout semantic ontology
|
||
# for annotating document structure (headers, paragraphs, tables, sidebars, figures).
|
||
# - NEW: Primary content regions: DOC.HDR (6 heading levels), DOC.PAR, DOC.SEN, DOC.LST,
|
||
# DOC.LIT, DOC.TBL (with TBL.HDR, TBL.BDY, TBL.ROW, TBL.CEL sub-components).
|
||
# - NEW: Supplementary content regions: DOC.SDB (Sidebar) with hyponyms DOC.SDB.MRG
|
||
# (Marginalia with margin positions), DOC.SDB.IBX (Infobox), DOC.SDB.CLT (Callout).
|
||
# - NEW: Additional regions: DOC.CAP, DOC.FTN, DOC.FIG, DOC.NAV, DOC.PGN, DOC.BLK,
|
||
# DOC.MTD (Metadata Block), DOC.ANN (Annotation Region).
|
||
# - NEW: Media collection region: DOC.GAL (Gallery) for image carousels, plate sections,
|
||
# portfolio showcases - with gallery_types and item nesting.
|
||
# - NEW: Cartographic region: DOC.MAP (Map) for geographic visualizations with map_types
|
||
# (reference, thematic, historical, site_plan, route, cadastral, interactive) and
|
||
# map_components (MAP.BAS base layer, MAP.OVL overlay, MAP.MRK markers, MAP.SHP shapes,
|
||
# MAP.LEG legend, MAP.CTL controls).
|
||
# - NEW: Audiovisual regions: DOC.AUD (Audio) with audio_types and audio_segments
|
||
# (AUD.SPK speaker, AUD.MUS music, AUD.SIL silence, AUD.SFX effects); DOC.VID (Video)
|
||
# with video_types and video_segments (VID.SCN scene, VID.SHT shot, VID.TTL title,
|
||
# VID.CRD credits).
|
||
# - NEW: Embedded interactive region: DOC.EMB for iframes, widgets, IIIF viewers,
|
||
# 3D viewers, timelines, social embeds, data visualizations, virtual tours.
|
||
# - NEW: Navigation/reference regions promoted to full hypernyms: DOC.TOC (Table of
|
||
# Contents with toc_types: main, LOF, LOT, abbreviations, maps, plates), DOC.IDX
|
||
# (Index with index_types: name, place, subject, title, chronological - pre-annotated
|
||
# entity sources).
|
||
# - NEW: Front matter regions: DOC.TTP (Title Page with components: TTL, STL, AUT, PUB,
|
||
# DAT, PLC, EDT, IMP), DOC.DED (Dedication/Epigraph), DOC.COL (Colophon with components:
|
||
# PRN printer, DAT date, PLC place, TYP typography, PAP paper, CPY copyright, EDN edition).
|
||
# - NEW: Back matter regions: DOC.BIB (Bibliography with entry components: AUT, TTL, DAT,
|
||
# PUB, PLC), DOC.APP (Appendix), DOC.GLO (Glossary with components: TRM, DEF, SYN, REL).
|
||
# - NEW: Commercial/branding regions: DOC.ADV (Advertisement with types: display, classified,
|
||
# trade_listing, prospectus, patent_medicine, auction_notice, legal_notice), DOC.LOG
|
||
# (Logo with types: publisher_logo, printer_device, masthead, watermark, seal, coat_of_arms,
|
||
# colophon_mark, ex_libris).
|
||
# - NEW: Semantic role enumeration expanded: PRIM, SUPP, NAV, STRC, REF, VIS, AV, INT, META,
|
||
# SPAT, FRNT (front matter), BACK (back matter), PARA (paratextual), COMM (commercial),
|
||
# LEX (lexical/definitions).
|
||
# - NEW: Two-layer nested provenance model - layout claims vs entity claims with separate
|
||
# confidence/provenance chains. Entity claims reference parent layout claims.
|
||
# - NEW: Format-agnostic path conventions for PAGE-XML, HTML, JSON, Markdown, EPUB, PDF, TEI.
|
||
# - NEW: Clustering strategies for grouping entities by document region.
|
||
# - Added namespaces: premis:, bibo:, csvw:, html:, frbr:, frbre:, geosparql:, sf:,
|
||
# timeml:, tei:, as: (ActivityStreams), skos: for Section 15 ontology mappings.
|
||
# - Added comprehensive LinkML mapping variants (class_uri, exact_mappings,
|
||
# close_mappings, related_mappings, narrow_mappings, broad_mappings).
|
||
#
|
||
# CHANGELOG (1.6.3):
|
||
# - MAJOR: Expanded scope from NER-only to full extraction pipeline
|
||
# - Added Property Extraction Rules section (relationship detection)
|
||
# - Added Claim Validation Schema (LinkML-based provenance)
|
||
# - Added Entity Resolution and Linking Pipeline section
|
||
# - Added LinkML namespace and schema references (https://linkml.io/)
|
||
# - Added temporal property extraction patterns (founding dates, etc.)
|
||
# - All claims now require 5-component provenance:
|
||
# 1. namespace (ontology prefix)
|
||
# 2. path (xpath/jsonpath)
|
||
# 3. timestamp (ISO 8601)
|
||
# 4. agent (extraction model)
|
||
# 5. context_convention (this file version)
|
||
# - Aligned with WebClaim.yaml LinkML schema for claim instances
|
||
#
|
||
# CHANGELOG (1.6.2):
|
||
# - Added NIF (NLP Interchange Format) vocabulary for annotation interchange
|
||
# - Added NERD ontology core class mappings for cross-system interoperability
|
||
# - Added W3C Web Annotation Data Model for text span provenance
|
||
# - Added itsrdf namespace for entity linking (ITS 2.0)
|
||
# - New section: NIF/NERD/OA Integration Patterns
|
||
# - Enhanced output format with NIF-compliant properties
|
||
#
|
||
# CHANGELOG (1.6.1):
|
||
# - Added W3C Org Ontology mappings (org:FormalOrganization, org:OrganizationalUnit)
|
||
# - Added RegOrg vocabulary mappings (rov:RegisteredOrganization, rov:legalName)
|
||
# - Added new ORG.UNT subcategory for organizational units
|
||
# - Enhanced BEING.STF with org:Membership, org:Role, org:Post patterns
|
||
# - Added foaf:OnlineAccount for TXT.SOC social media handles
|
||
# - Added Europeana Data Model (EDM) mappings for THING subcategories
|
||
# - Expanded relationships section with 14 new org:, rov:, foaf: predicates
|
||
# =============================================================================
|
||
|
||
convention:
|
||
name: "GLAM-NER Unified Entity Annotation Convention"
|
||
version: "1.7.0-unified"
|
||
date: "2025-12-02"
|
||
status: "production"
|
||
|
||
description: |
|
||
A unified, domain-agnostic convention for the complete entity extraction
|
||
pipeline: Named Entity Recognition (NER), Property Extraction, Entity
|
||
Resolution, Entity Linking, and Claim Validation.
|
||
|
||
This convention applies universally to any text source: historical
|
||
manuscripts, modern websites, archival documents, or digital platforms.
|
||
|
||
The convention identifies 9 hypernym entity types (expanded from 8 in v1.6.x)
|
||
with ontology mappings to CIDOC-CRM, RiC-O, Schema.org, PiCO, W3C Org,
|
||
RegOrg, FOAF, vCard, EDM, OWL-Time, GeoSPARQL, FRBR, TEI, and NIF 2.0.
|
||
|
||
CRITICAL DESIGN PRINCIPLES (v1.7.0):
|
||
1. AGENT vs PERSON: "Being" is too narrow - AGENT covers humans, animals, AI,
|
||
fictional characters, robots. Uses CIDOC-CRM E39_Actor as primary class.
|
||
2. TOPONYM vs GEOMETRY: Place names (nominal references) are distinct from
|
||
coordinate data (geometric representations). Never conflate these.
|
||
3. GROUP vs ORGANISATION: "Organisation" implies formal structure - GROUP
|
||
covers all collectives from informal bands to registered corporations.
|
||
4. TEMPORAL distinctions: Absolute timestamps, relative expressions, durations,
|
||
and recurring periods have fundamentally different semantics (TimeML/TIMEX3).
|
||
5. ROLE is distinct from AGENT: "Director" is a role; "Dr. Jan de Wit" is an
|
||
agent. Roles can be filled by different agents over time.
|
||
6. TEXTUAL REFERENCE is not PRODUCT: Documents have Work/Expression/Manifestation
|
||
distinctions (FRBR) that product-centric models miss.
|
||
|
||
All extracted claims MUST include verifiable provenance following the LinkML
|
||
schema at https://linkml.io/ and this project's WebClaim.yaml schema.
|
||
|
||
digital_humanities_authorities:
|
||
- name: "TEI P5 Guidelines, Chapter 14: Names, Dates, People, Places"
|
||
url: "https://tei-c.org/release/doc/tei-p5-doc/en/html/ND.html"
|
||
note: "Authoritative for persName, placeName, roleName, orgName distinctions"
|
||
- name: "TimeML/TIMEX3 Specification"
|
||
url: "https://timeml.github.io/site/timebank/"
|
||
note: "Authoritative for temporal expression annotation types"
|
||
- name: "CIDOC-CRM 7.1.3"
|
||
url: "https://cidoc-crm.org/Version/version-7.1.3"
|
||
note: "Authoritative for heritage domain entity classes"
|
||
- name: "PiCO (Persons in Context) Ontology"
|
||
url: "https://personsincontext.org/"
|
||
note: "Authoritative for person observation/reconstruction pattern"
|
||
- name: "FRBR (Functional Requirements for Bibliographic Records)"
|
||
url: "https://www.ifla.org/publications/functional-requirements-for-bibliographic-records"
|
||
note: "Authoritative for Work/Expression/Manifestation/Item distinctions"
|
||
- name: "Pleiades Gazetteer"
|
||
url: "https://pleiades.stoa.org/"
|
||
note: "Model for place/name/location separation in historical geography"
|
||
- name: "GeoSPARQL"
|
||
url: "https://www.ogc.org/standard/geosparql/"
|
||
note: "Authoritative for spatial geometry representation"
|
||
|
||
nerd_deprecation_note: |
|
||
NERD (Named Entity Recognition and Disambiguation) ontology mappings are
|
||
RETAINED for cross-system NLP tool interchange but are NOT AUTHORITATIVE
|
||
for this convention. NERD has biases toward modern web/journalism contexts:
|
||
- nerd:Person excludes non-human agents (too narrow)
|
||
- nerd:Location conflates toponyms with geometry (imprecise)
|
||
- nerd:Organization excludes informal collectives (too narrow)
|
||
- nerd:Time conflates absolute/relative/duration (imprecise)
|
||
- nerd:Product misrepresents textual references (wrong semantics)
|
||
- NERD lacks role/title/honorific class (missing category)
|
||
|
||
Use NERD classes only for NIF export to downstream NLP tools that expect
|
||
NERD vocabulary. Internal processing uses CIDOC-CRM, TEI, and TimeML classes.
|
||
|
||
scope:
|
||
- entity_recognition: "Detect and classify named entities in text"
|
||
- property_extraction: "Extract relationships and attributes between entities"
|
||
- entity_resolution: "Disambiguate and merge entity mentions"
|
||
- entity_linking: "Link entities to knowledge bases (Wikidata, etc.)"
|
||
- claim_validation: "Verify and track provenance for all claims"
|
||
|
||
linkml_reference:
|
||
specification: "https://linkml.io/linkml/"
|
||
version: "1.8.x"
|
||
purpose: |
|
||
LinkML (Linked Data Modeling Language) is used for:
|
||
- Formal schema definitions (classes, slots, enums)
|
||
- Instance validation
|
||
- Multi-format serialization (YAML, JSON, RDF, etc.)
|
||
- Type-safe claim structures
|
||
|
||
# =============================================================================
|
||
# SECTION 1: ONTOLOGY NAMESPACES
|
||
# =============================================================================
|
||
# All namespace prefixes used in ontology mappings throughout this convention.
|
||
|
||
ontology:
|
||
namespaces:
|
||
# Core Heritage/Cultural Ontologies
|
||
rico: "https://www.ica.org/standards/RiC/ontology#"
|
||
crm: "http://www.cidoc-crm.org/cidoc-crm/"
|
||
edm: "http://www.europeana.eu/schemas/edm/"
|
||
|
||
# Web/General Ontologies
|
||
schema: "http://schema.org/"
|
||
foaf: "http://xmlns.com/foaf/0.1/"
|
||
|
||
# Organization Ontologies
|
||
org: "http://www.w3.org/ns/org#"
|
||
rov: "http://www.w3.org/ns/regorg#"
|
||
|
||
# Person/Identity Ontologies
|
||
picom: "https://personincontext.org/model/"
|
||
pnv: "https://w3id.org/pnv#"
|
||
|
||
# Contact Information
|
||
vcard: "http://www.w3.org/2006/vcard/ns#"
|
||
|
||
# Provenance
|
||
prov: "http://www.w3.org/ns/prov#"
|
||
|
||
# Vocabularies & Classification
|
||
skos: "http://www.w3.org/2004/02/skos/core#"
|
||
|
||
# Core RDF/OWL
|
||
rdf: "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
|
||
rdfs: "http://www.w3.org/2000/01/rdf-schema#"
|
||
xsd: "http://www.w3.org/2001/XMLSchema#"
|
||
|
||
# Dublin Core
|
||
dc: "http://purl.org/dc/elements/1.1/"
|
||
dct: "http://purl.org/dc/terms/"
|
||
|
||
# Temporal & Spatial
|
||
time: "http://www.w3.org/2006/time#"
|
||
geo: "http://www.w3.org/2003/01/geo/wgs84_pos#"
|
||
|
||
# Measurement
|
||
ivoa: "http://www.ivoa.net/rdf/UCD1+#"
|
||
|
||
# Project-specific
|
||
glam: "https://w3id.org/glam/"
|
||
|
||
# NLP Interchange Format (NIF 2.0)
|
||
nif: "http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#"
|
||
|
||
# Named Entity Recognition and Disambiguation (NERD)
|
||
nerd: "http://nerd.eurecom.fr/ontology#"
|
||
|
||
# W3C Web Annotation Data Model
|
||
oa: "http://www.w3.org/ns/oa#"
|
||
|
||
# Internationalization Tag Set (ITS 2.0) - Entity Linking
|
||
itsrdf: "http://www.w3.org/2005/11/its/rdf#"
|
||
|
||
# LinkML Schema Language
|
||
linkml: "https://w3id.org/linkml/"
|
||
|
||
# Heritage Custodian Ontology (project-specific LinkML schemas)
|
||
hc: "https://nde.nl/ontology/hc/"
|
||
|
||
# Provenance Authoring and Versioning
|
||
pav: "http://purl.org/pav/"
|
||
|
||
# Document Structure & Layout (Section 15)
|
||
premis: "http://www.loc.gov/premis/rdf/v3/"
|
||
bibo: "http://purl.org/ontology/bibo/"
|
||
csvw: "http://www.w3.org/ns/csvw#"
|
||
html: "http://www.w3.org/1999/xhtml/"
|
||
|
||
# Bibliographic/FRBR (for TEXTUAL_REFERENCE)
|
||
frbr: "http://purl.org/vocab/frbr/core#"
|
||
frbre: "http://purl.org/vocab/frbr/extended#"
|
||
|
||
# Spatial/Geometry (for GEOMETRY hypernym)
|
||
geosparql: "http://www.opengis.net/ont/geosparql#"
|
||
sf: "http://www.opengis.net/ont/sf#"
|
||
|
||
# Temporal (TimeML/TIMEX3 extensions)
|
||
timeml: "http://timeml.org/timeml/"
|
||
|
||
# TEI (for CHR character model, roleName, etc.)
|
||
tei: "http://www.tei-c.org/ns/1.0/"
|
||
|
||
# ActivityStreams (for collections, media activities)
|
||
as: "https://www.w3.org/ns/activitystreams#"
|
||
|
||
# =============================================================================
|
||
# SECTION 2: ENTITY TYPE DEFINITIONS
|
||
# =============================================================================
|
||
# Complete definitions for all 8 entity types with subcategories,
|
||
# inclusion/exclusion rules, and ontology mappings.
|
||
|
||
entity_types:
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# BEING - Persons, animals, mythological figures
|
||
# ---------------------------------------------------------------------------
|
||
# ---------------------------------------------------------------------------
|
||
# AGENT - Entities capable of intentional action (CIDOC-CRM E39_Actor)
|
||
# ---------------------------------------------------------------------------
|
||
# BREAKING CHANGE v1.7.0: Renamed from BEING (BEI) to AGENT (AGT)
|
||
# Rationale: "Being" implies human-centric ontology. CIDOC-CRM E39_Actor is
|
||
# the proper hypernym for ALL entities capable of intentional action:
|
||
# humans, animals, AI systems, fictional characters, collectives, and robots.
|
||
# ---------------------------------------------------------------------------
|
||
AGENT:
|
||
code: "AGT"
|
||
definition: |
|
||
Entities capable of intentional action. Includes humans (historical and
|
||
contemporary), animals, AI systems, fictional characters, mythological
|
||
figures, and collectives. The defining characteristic is AGENCY - the
|
||
capacity to act, make decisions, and bear responsibility.
|
||
|
||
This is the broadest actor class, encompassing all entities that can:
|
||
- Perform intentional actions (create, destroy, transfer, modify)
|
||
- Hold beliefs, desires, or goals
|
||
- Bear moral or legal responsibility
|
||
- Be attributed authorship or causation
|
||
|
||
design_rationale: |
|
||
CIDOC-CRM E39_Actor ("a persistent item that has the potential to perform
|
||
intentional actions") is the correct hypernym. The former "BEING" label
|
||
was anthropocentric, excluding valid actors like:
|
||
- AI systems creating art or making curatorial decisions
|
||
- Named animals with documented agency (working animals, famous pets)
|
||
- Fictional characters who are subjects of scholarly study
|
||
- Robots performing heritage conservation tasks
|
||
|
||
TEI P5 provides the Character (roleName) model for fictional entities.
|
||
FOAF provides Agent as a superclass of Person and Group.
|
||
|
||
ontology_mappings:
|
||
primary_class: "crm:E39_Actor"
|
||
primary_class_definition: |
|
||
CIDOC-CRM E39 Actor: "This class comprises people, either combineely as
|
||
combinees of groups or combinees of groups. A gathering of combinees of
|
||
E21 Persons becomes an instance of E74 Group when it exhibits collective
|
||
agency, that is, it can perform actions as a unit."
|
||
alternative_classes:
|
||
- "foaf:Agent"
|
||
- "schema:Thing" # schema:Person and schema:Organization are subclasses
|
||
- "prov:Agent"
|
||
linkml_mapping:
|
||
class_uri: "crm:E39_Actor"
|
||
exact_mappings:
|
||
- "foaf:Agent"
|
||
- "prov:Agent"
|
||
close_mappings:
|
||
- "schema:Person" # More specific, human-only
|
||
related_mappings:
|
||
- "dcterms:Agent"
|
||
nerd_class: "nerd:Person"
|
||
nerd_deprecation_note: |
|
||
DEPRECATED: NERD's Person class is too narrow. NERD was designed for
|
||
news/journalism NER where non-human agents are rare. For Digital
|
||
Humanities, CIDOC-CRM E39_Actor is authoritative.
|
||
Retain NERD mapping ONLY for NLP pipeline interchange, NOT as semantic authority.
|
||
pico_class: "picom:PersonObservation"
|
||
pico_note: |
|
||
In PiCO, textual mentions create PersonObservation instances linked
|
||
to reconstructed Person entities via picom:isObservationOf. This
|
||
observation/reconstruction pattern applies to ALL agent subcategories.
|
||
|
||
subcategories:
|
||
# ----- HUMAN AGENTS -----
|
||
PERSON:
|
||
code: "AGT.PER"
|
||
definition: "Individual human beings, historical or contemporary"
|
||
examples:
|
||
- "Rembrandt van Rijn"
|
||
- "Queen Beatrix"
|
||
- "Jan de Bakker"
|
||
- "Marie Curie"
|
||
ontology_class: "crm:E21_Person"
|
||
linkml_mapping:
|
||
class_uri: "crm:E21_Person"
|
||
exact_mappings:
|
||
- "foaf:Person"
|
||
- "schema:Person"
|
||
- "rico:Person"
|
||
|
||
STAFF:
|
||
code: "AGT.STF"
|
||
definition: "Personnel of heritage institutions in professional roles"
|
||
examples:
|
||
- "Dr. Maria van den Berg, Director"
|
||
- "Jan Pietersen, Curator of Prints"
|
||
- "Chief Archivist Emma de Vries"
|
||
ontology_class: "picom:PersonObservation"
|
||
org_ontology_mapping:
|
||
membership: "org:Membership"
|
||
role: "org:Role"
|
||
post: "org:Post"
|
||
note: |
|
||
Links to institution via org:memberOf or org:holds (for Posts).
|
||
The org:Membership class represents the n-ary relationship between
|
||
an Agent, an Organization, and a Role. Use org:Post when the position
|
||
exists independently of the person filling it.
|
||
|
||
For the ROLE itself (e.g., "Director", "Curator"), see ROLE hypernym.
|
||
|
||
# ----- COLLECTIVE AGENTS -----
|
||
COLLECTIVE:
|
||
code: "AGT.COL"
|
||
definition: |
|
||
Named collectives of agents acting as a unit but WITHOUT formal
|
||
organizational structure. For formal organizations, use GROUP hypernym.
|
||
examples:
|
||
- "The Dutch Masters"
|
||
- "The Impressionists"
|
||
- "The Founding Fathers"
|
||
- "Anonymous (hacker collective)"
|
||
ontology_class: "crm:E74_Group"
|
||
alternative_classes:
|
||
- "foaf:Group"
|
||
note: |
|
||
Collectives exhibit collective agency but lack:
|
||
- Legal personality
|
||
- Formal membership rules
|
||
- Organizational hierarchy
|
||
|
||
For formal organizations (museums, companies), use GROUP hypernym.
|
||
For informal project collaborations, use org:OrganizationalCollaboration.
|
||
|
||
# ----- FICTIONAL/MYTHOLOGICAL AGENTS -----
|
||
FICTIONAL:
|
||
code: "AGT.FIC"
|
||
definition: |
|
||
Characters from fiction, mythology, legend, or religious traditions
|
||
who are subjects of scholarly study or cultural analysis.
|
||
examples:
|
||
- "Sherlock Holmes"
|
||
- "Harry Potter"
|
||
- "Hamlet"
|
||
- "Don Quixote"
|
||
ontology_class: "crm:E21_Person"
|
||
alternative_classes:
|
||
- "tei:character" # TEI P5 character element
|
||
linkml_mapping:
|
||
class_uri: "crm:E21_Person"
|
||
exact_mappings: []
|
||
close_mappings:
|
||
- "schema:Person"
|
||
note: "Use crm:P2_has_type with value 'fictional' to distinguish"
|
||
tei_note: |
|
||
TEI P5 uses <person> with @role="fictional" or nests within
|
||
<listPerson type="fictional">. The <character> element (from
|
||
TEI Drama module) is more specific for dramatic personae.
|
||
|
||
MYTHOLOGICAL:
|
||
code: "AGT.MYT"
|
||
definition: |
|
||
Gods, deities, legendary figures, and supernatural beings from
|
||
religious or mythological traditions.
|
||
examples:
|
||
- "Apollo"
|
||
- "Thor"
|
||
- "Vishnu"
|
||
- "Anansi"
|
||
- "King Arthur"
|
||
- "Siegfried"
|
||
ontology_class: "crm:E21_Person"
|
||
note: |
|
||
Use crm:P2_has_type to indicate mythological/divine status.
|
||
Mythological figures may have historical cult/worship data
|
||
(temples, festivals) even though the entity is non-physical.
|
||
|
||
# ----- NON-HUMAN AGENTS -----
|
||
ANIMAL:
|
||
code: "AGT.ANI"
|
||
definition: |
|
||
Named individual animals with documented agency or cultural
|
||
significance. NOT species names (use THING.TAX for taxonomy).
|
||
examples:
|
||
- "Dolly the sheep (first cloned mammal)"
|
||
- "Jumbo the elephant"
|
||
- "Hachiko (famous loyal dog)"
|
||
- "Wojtek the soldier bear"
|
||
- "Paul the Octopus (World Cup predictor)"
|
||
ontology_class: "crm:E39_Actor"
|
||
note: |
|
||
Animals qualify as agents when they:
|
||
- Have individual names (not just species)
|
||
- Performed documented actions
|
||
- Have cultural/historical significance
|
||
|
||
Generic animal mentions ("a cat", "the horses") are NOT agents.
|
||
|
||
ARTIFICIAL:
|
||
code: "AGT.ART"
|
||
definition: |
|
||
Artificial agents: AI systems, robots, software agents, and
|
||
automated systems capable of autonomous decision-making or action.
|
||
examples:
|
||
- "DALL-E (AI image generator)"
|
||
- "AlphaGo (game-playing AI)"
|
||
- "Sophia (humanoid robot)"
|
||
- "Watson (IBM's AI system)"
|
||
- "GPT-4 (language model)"
|
||
ontology_class: "crm:E39_Actor"
|
||
alternative_classes:
|
||
- "prov:SoftwareAgent"
|
||
linkml_mapping:
|
||
class_uri: "crm:E39_Actor"
|
||
exact_mappings:
|
||
- "prov:SoftwareAgent"
|
||
note: |
|
||
PROV-O defines prov:SoftwareAgent as "A software agent is running
|
||
software." Use when the AI/robot is the proximate cause of an action,
|
||
distinct from the human programmers or operators.
|
||
note: |
|
||
Artificial agents are increasingly relevant for heritage:
|
||
- AI systems making curatorial decisions
|
||
- Robots performing conservation tasks
|
||
- Automated digitization systems
|
||
- AI-generated art and authorship questions
|
||
|
||
Attribution of agency to AI is context-dependent and evolving.
|
||
|
||
inclusion_rules:
|
||
- id: "AGT_INC001"
|
||
rule: "Tag agent names even when only partial name appears"
|
||
examples:
|
||
- "Rembrandt (given name only)"
|
||
- "Van Gogh (surname only)"
|
||
- "GPT (abbreviated AI name)"
|
||
|
||
- id: "AGT_INC002"
|
||
rule: "Tag agents identified by title + name"
|
||
examples:
|
||
- "Professor Einstein"
|
||
- "Dr. Curie"
|
||
- "King Willem-Alexander"
|
||
|
||
- id: "AGT_INC003"
|
||
rule: "Tag staff members with their institutional context"
|
||
examples:
|
||
- "Director Jan de Wit"
|
||
- "Curator of Medieval Art at the Rijksmuseum"
|
||
|
||
- id: "AGT_INC004"
|
||
rule: "Tag named collectives acting as unified agents"
|
||
examples:
|
||
- "The Impressionists"
|
||
- "Anonymous"
|
||
- "The Beatles"
|
||
|
||
- id: "AGT_INC005"
|
||
rule: "Tag fictional/mythological characters when subjects of analysis"
|
||
examples:
|
||
- "Hamlet's soliloquy"
|
||
- "representations of Apollo"
|
||
- "Harry Potter merchandise"
|
||
|
||
- id: "AGT_INC006"
|
||
rule: "Tag AI/robot agents when attributed with actions"
|
||
examples:
|
||
- "art created by DALL-E"
|
||
- "AlphaGo defeated Lee Sedol"
|
||
|
||
exclusion_rules:
|
||
- id: "AGT_EXC001"
|
||
rule: "Do NOT tag generic role descriptions without names"
|
||
examples:
|
||
- "the curator (generic)"
|
||
- "a visitor (generic)"
|
||
- "staff members (generic plural)"
|
||
note: "For role terms themselves, see ROLE hypernym"
|
||
|
||
- id: "AGT_EXC002"
|
||
rule: "Do NOT tag pronouns"
|
||
examples:
|
||
- "he"
|
||
- "she"
|
||
- "they"
|
||
- "it"
|
||
|
||
- id: "AGT_EXC003"
|
||
rule: "Do NOT tag species names (use THING.TAX instead)"
|
||
examples:
|
||
- "elephants (species, not individual)"
|
||
- "Homo sapiens"
|
||
- "cats (generic)"
|
||
|
||
- id: "AGT_EXC004"
|
||
rule: "Do NOT tag tools or software without agency attribution"
|
||
examples:
|
||
- "Photoshop (tool, not agent)"
|
||
- "the database (system, not agent)"
|
||
note: "Only tag AI when it is the attributed actor, not just a tool used"
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# TOPONYM - Named place references (nominal, not geometric)
|
||
# ---------------------------------------------------------------------------
|
||
# BREAKING CHANGE v1.7.0: Renamed from PLACE (PLC) to TOPONYM (TOP)
|
||
# Rationale: Distinguish between:
|
||
# - TOPONYM: Nominal place names ("Amsterdam", "the Alps") - textual references
|
||
# - GEOMETRY: Coordinates, polygons, spatial extents - see GEOMETRY hypernym
|
||
#
|
||
# This follows the Pleiades model: Place = conceptual entity, Location = geometry.
|
||
# TEI P5: <placeName> vs. <geo> elements.
|
||
# GeoSPARQL: geo:Feature vs. geo:Geometry.
|
||
# ---------------------------------------------------------------------------
|
||
TOPONYM:
|
||
code: "TOP"
|
||
definition: |
|
||
Named references to places in text. Toponyms are NOMINAL - they are
|
||
linguistic labels for places, not the places themselves. The same place
|
||
may have multiple toponyms (historical names, variant spellings, exonyms).
|
||
|
||
Key distinction:
|
||
- TOPONYM: "Amsterdam", "Constantinople", "the Netherlands" (names in text)
|
||
- GEOMETRY: "52.3676° N, 4.9041° E" (coordinates, see GEOMETRY hypernym)
|
||
|
||
A toponym can reference:
|
||
- A persistent place (conceptual entity with temporal extent)
|
||
- A location (specific geometry at a point in time)
|
||
- An uncertain or legendary place (may lack precise geometry)
|
||
|
||
design_rationale: |
|
||
The Pleiades gazetteer model distinguishes:
|
||
- Place: A conceptual geographic entity with persistent identity
|
||
- Name: A toponym (linguistic label) used for that place
|
||
- Location: A geometry (coordinates) for that place at a specific time
|
||
|
||
TEI P5 follows similar logic:
|
||
- <placeName>: Nominal reference to a place
|
||
- <geo>: Geographic coordinates (separate element)
|
||
|
||
This separation is essential for:
|
||
- Historical places with unknown geometry ("Atlantis", "El Dorado")
|
||
- Places with changing geometry (coastlines, borders)
|
||
- Places with multiple names over time (Constantinople→Istanbul)
|
||
- Linking textual mentions to gazetteers (GeoNames, Pleiades, Wikidata)
|
||
|
||
ontology_mappings:
|
||
primary_class: "crm:E53_Place"
|
||
primary_class_definition: |
|
||
CIDOC-CRM E53 Place: "This class comprises extents in space, in
|
||
particular on the surface of the earth, in the pure sense of physics:
|
||
independent from temporal phenomena and matter."
|
||
alternative_classes:
|
||
- "schema:Place"
|
||
- "rico:Place"
|
||
- "edm:Place"
|
||
- "pleiades:Place"
|
||
linkml_mapping:
|
||
class_uri: "crm:E53_Place"
|
||
exact_mappings:
|
||
- "edm:Place"
|
||
- "rico:Place"
|
||
close_mappings:
|
||
- "schema:Place"
|
||
related_mappings:
|
||
- "gn:Feature" # GeoNames
|
||
nerd_class: "nerd:Location"
|
||
nerd_deprecation_note: |
|
||
DEPRECATED: NERD's Location class conflates toponyms (names) with
|
||
geometry (coordinates). For Digital Humanities, use crm:E53_Place for
|
||
conceptual places and geo:Geometry for spatial data.
|
||
Retain NERD mapping ONLY for NLP pipeline interchange.
|
||
tei_mapping:
|
||
element: "placeName"
|
||
attributes:
|
||
ref: "URI reference to gazetteer entry"
|
||
type: "settlement|region|country|address|building|natural"
|
||
cert: "high|medium|low (certainty of identification)"
|
||
note: |
|
||
EDM Place is equivalent to CIDOC-CRM E53_Place and is used in
|
||
Europeana cultural heritage contexts. Pleiades provides the
|
||
Place/Name/Location model for historical geography.
|
||
|
||
subcategories:
|
||
# ----- ADMINISTRATIVE PLACES -----
|
||
SETTLEMENT:
|
||
code: "TOP.SET"
|
||
definition: "Cities, towns, villages, and other populated places"
|
||
examples:
|
||
- "Amsterdam"
|
||
- "New York City"
|
||
- "the village of Giethoorn"
|
||
- "古都京都 (Kyoto)"
|
||
ontology_class: "schema:City"
|
||
alternative_classes:
|
||
- "gn:P.PPL" # GeoNames populated place
|
||
linkml_mapping:
|
||
class_uri: "crm:E53_Place"
|
||
close_mappings:
|
||
- "schema:City"
|
||
- "gn:P.PPL"
|
||
|
||
REGION:
|
||
code: "TOP.REG"
|
||
definition: "Provinces, states, counties, and administrative regions"
|
||
examples:
|
||
- "North Holland"
|
||
- "Bavaria"
|
||
- "California"
|
||
- "Île-de-France"
|
||
ontology_class: "schema:AdministrativeArea"
|
||
alternative_classes:
|
||
- "gn:A.ADM1" # GeoNames first-order admin division
|
||
|
||
COUNTRY:
|
||
code: "TOP.CTY"
|
||
definition: "Nations and sovereign states (modern and historical)"
|
||
examples:
|
||
- "The Netherlands"
|
||
- "France"
|
||
- "Japan"
|
||
- "the Dutch Republic (historical)"
|
||
ontology_class: "schema:Country"
|
||
note: |
|
||
For historical polities (kingdoms, empires, republics), use
|
||
crm:P2_has_type to indicate temporal status. Link to Wikidata
|
||
for historical state succession chains.
|
||
|
||
# ----- PHYSICAL STRUCTURE PLACES -----
|
||
ADDRESS:
|
||
code: "TOP.ADR"
|
||
definition: "Street addresses and postal locations"
|
||
examples:
|
||
- "Museumstraat 1, Amsterdam"
|
||
- "1600 Pennsylvania Avenue"
|
||
- "Postbus 74888, 1070 DN Amsterdam"
|
||
ontology_class: "schema:PostalAddress"
|
||
vcard_mapping:
|
||
class: "vcard:Address"
|
||
properties:
|
||
street: "vcard:street-address"
|
||
locality: "vcard:locality"
|
||
region: "vcard:region"
|
||
postal_code: "vcard:postal-code"
|
||
country: "vcard:country-name"
|
||
note: |
|
||
Addresses are composite toponyms containing multiple components.
|
||
Parse into structured vCard properties when possible.
|
||
|
||
INSTADDR:
|
||
code: "TOP.IAD"
|
||
definition: "Full institutional addresses including building names"
|
||
examples:
|
||
- "Rijksmuseum, Museumstraat 1, 1071 XX Amsterdam"
|
||
- "British Museum, Great Russell St, London WC1B 3DG"
|
||
ontology_class: "schema:PostalAddress"
|
||
org_mapping: "org:Site"
|
||
note: |
|
||
Links to GROUP hypernym via schema:address or org:hasSite.
|
||
org:Site represents an office or premise at which the organization
|
||
is located - use for physical institutional locations.
|
||
|
||
BUILDING:
|
||
code: "TOP.BLD"
|
||
definition: "Named buildings, monuments, and architectural structures"
|
||
examples:
|
||
- "the Rijksmuseum building"
|
||
- "Anne Frank House"
|
||
- "Palace of Versailles"
|
||
- "the Parthenon"
|
||
ontology_class: "crm:E18_Physical_Thing"
|
||
alternative_classes:
|
||
- "edm:PhysicalThing"
|
||
- "schema:LandmarksOrHistoricalBuildings"
|
||
note: |
|
||
Buildings are physical things (E18) that occupy places (E53).
|
||
The building-as-place uses crm:P53_has_former_or_current_location.
|
||
|
||
# ----- NATURAL FEATURE PLACES -----
|
||
NATURAL:
|
||
code: "TOP.NAT"
|
||
definition: "Natural geographic features: mountains, rivers, lakes, etc."
|
||
examples:
|
||
- "the Alps"
|
||
- "Amazon River"
|
||
- "Mount Fuji"
|
||
- "Lake Baikal"
|
||
- "the Sahara Desert"
|
||
ontology_class: "crm:E53_Place"
|
||
alternative_classes:
|
||
- "gn:T" # GeoNames terrain features
|
||
- "gn:H" # GeoNames hydrographic features
|
||
|
||
# ----- TEMPORAL/UNCERTAIN PLACES -----
|
||
HISTORICAL:
|
||
code: "TOP.HIS"
|
||
definition: |
|
||
Historical toponyms: places that no longer exist, have changed
|
||
names, or have uncertain modern equivalents.
|
||
examples:
|
||
- "Constantinople (→ Istanbul)"
|
||
- "Batavia (→ Jakarta)"
|
||
- "New Amsterdam (→ New York)"
|
||
- "Babylon"
|
||
- "Tenochtitlan"
|
||
ontology_class: "crm:E53_Place"
|
||
pleiades_note: |
|
||
Pleiades is the authoritative gazetteer for ancient world places.
|
||
Use pleiades:Place identifiers for Greco-Roman and ancient Near
|
||
Eastern toponyms. Link via @ref in TEI.
|
||
note: |
|
||
Use crm:P2_has_type to indicate historical status.
|
||
Link historical toponyms to modern equivalents via owl:sameAs
|
||
or skos:closeMatch when identity is certain.
|
||
|
||
LEGENDARY:
|
||
code: "TOP.LEG"
|
||
definition: |
|
||
Legendary, mythological, or fictional places. These may have
|
||
cultural significance but lack verifiable geometry.
|
||
examples:
|
||
- "Atlantis"
|
||
- "El Dorado"
|
||
- "Avalon"
|
||
- "Middle-earth"
|
||
- "Narnia"
|
||
ontology_class: "crm:E53_Place"
|
||
tei_mapping:
|
||
element: "placeName"
|
||
attributes:
|
||
type: "mythological"
|
||
cert: "low"
|
||
note: |
|
||
Tag legendary places when they are subjects of scholarly analysis
|
||
(archaeology, literary studies, art history). Use crm:P2_has_type
|
||
to indicate legendary/fictional status.
|
||
|
||
inclusion_rules:
|
||
- id: "TOP_INC001"
|
||
rule: "Tag place names even with directional or temporal modifiers"
|
||
examples:
|
||
- "northern France"
|
||
- "East Berlin"
|
||
- "southern Netherlands"
|
||
- "medieval Paris"
|
||
|
||
- id: "TOP_INC002"
|
||
rule: "Tag complete addresses as single entities"
|
||
examples:
|
||
- "Prinsengracht 263, 1016 GV Amsterdam"
|
||
|
||
- id: "TOP_INC003"
|
||
rule: "Tag historical toponyms with their historical form"
|
||
examples:
|
||
- "Batavia (not Jakarta, unless both appear)"
|
||
- "Constantinople (not Istanbul)"
|
||
|
||
- id: "TOP_INC004"
|
||
rule: "Tag exonyms (foreign names for places)"
|
||
examples:
|
||
- "The Hague (English exonym for Den Haag)"
|
||
- "Florence (English for Firenze)"
|
||
- "Cologne (English for Köln)"
|
||
|
||
- id: "TOP_INC005"
|
||
rule: "Tag legendary places when subjects of scholarly analysis"
|
||
examples:
|
||
- "the search for Atlantis"
|
||
- "representations of Avalon in medieval art"
|
||
|
||
exclusion_rules:
|
||
- id: "TOP_EXC001"
|
||
rule: "Do NOT tag generic spatial references"
|
||
examples:
|
||
- "here, there, nearby"
|
||
- "the city (without name)"
|
||
- "the museum (use GROUP hypernym)"
|
||
|
||
- id: "TOP_EXC002"
|
||
rule: "Do NOT tag directional words alone"
|
||
examples:
|
||
- "north"
|
||
- "south"
|
||
- "east"
|
||
- "west"
|
||
|
||
- id: "TOP_EXC003"
|
||
rule: "Do NOT tag room names within buildings (unless historically significant)"
|
||
examples:
|
||
- "Room 5"
|
||
- "the lobby"
|
||
note: "Exception: 'the Sistine Chapel' (historically significant)"
|
||
|
||
- id: "TOP_EXC004"
|
||
rule: "Do NOT tag coordinates or geometries (use GEOMETRY hypernym)"
|
||
examples:
|
||
- "52.3676° N, 4.9041° E (use GEO.PNT)"
|
||
- "bounding box: 52.0, 4.5, 53.0, 5.0 (use GEO.BOX)"
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# GEOMETRY - Spatial coordinates and geometric representations
|
||
# ---------------------------------------------------------------------------
|
||
# NEW in v1.7.0: Separated from PLACE to distinguish nominal (toponyms)
|
||
# from geometric (coordinates) representations of space.
|
||
# Authority: GeoSPARQL, OGC Simple Features, ISO 19107 (geographic info)
|
||
# ---------------------------------------------------------------------------
|
||
GEOMETRY:
|
||
code: "GEO"
|
||
definition: |
|
||
Geometric representations of spatial extent: coordinates, bounding boxes,
|
||
polygons, and other spatial primitives. Geometries are MATHEMATICAL
|
||
representations, distinct from nominal toponyms.
|
||
|
||
Key distinction:
|
||
- TOPONYM: "Amsterdam" (name in text)
|
||
- GEOMETRY: "52.3676° N, 4.9041° E" (coordinates)
|
||
|
||
Geometries may be:
|
||
- Extracted from text (coordinate mentions)
|
||
- Resolved from toponyms (geocoding)
|
||
- Imported from GIS data (shapefiles, GeoJSON)
|
||
|
||
design_rationale: |
|
||
GeoSPARQL (OGC standard) distinguishes:
|
||
- geo:Feature: An entity with spatial extent (≈ crm:E53_Place)
|
||
- geo:Geometry: A geometric representation of that extent
|
||
|
||
A single Feature may have multiple Geometries:
|
||
- Different precision levels
|
||
- Different time periods (historical boundaries)
|
||
- Different representations (point vs. polygon)
|
||
|
||
This separation enables:
|
||
- Linking toponyms to multiple coordinate systems
|
||
- Representing uncertainty in historical geography
|
||
- Integrating with GIS systems
|
||
|
||
ontology_mappings:
|
||
primary_class: "geo:Geometry"
|
||
primary_class_definition: |
|
||
GeoSPARQL geo:Geometry: "A coherent set of direct positions in space.
|
||
The basic geometries are Point, Curve, Surface, Solid."
|
||
alternative_classes:
|
||
- "sf:Geometry" # OGC Simple Features
|
||
- "crm:E94_Space_Primitive" # CIDOC-CRM spatial extension
|
||
linkml_mapping:
|
||
class_uri: "geo:Geometry"
|
||
exact_mappings:
|
||
- "sf:Geometry"
|
||
related_mappings:
|
||
- "schema:GeoCoordinates"
|
||
- "schema:GeoShape"
|
||
tei_mapping:
|
||
element: "geo"
|
||
note: |
|
||
TEI P5 <geo> contains coordinates in decimal degrees (WGS84).
|
||
Format: latitude,longitude or latitude longitude (space-separated).
|
||
note: |
|
||
GeoSPARQL is the W3C/OGC standard for representing geographic
|
||
information in RDF. Use geo:hasGeometry to link Features to Geometries.
|
||
|
||
subcategories:
|
||
POINT:
|
||
code: "GEO.PNT"
|
||
definition: "A single coordinate point (latitude/longitude)"
|
||
examples:
|
||
- "52.3676° N, 4.9041° E"
|
||
- "52.3676, 4.9041"
|
||
- "lat: 52.3676, lon: 4.9041"
|
||
- "N 52° 22' 3.36\", E 4° 54' 14.76\""
|
||
ontology_class: "sf:Point"
|
||
alternative_classes:
|
||
- "schema:GeoCoordinates"
|
||
geosparql_wkt: "POINT(4.9041 52.3676)"
|
||
note: |
|
||
Points may appear in various formats:
|
||
- Decimal degrees (DD): 52.3676, 4.9041
|
||
- Degrees minutes seconds (DMS): N 52° 22' 3.36"
|
||
- Signed decimal: 52.3676, 4.9041 (N/E positive)
|
||
|
||
Normalize to WGS84 decimal degrees for storage.
|
||
|
||
BOX:
|
||
code: "GEO.BOX"
|
||
definition: "A bounding box (minimum bounding rectangle)"
|
||
examples:
|
||
- "bounding box: 52.0, 4.5, 53.0, 5.0"
|
||
- "extent: SW 52.0, 4.5 to NE 53.0, 5.0"
|
||
ontology_class: "geo:Geometry"
|
||
alternative_classes:
|
||
- "schema:GeoShape"
|
||
geosparql_wkt: "ENVELOPE(4.5, 5.0, 53.0, 52.0)"
|
||
note: |
|
||
Bounding boxes define rectangular extents.
|
||
Format conventions vary; normalize to: minLat, minLon, maxLat, maxLon.
|
||
|
||
POLYGON:
|
||
code: "GEO.PLY"
|
||
definition: "A closed polygon (administrative boundary, parcel)"
|
||
examples:
|
||
- "POLYGON((4.8 52.3, 4.9 52.3, 4.9 52.4, 4.8 52.4, 4.8 52.3))"
|
||
- "GeoJSON polygon for Noord-Holland province"
|
||
ontology_class: "sf:Polygon"
|
||
geosparql_wkt: "POLYGON((...))"
|
||
note: |
|
||
Polygons are typically imported from GIS data, not extracted from
|
||
natural language text. May appear in technical documentation.
|
||
|
||
LINE:
|
||
code: "GEO.LIN"
|
||
definition: "A line or path (route, river course, boundary segment)"
|
||
examples:
|
||
- "the route from Amsterdam to Rotterdam"
|
||
- "LINESTRING(4.9 52.4, 4.5 51.9)"
|
||
ontology_class: "sf:LineString"
|
||
note: |
|
||
Lines appear in route descriptions or boundary definitions.
|
||
Less common in heritage text; more typical in geographic data.
|
||
|
||
inclusion_rules:
|
||
- id: "GEO_INC001"
|
||
rule: "Tag coordinate mentions in any standard format"
|
||
examples:
|
||
- "located at 52.3676° N, 4.9041° E"
|
||
- "coordinates: 52.3676, 4.9041"
|
||
- "GPS: N 52° 22' 3.36\", E 4° 54' 14.76\""
|
||
|
||
- id: "GEO_INC002"
|
||
rule: "Tag WKT (Well-Known Text) geometry literals"
|
||
examples:
|
||
- "POINT(4.9041 52.3676)"
|
||
- "POLYGON((...))"
|
||
|
||
- id: "GEO_INC003"
|
||
rule: "Tag GeoJSON references when geometry is inline"
|
||
examples:
|
||
- '{"type": "Point", "coordinates": [4.9041, 52.3676]}'
|
||
|
||
exclusion_rules:
|
||
- id: "GEO_EXC001"
|
||
rule: "Do NOT tag place names (use TOPONYM hypernym)"
|
||
examples:
|
||
- "Amsterdam (use TOP.SET)"
|
||
- "the Alps (use TOP.NAT)"
|
||
|
||
- id: "GEO_EXC002"
|
||
rule: "Do NOT tag references to external GIS files"
|
||
examples:
|
||
- "see shapefile boundaries.shp (external reference)"
|
||
- "GeoJSON file: regions.geojson (external)"
|
||
note: "Only tag inline geometry, not file references"
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# GROUP - Collectives of agents (formal and informal)
|
||
# ---------------------------------------------------------------------------
|
||
# BREAKING CHANGE v1.7.0: Renamed from ORGANISATION (ORG) to GROUP (GRP)
|
||
# Rationale: CIDOC-CRM E74_Group is the proper hypernym for ALL collectives:
|
||
# - Formal organizations (legal entities, corporations, governments)
|
||
# - Informal groups (movements, bands, collectives, families)
|
||
# - Historical entities (guilds, societies, courts)
|
||
#
|
||
# NERD's "Organization" is too narrow - it implies formal legal structure.
|
||
# E74_Group: "any gathering or organization of E39 Actors that acts collectively"
|
||
# ---------------------------------------------------------------------------
|
||
GROUP:
|
||
code: "GRP"
|
||
definition: |
|
||
Any collective of agents acting together. Groups range from formal
|
||
legal entities (corporations, governments, universities) to informal
|
||
collectives (bands, movements, families, friend groups).
|
||
|
||
The defining characteristic is COLLECTIVE AGENCY - the group can:
|
||
- Perform actions attributed to the group as a unit
|
||
- Hold collective identity distinct from individual members
|
||
- Persist through changes in membership
|
||
|
||
Groups are subclasses of AGENT (crm:E74_Group is a subclass of E39_Actor).
|
||
|
||
design_rationale: |
|
||
CIDOC-CRM E74_Group: "This class comprises any gatherings or organizations
|
||
of E21 Persons that act collectively or in a similar way due to any form
|
||
of unifying relationship."
|
||
|
||
This is broader than:
|
||
- org:Organization (W3C Org) - implies formal structure
|
||
- foaf:Organization - primarily corporate entities
|
||
- schema:Organization - web-focused commercial entities
|
||
|
||
E74_Group encompasses:
|
||
- Formal organizations (org:FormalOrganization)
|
||
- Informal groups (foaf:Group)
|
||
- Historical entities (guilds, courts, movements)
|
||
- Families and dynasties
|
||
- Artistic/cultural movements without formal structure
|
||
|
||
ontology_mappings:
|
||
primary_class: "crm:E74_Group"
|
||
primary_class_definition: |
|
||
CIDOC-CRM E74 Group: "This class comprises any gatherings or organizations
|
||
of E21 Persons that act collectively or in a similar way due to any form
|
||
of unifying relationship."
|
||
alternative_classes:
|
||
- "rico:CorporateBody" # For formal organizations
|
||
- "foaf:Group" # For informal groups
|
||
- "org:Organization" # W3C Org for structured organizations
|
||
linkml_mapping:
|
||
class_uri: "crm:E74_Group"
|
||
exact_mappings:
|
||
- "foaf:Group"
|
||
close_mappings:
|
||
- "org:Organization"
|
||
- "schema:Organization"
|
||
related_mappings:
|
||
- "rico:CorporateBody"
|
||
nerd_class: "nerd:Organization"
|
||
nerd_deprecation_note: |
|
||
DEPRECATED: NERD's Organization class implies formal legal structure,
|
||
excluding informal groups, movements, and historical collectives. For
|
||
Digital Humanities, CIDOC-CRM E74_Group is authoritative.
|
||
Retain NERD mapping ONLY for NLP pipeline interchange.
|
||
org_ontology:
|
||
formal_organization: "org:FormalOrganization"
|
||
organizational_unit: "org:OrganizationalUnit"
|
||
note: |
|
||
W3C Org Ontology provides precise organizational modeling:
|
||
- org:Organization: Any collection of people organized together
|
||
- org:FormalOrganization: Recognized legal entity (company, charity, government)
|
||
- org:OrganizationalUnit: Division or department within an organization
|
||
|
||
Use org:FormalOrganization for GRP.HER, GRP.COR, GRP.GOV, GRP.EDU.
|
||
Use foaf:Group or crm:E74_Group for GRP.INF.
|
||
rov_ontology:
|
||
registered_organization: "rov:RegisteredOrganization"
|
||
legal_name: "rov:legalName"
|
||
org_type: "rov:orgType"
|
||
org_status: "rov:orgStatus"
|
||
registration: "rov:registration"
|
||
note: |
|
||
Registered Organization Vocabulary (RegOrg) for legal entities:
|
||
- rov:RegisteredOrganization: Legal entity in a formal register
|
||
- rov:legalName: Registered legal name (vs. trading/brand names)
|
||
- Use ONLY for formal organizations with legal registration.
|
||
|
||
subcategories:
|
||
# ----- FORMAL ORGANIZATIONS -----
|
||
HERINST:
|
||
code: "GRP.HER"
|
||
definition: "Heritage institutions: museums, archives, libraries, galleries"
|
||
examples:
|
||
- "Rijksmuseum"
|
||
- "National Archives of the Netherlands"
|
||
- "British Library"
|
||
- "Louvre Museum"
|
||
ontology_class: "glam:HeritageCustodian"
|
||
alternative_classes:
|
||
- "org:FormalOrganization"
|
||
- "rov:RegisteredOrganization"
|
||
linkml_mapping:
|
||
class_uri: "glam:HeritageCustodian"
|
||
exact_mappings: []
|
||
close_mappings:
|
||
- "schema:Museum"
|
||
- "schema:Library"
|
||
- "schema:ArchiveOrganization"
|
||
note: |
|
||
Primary entity type for GLAM project extraction.
|
||
Most heritage institutions are formal organizations with legal registration.
|
||
Use rov:RegisteredOrganization when registration details (KvK, ISIL) are known.
|
||
|
||
PARBODY:
|
||
code: "GRP.PAR"
|
||
definition: "Parent or governing bodies of heritage institutions"
|
||
examples:
|
||
- "Ministry of Education, Culture and Science"
|
||
- "Smithsonian Institution"
|
||
- "City of Amsterdam"
|
||
ontology_class: "rico:CorporateBody"
|
||
alternative_classes:
|
||
- "org:Organization"
|
||
org_predicates:
|
||
- "org:hasSubOrganization"
|
||
- "org:hasUnit"
|
||
note: "Links to HERINST via org:hasSubOrganization or org:hasUnit"
|
||
|
||
UNIT:
|
||
code: "GRP.UNT"
|
||
definition: "Departments, divisions, or units within organizations"
|
||
examples:
|
||
- "Department of Prints and Drawings"
|
||
- "Conservation Laboratory"
|
||
- "Education Services"
|
||
- "Digital Collections Unit"
|
||
ontology_class: "org:OrganizationalUnit"
|
||
org_predicates:
|
||
- "org:unitOf"
|
||
- "org:hasUnit"
|
||
note: |
|
||
Use for named internal divisions when they function as distinct entities.
|
||
Links to parent via org:unitOf. Distinct from GRP_EXC003 generic mentions.
|
||
|
||
CORPORATION:
|
||
code: "GRP.COR"
|
||
definition: "Commercial companies and businesses"
|
||
examples:
|
||
- "Dutch East India Company (VOC)"
|
||
- "Philips"
|
||
- "Royal Dutch Shell"
|
||
- "Google"
|
||
ontology_class: "schema:Corporation"
|
||
alternative_classes:
|
||
- "org:FormalOrganization"
|
||
- "rov:RegisteredOrganization"
|
||
note: |
|
||
Historical trading companies (VOC, WIC, East India Company) are
|
||
GRP.COR even though defunct. Use crm:P2_has_type for historical status.
|
||
|
||
GOVERNMENT:
|
||
code: "GRP.GOV"
|
||
definition: "Government agencies, legislatures, and public bodies"
|
||
examples:
|
||
- "Dutch Parliament (Staten-Generaal)"
|
||
- "European Commission"
|
||
- "City Council of Amsterdam"
|
||
- "US National Endowment for the Humanities"
|
||
ontology_class: "schema:GovernmentOrganization"
|
||
alternative_classes:
|
||
- "org:FormalOrganization"
|
||
|
||
EDUCATIONAL:
|
||
code: "GRP.EDU"
|
||
definition: "Universities, schools, and educational institutions"
|
||
examples:
|
||
- "University of Amsterdam"
|
||
- "Leiden University"
|
||
- "Harvard University"
|
||
- "Royal Academy of Fine Arts"
|
||
ontology_class: "schema:EducationalOrganization"
|
||
alternative_classes:
|
||
- "org:FormalOrganization"
|
||
|
||
RELIGIOUS:
|
||
code: "GRP.REL"
|
||
definition: "Religious organizations, denominations, and congregations"
|
||
examples:
|
||
- "Roman Catholic Church"
|
||
- "Westerkerk congregation"
|
||
- "Buddhist Temple Foundation"
|
||
- "Franciscan Order"
|
||
ontology_class: "schema:ReligiousOrganization"
|
||
note: |
|
||
Religious organizations may also be heritage custodians (GRP.HER) when
|
||
they maintain archives, libraries, or collections. Use multiple types.
|
||
|
||
ASSOCIATION:
|
||
code: "GRP.ASS"
|
||
definition: "Associations, societies, and membership organizations"
|
||
examples:
|
||
- "Royal Netherlands Academy of Arts and Sciences"
|
||
- "Historical Society of Amsterdam"
|
||
- "Friends of the Rijksmuseum"
|
||
- "International Council of Museums (ICOM)"
|
||
ontology_class: "org:FormalOrganization"
|
||
alternative_classes:
|
||
- "schema:Organization"
|
||
note: |
|
||
Associations are formal groups with membership structures but may
|
||
not have commercial purposes. Distinguished from GRP.INF by having
|
||
formal bylaws, officers, and registered status.
|
||
|
||
# ----- INFORMAL GROUPS (NEW in v1.7.0) -----
|
||
INFORMAL:
|
||
code: "GRP.INF"
|
||
definition: |
|
||
Informal collectives without legal structure: artistic movements,
|
||
social groups, loose collaborations, families, dynasties.
|
||
examples:
|
||
- "The Impressionists"
|
||
- "De Stijl movement"
|
||
- "Anonymous (hacker collective)"
|
||
- "the Habsburg dynasty"
|
||
- "the Medici family"
|
||
- "the Beatles"
|
||
ontology_class: "crm:E74_Group"
|
||
alternative_classes:
|
||
- "foaf:Group"
|
||
linkml_mapping:
|
||
class_uri: "crm:E74_Group"
|
||
exact_mappings:
|
||
- "foaf:Group"
|
||
note: |
|
||
Informal groups exhibit collective agency but lack:
|
||
- Legal personality or registration
|
||
- Formal membership rules
|
||
- Organizational hierarchy
|
||
|
||
Artistic movements, dynasties, bands, and collectives are GRP.INF.
|
||
The "group" is often named retrospectively by historians/critics.
|
||
|
||
# ----- HISTORICAL GROUPS -----
|
||
HISTORICAL:
|
||
code: "GRP.HIS"
|
||
definition: "Historical organizations that no longer exist or have transformed"
|
||
examples:
|
||
- "Dutch East India Company (dissolved 1799)"
|
||
- "Guild of Saint Luke"
|
||
- "Knights Templar"
|
||
- "League of Nations"
|
||
- "the Hanseatic League"
|
||
ontology_class: "crm:E74_Group"
|
||
note: |
|
||
Use crm:P2_has_type to indicate historical/defunct status.
|
||
Historical groups may have successor organizations - link via
|
||
org:resultedFrom or crm:P17_was_motivated_by.
|
||
|
||
inclusion_rules:
|
||
- id: "GRP_INC001"
|
||
rule: "Tag groups with their full official or common names"
|
||
examples:
|
||
- "Rijksmuseum Amsterdam"
|
||
- "The Metropolitan Museum of Art"
|
||
- "De Stijl"
|
||
|
||
- id: "GRP_INC002"
|
||
rule: "Tag abbreviated group names and acronyms"
|
||
examples:
|
||
- "VOC (Dutch East India Company)"
|
||
- "MoMA (Museum of Modern Art)"
|
||
- "KNAW (Royal Netherlands Academy)"
|
||
|
||
- id: "GRP_INC003"
|
||
rule: "Tag historical groups even if defunct"
|
||
examples:
|
||
- "Dutch East India Company"
|
||
- "League of Nations"
|
||
- "Guild of Saint Luke"
|
||
|
||
- id: "GRP_INC004"
|
||
rule: "Tag parent bodies and governing organizations"
|
||
examples:
|
||
- "under the Ministry of Culture"
|
||
- "part of the Smithsonian"
|
||
|
||
- id: "GRP_INC005"
|
||
rule: "Tag informal groups and movements with recognized names"
|
||
examples:
|
||
- "the Impressionists"
|
||
- "De Stijl movement"
|
||
- "the Habsburg dynasty"
|
||
|
||
- id: "GRP_INC006"
|
||
rule: "Tag families and dynasties when acting as collective agents"
|
||
examples:
|
||
- "the Medici family commissioned..."
|
||
- "Habsburg patronage of the arts"
|
||
|
||
exclusion_rules:
|
||
- id: "GRP_EXC001"
|
||
rule: "Do NOT tag generic group references"
|
||
examples:
|
||
- "the museum (without name)"
|
||
- "our organization"
|
||
- "the company"
|
||
- "a group of artists"
|
||
|
||
- id: "GRP_EXC002"
|
||
rule: "Do NOT tag group types without names"
|
||
examples:
|
||
- "museums (generic plural)"
|
||
- "archives (generic)"
|
||
- "families (generic)"
|
||
|
||
- id: "GRP_EXC003"
|
||
rule: "Do NOT tag departments unless they function as named units"
|
||
examples:
|
||
- "the prints department (use only if named entity)"
|
||
- "HR department (generic)"
|
||
|
||
- id: "GRP_EXC004"
|
||
rule: "Do NOT tag ad-hoc gatherings without persistent identity"
|
||
examples:
|
||
- "the crowd at the opening"
|
||
- "visitors to the museum"
|
||
- "the workshop participants"
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# APPELLATION - Names and naming constructs (distinct from what they name)
|
||
# ---------------------------------------------------------------------------
|
||
# BREAKING CHANGE v1.7.0: Renamed from DENOMINATION (DEN) to APPELLATION (APP)
|
||
# and restructured to separate:
|
||
# - APPELLATION: Names as linguistic constructs (titles, collection names)
|
||
# - ROLE: Social/occupational positions (see new ROLE hypernym below)
|
||
#
|
||
# TEI P5 provides the authority: <name>, <title>, <roleName>, <occupation>
|
||
# are distinct element types, not conflated.
|
||
# ---------------------------------------------------------------------------
|
||
APPELLATION:
|
||
code: "APP"
|
||
definition: |
|
||
Naming constructs as linguistic entities, distinct from the things they
|
||
name. Appellations are linguistic labels that identify entities:
|
||
- Titles (artwork titles, document titles)
|
||
- Collection names (archival fonds, named collections)
|
||
- Exhibition names (named temporary displays)
|
||
- Awards (named honors and prizes)
|
||
- Structured person names (when analyzing name components)
|
||
|
||
For ROLES (positions, occupations, honorifics), see the ROLE hypernym.
|
||
|
||
design_rationale: |
|
||
CIDOC-CRM E41 Appellation: "This class comprises signs, either meaningful
|
||
or not, or arrangements of signs following a specific syntax, that are
|
||
used or can be used to refer to and identify a specific instance."
|
||
|
||
Appellations are DISTINCT from what they name:
|
||
- "The Night Watch" (appellation) vs. the painting (E22_Human-Made_Object)
|
||
- "Rembrandt van Rijn" (appellation) vs. the person (E21_Person)
|
||
|
||
TEI P5 uses:
|
||
- <name> for any naming string
|
||
- <persName> for person names (with sub-elements)
|
||
- <placeName> for place names
|
||
- <title> for work titles
|
||
- <roleName> for role designations (see ROLE hypernym)
|
||
|
||
ontology_mappings:
|
||
primary_class: "crm:E41_Appellation"
|
||
primary_class_definition: |
|
||
CIDOC-CRM E41 Appellation: "This class comprises signs, either meaningful
|
||
or not, or arrangements of signs following a specific syntax, that are
|
||
used or can be used to refer to and identify a specific instance."
|
||
alternative_classes:
|
||
- "skos:prefLabel"
|
||
- "rdfs:label"
|
||
- "rico:Name"
|
||
linkml_mapping:
|
||
class_uri: "crm:E41_Appellation"
|
||
exact_mappings: []
|
||
close_mappings:
|
||
- "skos:prefLabel"
|
||
- "rdfs:label"
|
||
nerd_class: null
|
||
nerd_note: |
|
||
NERD has no dedicated class for names/appellations. Appellation entities
|
||
are linguistic constructs that name other entities. In NIF export, these
|
||
become nif:Phrase instances without a specific NERD type.
|
||
pnv_class: "pnv:PersonName"
|
||
pnv_note: |
|
||
For person names, use PNV (Person Name Vocabulary) structured representation
|
||
with components: pnv:givenName, pnv:surnamePrefix, pnv:baseSurname, pnv:patronym
|
||
|
||
subcategories:
|
||
TITLE:
|
||
code: "APP.TIT"
|
||
definition: "Titles of artworks, books, documents, musical works"
|
||
examples:
|
||
- "The Night Watch"
|
||
- "Girl with a Pearl Earring"
|
||
- "On the Origin of Species"
|
||
- "Symphony No. 9"
|
||
ontology_class: "crm:E35_Title"
|
||
linkml_mapping:
|
||
class_uri: "crm:E35_Title"
|
||
note: |
|
||
Titles are appellations for Works (see WORK hypernym).
|
||
The same work may have multiple titles (original, translated, popular).
|
||
|
||
COLLECTION:
|
||
code: "APP.COL"
|
||
definition: "Named collections and archival fonds"
|
||
examples:
|
||
- "the Golden Age Collection"
|
||
- "Archief van de Familie Bicker"
|
||
- "Dutch Masters Collection"
|
||
- "Fonds Bicker"
|
||
ontology_class: "rico:RecordSet"
|
||
alternative_classes:
|
||
- "crm:E78_Curated_Holding"
|
||
note: |
|
||
Collection names identify aggregations of objects.
|
||
Use rico:RecordSet for archival fonds/series.
|
||
Use crm:E78_Curated_Holding for museum collections.
|
||
|
||
EXHIBITION:
|
||
code: "APP.EXH"
|
||
definition: "Named exhibitions and displays"
|
||
examples:
|
||
- "Rembrandt and the Golden Age"
|
||
- "Vermeer: Master of Light"
|
||
- "The Art of Living (2023)"
|
||
ontology_class: "crm:E7_Activity"
|
||
alternative_classes:
|
||
- "schema:ExhibitionEvent"
|
||
note: |
|
||
Exhibition names are appellations for events (E7_Activity).
|
||
Include dates when analyzing exhibition history.
|
||
|
||
AWARD:
|
||
code: "APP.AWD"
|
||
definition: "Named prizes, honors, and awards"
|
||
examples:
|
||
- "Nobel Prize in Physics"
|
||
- "Rijksmuseum Award"
|
||
- "Order of Orange-Nassau"
|
||
- "Pulitzer Prize"
|
||
ontology_class: "crm:E55_Type"
|
||
note: |
|
||
Awards are types that can be bestowed. The bestowal event
|
||
is an E7_Activity; the award type is the appellation.
|
||
|
||
PERSONNAME:
|
||
code: "APP.PNM"
|
||
definition: |
|
||
Structured person name components, for detailed name analysis.
|
||
Use when parsing name parts, not for simple person mentions.
|
||
examples:
|
||
- "van Rijn (surname prefix + surname)"
|
||
- "Rembrandt Harmenszoon (given name + patronym)"
|
||
- "Dr. Maria van den Berg, PhD"
|
||
ontology_class: "pnv:PersonName"
|
||
pnv_components:
|
||
givenName: "pnv:givenName"
|
||
patronym: "pnv:patronym"
|
||
surnamePrefix: "pnv:surnamePrefix"
|
||
baseSurname: "pnv:baseSurname"
|
||
honorificPrefix: "pnv:honorificPrefix (Dr., Prof.)"
|
||
honorificSuffix: "pnv:honorificSuffix (PhD, Jr.)"
|
||
note: |
|
||
Use APP.PNM when analyzing name structure (genealogy, prosopography).
|
||
For simple person mentions, use AGENT.PER.
|
||
Honorifics in names are part of the appellation, not separate roles.
|
||
|
||
inclusion_rules:
|
||
- id: "APP_INC001"
|
||
rule: "Tag work titles in their original language when possible"
|
||
examples:
|
||
- "De Nachtwacht (Dutch)"
|
||
- "The Night Watch (English translation)"
|
||
|
||
- id: "APP_INC002"
|
||
rule: "Tag collection names including archival fonds designations"
|
||
examples:
|
||
- "Fonds Bicker"
|
||
- "Collection Fodor"
|
||
|
||
- id: "APP_INC003"
|
||
rule: "Tag exhibition titles with dates when given"
|
||
examples:
|
||
- "Vermeer (2023)"
|
||
- "Rembrandt: The Late Works (2015)"
|
||
|
||
exclusion_rules:
|
||
- id: "APP_EXC001"
|
||
rule: "Do NOT tag generic descriptive phrases"
|
||
examples:
|
||
- "the painting (not a title)"
|
||
- "the exhibition (not named)"
|
||
|
||
- id: "APP_EXC002"
|
||
rule: "Do NOT double-tag: if tagged as AGENT/GROUP, do not also tag as APPELLATION"
|
||
note: "Exception: when specifically analyzing name structure (APP.PNM)"
|
||
|
||
- id: "APP_EXC003"
|
||
rule: "Do NOT tag roles/occupations as appellations (use ROLE hypernym)"
|
||
examples:
|
||
- "Director (use ROLE.OCC)"
|
||
- "Curator (use ROLE.OCC)"
|
||
- "His Majesty (use ROLE.HON)"
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# ROLE - Social positions, occupations, and honorifics (TEI roleName model)
|
||
# ---------------------------------------------------------------------------
|
||
# NEW in v1.7.0: Separated from DENOMINATION to properly model:
|
||
# - Occupational roles (curator, archivist, director)
|
||
# - Honorific titles (Your Majesty, Professor, Dr.)
|
||
# - Social positions (president, secretary, member)
|
||
# - Relational roles (father of, student of)
|
||
#
|
||
# TEI P5: <roleName> is distinct from <persName> and <occupation>
|
||
# W3C Org: org:Role and org:Post model organizational positions
|
||
# ---------------------------------------------------------------------------
|
||
ROLE:
|
||
code: "ROL"
|
||
definition: |
|
||
Social, occupational, and ceremonial positions that agents hold or
|
||
have held. Roles are POSITIONS, not the persons who fill them.
|
||
|
||
Key distinction:
|
||
- AGENT: The person ("Jan de Wit")
|
||
- ROLE: The position ("Director", "Curator")
|
||
- APPELLATION: The name as linguistic construct ("Dr. Jan de Wit")
|
||
|
||
Roles include:
|
||
- Occupations: Professional positions (curator, archivist, librarian)
|
||
- Titles: Honorific designations (Professor, Dr., Sir)
|
||
- Offices: Formal positions in organizations (Director, President)
|
||
- Relational: Family/social relationships (father of, mentor to)
|
||
|
||
design_rationale: |
|
||
TEI P5 distinguishes:
|
||
- <roleName>: "contains a name component which indicates that the
|
||
referent has a particular role or position in society"
|
||
- <occupation>: "contains an informal description of a person's trade,
|
||
profession, or occupation"
|
||
|
||
W3C Org Ontology provides:
|
||
- org:Role: A role within an organization
|
||
- org:Post: A specific position that exists independently of holders
|
||
- org:Membership: The n-ary relation of agent+organization+role
|
||
|
||
CIDOC-CRM uses:
|
||
- E55_Type for role classification
|
||
- P14.1_in_the_role_of for role-in-activity
|
||
|
||
Roles are NOT the same as:
|
||
- AGENT (the person, not the position)
|
||
- APPELLATION (the name, not the social function)
|
||
- GROUP (the organization, not the position within it)
|
||
|
||
ontology_mappings:
|
||
primary_class: "org:Role"
|
||
primary_class_definition: |
|
||
W3C Org Ontology org:Role: "Denotes a role that a Person or other Agent
|
||
can take in an organization. Instances of this class describe the
|
||
abstract role; to denote a specific instance of a person playing that
|
||
role in a specific organization use a Membership."
|
||
alternative_classes:
|
||
- "org:Post" # Specific position (e.g., "Director of the Rijksmuseum")
|
||
- "crm:E55_Type" # CIDOC-CRM classification
|
||
- "schema:Role" # Schema.org
|
||
linkml_mapping:
|
||
class_uri: "org:Role"
|
||
exact_mappings:
|
||
- "schema:Role"
|
||
close_mappings:
|
||
- "crm:E55_Type"
|
||
tei_mapping:
|
||
element: "roleName"
|
||
attributes:
|
||
type: "honorific|official|occupation|civil|military|religious"
|
||
nymRef: "URI reference to controlled vocabulary"
|
||
related_elements:
|
||
- "occupation"
|
||
- "affiliation"
|
||
nerd_class: null
|
||
nerd_note: |
|
||
NERD has no dedicated class for roles/occupations. Most NER systems
|
||
conflate roles with persons. This convention separates them for
|
||
precision in heritage contexts (staff roles, historical titles).
|
||
|
||
subcategories:
|
||
# ----- OCCUPATIONAL ROLES -----
|
||
OCCUPATION:
|
||
code: "ROL.OCC"
|
||
definition: "Professional occupations and job titles"
|
||
examples:
|
||
- "curator"
|
||
- "archivist"
|
||
- "librarian"
|
||
- "conservator"
|
||
- "director"
|
||
- "registrar"
|
||
ontology_class: "schema:Occupation"
|
||
alternative_classes:
|
||
- "tei:occupation"
|
||
linkml_mapping:
|
||
class_uri: "schema:Occupation"
|
||
related_mappings:
|
||
- "sdo:occupationalCategory"
|
||
isco_note: |
|
||
Link to ISCO-08 (International Standard Classification of Occupations)
|
||
for standardized occupation codes when possible.
|
||
note: |
|
||
Occupations are general categories of work.
|
||
For specific positions in organizations, use ROL.POS.
|
||
For the person holding the occupation, use AGENT.STF.
|
||
|
||
POSITION:
|
||
code: "ROL.POS"
|
||
definition: |
|
||
Specific positions within organizations, typically unique roles
|
||
that persist beyond the current holder.
|
||
examples:
|
||
- "Director of the Rijksmuseum"
|
||
- "Chief Curator of Prints"
|
||
- "Head of Conservation"
|
||
- "President of ICOM"
|
||
ontology_class: "org:Post"
|
||
org_properties:
|
||
postIn: "org:postIn (links to organization)"
|
||
heldBy: "org:heldBy (links to agent)"
|
||
note: |
|
||
Positions (Posts) are distinct from the people who hold them.
|
||
org:Post models positions that exist independently of holders.
|
||
Use when the position itself is referenced, not just the occupation.
|
||
|
||
# ----- HONORIFIC TITLES -----
|
||
HONORIFIC:
|
||
code: "ROL.HON"
|
||
definition: |
|
||
Honorific titles, forms of address, and ceremonial designations.
|
||
examples:
|
||
- "Your Majesty"
|
||
- "Professor"
|
||
- "Dr."
|
||
- "Sir"
|
||
- "Dame"
|
||
- "His Excellency"
|
||
- "Reverend"
|
||
ontology_class: "crm:E55_Type"
|
||
tei_mapping:
|
||
element: "roleName"
|
||
attribute_type: "honorific"
|
||
pnv_mapping: "pnv:honorificPrefix | pnv:honorificSuffix"
|
||
note: |
|
||
Honorifics often appear with names but are roles, not name parts:
|
||
- "Professor Einstein" → AGENT.PER with ROL.HON
|
||
- When analyzing name structure: APP.PNM with honorific component
|
||
|
||
Academic degrees (PhD, MA) after names are honorificSuffix.
|
||
|
||
# ----- NOBILITY AND RANK -----
|
||
NOBILITY:
|
||
code: "ROL.NOB"
|
||
definition: "Noble titles, aristocratic ranks, and royal designations"
|
||
examples:
|
||
- "Duke of Wellington"
|
||
- "Countess of Blessington"
|
||
- "Prince of Orange"
|
||
- "Baron van Pallandt"
|
||
- "King"
|
||
- "Queen"
|
||
ontology_class: "crm:E55_Type"
|
||
tei_mapping:
|
||
element: "roleName"
|
||
attribute_type: "nobility"
|
||
note: |
|
||
Nobility titles may be:
|
||
- Hereditary: Pass through family lineage
|
||
- Created: Bestowed as honor
|
||
- Extinct: Historical titles no longer in use
|
||
|
||
Link to authority files for genealogical precision.
|
||
|
||
# ----- RELIGIOUS TITLES -----
|
||
RELIGIOUS:
|
||
code: "ROL.REL"
|
||
definition: "Religious titles, orders, and ecclesiastical ranks"
|
||
examples:
|
||
- "Pope"
|
||
- "Cardinal"
|
||
- "Rabbi"
|
||
- "Imam"
|
||
- "Reverend"
|
||
- "Brother"
|
||
- "Sister"
|
||
- "Abbot"
|
||
ontology_class: "crm:E55_Type"
|
||
tei_mapping:
|
||
element: "roleName"
|
||
attribute_type: "religious"
|
||
note: |
|
||
Religious roles span traditions:
|
||
- Christian: Pope, Bishop, Deacon, Abbot, Prior
|
||
- Jewish: Rabbi, Cantor
|
||
- Islamic: Imam, Mufti, Ayatollah
|
||
- Buddhist: Monk, Lama, Rinpoche
|
||
|
||
# ----- RELATIONAL ROLES -----
|
||
RELATIONAL:
|
||
code: "ROL.RLT"
|
||
definition: |
|
||
Relational roles defined by relationship to another agent:
|
||
family relationships, apprenticeship, mentorship.
|
||
examples:
|
||
- "father of Rembrandt"
|
||
- "student of Frans Hals"
|
||
- "wife of"
|
||
- "heir to"
|
||
- "successor of"
|
||
ontology_class: "crm:E55_Type"
|
||
bio_relations:
|
||
family: "bio:parent, bio:child, bio:spouse, bio:sibling"
|
||
professional: "bio:student, bio:mentor"
|
||
note: |
|
||
Relational roles require a reference agent:
|
||
- "X's father" → X has role "father" relative to someone
|
||
|
||
For family relationships, use BIO ontology properties.
|
||
For professional relationships, use org:memberOf context.
|
||
|
||
inclusion_rules:
|
||
- id: "ROL_INC001"
|
||
rule: "Tag occupation terms when they identify professional function"
|
||
examples:
|
||
- "the curator stated..."
|
||
- "as archivist, she organized..."
|
||
|
||
- id: "ROL_INC002"
|
||
rule: "Tag honorifics that precede or follow names"
|
||
examples:
|
||
- "Professor Einstein"
|
||
- "Dr. Marie Curie"
|
||
- "Jan de Wit, Director"
|
||
|
||
- id: "ROL_INC003"
|
||
rule: "Tag specific organizational positions"
|
||
examples:
|
||
- "Director of the Rijksmuseum"
|
||
- "Chief Curator of Medieval Art"
|
||
|
||
- id: "ROL_INC004"
|
||
rule: "Tag nobility and religious titles"
|
||
examples:
|
||
- "Duke of Wellington"
|
||
- "Cardinal Richelieu"
|
||
- "Rabbi Akiva"
|
||
|
||
exclusion_rules:
|
||
- id: "ROL_EXC001"
|
||
rule: "Do NOT tag generic person references as roles"
|
||
examples:
|
||
- "the man (not a role)"
|
||
- "a woman (not a role)"
|
||
- "someone (not a role)"
|
||
|
||
- id: "ROL_EXC002"
|
||
rule: "Do NOT tag organizations as roles"
|
||
examples:
|
||
- "the museum (use GROUP)"
|
||
- "the university (use GROUP)"
|
||
|
||
- id: "ROL_EXC003"
|
||
rule: "Do NOT double-tag: person with role context → AGENT + ROLE"
|
||
examples:
|
||
- "'Director Jan de Wit' → AGENT.PER + ROL.POS"
|
||
note: "Create separate annotations with linking"
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# QUANTITY - Numeric and measurable values
|
||
# ---------------------------------------------------------------------------
|
||
QUANTITY:
|
||
code: "QTY"
|
||
definition: |
|
||
Numerical quantities, measurements, counts, and ranges. Includes
|
||
dimensions, populations, statistics, and any quantifiable values
|
||
with or without units.
|
||
|
||
ontology_mappings:
|
||
primary_class: "crm:E54_Dimension"
|
||
alternative_classes:
|
||
- "schema:QuantitativeValue"
|
||
- "ivoa:Measure"
|
||
nerd_class: "nerd:Amount"
|
||
nerd_note: "NERD Amount class for all quantities and measurements."
|
||
|
||
subcategories:
|
||
COUNT:
|
||
code: "QTY.CNT"
|
||
definition: "Cardinal numbers and item counts"
|
||
examples:
|
||
- "8,000 artworks"
|
||
- "over 1 million visitors"
|
||
- "15 staff members"
|
||
ontology_class: "xsd:integer"
|
||
|
||
MEASUREMENT:
|
||
code: "QTY.MSR"
|
||
definition: "Physical measurements with units"
|
||
examples:
|
||
- "363 cm by 437 cm"
|
||
- "2.5 meters tall"
|
||
- "500 square meters"
|
||
ontology_class: "crm:E54_Dimension"
|
||
|
||
CURRENCY:
|
||
code: "QTY.CUR"
|
||
definition: "Monetary amounts"
|
||
examples:
|
||
- "15 euro admission"
|
||
- "5 million euros"
|
||
- "1.2 billion dollars"
|
||
ontology_class: "schema:MonetaryAmount"
|
||
|
||
PERCENTAGE:
|
||
code: "QTY.PCT"
|
||
definition: "Percentages and ratios"
|
||
examples:
|
||
- "75 percent of the collection"
|
||
- "one-third of visitors"
|
||
ontology_class: "xsd:decimal"
|
||
|
||
RANGE:
|
||
code: "QTY.RNG"
|
||
definition: "Numeric ranges"
|
||
examples:
|
||
- "between 50,000 and 100,000"
|
||
- "10-15 meters"
|
||
- "ages 6-12"
|
||
ontology_class: "schema:QuantitativeValue"
|
||
|
||
ORDINAL:
|
||
code: "QTY.ORD"
|
||
definition: "Ordinal numbers and rankings"
|
||
examples:
|
||
- "the 5th largest museum"
|
||
- "first place"
|
||
- "17th century"
|
||
ontology_class: "xsd:integer"
|
||
|
||
inclusion_rules:
|
||
- id: "QTY_INC001"
|
||
rule: "Tag quantities with their units as single entities"
|
||
examples:
|
||
- "8,000 paintings (single entity)"
|
||
- "2.5 million visitors annually"
|
||
|
||
- id: "QTY_INC002"
|
||
rule: "Tag approximate quantities"
|
||
examples:
|
||
- "approximately 500"
|
||
- "over 1 million"
|
||
- "nearly 10,000"
|
||
|
||
- id: "QTY_INC003"
|
||
rule: "Tag ranges as single entities"
|
||
examples:
|
||
- "10,000 to 15,000 items"
|
||
|
||
exclusion_rules:
|
||
- id: "QTY_EXC001"
|
||
rule: "Do NOT tag numbers that are part of proper names"
|
||
examples:
|
||
- "World War 2 (part of name, tag as THING.EVT)"
|
||
- "Henry VIII (part of name, tag as BEING)"
|
||
|
||
- id: "QTY_EXC002"
|
||
rule: "Do NOT tag page numbers, reference numbers, or identifiers"
|
||
examples:
|
||
- "page 42"
|
||
- "inventory number 12345"
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# TEMPORAL - Time references following TimeML/TIMEX3 model
|
||
# ---------------------------------------------------------------------------
|
||
# BREAKING CHANGE v1.7.0: Restructured from TEMPORAL_REFERENCE (TMP)
|
||
# Now follows TimeML/TIMEX3 typology:
|
||
# - DATE: Calendar dates (absolute)
|
||
# - TIME: Clock times
|
||
# - DURATION: Temporal spans
|
||
# - SET: Recurring/periodic times
|
||
# Plus absolute vs. relative distinction (TIMEX3 @mod attribute)
|
||
# ---------------------------------------------------------------------------
|
||
TEMPORAL:
|
||
code: "TMP"
|
||
definition: |
|
||
Temporal expressions following the TimeML/TIMEX3 standard. Temporal
|
||
expressions denote points, intervals, or frequencies on the timeline.
|
||
|
||
Key distinctions:
|
||
- ABSOLUTE: Resolved to specific calendar/clock ("15 July 1606")
|
||
- RELATIVE: Requires context to resolve ("last year", "recently")
|
||
- DURATION: Temporal extent ("three years", "for a decade")
|
||
- SET: Recurring frequencies ("every Monday", "annually")
|
||
|
||
TimeML/TIMEX3 is the ISO standard for temporal expression markup,
|
||
widely used in NLP (TimeBank, TempEval). This provides interoperability
|
||
with temporal reasoning systems.
|
||
|
||
design_rationale: |
|
||
NERD's generic "Time" class lacks the semantic precision needed for:
|
||
- Temporal reasoning and event ordering
|
||
- Calendar normalization (Julian vs. Gregorian)
|
||
- Duration calculations
|
||
- Frequency detection (for opening hours, events)
|
||
|
||
TimeML/TIMEX3 (ISO-TimeML) provides:
|
||
- TYPE attribute: DATE | TIME | DURATION | SET
|
||
- VALUE attribute: ISO 8601 normalized value
|
||
- MOD attribute: START | END | MID | BEFORE | AFTER | APPROX
|
||
|
||
CIDOC-CRM provides complementary modeling:
|
||
- E52 Time-Span: Temporal extent with fuzzy boundaries
|
||
- E61 Time Primitive: Precise instant or interval
|
||
- E4 Period: Named historical periods
|
||
|
||
ontology_mappings:
|
||
primary_class: "crm:E52_Time-Span"
|
||
primary_class_definition: |
|
||
CIDOC-CRM E52 Time-Span: "This class comprises abstract temporal extents,
|
||
in the sense of Galilean physics, having a beginning, an end and a
|
||
duration."
|
||
alternative_classes:
|
||
- "time:TemporalEntity" # W3C Time Ontology
|
||
- "time:Instant" # Specific point
|
||
- "time:Interval" # Duration with bounds
|
||
linkml_mapping:
|
||
class_uri: "crm:E52_Time-Span"
|
||
exact_mappings:
|
||
- "time:TemporalEntity"
|
||
close_mappings:
|
||
- "dct:temporal"
|
||
nerd_class: "nerd:Time"
|
||
nerd_deprecation_note: |
|
||
DEPRECATED: NERD's Time class is too generic, lacking the DATE/TIME/
|
||
DURATION/SET typology essential for temporal reasoning. Use TimeML
|
||
TIMEX3 types for NLP annotation; map to CIDOC-CRM for semantics.
|
||
Retain NERD mapping ONLY for basic NLP pipeline interchange.
|
||
timeml_mapping:
|
||
element: "TIMEX3"
|
||
attributes:
|
||
type: "DATE | TIME | DURATION | SET"
|
||
value: "ISO 8601 normalized value"
|
||
mod: "BEFORE | AFTER | APPROX | START | END | MID"
|
||
anchorTimeID: "ID of anchoring time for relative expressions"
|
||
note: |
|
||
W3C Time Ontology (OWL-Time) provides formal semantics for temporal
|
||
entities compatible with CIDOC-CRM. TimeML provides NLP annotation
|
||
conventions. Both map to ISO 8601 for interchange.
|
||
|
||
subcategories:
|
||
# ----- ABSOLUTE TEMPORAL EXPRESSIONS -----
|
||
DATE_ABS:
|
||
code: "TMP.DAB"
|
||
definition: |
|
||
Absolute calendar dates that can be resolved without context.
|
||
Maps to TimeML TIMEX3 type="DATE" without relative modifiers.
|
||
examples:
|
||
- "15 July 1606"
|
||
- "March 2023"
|
||
- "1888"
|
||
- "12/04/1943"
|
||
- "the year 2000"
|
||
ontology_class: "time:Instant"
|
||
timeml_type: "DATE"
|
||
xsd_type: "xsd:date | xsd:gYear | xsd:gYearMonth"
|
||
linkml_mapping:
|
||
class_uri: "time:Instant"
|
||
exact_mappings:
|
||
- "xsd:date"
|
||
- "xsd:gYear"
|
||
note: |
|
||
Normalize to ISO 8601 format when possible:
|
||
- Full date: 1606-07-15
|
||
- Year-month: 2023-03
|
||
- Year only: 1888
|
||
|
||
Historical dates may require calendar specification (Julian/Gregorian).
|
||
|
||
DATE_REL:
|
||
code: "TMP.DRL"
|
||
definition: |
|
||
Relative dates requiring context (document date, speech time)
|
||
to resolve. Maps to TimeML TIMEX3 type="DATE" with anchorTimeID.
|
||
examples:
|
||
- "last year"
|
||
- "yesterday"
|
||
- "next month"
|
||
- "three years ago"
|
||
- "in recent decades"
|
||
- "since the war"
|
||
ontology_class: "crm:E52_Time-Span"
|
||
timeml_type: "DATE"
|
||
timeml_anchor: "Requires @anchorTimeID to resolve"
|
||
note: |
|
||
Relative dates are common in heritage texts but require context:
|
||
- "last year" → need document_date to resolve
|
||
- "since the war" → need historical context (which war?)
|
||
|
||
Mark with @certainty when resolution is uncertain.
|
||
|
||
TIME_ABS:
|
||
code: "TMP.TAB"
|
||
definition: |
|
||
Absolute clock times. Maps to TimeML TIMEX3 type="TIME".
|
||
examples:
|
||
- "10:00"
|
||
- "14:30"
|
||
- "midnight"
|
||
- "noon"
|
||
- "3 PM"
|
||
ontology_class: "time:Instant"
|
||
timeml_type: "TIME"
|
||
xsd_type: "xsd:time"
|
||
note: |
|
||
Normalize to 24-hour ISO 8601 format: HH:MM:SS
|
||
Named times: midnight → 00:00, noon → 12:00
|
||
|
||
TIME_REL:
|
||
code: "TMP.TRL"
|
||
definition: |
|
||
Relative clock times requiring context.
|
||
examples:
|
||
- "an hour later"
|
||
- "that morning"
|
||
- "in the evening"
|
||
ontology_class: "crm:E52_Time-Span"
|
||
timeml_type: "TIME"
|
||
|
||
# ----- DURATIONS -----
|
||
DURATION:
|
||
code: "TMP.DUR"
|
||
definition: |
|
||
Temporal durations: lengths of time without fixed endpoints.
|
||
Maps to TimeML TIMEX3 type="DURATION".
|
||
examples:
|
||
- "three years"
|
||
- "for a decade"
|
||
- "two hours"
|
||
- "the 17th century (100 years)"
|
||
- "a fortnight"
|
||
- "several months"
|
||
ontology_class: "time:Duration"
|
||
timeml_type: "DURATION"
|
||
xsd_type: "xsd:duration"
|
||
linkml_mapping:
|
||
class_uri: "time:Duration"
|
||
exact_mappings:
|
||
- "xsd:duration"
|
||
note: |
|
||
Normalize to ISO 8601 duration format: P[n]Y[n]M[n]DT[n]H[n]M[n]S
|
||
Examples:
|
||
- "three years" → P3Y
|
||
- "two hours" → PT2H
|
||
- "a decade" → P10Y
|
||
- "17th century" → P100Y (with temporal bounds for period)
|
||
|
||
# ----- RECURRING/PERIODIC TIMES -----
|
||
SET:
|
||
code: "TMP.SET"
|
||
definition: |
|
||
Recurring or periodic temporal expressions. Maps to TimeML
|
||
TIMEX3 type="SET". Common for opening hours, event schedules.
|
||
examples:
|
||
- "every Monday"
|
||
- "annually"
|
||
- "twice a week"
|
||
- "open Tuesday-Sunday"
|
||
- "each summer"
|
||
- "quarterly"
|
||
ontology_class: "schema:OpeningHoursSpecification"
|
||
timeml_type: "SET"
|
||
alternative_classes:
|
||
- "time:GeneralDateTimeDescription"
|
||
note: |
|
||
SET expressions describe recurring patterns:
|
||
- Frequency: "twice a week" → SET with @quant="2" @freq="1W"
|
||
- Schedule: "every Monday" → SET with value="XXXX-WXX-1"
|
||
|
||
For opening hours, schema:OpeningHoursSpecification provides
|
||
structured representation with dayOfWeek, opens, closes properties.
|
||
|
||
# ----- OPENING HOURS (specialized SET) -----
|
||
OPENHRS:
|
||
code: "TMP.OPH"
|
||
definition: |
|
||
Institutional opening hours and operational schedules. A specialized
|
||
form of SET expression.
|
||
examples:
|
||
- "open Tuesday-Sunday 10:00-17:00"
|
||
- "closed Mondays"
|
||
- "last entry at 16:30"
|
||
- "open daily except holidays"
|
||
ontology_class: "schema:OpeningHoursSpecification"
|
||
schema_properties:
|
||
dayOfWeek: "schema:DayOfWeek (Monday, Tuesday, etc.)"
|
||
opens: "xsd:time (opening time)"
|
||
closes: "xsd:time (closing time)"
|
||
validFrom: "xsd:date (seasonal start)"
|
||
validThrough: "xsd:date (seasonal end)"
|
||
note: |
|
||
Links to GROUP hypernym (heritage institutions) via schema:openingHours.
|
||
Use schema:specialOpeningHoursSpecification for holidays/exceptions.
|
||
|
||
# ----- DATE RANGES -----
|
||
RANGE:
|
||
code: "TMP.RNG"
|
||
definition: |
|
||
Date/time ranges with explicit start and end points.
|
||
examples:
|
||
- "1888-1890"
|
||
- "from March to June 2023"
|
||
- "10 February - 4 June 2023"
|
||
- "between 1650 and 1670"
|
||
ontology_class: "time:ProperInterval"
|
||
alternative_classes:
|
||
- "crm:E52_Time-Span"
|
||
edtf_note: |
|
||
Extended Date/Time Format (EDTF, ISO 8601-2) provides:
|
||
- Intervals: 1888/1890
|
||
- Open ranges: 1888/.. (from 1888 onwards)
|
||
- Uncertain: 1888?/1890
|
||
|
||
# ----- NAMED PERIODS -----
|
||
CENTURY:
|
||
code: "TMP.CEN"
|
||
definition: "Century references, a common periodization"
|
||
examples:
|
||
- "17th century"
|
||
- "the 1800s"
|
||
- "nineteenth century"
|
||
- "early 20th century"
|
||
ontology_class: "crm:E52_Time-Span"
|
||
timeml_type: "DATE"
|
||
note: |
|
||
Normalize centuries to date ranges:
|
||
- "17th century" → 1601-01-01/1700-12-31
|
||
- "the 1800s" → 1800-01-01/1899-12-31
|
||
|
||
Modifiers (early, mid, late) narrow the range.
|
||
|
||
ERA:
|
||
code: "TMP.ERA"
|
||
definition: |
|
||
Named historical periods, movements, and eras. These are cultural
|
||
periodizations, not calendar units.
|
||
examples:
|
||
- "the Golden Age"
|
||
- "the Renaissance"
|
||
- "Medieval period"
|
||
- "Edo period"
|
||
- "the Enlightenment"
|
||
- "Art Deco era"
|
||
ontology_class: "crm:E4_Period"
|
||
linkml_mapping:
|
||
class_uri: "crm:E4_Period"
|
||
close_mappings:
|
||
- "dct:PeriodOfTime"
|
||
note: |
|
||
Named periods have fuzzy boundaries and geographic variation:
|
||
- "Renaissance" varies by region (Italy vs. Northern Europe)
|
||
- "Golden Age" is culture-specific (Dutch vs. Spanish)
|
||
|
||
Use crm:E4_Period which explicitly allows fuzzy temporal boundaries.
|
||
Link to authority files (Getty AAT, Wikidata) for disambiguation.
|
||
|
||
EXHIBPER:
|
||
code: "TMP.EXP"
|
||
definition: "Exhibition periods and event dates"
|
||
examples:
|
||
- "10 February - 4 June 2023"
|
||
- "on view through December"
|
||
- "opening reception: May 5, 7-9 PM"
|
||
ontology_class: "crm:E52_Time-Span"
|
||
schema_mapping: "schema:Event with schema:startDate and schema:endDate"
|
||
note: "Use for temporally bounded institutional events."
|
||
|
||
inclusion_rules:
|
||
- id: "TMP_INC001"
|
||
rule: "Tag complete date/time expressions as single entities"
|
||
examples:
|
||
- "15 July 1606 (single entity)"
|
||
- "between 1888 and 1890 (range)"
|
||
- "every Monday at 10:00"
|
||
|
||
- id: "TMP_INC002"
|
||
rule: "Tag named periods and eras"
|
||
examples:
|
||
- "the Dutch Golden Age"
|
||
- "the Baroque period"
|
||
- "during the Renaissance"
|
||
|
||
- id: "TMP_INC003"
|
||
rule: "Tag opening hours as complete SET expressions"
|
||
examples:
|
||
- "Tuesday to Sunday, 10:00-17:00"
|
||
- "open daily except Mondays"
|
||
|
||
- id: "TMP_INC004"
|
||
rule: "Tag relative expressions with their anchor context"
|
||
examples:
|
||
- "last year (relative to document date)"
|
||
- "since the merger (relative to event)"
|
||
|
||
- id: "TMP_INC005"
|
||
rule: "Tag durations even without specific anchoring"
|
||
examples:
|
||
- "for three years"
|
||
- "a decade of research"
|
||
|
||
exclusion_rules:
|
||
- id: "TMP_EXC001"
|
||
rule: "Do NOT tag deictics without recoverable reference"
|
||
examples:
|
||
- "now (unanchored)"
|
||
- "today (unless document date known)"
|
||
note: "These require pragmatic resolution beyond text"
|
||
|
||
- id: "TMP_EXC002"
|
||
rule: "Do NOT tag ordinal centuries as QUANTITY"
|
||
examples:
|
||
- "17th century → TMP.CEN, not QTY.ORD"
|
||
|
||
- id: "TMP_EXC003"
|
||
rule: "Do NOT tag temporal prepositions alone"
|
||
examples:
|
||
- "before (preposition, not temporal reference)"
|
||
- "during (connector)"
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# WORK - Intellectual/creative works following FRBR/LRM/FRBRoo model
|
||
# ---------------------------------------------------------------------------
|
||
# BREAKING CHANGE v1.7.0: Renamed from TEXTUAL_REFERENCE (TXT) to WORK (WRK)
|
||
# and restructured following FRBR/IFLA-LRM model:
|
||
# - WORK: Abstract intellectual creation (the concept)
|
||
# - EXPRESSION: Realization in a specific form (the text)
|
||
# - MANIFESTATION: Physical embodiment (the publication)
|
||
# - ITEM: Single exemplar (the specific copy)
|
||
#
|
||
# NERD's "Product" class is semantically wrong for intellectual works.
|
||
# FRBR provides the bibliographic and archival standard.
|
||
# ---------------------------------------------------------------------------
|
||
WORK:
|
||
code: "WRK"
|
||
definition: |
|
||
References to intellectual and creative works, spanning the FRBR/LRM
|
||
abstraction levels from abstract concept (Work) to physical copy (Item).
|
||
|
||
The FRBR model distinguishes:
|
||
- WORK: Distinct intellectual creation ("Hamlet" as a concept)
|
||
- EXPRESSION: Specific intellectual realization ("Hamlet in Dutch translation")
|
||
- MANIFESTATION: Physical embodiment ("2023 Penguin edition of Hamlet")
|
||
- ITEM: Single exemplar ("the copy in Amsterdam Public Library")
|
||
|
||
Most textual mentions reference WORK level (titles, concepts).
|
||
Archival contexts often reference MANIFESTATION or ITEM level.
|
||
|
||
design_rationale: |
|
||
NERD's "Product" class conflates:
|
||
- Creative works (books, music, films)
|
||
- Commercial products (software, merchandise)
|
||
- Services (subscriptions, access)
|
||
|
||
For heritage contexts, FRBR (now IFLA-LRM) is the bibliographic standard:
|
||
- Library catalogs use FRBR for bibliographic relationships
|
||
- FRBRoo integrates with CIDOC-CRM for museum contexts
|
||
- RiC-O (archival) aligns with FRBR for records
|
||
|
||
This enables:
|
||
- Linking editions, translations, and copies of the same work
|
||
- Tracking provenance of specific items
|
||
- Distinguishing conceptual references from physical objects
|
||
|
||
ontology_mappings:
|
||
primary_class: "frbroo:F1_Work"
|
||
primary_class_definition: |
|
||
FRBRoo F1 Work: "This class comprises distinct concepts or combination
|
||
of concepts identified in artistic and intellectual expressions."
|
||
Equivalent to IFLA-LRM Work.
|
||
alternative_classes:
|
||
- "lrm:Work" # IFLA Library Reference Model
|
||
- "crm:E89_Propositional_Object" # CIDOC-CRM intellectual content
|
||
- "schema:CreativeWork" # Schema.org
|
||
- "dct:BibliographicResource" # Dublin Core
|
||
linkml_mapping:
|
||
class_uri: "frbroo:F1_Work"
|
||
exact_mappings:
|
||
- "lrm:Work"
|
||
close_mappings:
|
||
- "schema:CreativeWork"
|
||
- "crm:E89_Propositional_Object"
|
||
related_mappings:
|
||
- "bf:Work" # BIBFRAME
|
||
nerd_class: "nerd:Product"
|
||
nerd_deprecation_note: |
|
||
DEPRECATED: NERD's Product class is semantically incorrect for
|
||
intellectual works. "Product" implies commercial goods, not creative
|
||
expressions. Use FRBR/FRBRoo/LRM for bibliographic precision.
|
||
Retain NERD mapping ONLY for basic NLP pipeline interchange.
|
||
frbr_mapping:
|
||
work: "frbroo:F1_Work | lrm:Work"
|
||
expression: "frbroo:F2_Expression | lrm:Expression"
|
||
manifestation: "frbroo:F3_Manifestation | lrm:Manifestation"
|
||
item: "frbroo:F5_Item | lrm:Item"
|
||
note: |
|
||
FRBRoo is the object-oriented version of FRBR, aligned with CIDOC-CRM.
|
||
IFLA-LRM (2017) is the consolidated successor to FRBR/FRAD/FRSAD.
|
||
BIBFRAME (bf:) is the Library of Congress RDF vocabulary for bibliographic data.
|
||
|
||
subcategories:
|
||
# ----- FRBR WORK LEVEL (abstract concept) -----
|
||
WORK_ABSTRACT:
|
||
code: "WRK.ABS"
|
||
definition: |
|
||
Abstract intellectual works referenced by title or concept,
|
||
independent of specific editions or copies.
|
||
examples:
|
||
- "Hamlet"
|
||
- "The Diary of Anne Frank"
|
||
- "Beethoven's Ninth Symphony"
|
||
- "the Quran"
|
||
ontology_class: "frbroo:F1_Work"
|
||
alternative_classes:
|
||
- "lrm:Work"
|
||
- "bf:Work"
|
||
linkml_mapping:
|
||
class_uri: "frbroo:F1_Work"
|
||
exact_mappings:
|
||
- "lrm:Work"
|
||
- "bf:Work"
|
||
note: |
|
||
Use for references to works as concepts, not specific editions:
|
||
- "Shakespeare's Hamlet" (the work, not a specific edition)
|
||
- "Anne Frank's diary" (the conceptual work)
|
||
|
||
Link to authority files (VIAF, Wikidata) for work identification.
|
||
|
||
# ----- FRBR EXPRESSION LEVEL (specific realization) -----
|
||
EXPRESSION:
|
||
code: "WRK.EXP"
|
||
definition: |
|
||
Specific realizations of works: translations, versions, arrangements,
|
||
performances captured as recordings.
|
||
examples:
|
||
- "the Dutch translation of Hamlet"
|
||
- "the 1603 quarto text of Hamlet"
|
||
- "Karajan's 1962 recording of Beethoven's Ninth"
|
||
- "the King James Bible"
|
||
ontology_class: "frbroo:F2_Expression"
|
||
alternative_classes:
|
||
- "lrm:Expression"
|
||
- "bf:Work" # BIBFRAME conflates Work/Expression
|
||
note: |
|
||
Expressions are realizations of Works:
|
||
- Same Work, different language → different Expression
|
||
- Same Work, different arrangement → different Expression
|
||
- Same musical Work, different performance → different Expression
|
||
|
||
# ----- FRBR MANIFESTATION LEVEL (publication/edition) -----
|
||
MANIFESTATION:
|
||
code: "WRK.MAN"
|
||
definition: |
|
||
Physical or digital embodiments: editions, publications, pressings,
|
||
broadcasts. Manifestations are producible in multiple copies.
|
||
examples:
|
||
- "the 2023 Penguin edition of Hamlet"
|
||
- "Museum Journal vol. 15, no. 2"
|
||
- "Nature magazine, March 2023"
|
||
- "the 1609 Quarto of Shakespeare's Sonnets"
|
||
ontology_class: "frbroo:F3_Manifestation"
|
||
alternative_classes:
|
||
- "lrm:Manifestation"
|
||
- "bf:Instance" # BIBFRAME equivalent
|
||
- "schema:Book"
|
||
- "schema:Periodical"
|
||
linkml_mapping:
|
||
class_uri: "frbroo:F3_Manifestation"
|
||
exact_mappings:
|
||
- "lrm:Manifestation"
|
||
- "bf:Instance"
|
||
close_mappings:
|
||
- "schema:Book"
|
||
note: |
|
||
Manifestations are what libraries catalog and acquire:
|
||
- Publisher, date, ISBN, format are Manifestation attributes
|
||
- Multiple identical copies = one Manifestation, many Items
|
||
|
||
# ----- FRBR ITEM LEVEL (specific copy) -----
|
||
ITEM:
|
||
code: "WRK.ITM"
|
||
definition: |
|
||
Single exemplars: a specific copy, with provenance, annotations,
|
||
damage. Items are unique physical objects.
|
||
examples:
|
||
- "the British Library's copy of the First Folio"
|
||
- "manuscript KB 128 A 14 in the Royal Library"
|
||
- "the annotated copy formerly owned by Van Gogh"
|
||
ontology_class: "frbroo:F5_Item"
|
||
alternative_classes:
|
||
- "lrm:Item"
|
||
- "bf:Item"
|
||
- "crm:E22_Human-Made_Object"
|
||
note: |
|
||
Items have individual provenance and condition:
|
||
- Ownership history, annotations, damage
|
||
- Physical location (shelf mark, repository)
|
||
- For manuscripts, each copy is a unique Item
|
||
|
||
# ----- MANUSCRIPT (always Item level) -----
|
||
MANUSCRIPT:
|
||
code: "WRK.MSS"
|
||
definition: |
|
||
Handwritten documents and manuscripts. Manuscripts are unique
|
||
Items (FRBR Item level), not reproducible Manifestations.
|
||
examples:
|
||
- "Codex Manesse"
|
||
- "the Book of Kells"
|
||
- "Leonardo's notebooks"
|
||
- "the Dead Sea Scrolls"
|
||
ontology_class: "crm:E22_Human-Made_Object"
|
||
alternative_classes:
|
||
- "frbroo:F5_Item"
|
||
- "rico:Record"
|
||
note: |
|
||
Manuscripts are physical objects that embody intellectual content:
|
||
- Use crm:E22_Human-Made_Object for the physical codex/scroll
|
||
- Use frbroo:F1_Work for the textual content
|
||
- Link via frbroo:R4_comprises_carriers_of
|
||
|
||
# ----- ARCHIVAL RECORDS -----
|
||
ARCHIVAL:
|
||
code: "WRK.ARC"
|
||
definition: |
|
||
Archival records, documents, and files. May be single items or
|
||
aggregations (fonds, series).
|
||
examples:
|
||
- "deed of sale dated 1650"
|
||
- "birth certificate of Rembrandt"
|
||
- "notarial act NL-SAA 5075/2135"
|
||
- "the VOC archives"
|
||
ontology_class: "rico:Record"
|
||
alternative_classes:
|
||
- "rico:RecordSet" # For aggregations (fonds, series)
|
||
- "crm:E31_Document"
|
||
rico_levels:
|
||
fonds: "rico:RecordSet with rico:hasRecordSetType 'fonds'"
|
||
series: "rico:RecordSet with rico:hasRecordSetType 'series'"
|
||
file: "rico:RecordSet with rico:hasRecordSetType 'file'"
|
||
item: "rico:Record"
|
||
note: |
|
||
RiC-O provides archival hierarchy:
|
||
- Fonds → Series → File → Item
|
||
- Use rico:isOrWasIncludedIn for containment
|
||
- Use rico:hasOrHadIdentifier for call numbers
|
||
|
||
# ----- DIGITAL RESOURCES -----
|
||
WEBSITE:
|
||
code: "WRK.WEB"
|
||
definition: "Web pages and online resources as intellectual works"
|
||
examples:
|
||
- "the museum's online collection"
|
||
- "the Wikipedia article on Rembrandt"
|
||
- "the Rijksmuseum website"
|
||
ontology_class: "schema:WebPage"
|
||
alternative_classes:
|
||
- "schema:WebSite"
|
||
- "fabio:WebPage" # FRBR-aligned Bibliographic Ontology
|
||
note: |
|
||
Websites are Manifestations of digital Works.
|
||
Archived versions (Internet Archive) are different Manifestations
|
||
of the same Expression.
|
||
|
||
URL:
|
||
code: "WRK.URL"
|
||
definition: "URLs as identifiers for digital resources"
|
||
examples:
|
||
- "www.rijksmuseum.nl"
|
||
- "https://www.britishmuseum.org"
|
||
- "https://doi.org/10.1000/xyz123"
|
||
ontology_class: "schema:URL"
|
||
note: |
|
||
URLs are locators, not works themselves.
|
||
Links to GROUP hypernym via schema:url.
|
||
DOIs are persistent identifiers for Works/Expressions.
|
||
|
||
# ----- CONTACT INFORMATION (not works, but often co-occur) -----
|
||
EMAIL:
|
||
code: "WRK.EML"
|
||
definition: "Email addresses as contact identifiers"
|
||
examples:
|
||
- "info@rijksmuseum.nl"
|
||
- "contact@museum.org"
|
||
ontology_class: "schema:email"
|
||
note: |
|
||
Email addresses are identifiers, not intellectual works.
|
||
Included here for practical extraction convenience.
|
||
Links to GROUP hypernym via schema:email.
|
||
|
||
SOCIAL:
|
||
code: "WRK.SOC"
|
||
definition: "Social media handles and profiles"
|
||
examples:
|
||
- "@rijksmuseum"
|
||
- "facebook.com/britishmuseum"
|
||
- "Instagram: @vangoghmuseum"
|
||
ontology_class: "foaf:OnlineAccount"
|
||
foaf_mapping:
|
||
class: "foaf:OnlineAccount"
|
||
properties:
|
||
account_name: "foaf:accountName"
|
||
service_homepage: "foaf:accountServiceHomepage"
|
||
note: |
|
||
Social accounts are identifiers for agent presence on platforms.
|
||
Links to GROUP hypernym via foaf:account.
|
||
|
||
# ----- CITATIONS -----
|
||
CITATION:
|
||
code: "WRK.CIT"
|
||
definition: "Bibliographic citations and references"
|
||
examples:
|
||
- "Smith (2020)"
|
||
- "cf. Van Gogh Letters, no. 532"
|
||
- "[ibid., p. 42]"
|
||
ontology_class: "dct:bibliographicCitation"
|
||
alternative_classes:
|
||
- "cito:Citation" # Citation Typing Ontology
|
||
note: |
|
||
Citations are references to Works/Expressions/Manifestations.
|
||
CiTO (Citation Typing Ontology) provides citation intent types:
|
||
- cito:cites, cito:critiques, cito:supports, etc.
|
||
|
||
inclusion_rules:
|
||
- id: "WRK_INC001"
|
||
rule: "Tag work titles at appropriate FRBR level"
|
||
examples:
|
||
- "'Hamlet' (Work level - the concept)"
|
||
- "'2023 Penguin edition' (Manifestation level)"
|
||
- "'the British Library's copy' (Item level)"
|
||
|
||
- id: "WRK_INC002"
|
||
rule: "Tag URLs and identifiers as complete strings"
|
||
examples:
|
||
- "info@rijksmuseum.nl"
|
||
- "https://www.rijksmuseum.nl"
|
||
- "doi:10.1000/xyz123"
|
||
|
||
- id: "WRK_INC003"
|
||
rule: "Tag archival references with call numbers when present"
|
||
examples:
|
||
- "NL-SAA 5075/2135"
|
||
- "KB 128 A 14"
|
||
|
||
- id: "WRK_INC004"
|
||
rule: "Tag manuscripts as Items (unique objects)"
|
||
examples:
|
||
- "Codex Manesse"
|
||
- "the Book of Kells"
|
||
|
||
exclusion_rules:
|
||
- id: "WRK_EXC001"
|
||
rule: "Do NOT tag generic work type references"
|
||
examples:
|
||
- "the document (no title)"
|
||
- "a manuscript (not named)"
|
||
- "some book"
|
||
|
||
- id: "WRK_EXC002"
|
||
rule: "Do NOT tag UI navigation elements"
|
||
examples:
|
||
- "Click here"
|
||
- "Read more"
|
||
- "Download PDF"
|
||
|
||
- id: "WRK_EXC003"
|
||
rule: "Do NOT confuse physical carrier with intellectual work"
|
||
examples:
|
||
- "the book (physical object → use THING if just the object)"
|
||
- "the paper (material, not work)"
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# THING - Objects, concepts, and phenomena
|
||
# ---------------------------------------------------------------------------
|
||
THING:
|
||
code: "THG"
|
||
definition: |
|
||
Named objects, artifacts, concepts, events, and other entities that
|
||
do not fit the above categories. Broad category for tangible items
|
||
and abstract concepts.
|
||
|
||
ontology_mappings:
|
||
primary_class: "crm:E70_Thing"
|
||
alternative_classes:
|
||
- "schema:Thing"
|
||
- "edm:NonInformation_Resource"
|
||
nerd_class: "nerd:Thing"
|
||
nerd_subclasses:
|
||
- "nerd:Product"
|
||
- "nerd:Event"
|
||
nerd_note: |
|
||
NERD Thing is the base class. Use nerd:Product for objects/artifacts,
|
||
nerd:Event for THG.EVT subcategory.
|
||
edm_note: |
|
||
Europeana Data Model (EDM) provides cultural heritage-specific classes:
|
||
- edm:ProvidedCHO: Cultural Heritage Object (the real-world thing)
|
||
- edm:PhysicalThing: Physical/material objects
|
||
- edm:Event: Historical or cultural events
|
||
- edm:TimeSpan: Temporal extent of events/objects
|
||
|
||
subcategories:
|
||
ARTWORK:
|
||
code: "THG.ART"
|
||
definition: "Physical artworks and art objects"
|
||
examples:
|
||
- "The Night Watch painting"
|
||
- "Rodin's The Thinker"
|
||
- "the Rosetta Stone"
|
||
ontology_class: "crm:E22_Man-Made_Object"
|
||
alternative_classes:
|
||
- "edm:PhysicalThing"
|
||
- "edm:ProvidedCHO"
|
||
edm_note: |
|
||
For artworks in cultural heritage contexts:
|
||
- edm:ProvidedCHO for the conceptual cultural object
|
||
- edm:PhysicalThing for the material manifestation
|
||
|
||
ARTIFACT:
|
||
code: "THG.AFT"
|
||
definition: "Historical objects and artifacts"
|
||
examples:
|
||
- "the crown jewels"
|
||
- "Viking helmet"
|
||
- "Roman coin"
|
||
ontology_class: "crm:E22_Man-Made_Object"
|
||
alternative_classes:
|
||
- "edm:PhysicalThing"
|
||
|
||
SPECIES:
|
||
code: "THG.SPC"
|
||
definition: "Biological species and taxa"
|
||
examples:
|
||
- "Tyrannosaurus rex"
|
||
- "Quercus robur"
|
||
- "dodo"
|
||
ontology_class: "schema:Taxon"
|
||
|
||
EVENT:
|
||
code: "THG.EVT"
|
||
definition: "Named historical events"
|
||
examples:
|
||
- "World War II"
|
||
- "the French Revolution"
|
||
- "the Great Fire of London"
|
||
ontology_class: "crm:E5_Event"
|
||
alternative_classes:
|
||
- "edm:Event"
|
||
- "schema:Event"
|
||
edm_properties:
|
||
has_time_span: "edm:happenedAt"
|
||
occurred_at_place: "edm:occurredAt"
|
||
note: |
|
||
EDM Event extends CIDOC-CRM E5_Event for cultural heritage contexts.
|
||
Use edm:happenedAt for temporal scope, edm:occurredAt for location.
|
||
|
||
CONCEPT:
|
||
code: "THG.CON"
|
||
definition: "Abstract concepts, movements, styles"
|
||
examples:
|
||
- "Impressionism"
|
||
- "the Baroque style"
|
||
- "human rights"
|
||
ontology_class: "crm:E55_Type"
|
||
alternative_classes:
|
||
- "skos:Concept"
|
||
|
||
LANGUAGE:
|
||
code: "THG.LNG"
|
||
definition: "Named languages"
|
||
examples:
|
||
- "Dutch"
|
||
- "Latin"
|
||
- "Old French"
|
||
ontology_class: "crm:E56_Language"
|
||
|
||
MATERIAL:
|
||
code: "THG.MAT"
|
||
definition: "Materials and substances"
|
||
examples:
|
||
- "oil on canvas"
|
||
- "bronze"
|
||
- "parchment"
|
||
ontology_class: "crm:E57_Material"
|
||
|
||
inclusion_rules:
|
||
- id: "THG_INC001"
|
||
rule: "Tag named artworks as physical objects (distinct from their titles)"
|
||
examples:
|
||
- "the painting known as The Night Watch"
|
||
- "Michelangelo's David"
|
||
|
||
- id: "THG_INC002"
|
||
rule: "Tag named historical events"
|
||
examples:
|
||
- "the Eighty Years War"
|
||
- "the Golden Age"
|
||
|
||
- id: "THG_INC003"
|
||
rule: "Tag art movements and styles when named"
|
||
examples:
|
||
- "Dutch Golden Age painting"
|
||
- "Art Nouveau"
|
||
|
||
exclusion_rules:
|
||
- id: "THG_EXC001"
|
||
rule: "Do NOT tag generic object references"
|
||
examples:
|
||
- "the object (not named)"
|
||
- "an artifact (not specific)"
|
||
|
||
- id: "THG_EXC002"
|
||
rule: "Do NOT tag common nouns unless part of a proper name"
|
||
examples:
|
||
- "painting (generic)"
|
||
- "sculpture (generic)"
|
||
|
||
# =============================================================================
|
||
# SECTION 3: UNIVERSAL EXCLUSION RULES
|
||
# =============================================================================
|
||
# These exclusion rules apply to ALL entity types across all text sources.
|
||
|
||
universal_exclusions:
|
||
|
||
navigation:
|
||
id: "UNI_EXC001"
|
||
description: "Navigation elements and menu items"
|
||
patterns:
|
||
- "Home"
|
||
- "About Us"
|
||
- "Contact"
|
||
- "Menu"
|
||
- "Search"
|
||
- "Login"
|
||
- "Sign Up"
|
||
- "Back to top"
|
||
- "Next"
|
||
- "Previous"
|
||
- "Read more"
|
||
- "Learn more"
|
||
- "Click here"
|
||
- "See all"
|
||
- "View more"
|
||
- "Show less"
|
||
note: "These are UI elements, not named entities"
|
||
|
||
calls_to_action:
|
||
id: "UNI_EXC002"
|
||
description: "Marketing and call-to-action phrases"
|
||
patterns:
|
||
- "Buy now"
|
||
- "Subscribe"
|
||
- "Book tickets"
|
||
- "Plan your visit"
|
||
- "Become a member"
|
||
- "Donate"
|
||
- "Shop"
|
||
- "Get tickets"
|
||
- "Reserve"
|
||
- "Join us"
|
||
note: "Action prompts, not entity references"
|
||
|
||
social_boilerplate:
|
||
id: "UNI_EXC003"
|
||
description: "Generic social media text"
|
||
patterns:
|
||
- "Follow us"
|
||
- "Share"
|
||
- "Like"
|
||
- "Tweet"
|
||
- "Pin it"
|
||
- "Share on Facebook"
|
||
- "Follow on Instagram"
|
||
note: "Tag actual handles (TXT.SOC), not these generic phrases"
|
||
|
||
technical_artifacts:
|
||
id: "UNI_EXC004"
|
||
description: "Content management system artifacts"
|
||
patterns:
|
||
- "Posted by"
|
||
- "Last updated"
|
||
- "Tags:"
|
||
- "Categories:"
|
||
- "Comments"
|
||
- "Leave a reply"
|
||
- "Related posts"
|
||
- "Powered by"
|
||
- "Cookie settings"
|
||
- "Privacy policy"
|
||
- "Terms of use"
|
||
- "All rights reserved"
|
||
note: "Technical/legal boilerplate, not content entities"
|
||
|
||
generic_words:
|
||
id: "UNI_EXC005"
|
||
description: "Common words that should not be tagged"
|
||
patterns:
|
||
- "Welcome"
|
||
- "Information"
|
||
- "Details"
|
||
- "Overview"
|
||
- "Introduction"
|
||
- "Summary"
|
||
- "Description"
|
||
- "Features"
|
||
- "Highlights"
|
||
- "News"
|
||
- "Events"
|
||
- "Updates"
|
||
note: "Section headers and generic labels, not named entities"
|
||
|
||
pronouns:
|
||
id: "UNI_EXC006"
|
||
description: "Pronouns and demonstrative words"
|
||
patterns:
|
||
- "he"
|
||
- "she"
|
||
- "it"
|
||
- "they"
|
||
- "we"
|
||
- "you"
|
||
- "I"
|
||
- "him"
|
||
- "her"
|
||
- "them"
|
||
- "us"
|
||
- "this"
|
||
- "that"
|
||
- "these"
|
||
- "those"
|
||
- "here"
|
||
- "there"
|
||
- "where"
|
||
note: "Pronouns require coreference resolution, not NER"
|
||
|
||
# =============================================================================
|
||
# SECTION 4: DOUBLE-TAGGING PATTERNS
|
||
# =============================================================================
|
||
# Some text spans warrant multiple entity tags when they serve multiple
|
||
# semantic functions. Apply sparingly and document clearly.
|
||
|
||
double_tagging:
|
||
|
||
description: |
|
||
Double-tagging is permitted when a single text span genuinely serves
|
||
multiple semantic functions. Each tag must be independently justified.
|
||
Do NOT double-tag for redundancy.
|
||
|
||
permitted_patterns:
|
||
|
||
- pattern: "BEING + DENOMINATION"
|
||
description: "Person reference with structured name analysis"
|
||
example:
|
||
text: "Rembrandt Harmenszoon van Rijn"
|
||
tags:
|
||
- type: "BEING.PER"
|
||
span: "Rembrandt Harmenszoon van Rijn"
|
||
purpose: "Identifies the person"
|
||
- type: "DEN.PNM"
|
||
span: "Rembrandt Harmenszoon van Rijn"
|
||
purpose: "Analyzes name structure"
|
||
pnv_components:
|
||
givenName: "Rembrandt"
|
||
patronym: "Harmenszoon"
|
||
surnamePrefix: "van"
|
||
baseSurname: "Rijn"
|
||
|
||
- pattern: "ORGANISATION + PLACE"
|
||
description: "Institution that is also a place/building"
|
||
example:
|
||
text: "the Rijksmuseum"
|
||
tags:
|
||
- type: "ORG.HER"
|
||
span: "the Rijksmuseum"
|
||
purpose: "Identifies the organization"
|
||
- type: "PLC.BLD"
|
||
span: "the Rijksmuseum"
|
||
purpose: "Identifies the physical building"
|
||
note: "Apply when both organizational and locational aspects are relevant"
|
||
|
||
- pattern: "THING.ART + DENOMINATION"
|
||
description: "Artwork object with its title"
|
||
example:
|
||
text: "The Night Watch"
|
||
tags:
|
||
- type: "THG.ART"
|
||
span: "The Night Watch"
|
||
purpose: "The physical painting"
|
||
- type: "DEN.TIT"
|
||
span: "The Night Watch"
|
||
purpose: "The title of the work"
|
||
|
||
- pattern: "TEMPORAL + THING.EVT"
|
||
description: "Event that is also a time reference"
|
||
example:
|
||
text: "World War II"
|
||
tags:
|
||
- type: "THG.EVT"
|
||
span: "World War II"
|
||
purpose: "The historical event"
|
||
- type: "TMP.ERA"
|
||
span: "World War II"
|
||
purpose: "The time period (1939-1945)"
|
||
|
||
prohibited_patterns:
|
||
|
||
- pattern: "Same category double-tag"
|
||
description: "Never apply two tags from the same category"
|
||
example:
|
||
text: "Amsterdam"
|
||
wrong: "PLC.SET + PLC.CTY (Both are PLACE subcategories)"
|
||
correct: "PLC.SET (Choose the most specific applicable type)"
|
||
|
||
- pattern: "Redundant hierarchy"
|
||
description: "Do not tag both a whole and its parts"
|
||
example:
|
||
text: "Rijksmuseum, Museumstraat 1, Amsterdam"
|
||
wrong: "tag entire span as PLC.IAD AND also tag Amsterdam separately as PLC.SET"
|
||
correct: "tag entire span as PLC.IAD (includes city)"
|
||
note: "Subordinate elements are implicit in the containing entity"
|
||
|
||
# =============================================================================
|
||
# SECTION 5: ONTOLOGY RELATIONSHIP PATTERNS
|
||
# =============================================================================
|
||
# Bidirectional relationships between entity types, with ontology predicates.
|
||
|
||
relationships:
|
||
|
||
description: |
|
||
Relationships connect entities extracted from text. Each relationship
|
||
has a subject, predicate, and object, with ontology mappings for RDF
|
||
serialization. Relationships are bidirectional where applicable.
|
||
|
||
patterns:
|
||
|
||
- name: "staff_member_of"
|
||
subject_type: "BEING.STF"
|
||
predicate: "org:memberOf"
|
||
inverse_predicate: "org:hasMember"
|
||
object_type: "ORG.HER"
|
||
example:
|
||
text: "Dr. Maria van den Berg, Director of the Rijksmuseum"
|
||
subject: "Dr. Maria van den Berg"
|
||
object: "Rijksmuseum"
|
||
|
||
- name: "governed_by"
|
||
subject_type: "ORG.HER"
|
||
predicate: "org:subOrganizationOf"
|
||
inverse_predicate: "org:hasSubOrganization"
|
||
object_type: "ORG.PAR"
|
||
example:
|
||
text: "The Rijksmuseum, part of the Ministry of Culture"
|
||
subject: "Rijksmuseum"
|
||
object: "Ministry of Culture"
|
||
|
||
- name: "located_in"
|
||
subject_type: "ORG.HER"
|
||
predicate: "schema:location"
|
||
inverse_predicate: "schema:containsPlace"
|
||
object_type: "PLC"
|
||
example:
|
||
text: "The British Museum in London"
|
||
subject: "British Museum"
|
||
object: "London"
|
||
|
||
- name: "has_address"
|
||
subject_type: "ORG.HER"
|
||
predicate: "schema:address"
|
||
object_type: "PLC.IAD"
|
||
example:
|
||
text: "Rijksmuseum, Museumstraat 1, 1071 XX Amsterdam"
|
||
subject: "Rijksmuseum"
|
||
object: "Museumstraat 1, 1071 XX Amsterdam"
|
||
|
||
- name: "has_email"
|
||
subject_type: "ORG.HER"
|
||
predicate: "schema:email"
|
||
object_type: "TXT.IEM"
|
||
example:
|
||
subject: "Rijksmuseum"
|
||
object: "info@rijksmuseum.nl"
|
||
|
||
- name: "has_website"
|
||
subject_type: "ORG.HER"
|
||
predicate: "schema:url"
|
||
object_type: "TXT.IWB"
|
||
example:
|
||
subject: "Rijksmuseum"
|
||
object: "https://www.rijksmuseum.nl"
|
||
|
||
- name: "has_social_media"
|
||
subject_type: "ORG.HER"
|
||
predicate: "schema:sameAs"
|
||
object_type: "TXT.SOC"
|
||
example:
|
||
subject: "Rijksmuseum"
|
||
object: "@rijksmuseum"
|
||
|
||
- name: "has_collection"
|
||
subject_type: "ORG.HER"
|
||
predicate: "rico:hasOrHadAllMembersOf"
|
||
inverse_predicate: "rico:isOrWasIncludedIn"
|
||
object_type: "DEN.COL"
|
||
example:
|
||
text: "The Rijksmuseum Golden Age Collection"
|
||
subject: "Rijksmuseum"
|
||
object: "Golden Age Collection"
|
||
|
||
- name: "created_by"
|
||
subject_type: "THG.ART"
|
||
predicate: "crm:P14_carried_out_by"
|
||
inverse_predicate: "crm:P14i_performed"
|
||
object_type: "BEING.PER"
|
||
example:
|
||
text: "The Night Watch by Rembrandt"
|
||
subject: "The Night Watch"
|
||
object: "Rembrandt"
|
||
|
||
- name: "has_title"
|
||
subject_type: "THG.ART"
|
||
predicate: "crm:P102_has_title"
|
||
object_type: "DEN.TIT"
|
||
example:
|
||
subject: "the painting"
|
||
object: "The Night Watch"
|
||
|
||
- name: "occurred_during"
|
||
subject_type: "THG.EVT"
|
||
predicate: "crm:P4_has_time-span"
|
||
object_type: "TMP"
|
||
example:
|
||
text: "World War II (1939-1945)"
|
||
subject: "World War II"
|
||
object: "1939-1945"
|
||
|
||
- name: "exhibition_dates"
|
||
subject_type: "DEN.EXH"
|
||
predicate: "crm:P4_has_time-span"
|
||
object_type: "TMP.EXP"
|
||
example:
|
||
text: "Vermeer exhibition, 10 February - 4 June 2023"
|
||
subject: "Vermeer exhibition"
|
||
object: "10 February - 4 June 2023"
|
||
|
||
- name: "has_opening_hours"
|
||
subject_type: "ORG.HER"
|
||
predicate: "schema:openingHoursSpecification"
|
||
object_type: "TMP.OPH"
|
||
example:
|
||
text: "The museum is open Tuesday-Sunday, 10:00-17:00"
|
||
subject: "The museum"
|
||
object: "Tuesday-Sunday, 10:00-17:00"
|
||
|
||
# -------------------------------------------------------------------------
|
||
# Additional W3C Org Ontology Relationship Patterns
|
||
# -------------------------------------------------------------------------
|
||
|
||
- name: "has_site"
|
||
subject_type: "ORG.HER"
|
||
predicate: "org:hasSite"
|
||
inverse_predicate: "org:siteOf"
|
||
object_type: "PLC.IAD"
|
||
ontology: "org"
|
||
example:
|
||
text: "The Rijksmuseum has locations at Museumstraat and Schiphol Airport"
|
||
subject: "Rijksmuseum"
|
||
object: "Museumstraat location"
|
||
note: |
|
||
org:Site represents an office or premise at which the organization
|
||
is located. Use for physical institutional locations. Distinct from
|
||
schema:address which captures postal address data.
|
||
|
||
- name: "has_unit"
|
||
subject_type: "ORG.HER"
|
||
predicate: "org:hasUnit"
|
||
inverse_predicate: "org:unitOf"
|
||
object_type: "ORG.UNT"
|
||
ontology: "org"
|
||
example:
|
||
text: "The Rijksmuseum's Department of Prints and Drawings"
|
||
subject: "Rijksmuseum"
|
||
object: "Department of Prints and Drawings"
|
||
note: |
|
||
Links organizational units to their parent organization.
|
||
Use for named internal divisions that function as distinct entities.
|
||
|
||
- name: "head_of"
|
||
subject_type: "BEING.STF"
|
||
predicate: "org:headOf"
|
||
object_type: "ORG.HER"
|
||
ontology: "org"
|
||
example:
|
||
text: "Taco Dibbits, Director of the Rijksmuseum"
|
||
subject: "Taco Dibbits"
|
||
object: "Rijksmuseum"
|
||
note: "Use for leadership roles. More specific than org:memberOf."
|
||
|
||
- name: "holds_post"
|
||
subject_type: "BEING.STF"
|
||
predicate: "org:holds"
|
||
inverse_predicate: "org:heldBy"
|
||
object_type: "org:Post"
|
||
ontology: "org"
|
||
example:
|
||
text: "Dr. van den Berg holds the position of Chief Curator"
|
||
subject: "Dr. van den Berg"
|
||
object: "Chief Curator"
|
||
note: |
|
||
org:Post represents a position that exists independently of the
|
||
person filling it. Use when position titles are formally defined.
|
||
|
||
- name: "has_classification"
|
||
subject_type: "ORG.HER"
|
||
predicate: "org:classification"
|
||
object_type: "THG.CON"
|
||
ontology: "org"
|
||
example:
|
||
text: "The institution is classified as a national museum"
|
||
subject: "The institution"
|
||
object: "national museum"
|
||
note: |
|
||
org:classification links to a classification scheme category.
|
||
Use for institution type classifications (museum, archive, library).
|
||
|
||
- name: "member_during"
|
||
subject_type: "org:Membership"
|
||
predicate: "org:memberDuring"
|
||
object_type: "TMP"
|
||
ontology: "org"
|
||
example:
|
||
text: "She served as curator from 2010 to 2015"
|
||
subject: "membership"
|
||
object: "2010-2015"
|
||
note: |
|
||
Temporal scope of a membership/role. Uses org:Membership as
|
||
n-ary relationship between person, organization, and role.
|
||
|
||
- name: "collaborates_with"
|
||
subject_type: "ORG.HER"
|
||
predicate: "org:linkedTo"
|
||
inverse_predicate: "org:linkedTo"
|
||
object_type: "ORG.HER"
|
||
ontology: "org"
|
||
example:
|
||
text: "The Rijksmuseum collaborates with the Van Gogh Museum"
|
||
subject: "Rijksmuseum"
|
||
object: "Van Gogh Museum"
|
||
note: |
|
||
org:linkedTo indicates a general relationship between organizations.
|
||
Use for partnerships, collaborations, and informal associations.
|
||
|
||
- name: "resulted_from_change"
|
||
subject_type: "ORG.HER"
|
||
predicate: "org:resultedFrom"
|
||
inverse_predicate: "org:resultingOrganization"
|
||
object_type: "org:ChangeEvent"
|
||
ontology: "org"
|
||
example:
|
||
text: "The Noord-Hollands Archief was formed through a merger in 2001"
|
||
subject: "Noord-Hollands Archief"
|
||
object: "2001 merger event"
|
||
note: |
|
||
Tracks organizational change events (mergers, splits, reorganizations).
|
||
Use with org:ChangeEvent for provenance of institutional changes.
|
||
|
||
- name: "changed_by"
|
||
subject_type: "ORG.HER"
|
||
predicate: "org:changedBy"
|
||
object_type: "org:ChangeEvent"
|
||
ontology: "org"
|
||
example:
|
||
text: "The archive was restructured in 2018"
|
||
subject: "The archive"
|
||
object: "2018 restructuring"
|
||
note: "Use for change events that affected but didn't create the organization."
|
||
|
||
# -------------------------------------------------------------------------
|
||
# RegOrg (Registered Organization) Relationship Patterns
|
||
# -------------------------------------------------------------------------
|
||
|
||
- name: "has_registration"
|
||
subject_type: "ORG.HER"
|
||
predicate: "rov:registration"
|
||
object_type: "rov:Identifier"
|
||
ontology: "rov"
|
||
example:
|
||
text: "Rijksmuseum (KvK 41.203.583)"
|
||
subject: "Rijksmuseum"
|
||
object: "KvK 41.203.583"
|
||
note: |
|
||
Links to formal registration identifiers (KvK, ISIL, etc.).
|
||
The identifier should include the scheme and value.
|
||
|
||
- name: "has_legal_name"
|
||
subject_type: "ORG.HER"
|
||
predicate: "rov:legalName"
|
||
object_type: "DEN"
|
||
ontology: "rov"
|
||
example:
|
||
text: "Stichting Het Rijksmuseum (legally registered name)"
|
||
subject: "Rijksmuseum"
|
||
object: "Stichting Het Rijksmuseum"
|
||
note: |
|
||
rov:legalName is the registered legal name, which may differ
|
||
from trading names or common usage names (skos:prefLabel).
|
||
|
||
- name: "has_org_status"
|
||
subject_type: "ORG.HER"
|
||
predicate: "rov:orgStatus"
|
||
object_type: "skos:Concept"
|
||
ontology: "rov"
|
||
example:
|
||
text: "The museum is currently active"
|
||
subject: "The museum"
|
||
object: "active"
|
||
note: "Organization status: active, dissolved, merged, etc."
|
||
|
||
# -------------------------------------------------------------------------
|
||
# FOAF Relationship Patterns
|
||
# -------------------------------------------------------------------------
|
||
|
||
- name: "has_social_account"
|
||
subject_type: "ORG.HER"
|
||
predicate: "foaf:account"
|
||
object_type: "TXT.SOC"
|
||
ontology: "foaf"
|
||
example:
|
||
text: "Follow @rijksmuseum on Twitter"
|
||
subject: "Rijksmuseum"
|
||
object: "@rijksmuseum"
|
||
note: |
|
||
foaf:account links to foaf:OnlineAccount. More structured than
|
||
schema:sameAs, allowing account name and service to be specified.
|
||
|
||
- name: "has_homepage"
|
||
subject_type: "ORG.HER"
|
||
predicate: "foaf:homepage"
|
||
object_type: "TXT.IWB"
|
||
ontology: "foaf"
|
||
example:
|
||
subject: "Rijksmuseum"
|
||
object: "https://www.rijksmuseum.nl"
|
||
note: "Alternative to schema:url. foaf:homepage is specifically the main website."
|
||
|
||
# =============================================================================
|
||
# SECTION 6: PiCO INTEGRATION PATTERNS
|
||
# =============================================================================
|
||
# Person in Context Ontology patterns for person observations.
|
||
|
||
pico_integration:
|
||
|
||
description: |
|
||
PiCO (Person in Context Ontology) models textual observations of persons
|
||
as distinct from reconstructed person entities. This enables:
|
||
- Tracking provenance of person mentions
|
||
- Handling name variations across sources
|
||
- Linking observations to formal person records
|
||
|
||
observation_pattern:
|
||
description: "Every person mention creates a PersonObservation"
|
||
class: "picom:PersonObservation"
|
||
properties:
|
||
- property: "picom:hasObservedName"
|
||
description: "The name string as it appears in text"
|
||
range: "pnv:PersonName"
|
||
- property: "picom:isObservationOf"
|
||
description: "Links to reconstructed Person entity"
|
||
range: "crm:E21_Person"
|
||
- property: "prov:hadPrimarySource"
|
||
description: "The source document/webpage"
|
||
range: "prov:Entity"
|
||
- property: "picom:observedAt"
|
||
description: "When the observation was made"
|
||
range: "xsd:dateTime"
|
||
|
||
pnv_name_structure:
|
||
description: "Person Name Vocabulary for structured names"
|
||
class: "pnv:PersonName"
|
||
components:
|
||
- property: "pnv:givenName"
|
||
examples:
|
||
- "Rembrandt"
|
||
- "Maria"
|
||
- "Jan"
|
||
- property: "pnv:patronym"
|
||
examples:
|
||
- "Harmenszoon"
|
||
- "Janszoon"
|
||
- property: "pnv:surnamePrefix"
|
||
examples:
|
||
- "van"
|
||
- "de"
|
||
- "van den"
|
||
- "op de"
|
||
- property: "pnv:baseSurname"
|
||
examples:
|
||
- "Rijn"
|
||
- "Berg"
|
||
- "Velde"
|
||
- property: "pnv:honorificPrefix"
|
||
examples:
|
||
- "Dr."
|
||
- "Prof."
|
||
- "Sir"
|
||
- "Queen"
|
||
- property: "pnv:honorificSuffix"
|
||
examples:
|
||
- "PhD"
|
||
- "Jr."
|
||
- "III"
|
||
|
||
example:
|
||
text: "Dr. Maria van den Berg, Director"
|
||
observation:
|
||
type: "picom:PersonObservation"
|
||
hasObservedName:
|
||
type: "pnv:PersonName"
|
||
honorificPrefix: "Dr."
|
||
givenName: "Maria"
|
||
surnamePrefix: "van den"
|
||
baseSurname: "Berg"
|
||
hasRole: "Director"
|
||
hadPrimarySource: "https://example.org/source-page"
|
||
|
||
# =============================================================================
|
||
# SECTION 7: NIF/NERD/OA INTEGRATION PATTERNS
|
||
# =============================================================================
|
||
# NLP Interchange Format (NIF), Named Entity Recognition and Disambiguation
|
||
# (NERD), and W3C Web Annotation (OA) patterns for cross-tool interoperability.
|
||
|
||
nif_nerd_integration:
|
||
|
||
description: |
|
||
This section defines how GLAM-NER annotations integrate with:
|
||
- NIF 2.0: Standard format for NLP tool interchange (string/offset addressing)
|
||
- NERD: Cross-system entity type mappings (10 core classes)
|
||
- W3C OA: Web Annotation Data Model for annotation provenance
|
||
- itsrdf: ITS 2.0 entity linking predicates
|
||
|
||
These standards enable GLAM-NER annotations to be consumed by external
|
||
NLP pipelines, linked data systems, and annotation aggregators.
|
||
|
||
nif_core_patterns:
|
||
description: |
|
||
NIF (NLP Interchange Format) provides URI-based addressing for text
|
||
spans. Every annotation creates a nif:Phrase linked to its context.
|
||
|
||
context_class: "nif:Context"
|
||
context_note: |
|
||
nif:Context represents the full text document. All annotations reference
|
||
this context via nif:referenceContext.
|
||
|
||
phrase_class: "nif:Phrase"
|
||
phrase_note: |
|
||
nif:Phrase represents extracted entity mentions. Each GLAM-NER entity
|
||
becomes a nif:Phrase with offset-based URI addressing.
|
||
|
||
uri_schemes:
|
||
offset_based:
|
||
pattern: "{source_url}#offset_{begin}_{end}"
|
||
example: "https://example.org/page#offset_42_58"
|
||
note: "Default scheme. Begin/end are character offsets (0-based)."
|
||
|
||
rfc5147:
|
||
pattern: "{source_url}#char={begin},{end}"
|
||
example: "https://example.org/page#char=42,58"
|
||
note: "RFC 5147 fragment identifiers for text/plain."
|
||
|
||
context_hash:
|
||
pattern: "{source_url}#hash_{context_length}_{hash}_{begin}_{end}"
|
||
note: "Hash-based URIs for content-addressing. More stable across edits."
|
||
|
||
core_properties:
|
||
- property: "nif:beginIndex"
|
||
range: "xsd:nonNegativeInteger"
|
||
description: "Character offset where entity span begins (0-based)"
|
||
|
||
- property: "nif:endIndex"
|
||
range: "xsd:nonNegativeInteger"
|
||
description: "Character offset where entity span ends (exclusive)"
|
||
|
||
- property: "nif:anchorOf"
|
||
range: "xsd:string"
|
||
description: "The exact text string of the entity mention"
|
||
|
||
- property: "nif:referenceContext"
|
||
range: "nif:Context"
|
||
description: "Link to the document context containing this phrase"
|
||
|
||
- property: "nif:sourceUrl"
|
||
range: "xsd:anyURI"
|
||
description: "Original URL of the source document"
|
||
|
||
nerd_class_mappings:
|
||
description: |
|
||
NERD (Named Entity Recognition and Disambiguation) defines 10 core
|
||
entity classes that map across multiple NER systems (DBpedia Spotlight,
|
||
AlchemyAPI, OpenCalais, Zemanta, etc.). GLAM-NER types map to NERD for
|
||
cross-system interoperability.
|
||
|
||
core_classes:
|
||
- nerd_class: "nerd:Thing"
|
||
description: "Generic entity (base class)"
|
||
glam_ner_types: ["THING"]
|
||
|
||
- nerd_class: "nerd:Person"
|
||
description: "Human beings"
|
||
glam_ner_types: ["BEING.PER", "BEING.STF"]
|
||
subclasses:
|
||
- "nerd:Astronaut"
|
||
- "nerd:Politician"
|
||
- "nerd:Artist"
|
||
|
||
- nerd_class: "nerd:Organization"
|
||
description: "Organizations, companies, institutions"
|
||
glam_ner_types: ["ORG", "ORG.HER", "ORG.COM", "ORG.GOV", "ORG.EDU", "ORG.REL", "ORG.UNT"]
|
||
subclasses:
|
||
- "nerd:Company"
|
||
- "nerd:SportsTeam"
|
||
- "nerd:Band"
|
||
- "nerd:University"
|
||
- "nerd:Museum"
|
||
|
||
- nerd_class: "nerd:Location"
|
||
description: "Geographic places and features"
|
||
glam_ner_types: ["PLACE", "PLACE.GEO", "PLACE.BLD", "PLACE.ADR"]
|
||
subclasses:
|
||
- "nerd:City"
|
||
- "nerd:Country"
|
||
- "nerd:Continent"
|
||
- "nerd:Region"
|
||
- "nerd:Facility"
|
||
|
||
- nerd_class: "nerd:Event"
|
||
description: "Named events"
|
||
glam_ner_types: ["THING.EVT"]
|
||
subclasses:
|
||
- "nerd:SportEvent"
|
||
- "nerd:MusicFestival"
|
||
|
||
- nerd_class: "nerd:Time"
|
||
description: "Temporal expressions"
|
||
glam_ner_types: ["TEMPORAL", "TEMPORAL.DAT", "TEMPORAL.PER", "TEMPORAL.DUR"]
|
||
|
||
- nerd_class: "nerd:Amount"
|
||
description: "Quantities and measurements"
|
||
glam_ner_types: ["QUANTITY", "QUANTITY.MON", "QUANTITY.CNT", "QUANTITY.DIM"]
|
||
|
||
- nerd_class: "nerd:Product"
|
||
description: "Products and creative works"
|
||
glam_ner_types: ["THING.OBJ", "THING.DOC", "THING.COL"]
|
||
subclasses:
|
||
- "nerd:Album"
|
||
- "nerd:Book"
|
||
- "nerd:Movie"
|
||
- "nerd:Software"
|
||
|
||
- nerd_class: "nerd:Animal"
|
||
description: "Animals"
|
||
glam_ner_types: ["BEING.ANI"]
|
||
|
||
- nerd_class: "nerd:Function"
|
||
description: "Roles, titles, occupations"
|
||
glam_ner_types: []
|
||
note: "GLAM-NER treats roles as attributes of BEING.STF, not separate entities"
|
||
|
||
web_annotation_patterns:
|
||
description: |
|
||
W3C Web Annotation Data Model provides standard annotation structure
|
||
with target selectors for precise text span identification.
|
||
|
||
annotation_class: "oa:Annotation"
|
||
|
||
structure:
|
||
- property: "oa:hasBody"
|
||
description: "The annotation content (entity type, confidence)"
|
||
example: "The GLAM-NER entity classification"
|
||
|
||
- property: "oa:hasTarget"
|
||
description: "What is being annotated (text span)"
|
||
example: "TextPositionSelector pointing to entity mention"
|
||
|
||
- property: "oa:motivatedBy"
|
||
description: "Why the annotation was created"
|
||
value: "oa:classifying"
|
||
note: "NER annotations are classification activities"
|
||
|
||
target_selectors:
|
||
text_position:
|
||
class: "oa:TextPositionSelector"
|
||
properties:
|
||
- "oa:start" # Start offset (0-based)
|
||
- "oa:end" # End offset (exclusive)
|
||
note: "Equivalent to NIF offset-based addressing"
|
||
|
||
text_quote:
|
||
class: "oa:TextQuoteSelector"
|
||
properties:
|
||
- "oa:exact" # The exact matched text
|
||
- "oa:prefix" # Context before (for disambiguation)
|
||
- "oa:suffix" # Context after (for disambiguation)
|
||
note: "Provides context for robust text matching"
|
||
|
||
xpath:
|
||
class: "oa:XPathSelector"
|
||
properties:
|
||
- "rdf:value" # XPath expression to DOM node
|
||
note: "For HTML/XML sources with DOM structure"
|
||
|
||
itsrdf_entity_linking:
|
||
description: |
|
||
ITS 2.0 (Internationalization Tag Set) provides entity linking predicates
|
||
for connecting mentions to knowledge bases.
|
||
|
||
properties:
|
||
- property: "itsrdf:taIdentRef"
|
||
description: "URI reference to entity in knowledge base"
|
||
example: "http://dbpedia.org/resource/Rijksmuseum"
|
||
note: "Primary entity linking predicate"
|
||
|
||
- property: "itsrdf:taSource"
|
||
description: "Knowledge base source"
|
||
examples:
|
||
- "DBpedia"
|
||
- "Wikidata"
|
||
- "GeoNames"
|
||
|
||
- property: "itsrdf:taConfidence"
|
||
description: "Linking confidence score (0.0-1.0)"
|
||
range: "xsd:double"
|
||
|
||
- property: "itsrdf:taClassRef"
|
||
description: "URI of entity type in target ontology"
|
||
example: "http://dbpedia.org/ontology/Museum"
|
||
|
||
glam_ner_to_nerd_mapping:
|
||
description: "Complete mapping table from GLAM-NER types to NERD classes"
|
||
mappings:
|
||
- glam_type: "BEING"
|
||
glam_code: "BEI"
|
||
nerd_class: "nerd:Person"
|
||
nerd_subclasses: []
|
||
|
||
- glam_type: "BEING.PER"
|
||
glam_code: "BEI.PER"
|
||
nerd_class: "nerd:Person"
|
||
nerd_subclasses: ["nerd:Artist", "nerd:Politician", "nerd:Astronaut"]
|
||
|
||
- glam_type: "BEING.STF"
|
||
glam_code: "BEI.STF"
|
||
nerd_class: "nerd:Person"
|
||
note: "Staff roles map to nerd:Function as secondary annotation"
|
||
|
||
- glam_type: "BEING.ANI"
|
||
glam_code: "BEI.ANI"
|
||
nerd_class: "nerd:Animal"
|
||
|
||
- glam_type: "BEING.MYT"
|
||
glam_code: "BEI.MYT"
|
||
nerd_class: "nerd:Person"
|
||
note: "Mythological figures treated as Person in NERD"
|
||
|
||
- glam_type: "BEING.GRP"
|
||
glam_code: "BEI.GRP"
|
||
nerd_class: "nerd:Organization"
|
||
note: "Named groups map to Organization"
|
||
|
||
- glam_type: "ORG"
|
||
glam_code: "ORG"
|
||
nerd_class: "nerd:Organization"
|
||
|
||
- glam_type: "ORG.HER"
|
||
glam_code: "ORG.HER"
|
||
nerd_class: "nerd:Organization"
|
||
nerd_subclasses: ["nerd:Museum"]
|
||
|
||
- glam_type: "ORG.COM"
|
||
glam_code: "ORG.COM"
|
||
nerd_class: "nerd:Organization"
|
||
nerd_subclasses: ["nerd:Company"]
|
||
|
||
- glam_type: "ORG.GOV"
|
||
glam_code: "ORG.GOV"
|
||
nerd_class: "nerd:Organization"
|
||
|
||
- glam_type: "ORG.EDU"
|
||
glam_code: "ORG.EDU"
|
||
nerd_class: "nerd:Organization"
|
||
nerd_subclasses: ["nerd:University"]
|
||
|
||
- glam_type: "ORG.REL"
|
||
glam_code: "ORG.REL"
|
||
nerd_class: "nerd:Organization"
|
||
|
||
- glam_type: "ORG.UNT"
|
||
glam_code: "ORG.UNT"
|
||
nerd_class: "nerd:Organization"
|
||
note: "Organizational units are Organizations in NERD"
|
||
|
||
- glam_type: "PLACE"
|
||
glam_code: "PLA"
|
||
nerd_class: "nerd:Location"
|
||
|
||
- glam_type: "PLACE.GEO"
|
||
glam_code: "PLA.GEO"
|
||
nerd_class: "nerd:Location"
|
||
nerd_subclasses: ["nerd:City", "nerd:Country", "nerd:Region", "nerd:Continent"]
|
||
|
||
- glam_type: "PLACE.BLD"
|
||
glam_code: "PLA.BLD"
|
||
nerd_class: "nerd:Location"
|
||
nerd_subclasses: ["nerd:Facility"]
|
||
|
||
- glam_type: "PLACE.ADR"
|
||
glam_code: "PLA.ADR"
|
||
nerd_class: "nerd:Location"
|
||
|
||
- glam_type: "TEMPORAL"
|
||
glam_code: "TMP"
|
||
nerd_class: "nerd:Time"
|
||
|
||
- glam_type: "TEMPORAL.DAT"
|
||
glam_code: "TMP.DAT"
|
||
nerd_class: "nerd:Time"
|
||
|
||
- glam_type: "TEMPORAL.PER"
|
||
glam_code: "TMP.PER"
|
||
nerd_class: "nerd:Time"
|
||
|
||
- glam_type: "TEMPORAL.DUR"
|
||
glam_code: "TMP.DUR"
|
||
nerd_class: "nerd:Time"
|
||
|
||
- glam_type: "QUANTITY"
|
||
glam_code: "QTY"
|
||
nerd_class: "nerd:Amount"
|
||
|
||
- glam_type: "QUANTITY.MON"
|
||
glam_code: "QTY.MON"
|
||
nerd_class: "nerd:Amount"
|
||
|
||
- glam_type: "QUANTITY.CNT"
|
||
glam_code: "QTY.CNT"
|
||
nerd_class: "nerd:Amount"
|
||
|
||
- glam_type: "QUANTITY.DIM"
|
||
glam_code: "QTY.DIM"
|
||
nerd_class: "nerd:Amount"
|
||
|
||
- glam_type: "THING"
|
||
glam_code: "THI"
|
||
nerd_class: "nerd:Thing"
|
||
|
||
- glam_type: "THING.OBJ"
|
||
glam_code: "THI.OBJ"
|
||
nerd_class: "nerd:Product"
|
||
|
||
- glam_type: "THING.DOC"
|
||
glam_code: "THI.DOC"
|
||
nerd_class: "nerd:Product"
|
||
nerd_subclasses: ["nerd:Book"]
|
||
|
||
- glam_type: "THING.COL"
|
||
glam_code: "THI.COL"
|
||
nerd_class: "nerd:Thing"
|
||
note: "Collections map to generic Thing (no NERD equivalent)"
|
||
|
||
- glam_type: "THING.EVT"
|
||
glam_code: "THI.EVT"
|
||
nerd_class: "nerd:Event"
|
||
nerd_subclasses: ["nerd:SportEvent", "nerd:MusicFestival"]
|
||
|
||
- glam_type: "TXT"
|
||
glam_code: "TXT"
|
||
nerd_class: "nerd:Thing"
|
||
note: "Text fragments have no direct NERD mapping"
|
||
|
||
example_nif_annotation:
|
||
description: "Complete example of GLAM-NER annotation in NIF/OA format"
|
||
source_text: "The Rijksmuseum in Amsterdam holds over one million objects."
|
||
|
||
turtle_example: |
|
||
@prefix nif: <http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#> .
|
||
@prefix nerd: <http://nerd.eurecom.fr/ontology#> .
|
||
@prefix oa: <http://www.w3.org/ns/oa#> .
|
||
@prefix itsrdf: <http://www.w3.org/2005/11/its/rdf#> .
|
||
@prefix glam: <https://w3id.org/glam/> .
|
||
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
|
||
|
||
# Document context
|
||
<https://example.org/doc1#offset_0_60> a nif:Context, nif:OffsetBasedString ;
|
||
nif:isString "The Rijksmuseum in Amsterdam holds over one million objects." ;
|
||
nif:sourceUrl <https://example.org/doc1> ;
|
||
nif:beginIndex "0"^^xsd:nonNegativeInteger ;
|
||
nif:endIndex "60"^^xsd:nonNegativeInteger .
|
||
|
||
# Entity 1: Rijksmuseum (ORG.HER)
|
||
<https://example.org/doc1#offset_4_15> a nif:Phrase, nif:OffsetBasedString ;
|
||
nif:anchorOf "Rijksmuseum" ;
|
||
nif:beginIndex "4"^^xsd:nonNegativeInteger ;
|
||
nif:endIndex "15"^^xsd:nonNegativeInteger ;
|
||
nif:referenceContext <https://example.org/doc1#offset_0_60> ;
|
||
a nerd:Organization, nerd:Museum ;
|
||
glam:entityType "ORG.HER" ;
|
||
glam:confidence "0.95"^^xsd:double ;
|
||
itsrdf:taIdentRef <http://www.wikidata.org/entity/Q190804> ;
|
||
itsrdf:taSource "Wikidata" ;
|
||
itsrdf:taConfidence "0.92"^^xsd:double .
|
||
|
||
# Entity 2: Amsterdam (PLACE.GEO)
|
||
<https://example.org/doc1#offset_19_28> a nif:Phrase, nif:OffsetBasedString ;
|
||
nif:anchorOf "Amsterdam" ;
|
||
nif:beginIndex "19"^^xsd:nonNegativeInteger ;
|
||
nif:endIndex "28"^^xsd:nonNegativeInteger ;
|
||
nif:referenceContext <https://example.org/doc1#offset_0_60> ;
|
||
a nerd:Location, nerd:City ;
|
||
glam:entityType "PLACE.GEO" ;
|
||
glam:confidence "0.98"^^xsd:double ;
|
||
itsrdf:taIdentRef <http://www.wikidata.org/entity/Q727> ;
|
||
itsrdf:taSource "Wikidata" .
|
||
|
||
# Entity 3: one million (QUANTITY.CNT)
|
||
<https://example.org/doc1#offset_41_52> a nif:Phrase, nif:OffsetBasedString ;
|
||
nif:anchorOf "one million" ;
|
||
nif:beginIndex "41"^^xsd:nonNegativeInteger ;
|
||
nif:endIndex "52"^^xsd:nonNegativeInteger ;
|
||
nif:referenceContext <https://example.org/doc1#offset_0_60> ;
|
||
a nerd:Amount ;
|
||
glam:entityType "QUANTITY.CNT" ;
|
||
glam:normalizedValue "1000000"^^xsd:integer ;
|
||
glam:confidence "0.90"^^xsd:double .
|
||
|
||
# =============================================================================
|
||
# SECTION 8: PROPERTY EXTRACTION RULES
|
||
# =============================================================================
|
||
# Rules for extracting relationships and attributes between entities.
|
||
# Properties connect entities to each other and to literal values.
|
||
|
||
property_extraction:
|
||
|
||
description: |
|
||
Property extraction identifies relationships between detected entities
|
||
and extracts attributes with their values. Every property claim MUST
|
||
include verifiable provenance.
|
||
|
||
Properties are categorized by:
|
||
- Subject entity type (e.g., ORG.HER, BEING.PER)
|
||
- Property semantics (temporal, spatial, organizational, etc.)
|
||
- Value type (entity reference, literal, temporal range)
|
||
|
||
Each extracted property becomes a claim that requires:
|
||
1. Source path (XPath, JSONPath, or text offset)
|
||
2. Extraction timestamp
|
||
3. Agent identifier
|
||
4. Convention version reference
|
||
|
||
property_categories:
|
||
|
||
temporal_properties:
|
||
description: "Properties with date/time values or temporal ranges"
|
||
|
||
founding_date:
|
||
property_uri: "schema:foundingDate"
|
||
alternative_uris:
|
||
- "crm:P98i_was_born"
|
||
- "rico:beginningDate"
|
||
subject_types: ["ORG", "ORG.HER", "ORG.COM", "ORG.GOV"]
|
||
value_type: "temporal_expression"
|
||
patterns:
|
||
- pattern: "founded in {YEAR}"
|
||
example: "The museum was founded in 1885"
|
||
- pattern: "established {YEAR}"
|
||
example: "Established 1952"
|
||
- pattern: "since {YEAR}"
|
||
example: "Serving the community since 1923"
|
||
- pattern: "opgericht in {YEAR}"
|
||
example: "Opgericht in 1901"
|
||
- pattern: "{YEAR} gesticht"
|
||
example: "In 1850 gesticht"
|
||
extraction_notes: |
|
||
Temporal expressions may be:
|
||
- Precise dates: "15 July 1885"
|
||
- Year only: "1885"
|
||
- Decade: "1880s"
|
||
- Century: "19th century"
|
||
- Relative: "over 100 years ago"
|
||
|
||
Normalize to ISO 8601 where possible, preserve original text.
|
||
|
||
dissolution_date:
|
||
property_uri: "schema:dissolutionDate"
|
||
alternative_uris:
|
||
- "crm:P100i_died_in"
|
||
- "rico:endDate"
|
||
subject_types: ["ORG", "ORG.HER"]
|
||
value_type: "temporal_expression"
|
||
patterns:
|
||
- pattern: "closed in {YEAR}"
|
||
- pattern: "dissolved {YEAR}"
|
||
- pattern: "ceased operations {YEAR}"
|
||
|
||
opening_hours:
|
||
property_uri: "schema:openingHours"
|
||
subject_types: ["ORG.HER", "PLC.BLD"]
|
||
value_type: "temporal_specification"
|
||
patterns:
|
||
- pattern: "open {DAYS} {TIME_RANGE}"
|
||
example: "Open Tuesday-Sunday 10:00-17:00"
|
||
- pattern: "geopend {DAYS} {TIME_RANGE}"
|
||
example: "Geopend dinsdag t/m zondag van 10:00 tot 17:00"
|
||
|
||
event_date:
|
||
property_uri: "schema:startDate"
|
||
alternative_uris:
|
||
- "crm:P4_has_time-span"
|
||
subject_types: ["THG.EVT", "DEN.EXH"]
|
||
value_type: "temporal_expression"
|
||
|
||
organizational_properties:
|
||
description: "Properties describing organizational relationships"
|
||
|
||
parent_organization:
|
||
property_uri: "org:subOrganizationOf"
|
||
alternative_uris:
|
||
- "schema:parentOrganization"
|
||
- "rico:isOrWasSubordinateTo"
|
||
subject_types: ["ORG", "ORG.HER", "ORG.UNT"]
|
||
value_type: "entity_reference"
|
||
target_types: ["ORG", "ORG.PAR", "ORG.GOV"]
|
||
patterns:
|
||
- pattern: "part of {ORG}"
|
||
example: "The archive is part of the National Library"
|
||
- pattern: "department of {ORG}"
|
||
- pattern: "onderdeel van {ORG}"
|
||
- pattern: "valt onder {ORG}"
|
||
|
||
has_unit:
|
||
property_uri: "org:hasUnit"
|
||
alternative_uris:
|
||
- "schema:department"
|
||
subject_types: ["ORG", "ORG.HER"]
|
||
value_type: "entity_reference"
|
||
target_types: ["ORG.UNT"]
|
||
patterns:
|
||
- pattern: "{ORG} has a {UNIT}"
|
||
- pattern: "includes the {UNIT}"
|
||
|
||
member_of:
|
||
property_uri: "org:memberOf"
|
||
alternative_uris:
|
||
- "schema:memberOf"
|
||
subject_types: ["ORG.HER", "BEING.STF"]
|
||
value_type: "entity_reference"
|
||
target_types: ["ORG", "ORG.HER"]
|
||
|
||
has_member:
|
||
property_uri: "org:hasMember"
|
||
subject_types: ["ORG", "ORG.HER"]
|
||
value_type: "entity_reference"
|
||
target_types: ["BEING", "BEING.STF", "ORG.HER"]
|
||
|
||
contact_properties:
|
||
description: "Properties for contact information"
|
||
|
||
email:
|
||
property_uri: "schema:email"
|
||
alternative_uris:
|
||
- "vcard:hasEmail"
|
||
subject_types: ["ORG.HER", "BEING.STF"]
|
||
value_type: "email_address"
|
||
patterns:
|
||
- pattern: "{EMAIL_REGEX}"
|
||
regex: "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}"
|
||
|
||
telephone:
|
||
property_uri: "schema:telephone"
|
||
alternative_uris:
|
||
- "vcard:hasTelephone"
|
||
subject_types: ["ORG.HER"]
|
||
value_type: "phone_number"
|
||
patterns:
|
||
- pattern: "+{COUNTRY_CODE} {NUMBER}"
|
||
- pattern: "0{AREA_CODE}-{NUMBER}"
|
||
- pattern: "({AREA_CODE}) {NUMBER}"
|
||
|
||
address:
|
||
property_uri: "schema:address"
|
||
alternative_uris:
|
||
- "vcard:hasAddress"
|
||
- "org:hasSite"
|
||
subject_types: ["ORG.HER"]
|
||
value_type: "entity_reference"
|
||
target_types: ["PLC.ADR", "PLC.IAD"]
|
||
|
||
location:
|
||
property_uri: "schema:location"
|
||
alternative_uris:
|
||
- "crm:P55_has_current_location"
|
||
- "rico:hasOrHadLocation"
|
||
subject_types: ["ORG.HER", "THG.EVT"]
|
||
value_type: "entity_reference"
|
||
target_types: ["PLC", "PLC.GEO", "PLC.BLD"]
|
||
|
||
identifier_properties:
|
||
description: "Properties for formal identifiers"
|
||
|
||
isil_code:
|
||
property_uri: "glam:isilCode"
|
||
alternative_uris:
|
||
- "schema:identifier"
|
||
subject_types: ["ORG.HER"]
|
||
value_type: "identifier"
|
||
pattern: "[A-Z]{2}-[A-Za-z0-9]+"
|
||
example: "NL-AsdRM"
|
||
|
||
wikidata_id:
|
||
property_uri: "schema:sameAs"
|
||
subject_types: ["ORG.HER", "BEING", "PLC"]
|
||
value_type: "uri"
|
||
pattern: "Q[0-9]+"
|
||
uri_template: "http://www.wikidata.org/entity/{ID}"
|
||
|
||
kvk_number:
|
||
property_uri: "rov:registration"
|
||
subject_types: ["ORG.HER", "ORG.COM"]
|
||
value_type: "identifier"
|
||
pattern: "[0-9]{8}"
|
||
jurisdiction: "NL"
|
||
|
||
collection_properties:
|
||
description: "Properties for collection descriptions"
|
||
|
||
collection_size:
|
||
property_uri: "schema:size"
|
||
alternative_uris:
|
||
- "rico:recordSetExtent"
|
||
subject_types: ["ORG.HER", "DEN.COL"]
|
||
value_type: "quantity"
|
||
patterns:
|
||
- pattern: "{NUMBER} {UNIT}"
|
||
examples:
|
||
- "over 1 million objects"
|
||
- "8,000 artworks"
|
||
- "15 kilometers of archives"
|
||
|
||
subject_area:
|
||
property_uri: "schema:about"
|
||
alternative_uris:
|
||
- "dct:subject"
|
||
subject_types: ["ORG.HER", "DEN.COL"]
|
||
value_type: "concept"
|
||
|
||
temporal_coverage:
|
||
property_uri: "dct:temporal"
|
||
subject_types: ["DEN.COL", "ORG.HER"]
|
||
value_type: "temporal_range"
|
||
patterns:
|
||
- pattern: "from {START} to {END}"
|
||
- pattern: "{CENTURY} century"
|
||
- pattern: "{ERA} period"
|
||
|
||
extraction_pipeline:
|
||
description: "Steps for property extraction"
|
||
|
||
steps:
|
||
- step: 1
|
||
name: "Entity Detection"
|
||
description: "First identify all entities in the text"
|
||
output: "List of entity mentions with spans and types"
|
||
|
||
- step: 2
|
||
name: "Relation Pattern Matching"
|
||
description: "Apply relation patterns to detect property claims"
|
||
input: "Entity pairs and intervening text"
|
||
output: "Candidate property claims"
|
||
|
||
- step: 3
|
||
name: "Property Classification"
|
||
description: "Classify property type using ontology mappings"
|
||
input: "Candidate claims"
|
||
output: "Typed property claims with URIs"
|
||
|
||
- step: 4
|
||
name: "Value Normalization"
|
||
description: "Normalize values (dates, quantities, identifiers)"
|
||
input: "Raw extracted values"
|
||
output: "Normalized values with original text preserved"
|
||
|
||
- step: 5
|
||
name: "Provenance Attachment"
|
||
description: "Add required provenance metadata to each claim"
|
||
input: "Property claims"
|
||
output: "Claims with full provenance (see claim_validation section)"
|
||
|
||
# =============================================================================
|
||
# SECTION 9: CLAIM VALIDATION SCHEMA
|
||
# =============================================================================
|
||
# LinkML-based schema for claim validation and provenance tracking.
|
||
# Every extracted claim (entity or property) MUST conform to this schema.
|
||
|
||
claim_validation:
|
||
|
||
description: |
|
||
All claims extracted by this convention MUST include complete provenance
|
||
metadata. This enables verification, reproducibility, and trust assessment.
|
||
|
||
A "claim" is any assertion extracted from a source:
|
||
- Entity existence ("Rijksmuseum" is an organization)
|
||
- Property value ("Rijksmuseum" was founded in 1885)
|
||
- Relationship ("Dr. Dibbits" is director of "Rijksmuseum")
|
||
|
||
Claims without verifiable provenance are considered FABRICATED and
|
||
MUST be rejected or flagged for manual verification.
|
||
|
||
linkml_schema_reference:
|
||
schema_file: "schemas/20251121/linkml/modules/classes/WebClaim.yaml"
|
||
namespace: "https://nde.nl/ontology/hc/"
|
||
description: |
|
||
The WebClaim class defines the formal structure for web-extracted claims.
|
||
All claim instances MUST validate against this LinkML schema.
|
||
|
||
required_provenance_components:
|
||
description: |
|
||
Every claim MUST include these 5 provenance components.
|
||
Missing any component renders the claim unverifiable.
|
||
|
||
components:
|
||
- id: 1
|
||
name: "namespace"
|
||
description: "Ontology namespace for the claim predicate"
|
||
examples:
|
||
- "schema:"
|
||
- "rico:"
|
||
- "org:"
|
||
- "crm:"
|
||
linkml_slot: "claim_type"
|
||
required: true
|
||
|
||
- id: 2
|
||
name: "path"
|
||
description: "Location path to the claim source in the document"
|
||
path_types:
|
||
xpath:
|
||
description: "XPath expression for HTML/XML documents"
|
||
pattern: "^/.*"
|
||
example: "/html[1]/body[1]/div[6]/div[1]/h1[1]"
|
||
jsonpath:
|
||
description: "JSONPath expression for JSON documents"
|
||
pattern: "^\\$.*"
|
||
example: "$.organization.foundingDate"
|
||
text_offset:
|
||
description: "Character offset range for plain text"
|
||
pattern: "offset_{begin}_{end}"
|
||
example: "offset_42_58"
|
||
css_selector:
|
||
description: "CSS selector for HTML elements"
|
||
example: "footer > div.contact > a.email"
|
||
linkml_slot: "xpath"
|
||
required: true
|
||
|
||
- id: 3
|
||
name: "timestamp"
|
||
description: "When the source was retrieved/observed"
|
||
format: "ISO 8601 with timezone"
|
||
example: "2025-12-02T14:30:00Z"
|
||
linkml_slot: "retrieved_on"
|
||
required: true
|
||
|
||
- id: 4
|
||
name: "agent"
|
||
description: "The extraction agent (model, script, human)"
|
||
agent_types:
|
||
llm_model:
|
||
examples:
|
||
- "glm4.6"
|
||
- "gpt-4-turbo"
|
||
- "claude-3.5-sonnet"
|
||
format: "{model_family}-{version}"
|
||
script:
|
||
examples:
|
||
- "add_xpath_provenance.py"
|
||
- "extract_entities.py"
|
||
format: "{script_name}"
|
||
human:
|
||
examples:
|
||
- "curator:john.doe@museum.nl"
|
||
format: "human:{identifier}"
|
||
linkml_slot: "extraction_method"
|
||
required: true
|
||
|
||
- id: 5
|
||
name: "context_convention"
|
||
description: "Version of this convention used for extraction"
|
||
format: "entity_annotation_rules_v{VERSION}"
|
||
current_value: "entity_annotation_rules_v1.6.3"
|
||
linkml_slot: "extraction_context"
|
||
required: true
|
||
|
||
claim_structure:
|
||
description: "Complete claim structure conforming to LinkML WebClaim schema"
|
||
|
||
claim_fields:
|
||
# Identity
|
||
claim_id:
|
||
type: "uriorcurie"
|
||
description: "Unique identifier for this claim"
|
||
pattern: "urn:glam:claim:{uuid}"
|
||
required: true
|
||
|
||
# Claim content
|
||
claim_type:
|
||
type: "ClaimTypeEnum"
|
||
description: "Type of claim (from CanonicalClaimTypes enum)"
|
||
required: true
|
||
|
||
claim_value:
|
||
type: "string"
|
||
description: "The extracted value"
|
||
required: true
|
||
|
||
normalized_value:
|
||
type: "varies"
|
||
description: "Normalized/parsed value (dates, quantities)"
|
||
required: false
|
||
|
||
# Subject and object entities (for relationship claims)
|
||
subject_entity_id:
|
||
type: "uriorcurie"
|
||
description: "Entity this claim is about"
|
||
required: true
|
||
|
||
object_entity_id:
|
||
type: "uriorcurie"
|
||
description: "Target entity (for relationship claims)"
|
||
required: false
|
||
|
||
# Provenance (all 5 components)
|
||
source_url:
|
||
type: "uri"
|
||
description: "URL of source document"
|
||
required: true
|
||
|
||
xpath:
|
||
type: "string"
|
||
description: "Path to claim in source document"
|
||
required: true
|
||
|
||
html_file:
|
||
type: "string"
|
||
description: "Path to archived source file"
|
||
required: true
|
||
|
||
retrieved_on:
|
||
type: "datetime"
|
||
description: "When source was retrieved"
|
||
required: true
|
||
|
||
extraction_agent:
|
||
type: "string"
|
||
description: "Agent that extracted the claim"
|
||
required: true
|
||
|
||
extraction_convention:
|
||
type: "string"
|
||
description: "Convention version used"
|
||
default: "entity_annotation_rules_v1.6.3"
|
||
required: true
|
||
|
||
# Verification
|
||
xpath_match_score:
|
||
type: "float"
|
||
range: "0.0-1.0"
|
||
description: "Match quality between claim and source text"
|
||
required: true
|
||
|
||
xpath_matched_text:
|
||
type: "string"
|
||
description: "Actual text found at xpath location"
|
||
required: false
|
||
|
||
# Confidence (computed, not estimated)
|
||
confidence:
|
||
type: "float"
|
||
range: "0.0-1.0"
|
||
description: "Computed confidence based on xpath_match_score and extraction_method"
|
||
computation: |
|
||
confidence = xpath_match_score * method_weight
|
||
where method_weight:
|
||
- xpath_exact_match: 1.0
|
||
- pattern_match: 0.9
|
||
- nlp_extraction: 0.7
|
||
- llm_extraction: 0.6
|
||
required: true
|
||
|
||
claim_tiers:
|
||
description: "Claims are categorized by reliability tier"
|
||
|
||
tiers:
|
||
- tier: 1
|
||
name: "STRUCTURAL"
|
||
description: "Extracted from HTML structure with exact XPath"
|
||
confidence_range: "0.95-1.0"
|
||
examples:
|
||
- "page_title from <title> tag"
|
||
- "email from mailto: link href"
|
||
- "isil_code from structured data"
|
||
verification: "Automated - xpath resolves to exact text"
|
||
|
||
- tier: 2
|
||
name: "PATTERN"
|
||
description: "Extracted via pattern matching with XPath"
|
||
confidence_range: "0.80-0.95"
|
||
examples:
|
||
- "phone number matching +31 pattern"
|
||
- "founding year from 'founded in YYYY' pattern"
|
||
verification: "Automated with pattern validation"
|
||
|
||
- tier: 3
|
||
name: "NLP"
|
||
description: "Extracted via NLP/LLM with XPath verification"
|
||
confidence_range: "0.50-0.80"
|
||
examples:
|
||
- "organization description from about page"
|
||
- "staff member name from team page"
|
||
verification: "Requires XPath verification, may need human review"
|
||
|
||
- tier: 4
|
||
name: "INFERRED"
|
||
description: "Inferred from multiple claims, not directly extracted"
|
||
confidence_range: "0.30-0.50"
|
||
examples:
|
||
- "organization type inferred from name patterns"
|
||
- "relationship inferred from context"
|
||
verification: "Requires human review"
|
||
|
||
validation_rules:
|
||
description: "Rules for validating claim instances"
|
||
|
||
rules:
|
||
- id: "CV001"
|
||
rule: "Every claim MUST have all 5 provenance components"
|
||
severity: "ERROR"
|
||
check: "xpath AND retrieved_on AND extraction_agent AND extraction_convention AND source_url"
|
||
|
||
- id: "CV002"
|
||
rule: "xpath_match_score MUST be computed, not estimated"
|
||
severity: "ERROR"
|
||
check: "xpath_match_score derived from text comparison"
|
||
|
||
- id: "CV003"
|
||
rule: "Claims with xpath_match_score < 0.3 MUST be flagged"
|
||
severity: "WARNING"
|
||
action: "Add to manual_review queue"
|
||
|
||
- id: "CV004"
|
||
rule: "Temporal claims MUST include normalized ISO 8601 value"
|
||
severity: "WARNING"
|
||
applies_to: ["founding_date", "dissolution_date", "event_date"]
|
||
|
||
- id: "CV005"
|
||
rule: "Entity reference claims MUST include target entity_id"
|
||
severity: "ERROR"
|
||
applies_to: ["parent_organization", "location", "member_of"]
|
||
|
||
- id: "CV006"
|
||
rule: "Extraction convention version MUST match current production version"
|
||
severity: "WARNING"
|
||
current_version: "entity_annotation_rules_v1.6.3"
|
||
|
||
example_validated_claim:
|
||
description: "Complete example of a validated property claim"
|
||
|
||
claim:
|
||
claim_id: "urn:glam:claim:550e8400-e29b-41d4-a716-446655440000"
|
||
claim_type: "founding_date"
|
||
claim_value: "1885"
|
||
normalized_value: "1885-01-01"
|
||
|
||
subject_entity_id: "urn:glam:entity:rijksmuseum"
|
||
|
||
# Provenance (5 components)
|
||
source_url: "https://www.rijksmuseum.nl/nl/over-ons/wat-we-doen/geschiedenis"
|
||
xpath: "/html[1]/body[1]/main[1]/article[1]/div[2]/p[3]"
|
||
html_file: "web/0001/rijksmuseum.nl/over-ons/geschiedenis.html"
|
||
retrieved_on: "2025-12-02T10:30:00Z"
|
||
extraction_agent: "glm4.6"
|
||
extraction_convention: "entity_annotation_rules_v1.6.3"
|
||
|
||
# Verification
|
||
xpath_match_score: 0.92
|
||
xpath_matched_text: "Het Rijksmuseum is in 1885 opgericht als 's Rijks Museum"
|
||
|
||
# Computed confidence
|
||
confidence: 0.644 # 0.92 * 0.7 (NLP extraction weight)
|
||
tier: 3
|
||
|
||
# =============================================================================
|
||
# SECTION 10: ENTITY RESOLUTION AND LINKING PIPELINE
|
||
# =============================================================================
|
||
# Complete pipeline for entity resolution (disambiguation) and entity linking
|
||
# (connecting to knowledge bases).
|
||
|
||
entity_resolution_linking:
|
||
|
||
description: |
|
||
After entity recognition and property extraction, entities must be:
|
||
1. RESOLVED: Disambiguated and merged (same entity, different mentions)
|
||
2. LINKED: Connected to authoritative knowledge bases (Wikidata, ISIL, etc.)
|
||
|
||
Both processes generate claims that require provenance tracking.
|
||
|
||
entity_resolution:
|
||
description: |
|
||
Entity resolution determines when two entity mentions refer to the
|
||
same real-world entity. This is critical for:
|
||
- Merging data from multiple sources
|
||
- Preventing duplicate records
|
||
- Building knowledge graphs
|
||
|
||
resolution_strategies:
|
||
|
||
identifier_matching:
|
||
description: "Exact match on formal identifiers"
|
||
priority: 1
|
||
confidence: 1.0
|
||
identifiers:
|
||
- type: "ISIL"
|
||
pattern: "[A-Z]{2}-[A-Za-z0-9]+"
|
||
example: "Both have ISIL NL-AsdRM → same entity"
|
||
- type: "Wikidata"
|
||
pattern: "Q[0-9]+"
|
||
example: "Both link to Q190804 → same entity"
|
||
- type: "KvK"
|
||
pattern: "[0-9]{8}"
|
||
example: "Both have KvK 12345678 → same entity"
|
||
|
||
name_normalization:
|
||
description: "Match after normalizing names"
|
||
priority: 2
|
||
confidence: 0.85
|
||
normalization_steps:
|
||
- step: "lowercase"
|
||
- step: "remove_articles"
|
||
articles: ["de", "het", "the", "a", "an", "le", "la", "der", "die", "das"]
|
||
- step: "remove_legal_forms"
|
||
forms: ["stichting", "vereniging", "foundation", "inc", "ltd", "bv", "nv"]
|
||
- step: "normalize_whitespace"
|
||
- step: "remove_diacritics"
|
||
example: |
|
||
"Stichting Rijksmuseum" → "rijksmuseum"
|
||
"The Rijksmuseum" → "rijksmuseum"
|
||
→ Same normalized form, likely same entity
|
||
|
||
location_name_combo:
|
||
description: "Match on (location, normalized_name) pair"
|
||
priority: 3
|
||
confidence: 0.75
|
||
example: |
|
||
Entity A: city="Amsterdam", name="City Archive"
|
||
Entity B: city="Amsterdam", name="Stadsarchief"
|
||
→ Same location + similar function, possible match (needs review)
|
||
|
||
fuzzy_matching:
|
||
description: "Similarity-based matching for name variants"
|
||
priority: 4
|
||
confidence: 0.60
|
||
algorithms:
|
||
- name: "Levenshtein"
|
||
threshold: 0.85
|
||
- name: "Jaro-Winkler"
|
||
threshold: 0.90
|
||
- name: "Token Set Ratio"
|
||
threshold: 0.85
|
||
example: |
|
||
"Rijksmuseum Amsterdam" vs "Amsterdam Rijksmuseum"
|
||
Token Set Ratio: 1.0 → likely match
|
||
|
||
resolution_output:
|
||
description: "Resolution creates a merge claim with provenance"
|
||
claim_type: "entity_resolution"
|
||
structure:
|
||
resolution_id: "urn:glam:resolution:{uuid}"
|
||
entity_a_id: "urn:glam:entity:{id_a}"
|
||
entity_b_id: "urn:glam:entity:{id_b}"
|
||
resolution_strategy: "identifier_matching"
|
||
resolution_confidence: 1.0
|
||
resolved_entity_id: "urn:glam:entity:{canonical_id}"
|
||
provenance:
|
||
resolved_on: "2025-12-02T10:30:00Z"
|
||
resolution_agent: "entity_resolver.py"
|
||
resolution_convention: "entity_annotation_rules_v1.6.3"
|
||
matching_evidence:
|
||
- field: "isil_code"
|
||
value_a: "NL-AsdRM"
|
||
value_b: "NL-AsdRM"
|
||
match_type: "exact"
|
||
|
||
entity_linking:
|
||
description: |
|
||
Entity linking connects extracted entities to authoritative
|
||
knowledge bases. This enables:
|
||
- Enrichment with additional data
|
||
- Cross-dataset integration
|
||
- Semantic interoperability
|
||
|
||
target_knowledge_bases:
|
||
|
||
wikidata:
|
||
priority: 1
|
||
namespace: "http://www.wikidata.org/entity/"
|
||
lookup_methods:
|
||
- method: "SPARQL search"
|
||
description: "Query Wikidata SPARQL endpoint by name"
|
||
- method: "API search"
|
||
description: "Use wbsearchentities API"
|
||
- method: "exact_match"
|
||
description: "Direct P-property lookup (e.g., P791 for ISIL)"
|
||
properties_to_check:
|
||
- "P791" # ISIL code
|
||
- "P214" # VIAF ID
|
||
- "P244" # Library of Congress authority ID
|
||
- "P227" # GND ID
|
||
link_claim:
|
||
property_uri: "schema:sameAs"
|
||
alternative_uri: "owl:sameAs"
|
||
|
||
isil_registry:
|
||
priority: 2
|
||
namespace: "https://ld.zdb-services.de/resource/"
|
||
lookup_methods:
|
||
- method: "ISIL code lookup"
|
||
description: "Direct lookup by ISIL code"
|
||
link_claim:
|
||
property_uri: "schema:identifier"
|
||
|
||
geonames:
|
||
priority: 3
|
||
namespace: "http://sws.geonames.org/"
|
||
applies_to: ["PLC", "PLC.GEO", "PLC.SET"]
|
||
lookup_methods:
|
||
- method: "Coordinate lookup"
|
||
description: "Find nearest named place by lat/lon"
|
||
- method: "Name search"
|
||
description: "Search by place name + country"
|
||
link_claim:
|
||
property_uri: "schema:geo"
|
||
|
||
viaf:
|
||
priority: 4
|
||
namespace: "http://viaf.org/viaf/"
|
||
applies_to: ["ORG.HER", "BEING"]
|
||
lookup_methods:
|
||
- method: "Name search"
|
||
description: "Search VIAF by organization/person name"
|
||
link_claim:
|
||
property_uri: "schema:sameAs"
|
||
|
||
linking_output:
|
||
description: "Linking creates an entity link claim with provenance"
|
||
claim_type: "entity_link"
|
||
linkml_properties:
|
||
- "itsrdf:taIdentRef"
|
||
- "itsrdf:taSource"
|
||
- "itsrdf:taConfidence"
|
||
structure:
|
||
link_id: "urn:glam:link:{uuid}"
|
||
source_entity_id: "urn:glam:entity:{id}"
|
||
target_uri: "http://www.wikidata.org/entity/Q190804"
|
||
target_source: "Wikidata"
|
||
link_confidence: 0.95
|
||
link_method: "ISIL exact match via P791"
|
||
provenance:
|
||
linked_on: "2025-12-02T10:35:00Z"
|
||
linking_agent: "entity_linker.py"
|
||
linking_convention: "entity_annotation_rules_v1.6.3"
|
||
matching_evidence:
|
||
- source_field: "isil_code"
|
||
source_value: "NL-AsdRM"
|
||
target_property: "P791"
|
||
target_value: "NL-AsdRM"
|
||
match_type: "exact"
|
||
|
||
pipeline_stages:
|
||
description: "Complete extraction-to-linking pipeline"
|
||
|
||
stages:
|
||
- stage: 1
|
||
name: "Source Acquisition"
|
||
input: "URL or document"
|
||
output: "Archived source with timestamp"
|
||
claims_generated: ["source_retrieval"]
|
||
|
||
- stage: 2
|
||
name: "Entity Recognition"
|
||
input: "Archived source"
|
||
output: "Entity mentions with spans and types"
|
||
claims_generated: ["entity_mention"]
|
||
convention_section: "SECTION 2: ENTITY TYPE DEFINITIONS"
|
||
|
||
- stage: 3
|
||
name: "Property Extraction"
|
||
input: "Entity mentions + source text"
|
||
output: "Property claims linking entities"
|
||
claims_generated: ["property_claim"]
|
||
convention_section: "SECTION 8: PROPERTY EXTRACTION RULES"
|
||
|
||
- stage: 4
|
||
name: "Claim Validation"
|
||
input: "All claims"
|
||
output: "Validated claims with provenance"
|
||
claims_generated: ["validated_claim"]
|
||
convention_section: "SECTION 9: CLAIM VALIDATION SCHEMA"
|
||
|
||
- stage: 5
|
||
name: "Entity Resolution"
|
||
input: "Validated entity claims"
|
||
output: "Resolved entities (merged duplicates)"
|
||
claims_generated: ["resolution_claim"]
|
||
convention_section: "SECTION 10: ENTITY RESOLUTION AND LINKING"
|
||
|
||
- stage: 6
|
||
name: "Entity Linking"
|
||
input: "Resolved entities"
|
||
output: "Linked entities with KB references"
|
||
claims_generated: ["link_claim"]
|
||
convention_section: "SECTION 10: ENTITY RESOLUTION AND LINKING"
|
||
|
||
- stage: 7
|
||
name: "Output Generation"
|
||
input: "All validated claims"
|
||
output: "LinkML instances, RDF, JSON-LD"
|
||
convention_section: "SECTION 11: EXTRACTION OUTPUT FORMAT"
|
||
|
||
# =============================================================================
|
||
# SECTION 11: EXTRACTION OUTPUT FORMAT
|
||
# =============================================================================
|
||
# Standard output format for NER extraction results.
|
||
|
||
output_format:
|
||
|
||
description: |
|
||
Extracted entities should be output in JSON format with the following
|
||
structure. Each entity includes its span, type, confidence, and any
|
||
relationships to other entities.
|
||
|
||
The schema supports both native JSON output and NIF/RDF serialization.
|
||
NIF properties are included to enable direct conversion to Turtle/JSON-LD
|
||
for NLP tool interchange.
|
||
|
||
schema:
|
||
document_context:
|
||
type: "object"
|
||
description: "Document-level context (maps to nif:Context)"
|
||
properties:
|
||
source_url:
|
||
type: "string"
|
||
format: "uri"
|
||
nif_property: "nif:sourceUrl"
|
||
full_text:
|
||
type: "string"
|
||
nif_property: "nif:isString"
|
||
text_length:
|
||
type: "integer"
|
||
description: "Length of full_text for nif:endIndex"
|
||
|
||
entities:
|
||
type: "array"
|
||
items:
|
||
entity_id:
|
||
type: "string"
|
||
description: "Unique identifier for this entity in the extraction"
|
||
text:
|
||
type: "string"
|
||
description: "The exact text span as it appears in source"
|
||
nif_property: "nif:anchorOf"
|
||
entity_type:
|
||
type: "string"
|
||
description: "Entity type code (e.g., ORG.HER, BEING.PER)"
|
||
glam_property: "glam:entityType"
|
||
start_offset:
|
||
type: "integer"
|
||
description: "Character offset where entity begins (0-based)"
|
||
nif_property: "nif:beginIndex"
|
||
end_offset:
|
||
type: "integer"
|
||
description: "Character offset where entity ends (exclusive)"
|
||
nif_property: "nif:endIndex"
|
||
confidence:
|
||
type: "number"
|
||
description: "Confidence score 0.0-1.0"
|
||
glam_property: "glam:confidence"
|
||
ontology_class:
|
||
type: "string"
|
||
description: "Primary ontology class URI"
|
||
nerd_class:
|
||
type: "string"
|
||
description: "NERD ontology class for NLP interchange"
|
||
example: "nerd:Organization"
|
||
entity_link:
|
||
type: "object"
|
||
description: "Entity linking to knowledge bases (maps to itsrdf properties)"
|
||
properties:
|
||
reference_uri:
|
||
type: "string"
|
||
format: "uri"
|
||
itsrdf_property: "itsrdf:taIdentRef"
|
||
example: "http://www.wikidata.org/entity/Q190804"
|
||
source:
|
||
type: "string"
|
||
itsrdf_property: "itsrdf:taSource"
|
||
example: "Wikidata"
|
||
linking_confidence:
|
||
type: "number"
|
||
itsrdf_property: "itsrdf:taConfidence"
|
||
normalized_value:
|
||
type: "string"
|
||
description: "Normalized/standardized value (for dates, quantities)"
|
||
glam_property: "glam:normalizedValue"
|
||
attributes:
|
||
type: "object"
|
||
description: "Type-specific attributes (e.g., PNV components for names)"
|
||
relationships:
|
||
type: "array"
|
||
description: "Relationships to other entities in extraction"
|
||
items:
|
||
predicate:
|
||
type: "string"
|
||
target_entity_id:
|
||
type: "string"
|
||
|
||
nif_uri_generation:
|
||
description: "How to generate NIF-compliant URIs for entities"
|
||
pattern: "{source_url}#offset_{start_offset}_{end_offset}"
|
||
example: "https://example.org/page#offset_4_15"
|
||
|
||
nif_serialization_note: |
|
||
To convert JSON output to NIF Turtle:
|
||
1. Create nif:Context from document_context
|
||
2. For each entity, create nif:Phrase with offset-based URI
|
||
3. Add nif:referenceContext linking to Context
|
||
4. Add nerd: class assertions based on nerd_class field
|
||
5. Add itsrdf: properties for entity linking
|
||
|
||
example:
|
||
# DOCUMENT-LEVEL EXTRACTION CONTEXT
|
||
# All 5 provenance components at document level
|
||
document_context:
|
||
source_url: "https://www.rijksmuseum.nl/en/about-us"
|
||
full_text: "The Rijksmuseum is located at Museumstraat 1, 1071 XX Amsterdam. Contact: info@rijksmuseum.nl. Director: Dr. Taco Dibbits."
|
||
text_length: 124
|
||
|
||
# EXTRACTION PROVENANCE (5 components - document level)
|
||
extraction_provenance:
|
||
retrieved_on: "2025-12-02T10:30:00Z" # Component 3: timestamp
|
||
extraction_agent: "glm4.6" # Component 4: agent
|
||
extraction_convention: "entity_annotation_rules_v1.6.3" # Component 5: context
|
||
html_archive_path: "web/0001/rijksmuseum.nl/en/about-us.html"
|
||
|
||
# EXTRACTED ENTITIES
|
||
entities:
|
||
- entity_id: "e1"
|
||
text: "Rijksmuseum"
|
||
entity_type: "ORG.HER"
|
||
start_offset: 4
|
||
end_offset: 15
|
||
confidence: 0.98
|
||
ontology_class: "glam:HeritageCustodian"
|
||
nerd_class: "nerd:Museum"
|
||
|
||
# ENTITY PROVENANCE (5 components)
|
||
provenance:
|
||
namespace: "schema:" # Component 1: namespace
|
||
xpath: "/html[1]/body[1]/main[1]/h1[1]" # Component 2: path
|
||
xpath_match_score: 1.0
|
||
xpath_matched_text: "Rijksmuseum"
|
||
|
||
entity_link:
|
||
reference_uri: "http://www.wikidata.org/entity/Q190804"
|
||
source: "Wikidata"
|
||
linking_confidence: 0.96
|
||
|
||
# PROPERTY CLAIMS with full 5-component provenance
|
||
property_claims:
|
||
- claim_id: "urn:glam:claim:e1-founding"
|
||
claim_type: "founding_date" # Component 1: namespace (rico:)
|
||
claim_value: "1885"
|
||
normalized_value: "1885-01-01"
|
||
provenance:
|
||
namespace: "rico:"
|
||
xpath: "/html[1]/body[1]/main[1]/article[1]/section[2]/p[1]"
|
||
retrieved_on: "2025-12-02T10:30:00Z"
|
||
extraction_agent: "glm4.6"
|
||
extraction_convention: "entity_annotation_rules_v1.6.3"
|
||
html_file: "web/0001/rijksmuseum.nl/en/about-us.html"
|
||
xpath_match_score: 0.85
|
||
xpath_matched_text: "The Rijksmuseum was founded in 1885"
|
||
confidence: 0.595 # 0.85 * 0.7 (NLP weight)
|
||
tier: 3
|
||
|
||
relationships:
|
||
- predicate: "schema:address"
|
||
target_entity_id: "e2"
|
||
- predicate: "schema:email"
|
||
target_entity_id: "e3"
|
||
|
||
- entity_id: "e2"
|
||
text: "Museumstraat 1, 1071 XX Amsterdam"
|
||
entity_type: "PLC.IAD"
|
||
start_offset: 30
|
||
end_offset: 63
|
||
confidence: 0.95
|
||
ontology_class: "schema:PostalAddress"
|
||
nerd_class: "nerd:Location"
|
||
|
||
# ENTITY PROVENANCE (5 components)
|
||
provenance:
|
||
namespace: "schema:"
|
||
xpath: "/html[1]/body[1]/main[1]/section[1]/address[1]"
|
||
xpath_match_score: 1.0
|
||
xpath_matched_text: "Museumstraat 1, 1071 XX Amsterdam"
|
||
|
||
- entity_id: "e3"
|
||
text: "info@rijksmuseum.nl"
|
||
entity_type: "TXT.IEM"
|
||
start_offset: 74
|
||
end_offset: 93
|
||
confidence: 0.99
|
||
ontology_class: "schema:email"
|
||
nerd_class: null
|
||
|
||
# ENTITY PROVENANCE (5 components) - email from mailto link
|
||
provenance:
|
||
namespace: "schema:"
|
||
xpath: "/html[1]/body[1]/footer[1]/a[@href='mailto:info@rijksmuseum.nl']"
|
||
xpath_match_score: 1.0
|
||
extraction_method: "structural" # Tier 1 - from HTML attribute
|
||
tier: 1
|
||
|
||
- entity_id: "e4"
|
||
text: "Dr. Taco Dibbits"
|
||
entity_type: "BEING.STF"
|
||
start_offset: 105
|
||
end_offset: 121
|
||
confidence: 0.92
|
||
ontology_class: "picom:PersonObservation"
|
||
nerd_class: "nerd:Person"
|
||
|
||
# ENTITY PROVENANCE (5 components)
|
||
provenance:
|
||
namespace: "pico:"
|
||
xpath: "/html[1]/body[1]/main[1]/section[3]/div[1]/h3[1]"
|
||
xpath_match_score: 0.92
|
||
xpath_matched_text: "Dr. Taco Dibbits, General Director"
|
||
|
||
entity_link:
|
||
reference_uri: "http://www.wikidata.org/entity/Q56314921"
|
||
source: "Wikidata"
|
||
linking_confidence: 0.88
|
||
|
||
attributes:
|
||
role: "General Director"
|
||
pnv_name:
|
||
honorificPrefix: "Dr."
|
||
givenName: "Taco"
|
||
baseSurname: "Dibbits"
|
||
|
||
# PROPERTY CLAIMS for staff relationships
|
||
property_claims:
|
||
- claim_id: "urn:glam:claim:e4-role"
|
||
claim_type: "staff_role"
|
||
claim_value: "General Director"
|
||
provenance:
|
||
namespace: "pico:"
|
||
xpath: "/html[1]/body[1]/main[1]/section[3]/div[1]/h3[1]"
|
||
retrieved_on: "2025-12-02T10:30:00Z"
|
||
extraction_agent: "glm4.6"
|
||
extraction_convention: "entity_annotation_rules_v1.6.3"
|
||
html_file: "web/0001/rijksmuseum.nl/en/about-us.html"
|
||
xpath_match_score: 0.75
|
||
xpath_matched_text: "Dr. Taco Dibbits, General Director"
|
||
confidence: 0.525 # 0.75 * 0.7 (NLP weight)
|
||
tier: 3
|
||
|
||
relationships:
|
||
- predicate: "org:memberOf"
|
||
target_entity_id: "e1"
|
||
|
||
# COMPLETE PROPERTY CLAIM EXAMPLE (standalone)
|
||
# Shows full structure for property extraction output
|
||
property_claim_example:
|
||
description: |
|
||
Complete example of a property claim with all 5 provenance components.
|
||
This structure is used for property extraction (Section 8) outputs.
|
||
|
||
claim:
|
||
claim_id: "urn:glam:claim:550e8400-e29b-41d4-a716-446655440000"
|
||
claim_type: "founding_date"
|
||
claim_value: "1885"
|
||
normalized_value: "1885-01-01"
|
||
|
||
subject_entity_id: "urn:glam:entity:rijksmuseum"
|
||
|
||
# === 5 PROVENANCE COMPONENTS (ALL REQUIRED) ===
|
||
#
|
||
# 1. NAMESPACE - ontology prefix for claim predicate
|
||
namespace: "rico:"
|
||
|
||
# 2. PATH - location in source document
|
||
xpath: "/html[1]/body[1]/main[1]/article[1]/div[2]/p[3]"
|
||
|
||
# 3. TIMESTAMP - when source was retrieved
|
||
retrieved_on: "2025-12-02T10:30:00Z"
|
||
|
||
# 4. AGENT - extraction model/script/human
|
||
extraction_agent: "glm4.6"
|
||
|
||
# 5. CONTEXT CONVENTION - version of extraction rules
|
||
extraction_convention: "entity_annotation_rules_v1.6.3"
|
||
# ==============================================
|
||
|
||
# Additional provenance fields
|
||
source_url: "https://www.rijksmuseum.nl/nl/over-ons/geschiedenis"
|
||
html_file: "web/0001/rijksmuseum.nl/over-ons/geschiedenis.html"
|
||
|
||
# Verification (computed, not estimated)
|
||
xpath_match_score: 0.92
|
||
xpath_matched_text: "Het Rijksmuseum is in 1885 opgericht"
|
||
|
||
# Computed confidence and tier
|
||
confidence: 0.644 # xpath_match_score * method_weight (0.92 * 0.7)
|
||
tier: 3 # NLP extraction
|
||
|
||
# =============================================================================
|
||
# SECTION 12: VALIDATION AND QUALITY
|
||
# =============================================================================
|
||
# Quality control rules for extraction validation.
|
||
|
||
validation:
|
||
|
||
required_fields:
|
||
- "text"
|
||
- "entity_type"
|
||
- "confidence"
|
||
|
||
confidence_thresholds:
|
||
high: 0.90
|
||
medium: 0.70
|
||
low: 0.50
|
||
minimum: 0.30
|
||
|
||
rules:
|
||
- id: "VAL001"
|
||
rule: "Entity text must not be empty"
|
||
|
||
- id: "VAL002"
|
||
rule: "Entity type must be a valid type code from this convention"
|
||
|
||
- id: "VAL003"
|
||
rule: "Confidence must be between 0.0 and 1.0"
|
||
|
||
- id: "VAL004"
|
||
rule: "Relationships must reference valid entity_ids in same extraction"
|
||
|
||
- id: "VAL005"
|
||
rule: "HERINST entities should have at least one contact relationship"
|
||
applies_to: "ORG.HER"
|
||
recommendation: "Link to address, email, website, or social media"
|
||
|
||
- id: "VAL006"
|
||
rule: "Staff entities should link to their institution"
|
||
applies_to: "BEING.STF"
|
||
required_relationship: "org:memberOf to ORG.HER"
|
||
|
||
# =============================================================================
|
||
# SECTION 13: PROVENANCE TRACKING
|
||
# =============================================================================
|
||
# Provenance metadata for extraction audit trail.
|
||
|
||
provenance:
|
||
|
||
description: |
|
||
Every extraction must include provenance metadata documenting the source,
|
||
method, and processing chain. This enables quality assessment and
|
||
reproducibility.
|
||
|
||
required_metadata:
|
||
source_url:
|
||
description: "URL of the source document/page"
|
||
type: "string"
|
||
format: "uri"
|
||
|
||
extraction_timestamp:
|
||
description: "ISO 8601 timestamp of extraction"
|
||
type: "string"
|
||
format: "date-time"
|
||
|
||
convention_version:
|
||
description: "Version of this NER convention used"
|
||
type: "string"
|
||
value: "1.6.3-unified"
|
||
|
||
extractor_id:
|
||
description: "Identifier for the extraction system/model"
|
||
type: "string"
|
||
examples:
|
||
- "claude-3.5-sonnet"
|
||
- "glam-ner-v1"
|
||
|
||
html_archive_path:
|
||
description: "Path to archived HTML source (for XPath verification)"
|
||
type: "string"
|
||
|
||
xpath_provenance:
|
||
description: |
|
||
For web extractions, each entity should include XPath to its location
|
||
in the archived HTML. This enables verification and quality control.
|
||
|
||
fields:
|
||
xpath:
|
||
description: "XPath expression locating the entity in HTML"
|
||
type: "string"
|
||
xpath_match_score:
|
||
description: "1.0 for exact match, less than 1.0 for fuzzy"
|
||
type: "number"
|
||
|
||
# =============================================================================
|
||
# SECTION 14: FRICTION POINTS AND EDGE CASES
|
||
# =============================================================================
|
||
# Known difficult cases and resolution strategies.
|
||
|
||
friction_points:
|
||
|
||
- id: "FP001"
|
||
title: "Organization vs. Building ambiguity"
|
||
description: |
|
||
Heritage institutions are both organizations and physical locations.
|
||
The Rijksmuseum can refer to the institution or the building.
|
||
resolution: |
|
||
Default to ORG.HER for heritage institutions. Apply double-tagging
|
||
(ORG.HER + PLC.BLD) only when the physical building is specifically
|
||
being discussed (e.g., the Rijksmuseum building was renovated).
|
||
|
||
- id: "FP002"
|
||
title: "Historical vs. current names"
|
||
description: |
|
||
Institutions may be referenced by historical names that differ from
|
||
current official names.
|
||
resolution: |
|
||
Tag the name as it appears in the text. Use the same entity type
|
||
regardless of whether historical or current. Note name changes in
|
||
relationship metadata if detectable.
|
||
|
||
- id: "FP003"
|
||
title: "Abbreviated institution names"
|
||
description: |
|
||
Common abbreviations like the Rijks or MoMA may not be explicit.
|
||
resolution: |
|
||
Tag abbreviations as ORG.HER when context makes the institution clear.
|
||
Lower confidence score if abbreviation is ambiguous.
|
||
|
||
- id: "FP004"
|
||
title: "Generic role vs. named staff"
|
||
description: |
|
||
Distinguishing between the director (generic) and Director Jan de Wit
|
||
(specific person in role).
|
||
resolution: |
|
||
Tag only when a proper name is present. The director alone is excluded
|
||
per BEI_EXC001. Director Jan de Wit is tagged as BEING.STF.
|
||
|
||
- id: "FP005"
|
||
title: "Nested locations"
|
||
description: |
|
||
Addresses contain multiple location levels: street, city, country.
|
||
resolution: |
|
||
Tag the complete address as a single PLC.IAD or PLC.ADR entity.
|
||
Do NOT separately tag city or country within an address span.
|
||
Tag cities/countries separately only when they appear independently.
|
||
|
||
- id: "FP006"
|
||
title: "Date formats across cultures"
|
||
description: |
|
||
Dates appear in many formats: 15 July 1606, 07/15/1606, 1606-07-15
|
||
resolution: |
|
||
Tag complete date expressions as single TMP.DAT entities regardless
|
||
of format. Include the full expression as it appears.
|
||
|
||
- id: "FP007"
|
||
title: "Collection names vs. general descriptions"
|
||
description: |
|
||
Distinguishing the Dutch Masters Collection (proper name) from
|
||
the collection of Dutch masters (description).
|
||
resolution: |
|
||
Tag capitalized, proper-noun collection names as DEN.COL.
|
||
Do not tag lowercase generic descriptions.
|
||
|
||
# =============================================================================
|
||
# SECTION 15: DOCUMENT STRUCTURE AND NAMESPACE PATHS
|
||
# =============================================================================
|
||
# CRITICAL: Namespaces and document paths are ESSENTIAL for clustering entities
|
||
# and distinguishing context-specific entity relationships.
|
||
|
||
document_structure:
|
||
|
||
purpose: |
|
||
Entity annotations occur within STRUCTURED DOCUMENTS - not flat text streams.
|
||
The LOCATION of an entity within document structure determines:
|
||
|
||
1. SEMANTIC SCOPE: An entity in a header governs entities in subsequent paragraphs
|
||
2. CO-REFERENCE CLUSTERS: Entities in the same structural unit likely co-refer
|
||
3. RELATIONSHIP CONTEXT: Header-paragraph relations differ from paragraph-paragraph
|
||
4. PROVENANCE PRECISION: XPath/JSONPath enables exact location for verification
|
||
|
||
Without namespace paths, entity extraction loses critical context that
|
||
distinguishes "the director" in a museum description from "the director"
|
||
in a film credits section of the same document.
|
||
|
||
design_principles:
|
||
- principle: "STRUCTURE IS MEANING"
|
||
description: |
|
||
Document structure (headers, sections, paragraphs, lists) carries semantic
|
||
information. A person mentioned in a "Board of Directors" section has a
|
||
different relationship to the organization than a person mentioned in a
|
||
"Historical Overview" section.
|
||
|
||
- principle: "PATHS ENABLE CLUSTERING"
|
||
description: |
|
||
Entities sharing a common path prefix belong to the same structural context.
|
||
Clustering by path enables:
|
||
- Scoped entity resolution (disambiguate within section before document)
|
||
- Contextual relationship inference (section membership implies relationship)
|
||
- Provenance aggregation (all claims from same region share reliability)
|
||
|
||
- principle: "NAMESPACES PREVENT COLLISION"
|
||
description: |
|
||
The same entity mention in different structural contexts may require
|
||
different annotations or link to different knowledge base entities.
|
||
Namespaces ensure annotations are addressable without ambiguity.
|
||
|
||
- principle: "LAYOUT INFORMS SEMANTICS"
|
||
description: |
|
||
Visual layout (sidebars, captions, footnotes, marginalia) carries meaning
|
||
distinct from main body text. PAGE-XML text regions, HTML semantic elements,
|
||
and JSON structural keys all encode layout that affects interpretation.
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# LAYOUT SEMANTIC ONTOLOGY
|
||
# ---------------------------------------------------------------------------
|
||
# Comprehensive hypernym classes and properties for document structure annotation.
|
||
# These classes are format-agnostic and apply to PAGE-XML, HTML, JSON, MD, EPUB, PDF.
|
||
#
|
||
# PRIMARY AUTHORITIES:
|
||
# - W3C Web Annotation Data Model (oa:) - annotation targeting and selectors
|
||
# - CIDOC-CRM (crm:) - information objects and carriers
|
||
# - RiC-O (rico:) - record parts and structure
|
||
# - PREMIS (premis:) - digital object hierarchy
|
||
# - Dublin Core (dcterms:) - part/whole relationships
|
||
# - NIF 2.0 (nif:) - text annotation interchange
|
||
# ---------------------------------------------------------------------------
|
||
|
||
layout_semantic_ontology:
|
||
|
||
purpose: |
|
||
This ontology defines semantic classes for document layout elements that
|
||
generative AI models can use to annotate/describe structural context.
|
||
|
||
CRITICAL: Layout claims are DISTINCT from entity claims. A complete annotation
|
||
requires TWO NESTED LAYERS of provenance:
|
||
|
||
1. LAYOUT CLAIM: "This text region is a sidebar"
|
||
- Has its own provenance (XPath, timestamp, confidence, agent)
|
||
- May be uncertain (is this a sidebar or a marginalia?)
|
||
|
||
2. ENTITY CLAIM: "This sidebar contains person name 'Rembrandt'"
|
||
- Has its own provenance (span offsets, NER model, confidence)
|
||
- REFERENCES the layout claim as context
|
||
|
||
This separation enables:
|
||
- Independent validation of layout vs. entity extraction
|
||
- Different confidence levels for structure vs. content
|
||
- Reasoning about how layout affects entity interpretation
|
||
|
||
# -------------------------------------------------------------------------
|
||
# HYPERNYM: DOCUMENT_REGION (DOC)
|
||
# -------------------------------------------------------------------------
|
||
# The top-level class for any identifiable region within a document.
|
||
# Analogous to CIDOC-CRM E73 Information Object but focused on structure.
|
||
# -------------------------------------------------------------------------
|
||
|
||
DOCUMENT_REGION:
|
||
code: "DOC"
|
||
definition: |
|
||
A discrete, identifiable region within a document that contains content
|
||
and has structural relationships to other regions. Document regions are
|
||
the fundamental unit of layout annotation.
|
||
|
||
Every document region has:
|
||
- BOUNDARIES: Start/end positions (character offsets, coordinates, paths)
|
||
- CONTAINMENT: Parent regions that contain it, child regions it contains
|
||
- SEQUENCE: Position relative to sibling regions
|
||
- SEMANTIC ROLE: Function within the document (header, body, supplement)
|
||
|
||
ontology_mappings:
|
||
primary_class: "crm:E73_Information_Object"
|
||
primary_class_definition: |
|
||
CIDOC-CRM E73 Information Object: "This class comprises identifiable
|
||
immaterial items, such as poems, jokes, data sets, images, texts,
|
||
multimedia objects, procedural prescriptions, computer program code,
|
||
algorithm or mathematical formulae, that have an objectively
|
||
recognizable structure."
|
||
alternative_classes:
|
||
- class: "rico:RecordPart"
|
||
note: "RiC-O class for component parts of archival records"
|
||
- class: "premis:Bitstream"
|
||
note: "PREMIS class for meaningful data segment within file"
|
||
- class: "oa:TextualBody"
|
||
note: "Web Annotation class for textual content"
|
||
linkml_mapping:
|
||
class_uri: "glam:DocumentRegion"
|
||
exact_mappings:
|
||
- "crm:E73_Information_Object"
|
||
close_mappings:
|
||
- "rico:RecordPart"
|
||
- "premis:Bitstream"
|
||
related_mappings:
|
||
- "oa:TextualBody"
|
||
- "nif:Context"
|
||
|
||
properties:
|
||
- property: "hasParentRegion"
|
||
uri: "glam:hasParentRegion"
|
||
range: "DocumentRegion"
|
||
owl_mapping: "dcterms:isPartOf"
|
||
description: "The containing region (section contains paragraph)"
|
||
|
||
- property: "hasChildRegion"
|
||
uri: "glam:hasChildRegion"
|
||
range: "DocumentRegion"
|
||
owl_mapping: "dcterms:hasPart"
|
||
description: "Contained regions (paragraph contains sentences)"
|
||
|
||
- property: "hasNextSibling"
|
||
uri: "glam:hasNextSibling"
|
||
range: "DocumentRegion"
|
||
owl_mapping: "rico:isOrWasAdjacentTo"
|
||
description: "Next region in document order"
|
||
|
||
- property: "hasPreviousSibling"
|
||
uri: "glam:hasPreviousSibling"
|
||
range: "DocumentRegion"
|
||
description: "Previous region in document order"
|
||
|
||
- property: "hasSemanticRole"
|
||
uri: "glam:hasSemanticRole"
|
||
range: "LayoutSemanticRole"
|
||
description: "The semantic function of this region"
|
||
|
||
- property: "regionPath"
|
||
uri: "glam:regionPath"
|
||
range: "xsd:string"
|
||
owl_mapping: "oa:hasSelector"
|
||
description: "XPath, JSONPath, or other path expression"
|
||
|
||
- property: "regionStart"
|
||
uri: "glam:regionStart"
|
||
range: "xsd:integer"
|
||
owl_mapping: "nif:beginIndex"
|
||
description: "Character offset of region start"
|
||
|
||
- property: "regionEnd"
|
||
uri: "glam:regionEnd"
|
||
range: "xsd:integer"
|
||
owl_mapping: "nif:endIndex"
|
||
description: "Character offset of region end"
|
||
|
||
subcategories:
|
||
|
||
# ---------------------------------------------------------------------
|
||
# PRIMARY CONTENT REGIONS
|
||
# ---------------------------------------------------------------------
|
||
|
||
DOC.HDR:
|
||
name: "HEADER"
|
||
definition: |
|
||
Heading or title region that introduces and governs subsequent content.
|
||
Headers establish scope for entity interpretation and relationship inference.
|
||
|
||
Includes: h1-h6, chapter titles, section headings, running headers,
|
||
PAGE-XML TextRegion[@type='header'], JSON object keys.
|
||
|
||
ontology_mappings:
|
||
primary_class: "schema:headline"
|
||
alternative_classes:
|
||
- "bf:Title"
|
||
- "dcterms:title"
|
||
nif_mapping: "nif:Title"
|
||
|
||
header_levels:
|
||
- level: 1
|
||
scope: "document"
|
||
html: "h1"
|
||
pagexml: "TextRegion[@type='heading'][@level='1']"
|
||
semantic: "Document title or primary topic"
|
||
- level: 2
|
||
scope: "chapter"
|
||
html: "h2"
|
||
pagexml: "TextRegion[@type='heading'][@level='2']"
|
||
semantic: "Major section or chapter heading"
|
||
- level: 3
|
||
scope: "section"
|
||
html: "h3"
|
||
semantic: "Subsection heading"
|
||
- level: 4
|
||
scope: "subsection"
|
||
html: "h4"
|
||
semantic: "Sub-subsection heading"
|
||
- level: 5
|
||
scope: "paragraph_group"
|
||
html: "h5"
|
||
semantic: "Minor heading or label"
|
||
- level: 6
|
||
scope: "inline"
|
||
html: "h6"
|
||
semantic: "Inline heading or run-in head"
|
||
|
||
governing_properties:
|
||
- property: "governs"
|
||
uri: "glam:governs"
|
||
range: "DocumentRegion"
|
||
description: "Regions semantically governed by this header"
|
||
- property: "governsUntil"
|
||
uri: "glam:governsUntil"
|
||
range: "DocumentRegion"
|
||
description: "Region where governance ends (next same-level header)"
|
||
|
||
DOC.PAR:
|
||
name: "PARAGRAPH"
|
||
definition: |
|
||
Block of continuous prose text forming a logical unit of discourse.
|
||
The primary content-bearing unit in most documents.
|
||
|
||
Includes: HTML <p>, PAGE-XML TextRegion[@type='paragraph'],
|
||
text separated by blank lines, JSON string values.
|
||
|
||
ontology_mappings:
|
||
primary_class: "crm:E33_Linguistic_Object"
|
||
alternative_classes:
|
||
- "schema:Text"
|
||
- "nif:Paragraph"
|
||
|
||
paragraph_properties:
|
||
- property: "paragraphIndex"
|
||
uri: "glam:paragraphIndex"
|
||
range: "xsd:integer"
|
||
description: "Zero-based index within containing section"
|
||
- property: "sentenceCount"
|
||
uri: "glam:sentenceCount"
|
||
range: "xsd:integer"
|
||
description: "Number of sentences in paragraph"
|
||
|
||
DOC.SEN:
|
||
name: "SENTENCE"
|
||
definition: |
|
||
A grammatical sentence within a paragraph. The minimal unit for
|
||
syntactic analysis and relationship extraction.
|
||
|
||
ontology_mappings:
|
||
primary_class: "nif:Sentence"
|
||
alternative_classes:
|
||
- "crm:E33_Linguistic_Object"
|
||
|
||
sentence_properties:
|
||
- property: "sentenceIndex"
|
||
uri: "glam:sentenceIndex"
|
||
range: "xsd:integer"
|
||
description: "Zero-based index within containing paragraph"
|
||
|
||
DOC.LST:
|
||
name: "LIST"
|
||
definition: |
|
||
Ordered or unordered enumeration of items sharing parallel structure.
|
||
|
||
Includes: HTML <ul>/<ol>, bulleted/numbered lists, PAGE-XML lists,
|
||
JSON arrays, markdown lists.
|
||
|
||
ontology_mappings:
|
||
primary_class: "schema:ItemList"
|
||
alternative_classes:
|
||
- "rdf:List"
|
||
- "rdf:Seq"
|
||
|
||
list_types:
|
||
- type: "ordered"
|
||
html: "ol"
|
||
semantic: "Sequence with meaningful order"
|
||
- type: "unordered"
|
||
html: "ul"
|
||
semantic: "Set without meaningful order"
|
||
- type: "definition"
|
||
html: "dl"
|
||
semantic: "Term-definition pairs"
|
||
|
||
DOC.LIT:
|
||
name: "LIST_ITEM"
|
||
definition: |
|
||
Individual item within a list. Shares type/relationship with siblings.
|
||
|
||
ontology_mappings:
|
||
primary_class: "schema:ListItem"
|
||
|
||
list_item_properties:
|
||
- property: "itemIndex"
|
||
uri: "glam:itemIndex"
|
||
range: "xsd:integer"
|
||
description: "Position within list (0-based)"
|
||
- property: "itemLabel"
|
||
uri: "glam:itemLabel"
|
||
range: "xsd:string"
|
||
description: "Bullet, number, or marker text"
|
||
|
||
DOC.TBL:
|
||
name: "TABLE"
|
||
definition: |
|
||
Structured tabular data with rows and columns.
|
||
|
||
Includes: HTML <table>, PAGE-XML tables, markdown tables,
|
||
CSV/TSV content, JSON arrays of objects.
|
||
|
||
ontology_mappings:
|
||
primary_class: "schema:Table"
|
||
alternative_classes:
|
||
- "csvw:Table"
|
||
|
||
table_structure:
|
||
- component: "TABLE_HEADER"
|
||
code: "DOC.TBL.HDR"
|
||
html: "thead, th"
|
||
description: "Column headers defining semantics"
|
||
- component: "TABLE_BODY"
|
||
code: "DOC.TBL.BDY"
|
||
html: "tbody"
|
||
description: "Data rows"
|
||
- component: "TABLE_ROW"
|
||
code: "DOC.TBL.ROW"
|
||
html: "tr"
|
||
description: "Single row of cells"
|
||
- component: "TABLE_CELL"
|
||
code: "DOC.TBL.CEL"
|
||
html: "td"
|
||
description: "Individual data cell"
|
||
|
||
# ---------------------------------------------------------------------
|
||
# SUPPLEMENTARY CONTENT REGIONS
|
||
# ---------------------------------------------------------------------
|
||
|
||
DOC.SDB:
|
||
name: "SIDEBAR"
|
||
definition: |
|
||
Supplementary content presented alongside main text, typically
|
||
providing context, metadata, or related information.
|
||
|
||
HYPERNYM for: marginalia, infoboxes, callout boxes, pull quotes,
|
||
asides, floating content.
|
||
|
||
Includes: HTML <aside>, Wikipedia infoboxes, PAGE-XML marginal regions,
|
||
floating boxes, pull quotes.
|
||
|
||
ontology_mappings:
|
||
primary_class: "schema:WPSideBar"
|
||
alternative_classes:
|
||
- "html:aside"
|
||
- "crm:E73_Information_Object"
|
||
|
||
hyponyms:
|
||
|
||
DOC.SDB.MRG:
|
||
name: "MARGINALIA"
|
||
definition: |
|
||
Annotations or notes in the margin of a page. Common in historical
|
||
manuscripts, early printed books, and academic texts.
|
||
|
||
Marginalia often contain:
|
||
- Reader annotations (comments, corrections)
|
||
- Editorial marks (printer instructions)
|
||
- Cross-references (citations, page numbers)
|
||
- Dates or signatures
|
||
|
||
ontology_mappings:
|
||
primary_class: "crm:E34_Inscription"
|
||
note: "Marginalia as inscribed marks on document"
|
||
|
||
margin_positions:
|
||
- position: "left"
|
||
pagexml: "TextRegion[@type='marginalia-left']"
|
||
- position: "right"
|
||
pagexml: "TextRegion[@type='marginalia-right']"
|
||
- position: "top"
|
||
pagexml: "TextRegion[@type='marginalia-top']"
|
||
- position: "bottom"
|
||
pagexml: "TextRegion[@type='marginalia-bottom']"
|
||
- position: "interlinear"
|
||
description: "Between lines of main text"
|
||
|
||
DOC.SDB.IBX:
|
||
name: "INFOBOX"
|
||
definition: |
|
||
Structured summary box containing key facts about a topic.
|
||
Common in encyclopedias, Wikipedia, and reference works.
|
||
|
||
Infoboxes contain structured claims with high extraction value.
|
||
|
||
ontology_mappings:
|
||
primary_class: "schema:Table"
|
||
note: "Infoboxes are typically key-value tables"
|
||
|
||
infobox_properties:
|
||
- property: "infoboxType"
|
||
range: "xsd:string"
|
||
description: "Type of infobox (person, place, organization)"
|
||
|
||
DOC.SDB.CLT:
|
||
name: "CALLOUT"
|
||
definition: |
|
||
Highlighted text box drawing attention to key information.
|
||
Includes pull quotes, highlighted passages, tip boxes.
|
||
|
||
callout_types:
|
||
- type: "pull_quote"
|
||
description: "Excerpt from main text displayed prominently"
|
||
- type: "tip"
|
||
description: "Advice or recommendation"
|
||
- type: "warning"
|
||
description: "Caution or alert"
|
||
- type: "note"
|
||
description: "Additional information"
|
||
|
||
DOC.CAP:
|
||
name: "CAPTION"
|
||
definition: |
|
||
Text describing or explaining a figure, table, or other visual element.
|
||
Captions establish aboutness relationships with visual content.
|
||
|
||
Includes: HTML <figcaption>, PAGE-XML caption regions, alt text,
|
||
image descriptions.
|
||
|
||
ontology_mappings:
|
||
primary_class: "schema:caption"
|
||
alternative_classes:
|
||
- "dcterms:description"
|
||
- "crm:P3_has_note"
|
||
|
||
caption_properties:
|
||
- property: "captionOf"
|
||
uri: "glam:captionOf"
|
||
range: "DocumentRegion"
|
||
owl_mapping: "schema:about"
|
||
description: "The figure/table this caption describes"
|
||
|
||
DOC.FTN:
|
||
name: "FOOTNOTE"
|
||
definition: |
|
||
Reference note at bottom of page or end of section/document.
|
||
Contains supplementary information, citations, or clarifications.
|
||
|
||
Includes: Footnotes, endnotes, sidenotes, margin notes with references.
|
||
|
||
ontology_mappings:
|
||
primary_class: "bibo:Note"
|
||
alternative_classes:
|
||
- "schema:Comment"
|
||
- "crm:E33_Linguistic_Object"
|
||
|
||
footnote_types:
|
||
- type: "footnote"
|
||
location: "bottom_of_page"
|
||
pagexml: "TextRegion[@type='footnote']"
|
||
- type: "endnote"
|
||
location: "end_of_chapter"
|
||
- type: "sidenote"
|
||
location: "margin"
|
||
|
||
footnote_properties:
|
||
- property: "footnoteMarker"
|
||
uri: "glam:footnoteMarker"
|
||
range: "xsd:string"
|
||
description: "The marker symbol (*, 1, a, etc.)"
|
||
- property: "referencesLocation"
|
||
uri: "glam:referencesLocation"
|
||
range: "DocumentRegion"
|
||
description: "Location of footnote marker in main text"
|
||
|
||
DOC.FIG:
|
||
name: "FIGURE"
|
||
definition: |
|
||
Single visual content item (image, diagram, chart).
|
||
For collections of images, use DOC.GAL (Gallery).
|
||
For geographic/spatial visualizations, use DOC.MAP (Map).
|
||
|
||
Entities mentioned in figures require visual analysis (OCR, object detection).
|
||
|
||
Includes: HTML <figure>, embedded images, diagrams, charts, illustrations.
|
||
|
||
ontology_mappings:
|
||
primary_class: "schema:ImageObject"
|
||
alternative_classes:
|
||
- "crm:E38_Image"
|
||
- "edm:WebResource"
|
||
- "foaf:Image"
|
||
linkml_mapping:
|
||
class_uri: "schema:ImageObject"
|
||
exact_mappings:
|
||
- "crm:E38_Image"
|
||
close_mappings:
|
||
- "edm:WebResource"
|
||
|
||
figure_types:
|
||
- type: "photograph"
|
||
description: "Photographic image"
|
||
- type: "illustration"
|
||
description: "Drawing, painting, or artwork"
|
||
- type: "diagram"
|
||
description: "Schematic, flowchart, or technical drawing"
|
||
- type: "chart"
|
||
description: "Data visualization (bar, line, pie, etc.)"
|
||
- type: "scan"
|
||
description: "Digitized physical document or object"
|
||
|
||
# ---------------------------------------------------------------------
|
||
# MEDIA COLLECTION REGIONS
|
||
# ---------------------------------------------------------------------
|
||
|
||
DOC.GAL:
|
||
name: "GALLERY"
|
||
code: "DOC.GAL"
|
||
definition: |
|
||
Collection of related visual or media items presented as a unit.
|
||
Distinguished from single figures by containing MULTIPLE items with
|
||
shared context, navigation, or thematic grouping.
|
||
|
||
Domain-agnostic applications:
|
||
- Heritage: Manuscript illuminations, artifact photo sets, exhibition views
|
||
- Web: Image carousels, product galleries, portfolio showcases
|
||
- Publishing: Photo essays, plate sections, illustration series
|
||
- Archives: Document set scans, correspondence series images
|
||
- Museums: Object photography series, conservation documentation
|
||
|
||
Entities in galleries share contextual scope from gallery title/caption.
|
||
Individual items may have their own DOC.FIG claims nested within.
|
||
|
||
ontology_mappings:
|
||
primary_class: "schema:ImageGallery"
|
||
alternative_classes:
|
||
- "schema:Collection"
|
||
- "crm:E78_Curated_Holding"
|
||
- "edm:Aggregation"
|
||
- "as:Collection" # ActivityStreams
|
||
linkml_mapping:
|
||
class_uri: "schema:ImageGallery"
|
||
close_mappings:
|
||
- "schema:Collection"
|
||
- "crm:E78_Curated_Holding"
|
||
related_mappings:
|
||
- "edm:Aggregation"
|
||
|
||
format_mappings:
|
||
html:
|
||
elements: ["div[class*='gallery']", "div[class*='carousel']", "div[class*='slider']", "ul[class*='gallery']"]
|
||
aria_roles: ["group", "listbox"]
|
||
json:
|
||
patterns: ["$.gallery", "$.images[]", "$.media.items[]"]
|
||
pagexml:
|
||
note: "Rare in PAGE-XML; group of consecutive ImageRegions"
|
||
|
||
gallery_types:
|
||
- type: "image_gallery"
|
||
description: "Collection of photographs or illustrations"
|
||
examples: ["museum object photos", "manuscript folios", "exhibition views"]
|
||
- type: "carousel"
|
||
description: "Horizontally/vertically scrolling media set"
|
||
- type: "slideshow"
|
||
description: "Sequential presentation with transitions"
|
||
- type: "lightbox"
|
||
description: "Thumbnail grid with modal expansion"
|
||
- type: "filmstrip"
|
||
description: "Linear sequence of video thumbnails or stills"
|
||
- type: "plate_section"
|
||
description: "Bound illustration pages in printed works"
|
||
note: "Common in historical scientific/art publications"
|
||
|
||
gallery_properties:
|
||
- property: "galleryTitle"
|
||
uri: "schema:name"
|
||
description: "Title of the gallery collection"
|
||
- property: "itemCount"
|
||
uri: "schema:numberOfItems"
|
||
description: "Number of items in gallery"
|
||
- property: "galleryItems"
|
||
uri: "schema:hasPart"
|
||
range: "DocumentRegion"
|
||
description: "Individual items (DOC.FIG) within gallery"
|
||
- property: "curatedBy"
|
||
uri: "schema:curator"
|
||
owl_mapping: "prov:wasAttributedTo"
|
||
description: "Agent who assembled the gallery"
|
||
|
||
entity_extraction_notes: |
|
||
Galleries present special extraction challenges:
|
||
1. Gallery TITLE provides context for all contained items
|
||
2. Individual CAPTIONS may override gallery-level context
|
||
3. Items may be ORDERED (narrative sequence) or UNORDERED (thematic group)
|
||
4. Navigation elements (prev/next, thumbnails) are structural, not content
|
||
|
||
Recommended approach:
|
||
- Create layout claim for gallery container (DOC.GAL)
|
||
- Create nested layout claims for each item (DOC.FIG)
|
||
- Entity claims reference their immediate container
|
||
- Inherit gallery context when item lacks own caption
|
||
|
||
DOC.MAP:
|
||
name: "MAP"
|
||
code: "DOC.MAP"
|
||
definition: |
|
||
Cartographic or spatial representation of geographic information.
|
||
Distinguished from generic figures by explicit spatial semantics.
|
||
|
||
CRITICAL DISTINCTION from GEOMETRY (GEO) entity type:
|
||
- DOC.MAP is a DOCUMENT REGION containing a map visualization
|
||
- GEO is an ENTITY TYPE for coordinate/shape data extracted from any region
|
||
|
||
Domain-agnostic applications:
|
||
- Heritage: Historical maps, archaeological site plans, building floorplans
|
||
- Web: Interactive maps, location widgets, route displays
|
||
- Publishing: Atlas plates, thematic maps, navigation charts
|
||
- Archives: Survey maps, cadastral records, military charts
|
||
- Urban planning: Zoning maps, infrastructure layouts
|
||
|
||
Entities in maps require spatial reasoning for extraction.
|
||
Place names (TOP) may appear as labels; geometry (GEO) as shapes/points.
|
||
|
||
ontology_mappings:
|
||
primary_class: "crm:E73_Information_Object"
|
||
alternative_classes:
|
||
- "schema:Map"
|
||
- "edm:WebResource"
|
||
- "geosparql:SpatialObject"
|
||
- "bibo:Map"
|
||
linkml_mapping:
|
||
class_uri: "schema:Map"
|
||
exact_mappings:
|
||
- "bibo:Map"
|
||
close_mappings:
|
||
- "crm:E73_Information_Object"
|
||
related_mappings:
|
||
- "geosparql:SpatialObject"
|
||
|
||
format_mappings:
|
||
html:
|
||
elements: ["div[class*='map']", "div[id*='map']", "leaflet-container", "mapboxgl-map", "gm-style"]
|
||
note: "Interactive maps often use Leaflet, Mapbox, Google Maps containers"
|
||
json:
|
||
patterns: ["$.map", "$.geojson", "$.geometry", "$.features[]"]
|
||
standards: ["GeoJSON (RFC 7946)", "TopoJSON"]
|
||
image_formats:
|
||
extensions: [".geotiff", ".tiff", ".jpg", ".png"]
|
||
note: "Georeferenced images contain embedded coordinate metadata"
|
||
pagexml:
|
||
type: "GraphicRegion[@type='map']"
|
||
|
||
map_types:
|
||
- type: "reference_map"
|
||
description: "General purpose geographic reference"
|
||
examples: ["world map", "country outline", "street map"]
|
||
- type: "thematic_map"
|
||
description: "Data visualization on geographic base"
|
||
examples: ["choropleth", "heat map", "dot distribution"]
|
||
- type: "historical_map"
|
||
description: "Map from historical period (primary source)"
|
||
examples: ["17th century nautical chart", "medieval mappa mundi"]
|
||
- type: "site_plan"
|
||
description: "Architectural or archaeological layout"
|
||
examples: ["floor plan", "excavation grid", "campus map"]
|
||
- type: "route_map"
|
||
description: "Navigation or journey visualization"
|
||
examples: ["transit map", "pilgrimage route", "trade route"]
|
||
- type: "cadastral_map"
|
||
description: "Property boundary documentation"
|
||
examples: ["land survey", "parcel map", "deed plat"]
|
||
- type: "interactive_map"
|
||
description: "User-manipulable web map with layers/zoom"
|
||
examples: ["Leaflet widget", "Google Maps embed", "IIIF geo extension"]
|
||
|
||
map_components:
|
||
- component: "MAP.BAS"
|
||
name: "Base Layer"
|
||
description: "Background geographic reference (satellite, street, terrain)"
|
||
- component: "MAP.OVL"
|
||
name: "Overlay Layer"
|
||
description: "Thematic data layer on top of base"
|
||
- component: "MAP.MRK"
|
||
name: "Markers"
|
||
description: "Point features (pins, icons, labels)"
|
||
- component: "MAP.SHP"
|
||
name: "Shapes"
|
||
description: "Polygon/polyline features (boundaries, routes)"
|
||
- component: "MAP.LEG"
|
||
name: "Legend"
|
||
description: "Symbol key and scale information"
|
||
- component: "MAP.CTL"
|
||
name: "Controls"
|
||
description: "Zoom, pan, layer toggles (structural, not content)"
|
||
|
||
map_properties:
|
||
- property: "mapTitle"
|
||
uri: "schema:name"
|
||
description: "Title of the map"
|
||
- property: "spatialCoverage"
|
||
uri: "schema:spatialCoverage"
|
||
description: "Geographic extent represented"
|
||
- property: "mapScale"
|
||
uri: "schema:contentSize"
|
||
description: "Representative fraction or verbal scale"
|
||
- property: "projection"
|
||
uri: "geosparql:hasSpatialAccuracy"
|
||
description: "Cartographic projection used"
|
||
- property: "temporalCoverage"
|
||
uri: "schema:temporalCoverage"
|
||
description: "Time period depicted (for historical maps)"
|
||
|
||
entity_extraction_notes: |
|
||
Maps present unique extraction challenges:
|
||
|
||
1. TOPONYMS (TOP) appear as:
|
||
- Labels on map face
|
||
- Legend entries
|
||
- Title/caption text
|
||
- Popup/tooltip content (interactive maps)
|
||
|
||
2. GEOMETRY (GEO) data includes:
|
||
- Point coordinates (markers)
|
||
- Bounding boxes
|
||
- Polygon vertices (regions, buildings)
|
||
- Polylines (routes, rivers, borders)
|
||
|
||
3. TEMPORAL context:
|
||
- Historical maps show past geography (not current!)
|
||
- Map creation date ≠ depicted time period
|
||
- Boundary changes over time
|
||
|
||
4. PROVENANCE considerations:
|
||
- Georeferenced scans have transformation accuracy
|
||
- Interactive maps have tile source attribution
|
||
- Derived maps inherit source map provenance
|
||
|
||
Recommended approach:
|
||
- Create layout claim for map container (DOC.MAP)
|
||
- Extract toponyms with map-specific confidence (label legibility)
|
||
- Extract geometry with coordinate reference system metadata
|
||
- Track both map creation date AND depicted time period
|
||
|
||
# ---------------------------------------------------------------------
|
||
# AUDIOVISUAL CONTENT REGIONS
|
||
# ---------------------------------------------------------------------
|
||
|
||
DOC.AUD:
|
||
name: "AUDIO"
|
||
code: "DOC.AUD"
|
||
definition: |
|
||
Audio content region (sound recording, podcast, music, oral history).
|
||
Entities require speech-to-text or audio analysis for extraction.
|
||
|
||
Domain-agnostic applications:
|
||
- Heritage: Oral history recordings, ethnographic field recordings
|
||
- Web: Podcast episodes, music players, audio articles
|
||
- Archives: Radio broadcasts, interview recordings, speeches
|
||
- Museums: Audio guides, soundscapes, musical instrument recordings
|
||
- Linguistics: Language documentation, dialect samples
|
||
|
||
Audio content often has associated TRANSCRIPT (DOC.PAR) which is the
|
||
primary source for NER; audio itself provides prosodic/speaker metadata.
|
||
|
||
ontology_mappings:
|
||
primary_class: "schema:AudioObject"
|
||
alternative_classes:
|
||
- "crm:E73_Information_Object"
|
||
- "edm:WebResource"
|
||
- "premis:IntellectualEntity"
|
||
linkml_mapping:
|
||
class_uri: "schema:AudioObject"
|
||
close_mappings:
|
||
- "crm:E73_Information_Object"
|
||
|
||
format_mappings:
|
||
html:
|
||
elements: ["audio", "div[class*='audio-player']", "div[class*='podcast']"]
|
||
json:
|
||
patterns: ["$.audio", "$.podcast", "$.episodes[]", "$.tracks[]"]
|
||
file_formats:
|
||
extensions: [".mp3", ".wav", ".ogg", ".flac", ".m4a", ".aac"]
|
||
|
||
audio_types:
|
||
- type: "speech"
|
||
description: "Spoken word content"
|
||
examples: ["interview", "lecture", "oral history", "podcast"]
|
||
- type: "music"
|
||
description: "Musical performance or composition"
|
||
examples: ["concert recording", "album track", "folk song"]
|
||
- type: "soundscape"
|
||
description: "Environmental or ambient audio"
|
||
examples: ["field recording", "museum ambiance", "historic sound"]
|
||
- type: "narration"
|
||
description: "Scripted audio guide or documentary"
|
||
examples: ["museum audio guide", "audiobook chapter"]
|
||
|
||
audio_properties:
|
||
- property: "duration"
|
||
uri: "schema:duration"
|
||
description: "Length of audio content (ISO 8601 duration)"
|
||
- property: "transcript"
|
||
uri: "schema:transcript"
|
||
range: "DocumentRegion"
|
||
description: "Text transcription of audio (DOC.PAR)"
|
||
- property: "speaker"
|
||
uri: "schema:actor"
|
||
description: "Person(s) speaking in recording"
|
||
- property: "recordingDate"
|
||
uri: "schema:dateCreated"
|
||
description: "When audio was recorded"
|
||
- property: "recordingLocation"
|
||
uri: "schema:contentLocation"
|
||
description: "Where audio was recorded"
|
||
|
||
audio_segments:
|
||
- segment: "AUD.SPK"
|
||
name: "Speaker Turn"
|
||
description: "Contiguous speech by single speaker"
|
||
- segment: "AUD.MUS"
|
||
name: "Music Segment"
|
||
description: "Musical interlude or background"
|
||
- segment: "AUD.SIL"
|
||
name: "Silence"
|
||
description: "Intentional pause or gap"
|
||
- segment: "AUD.SFX"
|
||
name: "Sound Effect"
|
||
description: "Non-speech, non-music audio event"
|
||
|
||
entity_extraction_notes: |
|
||
Audio requires multi-modal extraction:
|
||
|
||
1. TRANSCRIPT-BASED (primary):
|
||
- Extract entities from associated transcript (DOC.PAR)
|
||
- Timestamps align text spans to audio segments
|
||
- Speaker diarization links entities to speakers
|
||
|
||
2. AUDIO-DIRECT (secondary):
|
||
- Named entity recognition from ASR output
|
||
- Speaker identification (voice biometrics)
|
||
- Language/dialect detection
|
||
- Prosodic analysis (emphasis, emotion)
|
||
|
||
3. METADATA (tertiary):
|
||
- ID3 tags (music files)
|
||
- Episode metadata (podcasts)
|
||
- Catalog records (archives)
|
||
|
||
Provenance must track:
|
||
- ASR model and confidence
|
||
- Human transcription vs. automated
|
||
- Timestamp precision
|
||
|
||
DOC.VID:
|
||
name: "VIDEO"
|
||
code: "DOC.VID"
|
||
definition: |
|
||
Video content region (moving image with or without audio).
|
||
Entities require multimodal analysis: visual, audio, and text (captions/OCR).
|
||
|
||
Domain-agnostic applications:
|
||
- Heritage: Documentary footage, conservation documentation, exhibition videos
|
||
- Web: Embedded videos, live streams, video articles
|
||
- Archives: News broadcasts, home movies, surveillance footage
|
||
- Museums: Artist interviews, performance recordings, virtual tours
|
||
- Education: Lecture recordings, tutorials, demonstrations
|
||
|
||
Videos combine visual (DOC.FIG), audio (DOC.AUD), and textual modalities.
|
||
Entities may appear in any modality and require cross-modal linking.
|
||
|
||
ontology_mappings:
|
||
primary_class: "schema:VideoObject"
|
||
alternative_classes:
|
||
- "crm:E73_Information_Object"
|
||
- "edm:WebResource"
|
||
- "premis:IntellectualEntity"
|
||
linkml_mapping:
|
||
class_uri: "schema:VideoObject"
|
||
close_mappings:
|
||
- "crm:E73_Information_Object"
|
||
|
||
format_mappings:
|
||
html:
|
||
elements: ["video", "iframe[src*='youtube']", "iframe[src*='vimeo']", "div[class*='video-player']"]
|
||
json:
|
||
patterns: ["$.video", "$.videos[]", "$.media.video"]
|
||
file_formats:
|
||
extensions: [".mp4", ".webm", ".mov", ".avi", ".mkv"]
|
||
|
||
video_types:
|
||
- type: "documentary"
|
||
description: "Non-fiction narrative video"
|
||
- type: "interview"
|
||
description: "Recorded conversation with subject"
|
||
- type: "performance"
|
||
description: "Recorded artistic performance"
|
||
- type: "instructional"
|
||
description: "Tutorial or demonstration"
|
||
- type: "surveillance"
|
||
description: "Continuous monitoring footage"
|
||
- type: "news"
|
||
description: "Broadcast journalism content"
|
||
- type: "archival"
|
||
description: "Historical footage or home movies"
|
||
|
||
video_properties:
|
||
- property: "duration"
|
||
uri: "schema:duration"
|
||
description: "Length of video (ISO 8601 duration)"
|
||
- property: "transcript"
|
||
uri: "schema:transcript"
|
||
range: "DocumentRegion"
|
||
description: "Text transcription of speech"
|
||
- property: "caption"
|
||
uri: "schema:caption"
|
||
range: "DocumentRegion"
|
||
description: "Closed captions or subtitles"
|
||
- property: "thumbnail"
|
||
uri: "schema:thumbnail"
|
||
description: "Representative still image"
|
||
|
||
video_segments:
|
||
- segment: "VID.SCN"
|
||
name: "Scene"
|
||
description: "Continuous action segment"
|
||
- segment: "VID.SHT"
|
||
name: "Shot"
|
||
description: "Single camera take"
|
||
- segment: "VID.TTL"
|
||
name: "Title Card"
|
||
description: "Text overlay or title screen"
|
||
- segment: "VID.CRD"
|
||
name: "Credits"
|
||
description: "Attribution information"
|
||
|
||
DOC.EMB:
|
||
name: "EMBEDDED_INTERACTIVE"
|
||
code: "DOC.EMB"
|
||
definition: |
|
||
Embedded interactive content from external source or rich application.
|
||
Includes iframes, widgets, web components, and embedded applications.
|
||
|
||
Domain-agnostic applications:
|
||
- Heritage: IIIF viewers, 3D model viewers, timeline widgets
|
||
- Web: Social media embeds, code playgrounds, data visualizations
|
||
- Publishing: Interactive charts, explorable explanations
|
||
- Museums: Virtual tours, interactive exhibits, AR/VR content
|
||
- Archives: Document viewers, transcription interfaces
|
||
|
||
Entities within embedded content may require API access or headless
|
||
browser rendering to extract; simple iframe inspection is insufficient.
|
||
|
||
ontology_mappings:
|
||
primary_class: "schema:WebApplication"
|
||
alternative_classes:
|
||
- "schema:CreativeWork"
|
||
- "as:Application"
|
||
linkml_mapping:
|
||
class_uri: "schema:WebApplication"
|
||
|
||
format_mappings:
|
||
html:
|
||
elements: ["iframe", "embed", "object", "web-component", "[data-widget]"]
|
||
json:
|
||
patterns: ["$.embed", "$.widget", "$.interactive"]
|
||
|
||
embed_types:
|
||
- type: "iiif_viewer"
|
||
description: "IIIF-compliant image viewer"
|
||
examples: ["Mirador", "Universal Viewer", "OpenSeadragon"]
|
||
note: "Extract manifest URL for entity-rich metadata"
|
||
- type: "3d_viewer"
|
||
description: "3D model visualization"
|
||
examples: ["Sketchfab embed", "Three.js scene", "WebGL viewer"]
|
||
- type: "timeline"
|
||
description: "Temporal data visualization"
|
||
examples: ["TimelineJS", "Chronoline", "d3 timeline"]
|
||
- type: "social_embed"
|
||
description: "Social media post or feed"
|
||
examples: ["Twitter/X embed", "Instagram post", "YouTube video"]
|
||
- type: "data_viz"
|
||
description: "Interactive data visualization"
|
||
examples: ["D3.js chart", "Plotly graph", "Tableau embed"]
|
||
- type: "code_playground"
|
||
description: "Executable code environment"
|
||
examples: ["CodePen", "JSFiddle", "Observable notebook"]
|
||
- type: "form"
|
||
description: "Interactive input form"
|
||
examples: ["Survey", "search widget", "booking form"]
|
||
- type: "virtual_tour"
|
||
description: "360° or spatial navigation experience"
|
||
examples: ["Google Street View", "Matterport", "museum virtual tour"]
|
||
|
||
embed_properties:
|
||
- property: "embedSource"
|
||
uri: "schema:embedUrl"
|
||
description: "URL of embedded content"
|
||
- property: "embedProvider"
|
||
uri: "schema:provider"
|
||
description: "Service providing the embed"
|
||
- property: "embedType"
|
||
uri: "glam:embedType"
|
||
description: "Category of interactive content"
|
||
- property: "requiresInteraction"
|
||
uri: "glam:requiresInteraction"
|
||
description: "Whether user action needed to reveal content"
|
||
|
||
entity_extraction_notes: |
|
||
Embedded content presents significant extraction challenges:
|
||
|
||
1. ACCESSIBILITY:
|
||
- Content may be behind authentication
|
||
- May require JavaScript execution
|
||
- May load asynchronously
|
||
- Cross-origin policies may block access
|
||
|
||
2. STRATEGIES by embed type:
|
||
- IIIF: Fetch manifest JSON for rich metadata
|
||
- Social: Use platform APIs (Twitter, Instagram)
|
||
- Data viz: Extract from data source if accessible
|
||
- 3D: Parse model metadata, texture labels
|
||
|
||
3. PROVENANCE requirements:
|
||
- Record embed source URL
|
||
- Record extraction timestamp (content may change)
|
||
- Record access method (API, headless browser, etc.)
|
||
- Note if content was inaccessible
|
||
|
||
4. FALLBACK strategies:
|
||
- Use surrounding context (embed caption, link text)
|
||
- Use embed URL structure for hints
|
||
- Use embed provider metadata
|
||
|
||
# ---------------------------------------------------------------------
|
||
# NAVIGATION AND STRUCTURE REGIONS
|
||
# ---------------------------------------------------------------------
|
||
|
||
DOC.NAV:
|
||
name: "NAVIGATION"
|
||
definition: |
|
||
Generic navigational elements (breadcrumbs, menus, links).
|
||
For specific navigation structures, use dedicated types:
|
||
- DOC.TOC for tables of contents
|
||
- DOC.IDX for indices
|
||
|
||
ontology_mappings:
|
||
primary_class: "schema:SiteNavigationElement"
|
||
linkml_mapping:
|
||
class_uri: "schema:SiteNavigationElement"
|
||
|
||
navigation_types:
|
||
- type: "breadcrumb"
|
||
name: "Breadcrumb Trail"
|
||
html: "nav[aria-label='breadcrumb'], .breadcrumb"
|
||
- type: "menu"
|
||
name: "Navigation Menu"
|
||
html: "nav, menu"
|
||
- type: "sitemap"
|
||
name: "Site Map"
|
||
description: "Hierarchical site structure overview"
|
||
|
||
DOC.TOC:
|
||
name: "TABLE_OF_CONTENTS"
|
||
code: "DOC.TOC"
|
||
definition: |
|
||
Structured listing of document sections with page/location references.
|
||
Distinguished from general navigation by its document-internal scope
|
||
and hierarchical structure reflecting document organization.
|
||
|
||
Domain-agnostic applications:
|
||
- Heritage: Manuscript tables, register indices, finding aid outlines
|
||
- Publishing: Book TOCs, journal issue contents, report outlines
|
||
- Web: Article outlines, documentation navigation, wiki page contents
|
||
- Archives: Folder listings, series descriptions, container lists
|
||
- Legal: Statute tables, case indices, contract section lists
|
||
|
||
High-value for entity extraction: section titles often contain
|
||
key entities (names, places, dates, topics) that scope content.
|
||
|
||
ontology_mappings:
|
||
primary_class: "bibo:DocumentPart"
|
||
alternative_classes:
|
||
- "schema:ItemList"
|
||
- "crm:E73_Information_Object"
|
||
linkml_mapping:
|
||
class_uri: "bibo:DocumentPart"
|
||
close_mappings:
|
||
- "schema:ItemList"
|
||
|
||
format_mappings:
|
||
html:
|
||
elements: ["nav[role='doc-toc']", ".toc", "#table-of-contents", "ol.toc"]
|
||
pagexml:
|
||
type: "TextRegion[@type='table-of-contents']"
|
||
tei:
|
||
element: "<divGen type='toc'/>"
|
||
epub:
|
||
element: "nav[epub:type='toc']"
|
||
|
||
toc_types:
|
||
- type: "main_toc"
|
||
description: "Primary document table of contents"
|
||
- type: "list_of_figures"
|
||
description: "Figure/illustration listing with page numbers"
|
||
alias: "LOF"
|
||
- type: "list_of_tables"
|
||
description: "Table listing with page numbers"
|
||
alias: "LOT"
|
||
- type: "list_of_abbreviations"
|
||
description: "Abbreviation/acronym listing"
|
||
- type: "list_of_maps"
|
||
description: "Map listing with page numbers"
|
||
- type: "list_of_plates"
|
||
description: "Plate/illustration listing (historical)"
|
||
|
||
toc_properties:
|
||
- property: "tocEntry"
|
||
uri: "schema:itemListElement"
|
||
description: "Individual TOC entry"
|
||
- property: "sectionTitle"
|
||
uri: "schema:name"
|
||
description: "Title of referenced section"
|
||
- property: "pageReference"
|
||
uri: "bibo:pageStart"
|
||
description: "Page number or location reference"
|
||
- property: "nestingLevel"
|
||
uri: "glam:hierarchyLevel"
|
||
description: "Depth in TOC hierarchy (1=chapter, 2=section, etc.)"
|
||
|
||
entity_extraction_notes: |
|
||
TOC entries are HIGH-VALUE for entity extraction:
|
||
|
||
1. SECTION TITLES often contain:
|
||
- Person names (biographical sections)
|
||
- Place names (geographic chapters)
|
||
- Date ranges (chronological sections)
|
||
- Organization names (institutional histories)
|
||
|
||
2. HIERARCHICAL CONTEXT:
|
||
- Parent entries scope child entries
|
||
- "Part I: The Netherlands" → child sections are Dutch-related
|
||
|
||
3. PAGE REFERENCES enable:
|
||
- Linking entities to page ranges
|
||
- Validating entity locations in document
|
||
|
||
DOC.IDX:
|
||
name: "INDEX"
|
||
code: "DOC.IDX"
|
||
definition: |
|
||
Alphabetical or systematic listing of terms, names, or subjects
|
||
with page/location references. Distinguished from TOC by its
|
||
alphabetical organization and term-based (not section-based) structure.
|
||
|
||
Domain-agnostic applications:
|
||
- Heritage: Name indices, place indices, subject indices
|
||
- Publishing: Back-of-book indices, periodical indices
|
||
- Archives: Finding aid indices, name authority files
|
||
- Legal: Case citation indices, statute indices
|
||
- Academic: Author indices, keyword indices
|
||
|
||
EXTREMELY high-value for NER: indices are curated entity lists
|
||
with location references, essentially pre-annotated entity data.
|
||
|
||
ontology_mappings:
|
||
primary_class: "bibo:DocumentPart"
|
||
alternative_classes:
|
||
- "schema:ItemList"
|
||
- "skos:Collection"
|
||
linkml_mapping:
|
||
class_uri: "bibo:DocumentPart"
|
||
related_mappings:
|
||
- "skos:Collection"
|
||
|
||
format_mappings:
|
||
html:
|
||
elements: ["div.index", "#index", "section[role='doc-index']"]
|
||
pagexml:
|
||
type: "TextRegion[@type='index']"
|
||
tei:
|
||
element: "<divGen type='index'/>, <index>"
|
||
|
||
index_types:
|
||
- type: "name_index"
|
||
description: "Personal and corporate name index"
|
||
entity_type: "AGT"
|
||
note: "Pre-annotated AGENT entities"
|
||
- type: "place_index"
|
||
description: "Geographic and place name index"
|
||
entity_type: "TOP"
|
||
note: "Pre-annotated TOPONYM entities"
|
||
- type: "subject_index"
|
||
description: "Topic and subject index"
|
||
entity_type: "mixed"
|
||
- type: "title_index"
|
||
description: "Work and publication title index"
|
||
entity_type: "WRK"
|
||
note: "Pre-annotated WORK entities"
|
||
- type: "chronological_index"
|
||
description: "Date and event index"
|
||
entity_type: "TMP"
|
||
note: "Pre-annotated TEMPORAL entities"
|
||
|
||
index_properties:
|
||
- property: "indexTerm"
|
||
uri: "skos:prefLabel"
|
||
description: "The indexed term or name"
|
||
- property: "pageReferences"
|
||
uri: "bibo:pages"
|
||
description: "Page number(s) where term appears"
|
||
- property: "seeAlso"
|
||
uri: "skos:related"
|
||
description: "Cross-reference to related terms"
|
||
- property: "subentry"
|
||
uri: "skos:narrower"
|
||
description: "Nested sub-entries under main term"
|
||
|
||
entity_extraction_notes: |
|
||
Indices are GOLD STANDARD entity sources:
|
||
|
||
1. PRE-CURATED ENTITIES:
|
||
- Name indices = curated person/org list
|
||
- Place indices = curated gazetteer
|
||
- Subject indices = controlled vocabulary
|
||
|
||
2. EXTRACTION STRATEGY:
|
||
- Parse index entries as entity mentions
|
||
- Use index type to assign entity class
|
||
- Page references locate entities in text
|
||
|
||
3. CROSS-REFERENCE VALUE:
|
||
- "See also" links indicate entity relationships
|
||
- Sub-entries indicate hierarchical relationships
|
||
|
||
# ---------------------------------------------------------------------
|
||
# FRONT MATTER REGIONS
|
||
# ---------------------------------------------------------------------
|
||
|
||
DOC.TTP:
|
||
name: "TITLE_PAGE"
|
||
code: "DOC.TTP"
|
||
definition: |
|
||
Primary identifying page of a document containing title, author,
|
||
publisher, date, and other key metadata. High-value structured
|
||
information source.
|
||
|
||
Domain-agnostic applications:
|
||
- Heritage: Manuscript title pages, incunabula colophons, broadside headers
|
||
- Publishing: Book title pages, journal covers, report covers
|
||
- Archives: Folder titles, series title sheets
|
||
- Legal: Document covers, contract title pages
|
||
- Academic: Thesis title pages, paper headers
|
||
|
||
Title pages contain STRUCTURED CLAIMS about the document itself.
|
||
|
||
ontology_mappings:
|
||
primary_class: "bibo:DocumentPart"
|
||
alternative_classes:
|
||
- "schema:CoverPage"
|
||
- "crm:E73_Information_Object"
|
||
linkml_mapping:
|
||
class_uri: "bibo:DocumentPart"
|
||
|
||
format_mappings:
|
||
html:
|
||
elements: [".title-page", "#cover", "section.frontmatter"]
|
||
pagexml:
|
||
type: "TextRegion[@type='title-page']"
|
||
tei:
|
||
element: "<titlePage>"
|
||
|
||
title_page_components:
|
||
- component: "TTP.TTL"
|
||
name: "Title"
|
||
description: "Main document title"
|
||
ontology: "dcterms:title"
|
||
- component: "TTP.STL"
|
||
name: "Subtitle"
|
||
description: "Secondary title"
|
||
ontology: "bibo:shortTitle"
|
||
- component: "TTP.AUT"
|
||
name: "Author"
|
||
description: "Creator attribution"
|
||
ontology: "dcterms:creator"
|
||
- component: "TTP.PUB"
|
||
name: "Publisher"
|
||
description: "Publisher/printer"
|
||
ontology: "dcterms:publisher"
|
||
- component: "TTP.DAT"
|
||
name: "Date"
|
||
description: "Publication date"
|
||
ontology: "dcterms:date"
|
||
- component: "TTP.PLC"
|
||
name: "Place"
|
||
description: "Publication place"
|
||
ontology: "dcterms:spatial"
|
||
- component: "TTP.EDT"
|
||
name: "Edition"
|
||
description: "Edition statement"
|
||
ontology: "bibo:edition"
|
||
- component: "TTP.IMP"
|
||
name: "Imprint"
|
||
description: "Full publication statement"
|
||
ontology: "bibo:Note"
|
||
|
||
entity_extraction_notes: |
|
||
Title pages are AUTHORITATIVE entity sources:
|
||
|
||
1. STRUCTURED CLAIMS about document:
|
||
- Author → AGENT entity with creator role
|
||
- Publisher → GROUP entity
|
||
- Date → TEMPORAL entity (publication date)
|
||
- Place → TOPONYM entity (publication place)
|
||
|
||
2. HIGH CONFIDENCE: Title page claims are intentional,
|
||
not incidental mentions - treat as authoritative.
|
||
|
||
DOC.DED:
|
||
name: "DEDICATION"
|
||
code: "DOC.DED"
|
||
definition: |
|
||
Dedicatory text, epigraph, or inscription typically appearing
|
||
in front matter. May honor a person, quote a source, or set
|
||
thematic context for the work.
|
||
|
||
Domain-agnostic applications:
|
||
- Heritage: Manuscript dedications, donor inscriptions
|
||
- Publishing: Book dedications, memorial pages
|
||
- Archives: Gift acknowledgments, founding documents
|
||
- Academic: Thesis acknowledgments, memorial lectures
|
||
- Monuments: Dedicatory inscriptions, foundation stones
|
||
|
||
ontology_mappings:
|
||
primary_class: "bibo:DocumentPart"
|
||
alternative_classes:
|
||
- "schema:Quotation"
|
||
- "crm:E33_Linguistic_Object"
|
||
linkml_mapping:
|
||
class_uri: "bibo:DocumentPart"
|
||
|
||
format_mappings:
|
||
html:
|
||
elements: [".dedication", ".epigraph", "blockquote.epigraph"]
|
||
tei:
|
||
element: "<div type='dedication'>, <epigraph>"
|
||
|
||
dedication_types:
|
||
- type: "dedication"
|
||
description: "Work dedicated to a person or group"
|
||
note: "Dedicatee is an AGENT entity"
|
||
- type: "epigraph"
|
||
description: "Quotation setting thematic context"
|
||
note: "May contain WORK reference (source)"
|
||
- type: "inscription"
|
||
description: "Physical inscription text"
|
||
note: "Common in monuments, buildings"
|
||
- type: "acknowledgment"
|
||
description: "Thanks to supporters/contributors"
|
||
note: "Multiple AGENT entities"
|
||
|
||
dedication_properties:
|
||
- property: "dedicatee"
|
||
uri: "schema:recipient"
|
||
description: "Person/group to whom work is dedicated"
|
||
- property: "quotationSource"
|
||
uri: "schema:isBasedOn"
|
||
description: "Source of epigraph quotation"
|
||
|
||
DOC.COL:
|
||
name: "COLOPHON"
|
||
code: "DOC.COL"
|
||
definition: |
|
||
Production statement typically at end of document containing
|
||
printing/publication details: printer, date, place, technical
|
||
specifications. Critical for bibliographic identification.
|
||
|
||
Domain-agnostic applications:
|
||
- Heritage: Manuscript colophons, incunabula printer statements
|
||
- Publishing: Printer's statements, production credits
|
||
- Archives: Processing notes, digitization metadata
|
||
- Legal: Document certification statements
|
||
- Digital: Software version, generation metadata
|
||
|
||
Colophons contain AUTHORITATIVE production claims.
|
||
|
||
ontology_mappings:
|
||
primary_class: "bibo:DocumentPart"
|
||
alternative_classes:
|
||
- "crm:E65_Creation"
|
||
- "prov:Activity"
|
||
linkml_mapping:
|
||
class_uri: "bibo:DocumentPart"
|
||
related_mappings:
|
||
- "crm:E65_Creation"
|
||
|
||
format_mappings:
|
||
html:
|
||
elements: [".colophon", "#colophon", "section[role='doc-colophon']"]
|
||
pagexml:
|
||
type: "TextRegion[@type='colophon']"
|
||
tei:
|
||
element: "<colophon>"
|
||
|
||
colophon_components:
|
||
- component: "COL.PRN"
|
||
name: "Printer"
|
||
description: "Printing house or individual printer"
|
||
ontology: "schema:printer"
|
||
- component: "COL.DAT"
|
||
name: "Date"
|
||
description: "Printing/production date"
|
||
ontology: "dcterms:created"
|
||
- component: "COL.PLC"
|
||
name: "Place"
|
||
description: "Place of production"
|
||
ontology: "dcterms:spatial"
|
||
- component: "COL.TYP"
|
||
name: "Typography"
|
||
description: "Font/type information"
|
||
- component: "COL.PAP"
|
||
name: "Paper"
|
||
description: "Paper/material specifications"
|
||
- component: "COL.CPY"
|
||
name: "Copyright"
|
||
description: "Rights statement"
|
||
ontology: "dcterms:rights"
|
||
- component: "COL.EDN"
|
||
name: "Edition"
|
||
description: "Print run, edition number"
|
||
ontology: "bibo:edition"
|
||
|
||
entity_extraction_notes: |
|
||
Colophons are CRITICAL for heritage identification:
|
||
|
||
1. PRODUCTION AGENTS:
|
||
- Printer → GROUP or AGENT entity
|
||
- Publisher → GROUP entity
|
||
- Scribe (manuscripts) → AGENT entity
|
||
|
||
2. PRODUCTION CONTEXT:
|
||
- Date → TEMPORAL entity
|
||
- Place → TOPONYM entity
|
||
|
||
3. BIBLIOGRAPHIC VALUE:
|
||
- Often only source for incunabula dating
|
||
- Manuscript colophons name scribes
|
||
|
||
# ---------------------------------------------------------------------
|
||
# BACK MATTER REGIONS
|
||
# ---------------------------------------------------------------------
|
||
|
||
DOC.BIB:
|
||
name: "BIBLIOGRAPHY"
|
||
code: "DOC.BIB"
|
||
definition: |
|
||
List of cited or referenced works. Each entry represents a
|
||
WORK entity with structured bibliographic data.
|
||
|
||
Domain-agnostic applications:
|
||
- Heritage: Manuscript source lists, catalog references
|
||
- Publishing: Book bibliographies, article references
|
||
- Academic: Citation lists, literature reviews
|
||
- Legal: Case citations, statute references
|
||
- Archives: Related materials, provenance sources
|
||
|
||
Bibliographies are STRUCTURED WORK entity lists.
|
||
|
||
ontology_mappings:
|
||
primary_class: "bibo:DocumentPart"
|
||
alternative_classes:
|
||
- "schema:ItemList"
|
||
- "dcterms:BibliographicResource"
|
||
linkml_mapping:
|
||
class_uri: "bibo:DocumentPart"
|
||
|
||
format_mappings:
|
||
html:
|
||
elements: [".bibliography", ".references", "#refs", "section[role='doc-bibliography']"]
|
||
tei:
|
||
element: "<listBibl>, <div type='bibliography'>"
|
||
|
||
bibliography_types:
|
||
- type: "works_cited"
|
||
description: "Works directly cited in text"
|
||
- type: "further_reading"
|
||
description: "Recommended but not cited works"
|
||
- type: "sources"
|
||
description: "Primary source list"
|
||
- type: "discography"
|
||
description: "Music recording references"
|
||
- type: "filmography"
|
||
description: "Film/video references"
|
||
- type: "webography"
|
||
description: "Web resource references"
|
||
|
||
bibliography_entry_components:
|
||
- component: "BIB.AUT"
|
||
name: "Author"
|
||
entity_type: "AGT"
|
||
ontology: "dcterms:creator"
|
||
- component: "BIB.TTL"
|
||
name: "Title"
|
||
entity_type: "WRK"
|
||
ontology: "dcterms:title"
|
||
- component: "BIB.DAT"
|
||
name: "Date"
|
||
entity_type: "TMP"
|
||
ontology: "dcterms:date"
|
||
- component: "BIB.PUB"
|
||
name: "Publisher"
|
||
entity_type: "GRP"
|
||
ontology: "dcterms:publisher"
|
||
- component: "BIB.PLC"
|
||
name: "Place"
|
||
entity_type: "TOP"
|
||
ontology: "dcterms:spatial"
|
||
|
||
entity_extraction_notes: |
|
||
Bibliographies are PRE-STRUCTURED entity sources:
|
||
|
||
1. EACH ENTRY contains:
|
||
- Author(s) → AGENT entities
|
||
- Title → WORK entity
|
||
- Date → TEMPORAL entity
|
||
- Publisher → GROUP entity
|
||
- Place → TOPONYM entity
|
||
|
||
2. CITATION PARSING:
|
||
- Citation format indicates field boundaries
|
||
- High confidence due to intentional structure
|
||
|
||
DOC.APP:
|
||
name: "APPENDIX"
|
||
code: "DOC.APP"
|
||
definition: |
|
||
Supplementary material at document end containing supporting
|
||
data, extended discussions, or reference material too detailed
|
||
for main text.
|
||
|
||
Domain-agnostic applications:
|
||
- Heritage: Document transcriptions, provenance records
|
||
- Publishing: Data tables, extended methods, source texts
|
||
- Academic: Statistical data, interview transcripts
|
||
- Legal: Exhibits, supporting documents
|
||
- Archives: Finding aid supplements, accession lists
|
||
|
||
ontology_mappings:
|
||
primary_class: "bibo:DocumentPart"
|
||
alternative_classes:
|
||
- "schema:CreativeWork"
|
||
linkml_mapping:
|
||
class_uri: "bibo:DocumentPart"
|
||
|
||
format_mappings:
|
||
html:
|
||
elements: [".appendix", "section[role='doc-appendix']"]
|
||
tei:
|
||
element: "<div type='appendix'>"
|
||
|
||
appendix_types:
|
||
- type: "data_appendix"
|
||
description: "Statistical or tabular data"
|
||
- type: "document_appendix"
|
||
description: "Reproduced primary documents"
|
||
- type: "technical_appendix"
|
||
description: "Methods, algorithms, specifications"
|
||
- type: "glossary_appendix"
|
||
description: "Extended terminology (see also DOC.GLO)"
|
||
|
||
appendix_properties:
|
||
- property: "appendixLabel"
|
||
uri: "schema:name"
|
||
description: "Appendix letter/number (A, B, C or 1, 2, 3)"
|
||
- property: "appendixTitle"
|
||
uri: "dcterms:title"
|
||
description: "Descriptive title"
|
||
|
||
DOC.GLO:
|
||
name: "GLOSSARY"
|
||
code: "DOC.GLO"
|
||
definition: |
|
||
Alphabetical list of terms with definitions. Each entry is a
|
||
CONCEPT with term (label) and definition (description).
|
||
|
||
Domain-agnostic applications:
|
||
- Heritage: Manuscript terminology, archaic word lists
|
||
- Publishing: Technical glossaries, foreign word lists
|
||
- Academic: Disciplinary terminology
|
||
- Legal: Legal terms, statutory definitions
|
||
- Archives: Archival terminology, provenance terms
|
||
|
||
Glossaries are CONTROLLED VOCABULARY sources.
|
||
|
||
ontology_mappings:
|
||
primary_class: "skos:ConceptScheme"
|
||
alternative_classes:
|
||
- "schema:DefinedTermSet"
|
||
- "bibo:DocumentPart"
|
||
linkml_mapping:
|
||
class_uri: "skos:ConceptScheme"
|
||
|
||
format_mappings:
|
||
html:
|
||
elements: [".glossary", "dl.glossary", "section[role='doc-glossary']"]
|
||
tei:
|
||
element: "<list type='gloss'>, <div type='glossary'>"
|
||
|
||
glossary_entry_components:
|
||
- component: "GLO.TRM"
|
||
name: "Term"
|
||
ontology: "skos:prefLabel"
|
||
- component: "GLO.DEF"
|
||
name: "Definition"
|
||
ontology: "skos:definition"
|
||
- component: "GLO.SYN"
|
||
name: "Synonym"
|
||
ontology: "skos:altLabel"
|
||
- component: "GLO.REL"
|
||
name: "Related Term"
|
||
ontology: "skos:related"
|
||
|
||
# ---------------------------------------------------------------------
|
||
# COMMERCIAL AND BRANDING REGIONS
|
||
# ---------------------------------------------------------------------
|
||
|
||
DOC.ADV:
|
||
name: "ADVERTISEMENT"
|
||
code: "DOC.ADV"
|
||
definition: |
|
||
Commercial or promotional content within document. Important for
|
||
historical documents where ads provide dating, pricing, business,
|
||
and social context. Distinct from main editorial content.
|
||
|
||
Domain-agnostic applications:
|
||
- Heritage: Historical newspaper ads, trade catalog entries, broadside ads
|
||
- Publishing: Book advertisements, periodical ads, classified sections
|
||
- Archives: Commercial records, promotional materials
|
||
- Web: Banner ads, sponsored content, promotional sections
|
||
- Ephemera: Trade cards, handbills, promotional flyers
|
||
|
||
Advertisements are RICH entity sources for historical research:
|
||
business names, addresses, prices, products, and social attitudes.
|
||
|
||
ontology_mappings:
|
||
primary_class: "schema:Advertisement"
|
||
alternative_classes:
|
||
- "crm:E73_Information_Object"
|
||
- "bibo:DocumentPart"
|
||
linkml_mapping:
|
||
class_uri: "schema:Advertisement"
|
||
|
||
format_mappings:
|
||
html:
|
||
elements: [".ad", ".advertisement", "[role='complementary'][aria-label*='sponsor']", "aside.ad"]
|
||
pagexml:
|
||
type: "TextRegion[@type='advertisement']"
|
||
newspaper:
|
||
note: "Common in historical newspaper digitization"
|
||
|
||
advertisement_types:
|
||
- type: "display_ad"
|
||
description: "Large format advertisement with graphics"
|
||
- type: "classified_ad"
|
||
description: "Text-only small advertisement"
|
||
- type: "trade_listing"
|
||
description: "Business directory entry"
|
||
- type: "prospectus"
|
||
description: "Book/publication advertisement"
|
||
- type: "patent_medicine"
|
||
description: "Historical medical product ads"
|
||
note: "Common in 19th century periodicals"
|
||
- type: "auction_notice"
|
||
description: "Sale or auction announcement"
|
||
- type: "legal_notice"
|
||
description: "Required public announcements"
|
||
|
||
advertisement_properties:
|
||
- property: "advertiser"
|
||
uri: "schema:sponsor"
|
||
description: "Business or person placing ad"
|
||
entity_type: "GRP or AGT"
|
||
- property: "product"
|
||
uri: "schema:itemAdvertised"
|
||
description: "Product or service advertised"
|
||
- property: "businessAddress"
|
||
uri: "schema:address"
|
||
description: "Advertiser's address"
|
||
entity_type: "TOP"
|
||
- property: "price"
|
||
uri: "schema:price"
|
||
description: "Advertised price"
|
||
entity_type: "QTY"
|
||
|
||
entity_extraction_notes: |
|
||
Historical advertisements are TREASURE TROVES:
|
||
|
||
1. BUSINESS ENTITIES:
|
||
- Business names → GROUP entities
|
||
- Proprietor names → AGENT entities
|
||
- Business addresses → TOPONYM entities
|
||
|
||
2. HISTORICAL VALUE:
|
||
- Dating evidence (product availability)
|
||
- Pricing history
|
||
- Business locations over time
|
||
- Social/cultural attitudes
|
||
|
||
3. PROVENANCE:
|
||
- Distinguish ad claims from editorial claims
|
||
- Ads have different authority level
|
||
|
||
DOC.LOG:
|
||
name: "LOGO"
|
||
code: "DOC.LOG"
|
||
definition: |
|
||
Visual identity marks: logos, mastheads, colophon marks, printer's
|
||
devices, watermarks, seals, and brand identifiers. Important for
|
||
attribution and provenance.
|
||
|
||
Domain-agnostic applications:
|
||
- Heritage: Printer's marks, publisher devices, watermarks, seals
|
||
- Publishing: Publisher logos, journal mastheads, imprint marks
|
||
- Archives: Institutional seals, letterhead logos
|
||
- Web: Site logos, brand marks, favicons
|
||
- Legal: Notary seals, official stamps, certification marks
|
||
|
||
Logos identify PRODUCING AGENTS and provide provenance evidence.
|
||
|
||
ontology_mappings:
|
||
primary_class: "schema:ImageObject"
|
||
alternative_classes:
|
||
- "crm:E37_Mark"
|
||
- "crm:E73_Information_Object"
|
||
linkml_mapping:
|
||
class_uri: "schema:ImageObject"
|
||
close_mappings:
|
||
- "crm:E37_Mark"
|
||
|
||
format_mappings:
|
||
html:
|
||
elements: [".logo", "header img.logo", "[role='banner'] img", ".masthead img"]
|
||
pagexml:
|
||
type: "GraphicRegion[@type='logo'], GraphicRegion[@type='decoration']"
|
||
|
||
logo_types:
|
||
- type: "publisher_logo"
|
||
description: "Publisher's identifying mark"
|
||
- type: "printer_device"
|
||
description: "Historical printer's identifying mark"
|
||
note: "Critical for incunabula identification"
|
||
- type: "masthead"
|
||
description: "Newspaper/periodical title banner"
|
||
- type: "watermark"
|
||
description: "Paper manufacturer's mark"
|
||
note: "Used for paper dating and provenance"
|
||
- type: "seal"
|
||
description: "Official or personal seal impression"
|
||
entity_type: "AGT or GRP"
|
||
- type: "coat_of_arms"
|
||
description: "Heraldic device"
|
||
entity_type: "AGT or GRP"
|
||
- type: "colophon_mark"
|
||
description: "Decorative mark in colophon"
|
||
- type: "ex_libris"
|
||
description: "Bookplate or ownership mark"
|
||
note: "Provenance evidence for ownership history"
|
||
|
||
logo_properties:
|
||
- property: "logoOwner"
|
||
uri: "schema:creator"
|
||
description: "Entity identified by logo"
|
||
entity_type: "GRP or AGT"
|
||
- property: "logoDescription"
|
||
uri: "schema:description"
|
||
description: "Visual description of mark"
|
||
- property: "logoReference"
|
||
uri: "schema:isBasedOn"
|
||
description: "Reference to mark catalog/database"
|
||
|
||
entity_extraction_notes: |
|
||
Logos provide PROVENANCE evidence:
|
||
|
||
1. ATTRIBUTION:
|
||
- Printer's devices identify producer
|
||
- Publisher logos identify publisher
|
||
- Watermarks date paper production
|
||
|
||
2. OWNERSHIP HISTORY:
|
||
- Ex libris marks trace ownership
|
||
- Seals indicate institutional provenance
|
||
|
||
3. VISUAL ANALYSIS required:
|
||
- May need image matching to logo databases
|
||
- Heraldic interpretation for coats of arms
|
||
|
||
DOC.PGN:
|
||
name: "PAGINATION"
|
||
definition: |
|
||
Page numbers, folio numbers, signature marks in printed/manuscript works.
|
||
|
||
ontology_mappings:
|
||
primary_class: "crm:E42_Identifier"
|
||
|
||
pagination_types:
|
||
- type: "page_number"
|
||
pagexml: "TextRegion[@type='page-number']"
|
||
description: "Arabic or Roman numeral page number"
|
||
- type: "folio"
|
||
description: "Leaf number with recto/verso (e.g., 23r, 23v)"
|
||
- type: "signature_mark"
|
||
pagexml: "TextRegion[@type='signature-mark']"
|
||
description: "Gathering/quire identifier in manuscripts"
|
||
- type: "catch_word"
|
||
pagexml: "TextRegion[@type='catch-word']"
|
||
description: "Word at page bottom matching next page start"
|
||
|
||
DOC.BLK:
|
||
name: "BLOCK_QUOTE"
|
||
definition: |
|
||
Extended quotation from another source, typically indented or styled
|
||
distinctly from surrounding text.
|
||
|
||
ontology_mappings:
|
||
primary_class: "schema:Quotation"
|
||
alternative_classes:
|
||
- "crm:E33_Linguistic_Object"
|
||
|
||
quote_properties:
|
||
- property: "quotedFrom"
|
||
uri: "glam:quotedFrom"
|
||
range: "xsd:anyURI"
|
||
owl_mapping: "prov:wasDerivedFrom"
|
||
description: "Source of the quotation"
|
||
|
||
# ---------------------------------------------------------------------
|
||
# METADATA AND ADMINISTRATIVE REGIONS
|
||
# ---------------------------------------------------------------------
|
||
|
||
DOC.MTD:
|
||
name: "METADATA_BLOCK"
|
||
definition: |
|
||
Region containing document metadata (author, date, keywords, etc.).
|
||
High-value for entity extraction as claims are typically structured.
|
||
|
||
Includes: HTML <head>, document properties, front matter, colophon.
|
||
|
||
ontology_mappings:
|
||
primary_class: "dcterms:BibliographicResource"
|
||
alternative_classes:
|
||
- "schema:CreativeWork"
|
||
|
||
metadata_block_types:
|
||
- type: "front_matter"
|
||
description: "Title page, copyright, dedication"
|
||
- type: "back_matter"
|
||
description: "Appendices, bibliography, colophon"
|
||
- type: "colophon"
|
||
description: "Production details (printer, date, place)"
|
||
pagexml: "TextRegion[@type='colophon']"
|
||
- type: "document_head"
|
||
html: "head"
|
||
description: "HTML metadata section"
|
||
|
||
DOC.ANN:
|
||
name: "ANNOTATION_REGION"
|
||
definition: |
|
||
Region containing annotations or markup added to document.
|
||
Distinguished from original content for provenance tracking.
|
||
|
||
Includes: Editorial additions, transcription notes, TEI annotations.
|
||
|
||
ontology_mappings:
|
||
primary_class: "oa:Annotation"
|
||
alternative_classes:
|
||
- "crm:E13_Attribute_Assignment"
|
||
|
||
annotation_properties:
|
||
- property: "annotationBody"
|
||
uri: "oa:hasBody"
|
||
description: "Content of the annotation"
|
||
- property: "annotationTarget"
|
||
uri: "oa:hasTarget"
|
||
description: "Region being annotated"
|
||
- property: "annotator"
|
||
uri: "oa:annotatedBy"
|
||
owl_mapping: "prov:wasAttributedTo"
|
||
description: "Agent who created annotation"
|
||
|
||
# -------------------------------------------------------------------------
|
||
# SEMANTIC ROLE ENUMERATION
|
||
# -------------------------------------------------------------------------
|
||
|
||
layout_semantic_roles:
|
||
description: |
|
||
Enumeration of semantic roles that document regions can play.
|
||
A single region may have multiple roles.
|
||
|
||
roles:
|
||
- role: "PRIMARY_CONTENT"
|
||
code: "PRIM"
|
||
description: "Main content bearing primary information"
|
||
typical_regions: ["DOC.PAR", "DOC.HDR", "DOC.LST"]
|
||
|
||
- role: "SUPPLEMENTARY"
|
||
code: "SUPP"
|
||
description: "Additional context or metadata"
|
||
typical_regions: ["DOC.SDB", "DOC.FTN", "DOC.CAP", "DOC.APP"]
|
||
|
||
- role: "NAVIGATIONAL"
|
||
code: "NAV"
|
||
description: "Aids document navigation"
|
||
typical_regions: ["DOC.NAV", "DOC.PGN", "DOC.TOC", "DOC.IDX"]
|
||
|
||
- role: "STRUCTURAL"
|
||
code: "STRC"
|
||
description: "Defines document structure"
|
||
typical_regions: ["DOC.HDR", "DOC.TTP"]
|
||
|
||
- role: "REFERENTIAL"
|
||
code: "REF"
|
||
description: "Points to other resources"
|
||
typical_regions: ["DOC.FTN", "DOC.BLK", "DOC.BIB"]
|
||
|
||
- role: "VISUAL"
|
||
code: "VIS"
|
||
description: "Non-textual visual content (images, maps, diagrams)"
|
||
typical_regions: ["DOC.FIG", "DOC.GAL", "DOC.MAP", "DOC.LOG"]
|
||
|
||
- role: "AUDIOVISUAL"
|
||
code: "AV"
|
||
description: "Time-based media content (audio, video)"
|
||
typical_regions: ["DOC.AUD", "DOC.VID"]
|
||
|
||
- role: "INTERACTIVE"
|
||
code: "INT"
|
||
description: "User-manipulable embedded content"
|
||
typical_regions: ["DOC.EMB", "DOC.MAP"]
|
||
note: "Interactive maps have both VIS and INT roles"
|
||
|
||
- role: "METADATA"
|
||
code: "META"
|
||
description: "Document-level metadata"
|
||
typical_regions: ["DOC.MTD", "DOC.COL", "DOC.TTP"]
|
||
|
||
- role: "SPATIAL"
|
||
code: "SPAT"
|
||
description: "Geographic or spatial representation"
|
||
typical_regions: ["DOC.MAP"]
|
||
note: "Distinct from VISUAL; emphasizes coordinate/location semantics"
|
||
|
||
- role: "FRONT_MATTER"
|
||
code: "FRNT"
|
||
description: "Preliminary material before main content"
|
||
typical_regions: ["DOC.TTP", "DOC.DED", "DOC.TOC"]
|
||
|
||
- role: "BACK_MATTER"
|
||
code: "BACK"
|
||
description: "Material following main content"
|
||
typical_regions: ["DOC.BIB", "DOC.IDX", "DOC.APP", "DOC.GLO", "DOC.COL"]
|
||
|
||
- role: "PARATEXTUAL"
|
||
code: "PARA"
|
||
description: "Content about the document itself (not subject matter)"
|
||
typical_regions: ["DOC.TTP", "DOC.COL", "DOC.DED", "DOC.ADV"]
|
||
note: "Genette's paratext concept - frames the main text"
|
||
|
||
- role: "COMMERCIAL"
|
||
code: "COMM"
|
||
description: "Commercial or promotional content"
|
||
typical_regions: ["DOC.ADV", "DOC.LOG"]
|
||
note: "Distinguish from editorial content for authority assessment"
|
||
|
||
- role: "LEXICAL"
|
||
code: "LEX"
|
||
description: "Vocabulary, terminology, and definitions"
|
||
typical_regions: ["DOC.GLO", "DOC.IDX"]
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# NESTED PROVENANCE MODEL
|
||
# ---------------------------------------------------------------------------
|
||
# Entity claims reference layout claims in a nested provenance structure.
|
||
# ---------------------------------------------------------------------------
|
||
|
||
nested_provenance_model:
|
||
|
||
description: |
|
||
Entity annotations within documents require TWO LAYERS of provenance:
|
||
|
||
LAYER 1: LAYOUT CLAIM
|
||
- Asserts that a document region has a particular semantic role
|
||
- Has its own provenance chain (extraction method, confidence, timestamp)
|
||
- May be uncertain (e.g., "80% confident this is marginalia, not sidebar")
|
||
|
||
LAYER 2: ENTITY CLAIM (nested)
|
||
- Asserts that an entity appears within the layout region
|
||
- References Layer 1 layout claim as context
|
||
- Has its own provenance chain for the entity extraction
|
||
|
||
This separation enables:
|
||
- Different models for layout vs. entity extraction
|
||
- Independent validation of each layer
|
||
- Reasoning about how layout uncertainty affects entity confidence
|
||
- Aggregation of claims by layout context
|
||
|
||
layout_claim_schema:
|
||
description: "Schema for layout/structural claims about document regions"
|
||
|
||
required_fields:
|
||
- field: "layout_claim_id"
|
||
type: "URI"
|
||
description: "Unique identifier for this layout claim"
|
||
example: "https://example.org/claims/layout/doc1-region-5"
|
||
|
||
- field: "region_type"
|
||
type: "LayoutRegionCode"
|
||
description: "Code from layout semantic ontology (DOC.HDR, DOC.SDB.MRG, etc.)"
|
||
example: "DOC.SDB.MRG"
|
||
|
||
- field: "region_path"
|
||
type: "string"
|
||
description: "XPath, JSONPath, or path to region"
|
||
example: "//page:TextRegion[@id='r5']"
|
||
|
||
- field: "source_document"
|
||
type: "URI"
|
||
description: "Document containing this region"
|
||
example: "https://example.org/documents/manuscript_123.xml"
|
||
|
||
- field: "document_format"
|
||
type: "enum"
|
||
values: ["PAGE-XML", "HTML", "JSON", "TEI-XML", "MARKDOWN", "PDF", "EPUB"]
|
||
|
||
- field: "extraction_timestamp"
|
||
type: "xsd:dateTime"
|
||
owl_mapping: "prov:generatedAtTime"
|
||
|
||
- field: "extraction_agent"
|
||
type: "URI"
|
||
description: "Model or agent that identified this region"
|
||
owl_mapping: "prov:wasAttributedTo"
|
||
|
||
- field: "confidence"
|
||
type: "xsd:float"
|
||
range: "[0.0, 1.0]"
|
||
description: "Confidence in layout classification"
|
||
|
||
optional_fields:
|
||
- field: "parent_region"
|
||
type: "URI"
|
||
description: "Reference to containing region's layout claim"
|
||
|
||
- field: "semantic_roles"
|
||
type: "list[SemanticRoleCode]"
|
||
description: "Roles this region plays (PRIM, SUPP, NAV, etc.)"
|
||
|
||
- field: "governing_header"
|
||
type: "URI"
|
||
description: "Reference to header region that governs this content"
|
||
|
||
- field: "layout_model_version"
|
||
type: "string"
|
||
description: "Version of layout extraction model used"
|
||
|
||
entity_claim_with_layout_context:
|
||
description: "Schema for entity claims that reference layout context"
|
||
|
||
required_fields:
|
||
- field: "entity_claim_id"
|
||
type: "URI"
|
||
description: "Unique identifier for this entity claim"
|
||
|
||
- field: "entity_type"
|
||
type: "EntityTypeCode"
|
||
description: "Code from entity type ontology (AGT.PER, TOP.CTY, etc.)"
|
||
|
||
- field: "entity_text"
|
||
type: "string"
|
||
description: "Surface form of entity mention"
|
||
owl_mapping: "nif:anchorOf"
|
||
|
||
- field: "span_start"
|
||
type: "xsd:integer"
|
||
owl_mapping: "nif:beginIndex"
|
||
|
||
- field: "span_end"
|
||
type: "xsd:integer"
|
||
owl_mapping: "nif:endIndex"
|
||
|
||
- field: "layout_context"
|
||
type: "URI"
|
||
description: "Reference to layout claim (Layer 1)"
|
||
note: "CRITICAL: Links entity to its structural context"
|
||
|
||
- field: "extraction_timestamp"
|
||
type: "xsd:dateTime"
|
||
owl_mapping: "prov:generatedAtTime"
|
||
|
||
- field: "extraction_agent"
|
||
type: "URI"
|
||
owl_mapping: "prov:wasAttributedTo"
|
||
|
||
- field: "confidence"
|
||
type: "xsd:float"
|
||
range: "[0.0, 1.0]"
|
||
|
||
optional_fields:
|
||
- field: "entity_link"
|
||
type: "URI"
|
||
description: "Link to knowledge base entity (Wikidata, etc.)"
|
||
owl_mapping: "itsrdf:taIdentRef"
|
||
|
||
- field: "entity_link_confidence"
|
||
type: "xsd:float"
|
||
description: "Confidence in entity linking (separate from NER confidence)"
|
||
|
||
- field: "inherited_context"
|
||
type: "object"
|
||
description: "Context inherited from governing header"
|
||
schema:
|
||
temporal_scope: "xsd:string"
|
||
spatial_scope: "xsd:string"
|
||
topic_scope: "xsd:string"
|
||
|
||
example_nested_claims:
|
||
description: "Example showing nested layout and entity claims"
|
||
|
||
layout_claim:
|
||
layout_claim_id: "https://example.org/claims/layout/ms123-marginalia-1"
|
||
region_type: "DOC.SDB.MRG"
|
||
region_path: "//page:TextRegion[@id='r5'][@type='marginalia-left']"
|
||
source_document: "https://example.org/documents/manuscript_123.xml"
|
||
document_format: "PAGE-XML"
|
||
extraction_timestamp: "2025-12-02T14:30:00Z"
|
||
extraction_agent: "https://example.org/agents/layout-model-v2.1"
|
||
confidence: 0.92
|
||
semantic_roles: ["SUPP", "META"]
|
||
parent_region: "https://example.org/claims/layout/ms123-page-1"
|
||
|
||
entity_claim:
|
||
entity_claim_id: "https://example.org/claims/entity/ms123-date-1"
|
||
entity_type: "TMP.DAB"
|
||
entity_text: "anno 1642"
|
||
span_start: 12
|
||
span_end: 21
|
||
layout_context: "https://example.org/claims/layout/ms123-marginalia-1"
|
||
extraction_timestamp: "2025-12-02T14:31:00Z"
|
||
extraction_agent: "https://example.org/agents/ner-model-v3.0"
|
||
confidence: 0.88
|
||
inherited_context:
|
||
note: "Date in marginalia - may indicate when reader annotated, not content date"
|
||
|
||
example_wikipedia_infobox:
|
||
description: |
|
||
Complex real-world example: Extracting structured data from a Wikipedia
|
||
infobox. Shows how layout uncertainty affects entity confidence and how
|
||
multiple entities in one region share a common layout claim parent.
|
||
|
||
# First, the layout claim about the infobox region itself
|
||
layout_claim:
|
||
layout_claim_id: "https://example.org/claims/layout/wiki-rijksmuseum-infobox"
|
||
region_type: "DOC.SDB.IBX"
|
||
region_path: "//table[@class='infobox vcard']"
|
||
source_document: "https://en.wikipedia.org/wiki/Rijksmuseum"
|
||
document_format: "HTML"
|
||
extraction_timestamp: "2025-12-02T16:00:00Z"
|
||
extraction_agent: "https://example.org/agents/html-layout-detector-v1.0"
|
||
confidence: 0.98
|
||
semantic_roles: ["SUPP", "META"]
|
||
parent_region: null
|
||
note: |
|
||
High confidence because Wikipedia infoboxes have consistent class names.
|
||
This is SUPPLEMENTARY content (not main article) providing METADATA.
|
||
|
||
# Entity claims nested within this layout region
|
||
entity_claims:
|
||
- entity_claim_id: "https://example.org/claims/entity/rijksmuseum-founding-date"
|
||
entity_type: "TMP.DAB"
|
||
entity_text: "1800"
|
||
span_start: 342
|
||
span_end: 346
|
||
layout_context: "https://example.org/claims/layout/wiki-rijksmuseum-infobox"
|
||
extraction_timestamp: "2025-12-02T16:01:00Z"
|
||
extraction_agent: "https://example.org/agents/ner-temporal-v2.0"
|
||
confidence: 0.95
|
||
entity_link: "https://www.wikidata.org/entity/Q190804"
|
||
entity_link_confidence: 0.92
|
||
property_path: "//table[@class='infobox vcard']//th[text()='Established']/following-sibling::td"
|
||
property_label: "Established"
|
||
note: |
|
||
Property extracted from infobox row. The label "Established" provides
|
||
semantic context that this is a founding date, not just any date.
|
||
|
||
- entity_claim_id: "https://example.org/claims/entity/rijksmuseum-location"
|
||
entity_type: "TOP.CTY"
|
||
entity_text: "Amsterdam"
|
||
span_start: 128
|
||
span_end: 137
|
||
layout_context: "https://example.org/claims/layout/wiki-rijksmuseum-infobox"
|
||
extraction_timestamp: "2025-12-02T16:01:00Z"
|
||
extraction_agent: "https://example.org/agents/ner-spatial-v2.0"
|
||
confidence: 0.99
|
||
entity_link: "https://www.wikidata.org/entity/Q727"
|
||
entity_link_confidence: 0.99
|
||
property_path: "//table[@class='infobox vcard']//th[text()='Location']/following-sibling::td"
|
||
property_label: "Location"
|
||
|
||
- entity_claim_id: "https://example.org/claims/entity/rijksmuseum-name"
|
||
entity_type: "GRP.ORG"
|
||
entity_text: "Rijksmuseum"
|
||
span_start: 0
|
||
span_end: 11
|
||
layout_context: "https://example.org/claims/layout/wiki-rijksmuseum-infobox"
|
||
extraction_timestamp: "2025-12-02T16:01:00Z"
|
||
extraction_agent: "https://example.org/agents/ner-org-v2.0"
|
||
confidence: 0.99
|
||
entity_link: "https://www.wikidata.org/entity/Q190804"
|
||
entity_link_confidence: 0.99
|
||
property_path: "//table[@class='infobox vcard']//caption"
|
||
property_label: "infobox_title"
|
||
|
||
provenance_chain_explanation: |
|
||
This example demonstrates the TWO-LAYER provenance model:
|
||
|
||
LAYER 1 (Layout Claim):
|
||
- Claims the table is an infobox (DOC.SDB.IBX)
|
||
- Has its own confidence (0.98) and agent (html-layout-detector-v1.0)
|
||
- If layout is misclassified, ALL nested entity claims are affected
|
||
|
||
LAYER 2 (Entity Claims):
|
||
- Each entity references the layout claim via layout_context
|
||
- Each entity has its OWN confidence separate from layout confidence
|
||
- Entity linking confidence is SEPARATE from NER confidence
|
||
|
||
Combined confidence calculation:
|
||
- P(entity correct) = P(layout correct) × P(NER correct | layout)
|
||
- P("1800" is Rijksmuseum founding) ≈ 0.98 × 0.95 = 0.931
|
||
|
||
If layout claim is later corrected (e.g., this wasn't actually an infobox),
|
||
all entity claims that referenced it can be invalidated or reprocessed.
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Structural Context for Entity Clustering
|
||
# ---------------------------------------------------------------------------
|
||
structural_contexts:
|
||
|
||
header_contexts:
|
||
description: |
|
||
Headers (h1-h6, TextRegion[@type='header'], JSON keys) establish scope
|
||
for subsequent content. Entities in headers often:
|
||
- Name the TOPIC of the section
|
||
- Establish the SUBJECT of subsequent predicates
|
||
- Define the CONTEXT for entity resolution
|
||
|
||
examples:
|
||
- context: "## Board of Directors"
|
||
implication: "Subsequent person names are likely board members"
|
||
- context: "<heading>1580-1620: The Golden Age</heading>"
|
||
implication: "Subsequent dates are within this period context"
|
||
- context: "{ \"museums\": { \"amsterdam\": { ... } } }"
|
||
implication: "Nested entities are Amsterdam museums"
|
||
|
||
annotation_rules:
|
||
- rule: "TAG_HEADER_ENTITIES"
|
||
description: |
|
||
Entities in headers MUST be tagged with structural role metadata:
|
||
- header_level: 1-6 (depth in hierarchy)
|
||
- governs_region: XPath/JSONPath to governed content
|
||
- scope_type: TOPIC | TEMPORAL | SPATIAL | ORGANIZATIONAL
|
||
|
||
- rule: "PROPAGATE_HEADER_CONTEXT"
|
||
description: |
|
||
Claims about entities in governed content inherit header context:
|
||
- A person in "Board of Directors" section gets implicit org:memberOf
|
||
- A date in "1580-1620" section gets implicit temporal bounding
|
||
|
||
paragraph_contexts:
|
||
description: |
|
||
Paragraphs (p, TextRegion[@type='paragraph'], text values) contain
|
||
the primary content. Entities in paragraphs:
|
||
- Co-occur with other entities in same paragraph (potential relationships)
|
||
- Fall under scope of governing headers
|
||
- May reference entities from previous paragraphs (anaphora)
|
||
|
||
annotation_rules:
|
||
- rule: "PARAGRAPH_CLUSTERING"
|
||
description: |
|
||
Entities within the same paragraph form a CO-OCCURRENCE CLUSTER.
|
||
Co-occurrence is a weak signal of relationship, strengthened by:
|
||
- Syntactic proximity (same sentence > adjacent sentences)
|
||
- Shared predicate (both objects of same verb)
|
||
- List membership (items in same enumeration)
|
||
|
||
- rule: "CROSS_PARAGRAPH_REFERENCE"
|
||
description: |
|
||
Pronouns and definite references ("the museum", "he") resolve to
|
||
antecedents in prior paragraphs within the same section scope.
|
||
Track paragraph index for resolution distance weighting.
|
||
|
||
list_contexts:
|
||
description: |
|
||
Lists (ul/ol, enumerations, bullet points) indicate PARALLEL STRUCTURE.
|
||
List items share a common type or relationship to a parent concept.
|
||
|
||
annotation_rules:
|
||
- rule: "LIST_ITEM_PARALLELISM"
|
||
description: |
|
||
Entities in sibling list items likely share:
|
||
- Same entity TYPE (all museums, all dates, all people)
|
||
- Same RELATIONSHIP to list parent (all "member of", all "located in")
|
||
- Parallel STRUCTURE (if first item has date, others likely do too)
|
||
|
||
- rule: "LIST_HEADER_INHERITANCE"
|
||
description: |
|
||
List items inherit context from the text introducing the list.
|
||
"The following museums are members:" + <li>Rijksmuseum</li>
|
||
implies Rijksmuseum has membership relationship.
|
||
|
||
sidebar_contexts:
|
||
description: |
|
||
Sidebars, asides, marginalia, infoboxes contain SUPPLEMENTARY information.
|
||
Entities in sidebars:
|
||
- Provide metadata ABOUT the main content
|
||
- May contain structured data (birth dates, locations, identifiers)
|
||
- Have different reliability/authority than main narrative
|
||
|
||
annotation_rules:
|
||
- rule: "SIDEBAR_METADATA_EXTRACTION"
|
||
description: |
|
||
Sidebar content often contains STRUCTURED CLAIMS suitable for
|
||
direct property extraction:
|
||
- Infobox fields → claim properties
|
||
- Marginalia dates → temporal metadata
|
||
- Caption text → aboutness relationships
|
||
|
||
- rule: "SIDEBAR_MAIN_LINKING"
|
||
description: |
|
||
Link sidebar entities to main content entities when co-referential.
|
||
The sidebar "Born: 1606" links to the main text "Rembrandt".
|
||
|
||
caption_contexts:
|
||
description: |
|
||
Captions (figcaption, TextRegion[@type='caption']) describe VISUAL content.
|
||
Entities in captions:
|
||
- Describe depicted subjects (people, places, objects)
|
||
- Provide dates/locations for the depicted scene
|
||
- May differ from main text (image of one thing, text about another)
|
||
|
||
annotation_rules:
|
||
- rule: "CAPTION_VISUAL_BINDING"
|
||
description: |
|
||
Caption entities are ABOUT the associated figure/image.
|
||
This is distinct from main text co-occurrence.
|
||
Use relationship: schema:about with target: figure URI.
|
||
|
||
- rule: "CAPTION_PROVENANCE"
|
||
description: |
|
||
Captions may have different authorship/dates than main text.
|
||
Track caption-specific provenance when available.
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Format-Specific Path Conventions
|
||
# ---------------------------------------------------------------------------
|
||
format_path_conventions:
|
||
|
||
page_xml:
|
||
description: |
|
||
PAGE-XML (used for historical manuscript transcription) organizes content
|
||
into TextRegions with type attributes. Critical for manuscript NER.
|
||
|
||
namespace: "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"
|
||
prefix: "page"
|
||
|
||
text_region_types:
|
||
- type: "header"
|
||
semantic_role: "section_heading"
|
||
path_example: "//page:TextRegion[@type='header'][1]"
|
||
- type: "paragraph"
|
||
semantic_role: "body_content"
|
||
path_example: "//page:TextRegion[@type='paragraph'][3]"
|
||
- type: "marginalia-left"
|
||
semantic_role: "supplementary_note"
|
||
path_example: "//page:TextRegion[@type='marginalia-left'][1]"
|
||
- type: "marginalia-right"
|
||
semantic_role: "supplementary_note"
|
||
path_example: "//page:TextRegion[@type='marginalia-right'][1]"
|
||
- type: "caption"
|
||
semantic_role: "figure_description"
|
||
path_example: "//page:TextRegion[@type='caption'][1]"
|
||
- type: "page-number"
|
||
semantic_role: "pagination"
|
||
path_example: "//page:TextRegion[@type='page-number'][1]"
|
||
- type: "signature-mark"
|
||
semantic_role: "gathering_identifier"
|
||
path_example: "//page:TextRegion[@type='signature-mark'][1]"
|
||
- type: "catch-word"
|
||
semantic_role: "gathering_continuity"
|
||
path_example: "//page:TextRegion[@type='catch-word'][1]"
|
||
- type: "table"
|
||
semantic_role: "structured_data"
|
||
path_example: "//page:TextRegion[@type='table'][1]"
|
||
|
||
annotation_output:
|
||
description: "Include PAGE-XML path in entity provenance"
|
||
example:
|
||
entity: "Rembrandt"
|
||
span_start: 145
|
||
span_end: 154
|
||
page_xml_path: "//page:Page[@imageFilename='folio_23r.jpg']/page:TextRegion[@id='r1']/page:TextLine[@id='l1']/page:Word[@id='w3']"
|
||
text_region_type: "paragraph"
|
||
text_region_id: "r1"
|
||
|
||
html:
|
||
description: |
|
||
HTML documents use semantic elements for structure. Critical for web NER.
|
||
|
||
semantic_elements:
|
||
headers:
|
||
elements: ["h1", "h2", "h3", "h4", "h5", "h6"]
|
||
semantic_role: "section_heading"
|
||
hierarchy: "h1 > h2 > h3 > h4 > h5 > h6"
|
||
|
||
body_content:
|
||
elements: ["p", "div", "span", "article", "section"]
|
||
semantic_role: "primary_content"
|
||
|
||
supplementary:
|
||
elements: ["aside", "nav", "footer", "header"]
|
||
semantic_role: "supplementary_content"
|
||
|
||
figures:
|
||
elements: ["figure", "figcaption", "img"]
|
||
semantic_role: "visual_content"
|
||
|
||
lists:
|
||
elements: ["ul", "ol", "li", "dl", "dt", "dd"]
|
||
semantic_role: "enumerated_content"
|
||
|
||
tables:
|
||
elements: ["table", "thead", "tbody", "tr", "th", "td"]
|
||
semantic_role: "structured_data"
|
||
|
||
metadata:
|
||
elements: ["meta", "title", "head"]
|
||
semantic_role: "document_metadata"
|
||
|
||
xpath_conventions:
|
||
- pattern: "//article/section[2]/h2[1]"
|
||
description: "Second section's heading in article"
|
||
- pattern: "//table[@class='infobox']//td[contains(text(),'Born')]"
|
||
description: "Birth date cell in Wikipedia-style infobox"
|
||
- pattern: "//aside//a[@href]"
|
||
description: "Links in sidebar content"
|
||
|
||
annotation_output:
|
||
example:
|
||
entity: "Rijksmuseum"
|
||
span_start: 2341
|
||
span_end: 2352
|
||
xpath: "/html/body/main/article/section[3]/p[2]"
|
||
css_selector: "article > section:nth-child(3) > p:nth-child(2)"
|
||
semantic_context: "body_content"
|
||
parent_header: "Dutch Museums"
|
||
parent_header_path: "/html/body/main/article/section[3]/h2"
|
||
|
||
json:
|
||
description: |
|
||
JSON documents use key paths for structure. Critical for API response NER.
|
||
|
||
path_notation: "JSONPath (RFC 9535) or dot notation"
|
||
|
||
common_patterns:
|
||
- pattern: "$.results[*].name"
|
||
description: "Name field in array of results"
|
||
- pattern: "$.data.institution.address.city"
|
||
description: "Nested city within institution data"
|
||
- pattern: "$['@context']"
|
||
description: "JSON-LD context (for namespace resolution)"
|
||
- pattern: "$.metadata.creator"
|
||
description: "Document creator in metadata block"
|
||
|
||
semantic_inference:
|
||
description: |
|
||
JSON keys often encode semantic roles:
|
||
- "name", "title", "label" → entity names
|
||
- "date", "created", "founded" → temporal properties
|
||
- "location", "address", "place" → spatial properties
|
||
- "author", "creator", "owner" → agent relationships
|
||
- "type", "category", "class" → classification
|
||
|
||
annotation_output:
|
||
example:
|
||
entity: "Amsterdam"
|
||
json_path: "$.data.museums[0].location.city"
|
||
array_index: 0
|
||
parent_key: "location"
|
||
root_key: "museums"
|
||
|
||
plain_text:
|
||
description: |
|
||
Plain text lacks structural markup. Use line/paragraph detection heuristics.
|
||
|
||
structure_detection:
|
||
- heuristic: "BLANK_LINE_PARAGRAPH"
|
||
description: "Two consecutive newlines indicate paragraph break"
|
||
- heuristic: "INDENTATION_STRUCTURE"
|
||
description: "Consistent indentation may indicate hierarchy"
|
||
- heuristic: "CAPITALIZATION_HEADERS"
|
||
description: "ALL CAPS or Title Case lines may be headers"
|
||
- heuristic: "ENUMERATION_LISTS"
|
||
description: "Lines starting with numbers/bullets are list items"
|
||
|
||
annotation_output:
|
||
description: "Use character offsets with paragraph/line indices"
|
||
example:
|
||
entity: "Dr. Jan de Wit"
|
||
char_start: 1456
|
||
char_end: 1470
|
||
line_number: 23
|
||
paragraph_index: 5
|
||
inferred_section: "unknown"
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Entity Clustering by Structural Path
|
||
# ---------------------------------------------------------------------------
|
||
clustering_strategies:
|
||
|
||
path_prefix_clustering:
|
||
description: |
|
||
Entities sharing a common path prefix belong to the same structural unit
|
||
and should be clustered for co-reference resolution and relationship extraction.
|
||
|
||
algorithm:
|
||
- step: 1
|
||
action: "Extract path for each entity annotation"
|
||
- step: 2
|
||
action: "Compute longest common prefix (LCP) for entity pairs"
|
||
- step: 3
|
||
action: "Cluster entities where LCP depth >= threshold"
|
||
- step: 4
|
||
action: "Within clusters, resolve co-references before cross-cluster"
|
||
|
||
example:
|
||
entities:
|
||
- name: "Rembrandt"
|
||
path: "/article/section[2]/p[1]"
|
||
- name: "Saskia"
|
||
path: "/article/section[2]/p[1]"
|
||
- name: "the painter"
|
||
path: "/article/section[2]/p[2]"
|
||
- name: "Vermeer"
|
||
path: "/article/section[3]/p[1]"
|
||
|
||
clusters:
|
||
- cluster_id: 1
|
||
path_prefix: "/article/section[2]"
|
||
entities: ["Rembrandt", "Saskia", "the painter"]
|
||
note: "Co-reference: 'the painter' likely refers to Rembrandt"
|
||
- cluster_id: 2
|
||
path_prefix: "/article/section[3]"
|
||
entities: ["Vermeer"]
|
||
note: "Separate section, no co-reference with cluster 1"
|
||
|
||
hierarchical_inheritance:
|
||
description: |
|
||
Properties from ancestor nodes propagate to descendant entities.
|
||
Header entities establish context inherited by paragraph entities.
|
||
|
||
inheritance_rules:
|
||
- rule: "TEMPORAL_SCOPE"
|
||
description: "Date range in header bounds dates in body"
|
||
example:
|
||
header: "The 17th Century (1600-1699)"
|
||
body_date: "1642"
|
||
inherited_context: "temporal_scope: 1600-1699"
|
||
|
||
- rule: "SPATIAL_SCOPE"
|
||
description: "Location in header provides default for body entities"
|
||
example:
|
||
header: "Museums in Amsterdam"
|
||
body_org: "Rijksmuseum"
|
||
inherited_context: "default_location: Amsterdam"
|
||
|
||
- rule: "TOPIC_SCOPE"
|
||
description: "Subject in header provides aboutness for body"
|
||
example:
|
||
header: "Rembrandt van Rijn"
|
||
body_text: "He painted The Night Watch"
|
||
inherited_context: "pronoun_antecedent: Rembrandt van Rijn"
|
||
|
||
cross_reference_resolution:
|
||
description: |
|
||
Resolve pronouns and definite references using structural proximity.
|
||
|
||
resolution_priority:
|
||
- priority: 1
|
||
scope: "same_sentence"
|
||
description: "Check for antecedent in same sentence"
|
||
- priority: 2
|
||
scope: "same_paragraph"
|
||
description: "Check preceding sentences in paragraph"
|
||
- priority: 3
|
||
scope: "same_section"
|
||
description: "Check preceding paragraphs in section"
|
||
- priority: 4
|
||
scope: "section_header"
|
||
description: "Check governing section header"
|
||
- priority: 5
|
||
scope: "document"
|
||
description: "Check document-level prominent entities"
|
||
|
||
example:
|
||
text: |
|
||
## Rembrandt van Rijn
|
||
|
||
The Dutch painter was born in Leiden. He moved to Amsterdam in 1631.
|
||
His most famous work is The Night Watch.
|
||
|
||
resolutions:
|
||
- pronoun: "The Dutch painter"
|
||
antecedent: "Rembrandt van Rijn"
|
||
resolution_scope: "section_header"
|
||
- pronoun: "He"
|
||
antecedent: "'The Dutch painter' -> 'Rembrandt van Rijn'"
|
||
resolution_scope: "same_paragraph"
|
||
- pronoun: "His"
|
||
antecedent: "'He' -> 'Rembrandt van Rijn'"
|
||
resolution_scope: "same_paragraph"
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Provenance Path Requirements
|
||
# ---------------------------------------------------------------------------
|
||
provenance_requirements:
|
||
|
||
mandatory_path_fields:
|
||
description: |
|
||
ALL entity annotations MUST include path information for provenance.
|
||
This enables verification, reproducibility, and precise citation.
|
||
|
||
required_fields:
|
||
- field: "source_document_uri"
|
||
type: "URI"
|
||
description: "Identifier for the source document"
|
||
example: "https://example.org/documents/manuscript_123.xml"
|
||
|
||
- field: "document_format"
|
||
type: "enum"
|
||
values: ["PAGE-XML", "HTML", "JSON", "TEI-XML", "PLAIN_TEXT", "PDF"]
|
||
description: "Format of source document (determines path syntax)"
|
||
|
||
- field: "structural_path"
|
||
type: "string"
|
||
description: "XPath, JSONPath, or character offset path to entity"
|
||
examples:
|
||
- "//page:TextRegion[@id='r5']/page:TextLine[@id='l3']"
|
||
- "/html/body/article/section[2]/p[1]"
|
||
- "$.data.institutions[3].name"
|
||
- "char:1456-1470;line:23;para:5"
|
||
|
||
- field: "structural_context"
|
||
type: "enum"
|
||
values:
|
||
- "header"
|
||
- "paragraph"
|
||
- "list_item"
|
||
- "table_cell"
|
||
- "caption"
|
||
- "sidebar"
|
||
- "marginalia"
|
||
- "footnote"
|
||
- "metadata"
|
||
description: "Semantic role of containing structure"
|
||
|
||
- field: "governing_header"
|
||
type: "object"
|
||
optional: true
|
||
description: "Path and text of governing header (if applicable)"
|
||
schema:
|
||
path: "string"
|
||
text: "string"
|
||
level: "integer"
|
||
|
||
nif_context_alignment:
|
||
description: |
|
||
Path information aligns with NIF Context for annotation interchange.
|
||
|
||
nif_properties:
|
||
- property: "nif:beginIndex"
|
||
maps_to: "span_start (character offset)"
|
||
- property: "nif:endIndex"
|
||
maps_to: "span_end (character offset)"
|
||
- property: "nif:sourceUrl"
|
||
maps_to: "source_document_uri"
|
||
- property: "nif:wasConvertedFrom"
|
||
maps_to: "structural_path (as provenance)"
|
||
|
||
example_nif_output:
|
||
"@context": "http://persistence.uni-leipzig.org/nlp2rdf/contexts/nif-2.0.json"
|
||
"@id": "https://example.org/doc#char=145,154"
|
||
"@type": "nif:String"
|
||
"nif:anchorOf": "Rembrandt"
|
||
"nif:beginIndex": 145
|
||
"nif:endIndex": 154
|
||
"nif:referenceContext": "https://example.org/doc#char=0,5000"
|
||
"glam:structuralPath": "//page:TextRegion[@id='r1']/page:TextLine[@id='l1']"
|
||
"glam:structuralContext": "paragraph"
|
||
"glam:governingHeader":
|
||
path: "//page:TextRegion[@id='header1']"
|
||
text: "Dutch Golden Age Artists"
|
||
level: 2
|
||
|
||
# =============================================================================
|
||
# SECTION 16: RELATIONSHIP ANNOTATIONS
|
||
# =============================================================================
|
||
#
|
||
# This section defines how entities relate to each other through typed predicates.
|
||
# Relationships are first-class annotations with their own provenance, enabling
|
||
# knowledge graph construction from annotated documents.
|
||
#
|
||
# Ontological foundations:
|
||
# - CIDOC-CRM: CRMbase properties (P1-P198) for cultural heritage
|
||
# - RDF/OWL: Standard predicate semantics
|
||
# - Schema.org: Web-friendly relationship vocabulary
|
||
# - FRBR/FRBRoo: Work-Expression-Manifestation-Item relationships
|
||
# - RiC-O: Archival relationship properties
|
||
# - Bio CRM: Biographical relationships (birth, death, marriage, etc.)
|
||
# - Wikidata: P-properties for broad coverage
|
||
#
|
||
# Design principles:
|
||
# 1. Relationships are DIRECTIONAL (subject → predicate → object)
|
||
# 2. Relationships have PROVENANCE (who asserted, when, confidence)
|
||
# 3. Relationships can be TEMPORAL (valid during time period)
|
||
# 4. Relationships can be NESTED (relationship about relationship)
|
||
# 5. Relationships support REIFICATION (statements about statements)
|
||
# =============================================================================
|
||
|
||
relationship_annotations:
|
||
|
||
description: |
|
||
Relationship annotations capture semantic links between entities within and
|
||
across documents. Unlike entity annotations which identify spans of text,
|
||
relationship annotations connect two or more entity annotations via typed
|
||
predicates drawn from established ontologies.
|
||
|
||
Key distinction from coreference:
|
||
- COREFERENCE: Two mentions refer to the SAME entity (identity)
|
||
- RELATIONSHIP: Two DIFFERENT entities have a semantic connection
|
||
|
||
Example:
|
||
- Coreference: "Rembrandt" and "the painter" → same person
|
||
- Relationship: "Rembrandt" created "The Night Watch" → two entities linked
|
||
|
||
namespaces:
|
||
# Core relationship vocabularies
|
||
crm: "http://www.cidoc-crm.org/cidoc-crm/"
|
||
crmsci: "http://www.cidoc-crm.org/extensions/crmsci/"
|
||
frbroo: "http://iflastandards.info/ns/fr/frbr/frbroo/"
|
||
rico: "https://www.ica.org/standards/RiC/ontology#"
|
||
rel: "http://purl.org/vocab/relationship/"
|
||
bio: "http://purl.org/vocab/bio/0.1/"
|
||
schema: "http://schema.org/"
|
||
wdt: "http://www.wikidata.org/prop/direct/"
|
||
dbo: "http://dbpedia.org/ontology/"
|
||
gndo: "https://d-nb.info/standards/elementset/gnd#"
|
||
edm: "http://www.europeana.eu/schemas/edm/"
|
||
dc: "http://purl.org/dc/elements/1.1/"
|
||
dcterms: "http://purl.org/dc/terms/"
|
||
prov: "http://www.w3.org/ns/prov#"
|
||
oa: "http://www.w3.org/ns/oa#"
|
||
rdf: "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
|
||
rdfs: "http://www.w3.org/2000/01/rdf-schema#"
|
||
owl: "http://www.w3.org/2002/07/owl#"
|
||
skos: "http://www.w3.org/2004/02/skos/core#"
|
||
time: "http://www.w3.org/2006/time#"
|
||
geo: "http://www.opengis.net/ont/geosparql#"
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# RELATIONSHIP HYPERNYM TAXONOMY
|
||
# ---------------------------------------------------------------------------
|
||
|
||
relationship_hypernyms:
|
||
description: |
|
||
Top-level categorization of relationship types. Each hypernym groups
|
||
semantically related predicates and maps to established ontologies.
|
||
|
||
hypernyms:
|
||
|
||
# -------------------------------------------------------------------------
|
||
# REL.CRE - Creation/Production Relationships
|
||
# -------------------------------------------------------------------------
|
||
- code: "REL.CRE"
|
||
label: "Creation"
|
||
description: |
|
||
Relationships involving the creation, production, or authorship of
|
||
works, objects, or expressions. Central to cultural heritage.
|
||
ontology_mappings:
|
||
- uri: "crm:P14_carried_out_by"
|
||
label: "carried out by"
|
||
inverse: "crm:P14i_performed"
|
||
- uri: "crm:P94_has_created"
|
||
label: "has created"
|
||
inverse: "crm:P94i_was_created_by"
|
||
- uri: "schema:creator"
|
||
label: "creator"
|
||
- uri: "schema:author"
|
||
label: "author"
|
||
- uri: "dc:creator"
|
||
label: "creator"
|
||
- uri: "frbroo:R17_created"
|
||
label: "created (FRBR Work)"
|
||
|
||
hyponyms:
|
||
- code: "REL.CRE.AUT"
|
||
label: "Authored"
|
||
description: "Agent authored a textual work"
|
||
predicates:
|
||
- uri: "schema:author"
|
||
- uri: "dcterms:creator"
|
||
- uri: "wdt:P50"
|
||
wikidata_label: "author"
|
||
domain: ["AGT.PER", "AGT.GRP"]
|
||
range: ["WRK.TXT", "WRK.WRK"]
|
||
|
||
- code: "REL.CRE.ART"
|
||
label: "Created (Artistic)"
|
||
description: "Agent created an artistic work"
|
||
predicates:
|
||
- uri: "crm:P14_carried_out_by"
|
||
- uri: "wdt:P170"
|
||
wikidata_label: "creator"
|
||
domain: ["AGT.PER", "AGT.GRP"]
|
||
range: ["WRK.VIS", "WRK.WRK"]
|
||
|
||
- code: "REL.CRE.COM"
|
||
label: "Composed"
|
||
description: "Agent composed a musical work"
|
||
predicates:
|
||
- uri: "schema:composer"
|
||
- uri: "wdt:P86"
|
||
wikidata_label: "composer"
|
||
domain: ["AGT.PER"]
|
||
range: ["WRK.MUS"]
|
||
|
||
- code: "REL.CRE.DIR"
|
||
label: "Directed"
|
||
description: "Agent directed a performance or film"
|
||
predicates:
|
||
- uri: "schema:director"
|
||
- uri: "wdt:P57"
|
||
wikidata_label: "director"
|
||
domain: ["AGT.PER"]
|
||
range: ["WRK.PER", "WRK.CIN"]
|
||
|
||
- code: "REL.CRE.EDT"
|
||
label: "Edited"
|
||
description: "Agent edited a work"
|
||
predicates:
|
||
- uri: "schema:editor"
|
||
- uri: "wdt:P98"
|
||
wikidata_label: "editor"
|
||
domain: ["AGT.PER", "AGT.GRP"]
|
||
range: ["WRK.TXT", "WRK.WRK"]
|
||
|
||
- code: "REL.CRE.TRN"
|
||
label: "Translated"
|
||
description: "Agent translated a work"
|
||
predicates:
|
||
- uri: "schema:translator"
|
||
- uri: "wdt:P655"
|
||
wikidata_label: "translator"
|
||
domain: ["AGT.PER"]
|
||
range: ["WRK.TXT"]
|
||
|
||
- code: "REL.CRE.ILL"
|
||
label: "Illustrated"
|
||
description: "Agent illustrated a work"
|
||
predicates:
|
||
- uri: "schema:illustrator"
|
||
- uri: "wdt:P110"
|
||
wikidata_label: "illustrator"
|
||
domain: ["AGT.PER"]
|
||
range: ["WRK.TXT", "WRK.VIS"]
|
||
|
||
- code: "REL.CRE.PHO"
|
||
label: "Photographed"
|
||
description: "Agent photographed (captured image)"
|
||
predicates:
|
||
- uri: "schema:photographer"
|
||
- uri: "wdt:P4876"
|
||
wikidata_label: "photographer"
|
||
domain: ["AGT.PER"]
|
||
range: ["WRK.VIS"]
|
||
|
||
- code: "REL.CRE.DES"
|
||
label: "Designed"
|
||
description: "Agent designed an object or building"
|
||
predicates:
|
||
- uri: "wdt:P287"
|
||
wikidata_label: "designed by"
|
||
domain: ["AGT.PER", "AGT.GRP"]
|
||
range: ["WRK.OBJ", "TOP.BLT"]
|
||
|
||
- code: "REL.CRE.INV"
|
||
label: "Invented"
|
||
description: "Agent invented something"
|
||
predicates:
|
||
- uri: "wdt:P61"
|
||
wikidata_label: "discoverer or inventor"
|
||
domain: ["AGT.PER", "AGT.GRP"]
|
||
range: ["WRK.OBJ", "WRK.WRK"]
|
||
|
||
# -------------------------------------------------------------------------
|
||
# REL.TMP - Temporal Relationships
|
||
# -------------------------------------------------------------------------
|
||
- code: "REL.TMP"
|
||
label: "Temporal"
|
||
description: |
|
||
Relationships with temporal semantics - events, periods, sequences.
|
||
Integrates with CIDOC-CRM temporal entities and TimeML.
|
||
ontology_mappings:
|
||
- uri: "crm:P4_has_time-span"
|
||
label: "has time-span"
|
||
- uri: "time:hasTime"
|
||
label: "has time"
|
||
- uri: "edm:happenedAt"
|
||
label: "happened at (time)"
|
||
|
||
hyponyms:
|
||
- code: "REL.TMP.DUR"
|
||
label: "During"
|
||
description: "Entity existed/occurred during time period"
|
||
predicates:
|
||
- uri: "crm:P4_has_time-span"
|
||
- uri: "time:inside"
|
||
domain: ["*"]
|
||
range: ["TMP.DUR", "TMP.RNG"]
|
||
|
||
- code: "REL.TMP.BEF"
|
||
label: "Before"
|
||
description: "Entity/event occurred before another"
|
||
predicates:
|
||
- uri: "time:before"
|
||
- uri: "crm:P120_occurs_before"
|
||
domain: ["TMP.*", "EVT.*"]
|
||
range: ["TMP.*", "EVT.*"]
|
||
|
||
- code: "REL.TMP.AFT"
|
||
label: "After"
|
||
description: "Entity/event occurred after another"
|
||
predicates:
|
||
- uri: "time:after"
|
||
- uri: "crm:P120i_occurs_after"
|
||
domain: ["TMP.*", "EVT.*"]
|
||
range: ["TMP.*", "EVT.*"]
|
||
|
||
- code: "REL.TMP.CON"
|
||
label: "Contemporaneous"
|
||
description: "Entities/events overlapped in time"
|
||
predicates:
|
||
- uri: "time:intervalOverlaps"
|
||
- uri: "edm:isRelatedTo"
|
||
domain: ["*"]
|
||
range: ["*"]
|
||
|
||
- code: "REL.TMP.BGN"
|
||
label: "Began"
|
||
description: "Entity/event started at time"
|
||
predicates:
|
||
- uri: "time:hasBeginning"
|
||
- uri: "crm:P82a_begin_of_the_begin"
|
||
domain: ["*"]
|
||
range: ["TMP.DAB", "TMP.RNG"]
|
||
|
||
- code: "REL.TMP.END"
|
||
label: "Ended"
|
||
description: "Entity/event ended at time"
|
||
predicates:
|
||
- uri: "time:hasEnd"
|
||
- uri: "crm:P82b_end_of_the_end"
|
||
domain: ["*"]
|
||
range: ["TMP.DAB", "TMP.RNG"]
|
||
|
||
# -------------------------------------------------------------------------
|
||
# REL.SPA - Spatial Relationships
|
||
# -------------------------------------------------------------------------
|
||
- code: "REL.SPA"
|
||
label: "Spatial"
|
||
description: |
|
||
Relationships with spatial/geographic semantics. Uses GeoSPARQL
|
||
topological relations and CIDOC-CRM place properties.
|
||
ontology_mappings:
|
||
- uri: "crm:P53_has_former_or_current_location"
|
||
label: "has former or current location"
|
||
- uri: "geo:sfWithin"
|
||
label: "within (spatial)"
|
||
- uri: "schema:location"
|
||
label: "location"
|
||
|
||
hyponyms:
|
||
- code: "REL.SPA.LOC"
|
||
label: "Located In"
|
||
description: "Entity is/was located in place"
|
||
predicates:
|
||
- uri: "crm:P53_has_former_or_current_location"
|
||
- uri: "schema:location"
|
||
- uri: "wdt:P276"
|
||
wikidata_label: "location"
|
||
domain: ["*"]
|
||
range: ["TOP.*", "GEO.*"]
|
||
|
||
- code: "REL.SPA.WTH"
|
||
label: "Within"
|
||
description: "Place is spatially within another place"
|
||
predicates:
|
||
- uri: "geo:sfWithin"
|
||
- uri: "crm:P89_falls_within"
|
||
- uri: "wdt:P131"
|
||
wikidata_label: "located in administrative entity"
|
||
domain: ["TOP.*", "GEO.*"]
|
||
range: ["TOP.*", "GEO.*"]
|
||
|
||
- code: "REL.SPA.CON"
|
||
label: "Contains"
|
||
description: "Place spatially contains another"
|
||
predicates:
|
||
- uri: "geo:sfContains"
|
||
- uri: "crm:P89i_contains"
|
||
domain: ["TOP.*", "GEO.*"]
|
||
range: ["TOP.*", "GEO.*"]
|
||
|
||
- code: "REL.SPA.ADJ"
|
||
label: "Adjacent To"
|
||
description: "Place is adjacent to another"
|
||
predicates:
|
||
- uri: "geo:sfTouches"
|
||
- uri: "wdt:P47"
|
||
wikidata_label: "shares border with"
|
||
domain: ["TOP.*", "GEO.*"]
|
||
range: ["TOP.*", "GEO.*"]
|
||
|
||
- code: "REL.SPA.NER"
|
||
label: "Near"
|
||
description: "Entity is near a place (fuzzy proximity)"
|
||
predicates:
|
||
- uri: "schema:geo"
|
||
- uri: "crm:P168_place_is_defined_by"
|
||
domain: ["*"]
|
||
range: ["TOP.*", "GEO.*"]
|
||
|
||
- code: "REL.SPA.ORG"
|
||
label: "Origin"
|
||
description: "Entity originates from place"
|
||
predicates:
|
||
- uri: "schema:birthPlace"
|
||
- uri: "wdt:P19"
|
||
wikidata_label: "place of birth"
|
||
- uri: "wdt:P495"
|
||
wikidata_label: "country of origin"
|
||
domain: ["AGT.PER", "WRK.*"]
|
||
range: ["TOP.*"]
|
||
|
||
# -------------------------------------------------------------------------
|
||
# REL.SOC - Social Relationships
|
||
# -------------------------------------------------------------------------
|
||
- code: "REL.SOC"
|
||
label: "Social"
|
||
description: |
|
||
Interpersonal and organizational relationships. Uses RELATIONSHIP
|
||
vocabulary, Bio ontology, and social network predicates.
|
||
ontology_mappings:
|
||
- uri: "rel:participantIn"
|
||
label: "participant in"
|
||
- uri: "bio:relationship"
|
||
label: "relationship (bio)"
|
||
- uri: "schema:knows"
|
||
label: "knows"
|
||
|
||
hyponyms:
|
||
- code: "REL.SOC.FAM"
|
||
label: "Family"
|
||
description: "Family relationship between persons"
|
||
predicates:
|
||
- uri: "rel:parentOf"
|
||
- uri: "rel:childOf"
|
||
- uri: "rel:siblingOf"
|
||
- uri: "rel:spouseOf"
|
||
- uri: "wdt:P22"
|
||
wikidata_label: "father"
|
||
- uri: "wdt:P25"
|
||
wikidata_label: "mother"
|
||
- uri: "wdt:P26"
|
||
wikidata_label: "spouse"
|
||
- uri: "wdt:P40"
|
||
wikidata_label: "child"
|
||
domain: ["AGT.PER"]
|
||
range: ["AGT.PER"]
|
||
sub_relations:
|
||
- code: "REL.SOC.FAM.PAR"
|
||
label: "Parent Of"
|
||
predicates: ["rel:parentOf", "wdt:P22", "wdt:P25"]
|
||
- code: "REL.SOC.FAM.CHD"
|
||
label: "Child Of"
|
||
predicates: ["rel:childOf", "wdt:P40"]
|
||
- code: "REL.SOC.FAM.SIB"
|
||
label: "Sibling Of"
|
||
predicates: ["rel:siblingOf", "wdt:P3373"]
|
||
- code: "REL.SOC.FAM.SPO"
|
||
label: "Spouse Of"
|
||
predicates: ["rel:spouseOf", "wdt:P26"]
|
||
- code: "REL.SOC.FAM.GRP"
|
||
label: "Grandparent Of"
|
||
predicates: ["wdt:P1038"]
|
||
|
||
- code: "REL.SOC.PRO"
|
||
label: "Professional"
|
||
description: "Professional/occupational relationship"
|
||
predicates:
|
||
- uri: "rel:collaboratesWith"
|
||
- uri: "schema:colleague"
|
||
- uri: "wdt:P1066"
|
||
wikidata_label: "student of"
|
||
- uri: "wdt:P184"
|
||
wikidata_label: "doctoral advisor"
|
||
domain: ["AGT.PER"]
|
||
range: ["AGT.PER", "GRP.*"]
|
||
sub_relations:
|
||
- code: "REL.SOC.PRO.STU"
|
||
label: "Student Of"
|
||
predicates: ["wdt:P1066"]
|
||
- code: "REL.SOC.PRO.TCH"
|
||
label: "Teacher Of"
|
||
predicates: ["wdt:P802"]
|
||
- code: "REL.SOC.PRO.COL"
|
||
label: "Colleague Of"
|
||
predicates: ["schema:colleague"]
|
||
- code: "REL.SOC.PRO.MNT"
|
||
label: "Mentored By"
|
||
predicates: ["wdt:P184"]
|
||
|
||
- code: "REL.SOC.MEM"
|
||
label: "Membership"
|
||
description: "Agent is member of group"
|
||
predicates:
|
||
- uri: "schema:memberOf"
|
||
- uri: "crm:P107i_is_current_or_former_member_of"
|
||
- uri: "wdt:P463"
|
||
wikidata_label: "member of"
|
||
domain: ["AGT.PER", "GRP.*"]
|
||
range: ["GRP.*"]
|
||
|
||
- code: "REL.SOC.EMP"
|
||
label: "Employment"
|
||
description: "Agent employed by organization"
|
||
predicates:
|
||
- uri: "schema:worksFor"
|
||
- uri: "wdt:P108"
|
||
wikidata_label: "employer"
|
||
domain: ["AGT.PER"]
|
||
range: ["GRP.ORG", "GRP.COR", "GRP.GOV"]
|
||
|
||
- code: "REL.SOC.AFF"
|
||
label: "Affiliation"
|
||
description: "Agent affiliated with organization"
|
||
predicates:
|
||
- uri: "schema:affiliation"
|
||
- uri: "wdt:P1416"
|
||
wikidata_label: "affiliation"
|
||
domain: ["AGT.PER"]
|
||
range: ["GRP.*"]
|
||
|
||
- code: "REL.SOC.KNO"
|
||
label: "Knows"
|
||
description: "Agent knows another agent"
|
||
predicates:
|
||
- uri: "schema:knows"
|
||
- uri: "rel:knowsOf"
|
||
domain: ["AGT.PER"]
|
||
range: ["AGT.PER"]
|
||
|
||
- code: "REL.SOC.PAT"
|
||
label: "Patronage"
|
||
description: "Patron-artist/client relationship"
|
||
predicates:
|
||
- uri: "wdt:P88"
|
||
wikidata_label: "commissioned by"
|
||
domain: ["AGT.PER", "GRP.*"]
|
||
range: ["AGT.PER", "GRP.*"]
|
||
|
||
# -------------------------------------------------------------------------
|
||
# REL.ORG - Organizational Relationships
|
||
# -------------------------------------------------------------------------
|
||
- code: "REL.ORG"
|
||
label: "Organizational"
|
||
description: |
|
||
Relationships between organizations, including hierarchy,
|
||
succession, and structural connections.
|
||
ontology_mappings:
|
||
- uri: "schema:parentOrganization"
|
||
label: "parent organization"
|
||
- uri: "rico:isSuccessorOf"
|
||
label: "is successor of"
|
||
- uri: "crm:P107_has_current_or_former_member"
|
||
label: "has member"
|
||
|
||
hyponyms:
|
||
- code: "REL.ORG.PAR"
|
||
label: "Parent Organization"
|
||
description: "Organization is parent of another"
|
||
predicates:
|
||
- uri: "schema:parentOrganization"
|
||
- uri: "wdt:P749"
|
||
wikidata_label: "parent organization"
|
||
domain: ["GRP.*"]
|
||
range: ["GRP.*"]
|
||
|
||
- code: "REL.ORG.SUB"
|
||
label: "Subsidiary"
|
||
description: "Organization is subsidiary of another"
|
||
predicates:
|
||
- uri: "schema:subOrganization"
|
||
- uri: "wdt:P355"
|
||
wikidata_label: "subsidiary"
|
||
domain: ["GRP.*"]
|
||
range: ["GRP.*"]
|
||
|
||
- code: "REL.ORG.SUC"
|
||
label: "Successor"
|
||
description: "Organization succeeded another"
|
||
predicates:
|
||
- uri: "rico:isSuccessorOf"
|
||
- uri: "wdt:P1366"
|
||
wikidata_label: "replaced by"
|
||
domain: ["GRP.*"]
|
||
range: ["GRP.*"]
|
||
|
||
- code: "REL.ORG.PRE"
|
||
label: "Predecessor"
|
||
description: "Organization preceded another"
|
||
predicates:
|
||
- uri: "rico:hasPredecessor"
|
||
- uri: "wdt:P1365"
|
||
wikidata_label: "replaces"
|
||
domain: ["GRP.*"]
|
||
range: ["GRP.*"]
|
||
|
||
- code: "REL.ORG.MRG"
|
||
label: "Merged Into"
|
||
description: "Organization merged into another"
|
||
predicates:
|
||
- uri: "rico:isAssociatedWithEvent"
|
||
- uri: "wdt:P7888"
|
||
wikidata_label: "merged into"
|
||
domain: ["GRP.*"]
|
||
range: ["GRP.*"]
|
||
|
||
- code: "REL.ORG.SPL"
|
||
label: "Split From"
|
||
description: "Organization split from another"
|
||
predicates:
|
||
- uri: "wdt:P807"
|
||
wikidata_label: "separated from"
|
||
domain: ["GRP.*"]
|
||
range: ["GRP.*"]
|
||
|
||
- code: "REL.ORG.FND"
|
||
label: "Founded By"
|
||
description: "Organization founded by agent"
|
||
predicates:
|
||
- uri: "schema:founder"
|
||
- uri: "wdt:P112"
|
||
wikidata_label: "founded by"
|
||
domain: ["GRP.*"]
|
||
range: ["AGT.PER", "GRP.*"]
|
||
|
||
# -------------------------------------------------------------------------
|
||
# REL.WRK - Work/Expression Relationships (FRBR)
|
||
# -------------------------------------------------------------------------
|
||
- code: "REL.WRK"
|
||
label: "Work Relations"
|
||
description: |
|
||
Relationships between works, expressions, manifestations, and items
|
||
following the FRBR model. Critical for bibliographic and archival data.
|
||
ontology_mappings:
|
||
- uri: "frbroo:R3_is_realised_in"
|
||
label: "is realised in (Work→Expression)"
|
||
- uri: "frbroo:R4_carriers_provided_by"
|
||
label: "carriers provided by (Expression→Manifestation)"
|
||
- uri: "schema:isBasedOn"
|
||
label: "is based on"
|
||
|
||
hyponyms:
|
||
- code: "REL.WRK.EXP"
|
||
label: "Expression Of"
|
||
description: "Expression realizes a Work"
|
||
predicates:
|
||
- uri: "frbroo:R3i_realises"
|
||
- uri: "wdt:P629"
|
||
wikidata_label: "edition or translation of"
|
||
domain: ["WRK.EXP"]
|
||
range: ["WRK.WRK"]
|
||
|
||
- code: "REL.WRK.MAN"
|
||
label: "Manifestation Of"
|
||
description: "Manifestation embodies an Expression"
|
||
predicates:
|
||
- uri: "frbroo:R4i_comprises_carriers_of"
|
||
domain: ["WRK.MAN"]
|
||
range: ["WRK.EXP"]
|
||
|
||
- code: "REL.WRK.ITM"
|
||
label: "Item Of"
|
||
description: "Item exemplifies a Manifestation"
|
||
predicates:
|
||
- uri: "frbroo:R7i_is_example_of"
|
||
domain: ["WRK.ITM"]
|
||
range: ["WRK.MAN"]
|
||
|
||
- code: "REL.WRK.DRV"
|
||
label: "Derived From"
|
||
description: "Work derived from another work"
|
||
predicates:
|
||
- uri: "schema:isBasedOn"
|
||
- uri: "wdt:P144"
|
||
wikidata_label: "based on"
|
||
domain: ["WRK.*"]
|
||
range: ["WRK.*"]
|
||
|
||
- code: "REL.WRK.TRN"
|
||
label: "Translation Of"
|
||
description: "Work is translation of another"
|
||
predicates:
|
||
- uri: "wdt:P629"
|
||
wikidata_label: "edition or translation of"
|
||
domain: ["WRK.TXT"]
|
||
range: ["WRK.TXT"]
|
||
|
||
- code: "REL.WRK.ADP"
|
||
label: "Adaptation Of"
|
||
description: "Work is adaptation of another"
|
||
predicates:
|
||
- uri: "wdt:P144"
|
||
wikidata_label: "based on"
|
||
domain: ["WRK.*"]
|
||
range: ["WRK.*"]
|
||
|
||
- code: "REL.WRK.REF"
|
||
label: "References"
|
||
description: "Work references another work"
|
||
predicates:
|
||
- uri: "dcterms:references"
|
||
- uri: "schema:citation"
|
||
domain: ["WRK.*"]
|
||
range: ["WRK.*"]
|
||
|
||
- code: "REL.WRK.PRT"
|
||
label: "Part Of"
|
||
description: "Work is part of larger work"
|
||
predicates:
|
||
- uri: "dcterms:isPartOf"
|
||
- uri: "schema:isPartOf"
|
||
- uri: "wdt:P361"
|
||
wikidata_label: "part of"
|
||
domain: ["WRK.*"]
|
||
range: ["WRK.*"]
|
||
|
||
- code: "REL.WRK.SER"
|
||
label: "Part Of Series"
|
||
description: "Work is part of a series"
|
||
predicates:
|
||
- uri: "schema:isPartOf"
|
||
- uri: "wdt:P179"
|
||
wikidata_label: "part of the series"
|
||
domain: ["WRK.*"]
|
||
range: ["WRK.SER"]
|
||
|
||
# -------------------------------------------------------------------------
|
||
# REL.CUS - Custodial/Ownership Relationships
|
||
# -------------------------------------------------------------------------
|
||
- code: "REL.CUS"
|
||
label: "Custodial"
|
||
description: |
|
||
Relationships involving ownership, custody, and provenance of objects
|
||
and collections. Critical for GLAM institutions.
|
||
ontology_mappings:
|
||
- uri: "crm:P50_has_current_keeper"
|
||
label: "has current keeper"
|
||
- uri: "crm:P51_has_former_or_current_owner"
|
||
label: "has former or current owner"
|
||
- uri: "rico:hasOrHadHolder"
|
||
label: "has or had holder"
|
||
|
||
hyponyms:
|
||
- code: "REL.CUS.OWN"
|
||
label: "Owned By"
|
||
description: "Object/collection owned by agent"
|
||
predicates:
|
||
- uri: "crm:P51_has_former_or_current_owner"
|
||
- uri: "schema:ownedBy"
|
||
- uri: "wdt:P127"
|
||
wikidata_label: "owned by"
|
||
domain: ["WRK.*", "WRK.OBJ"]
|
||
range: ["AGT.PER", "GRP.*"]
|
||
|
||
- code: "REL.CUS.KEP"
|
||
label: "Kept By"
|
||
description: "Object/collection in custody of agent"
|
||
predicates:
|
||
- uri: "crm:P50_has_current_keeper"
|
||
- uri: "rico:hasOrHadHolder"
|
||
- uri: "wdt:P195"
|
||
wikidata_label: "collection"
|
||
domain: ["WRK.*", "WRK.OBJ"]
|
||
range: ["GRP.*"]
|
||
|
||
- code: "REL.CUS.COL"
|
||
label: "In Collection"
|
||
description: "Item is in a collection"
|
||
predicates:
|
||
- uri: "edm:isShownAt"
|
||
- uri: "wdt:P195"
|
||
wikidata_label: "collection"
|
||
domain: ["WRK.ITM", "WRK.OBJ"]
|
||
range: ["GRP.*", "WRK.COL"]
|
||
|
||
- code: "REL.CUS.PRV"
|
||
label: "Previous Owner"
|
||
description: "Object was previously owned by agent"
|
||
predicates:
|
||
- uri: "crm:P51_has_former_or_current_owner"
|
||
- uri: "wdt:P127"
|
||
domain: ["WRK.*", "WRK.OBJ"]
|
||
range: ["AGT.PER", "GRP.*"]
|
||
temporal: true
|
||
|
||
- code: "REL.CUS.ACQ"
|
||
label: "Acquired From"
|
||
description: "Object acquired from agent"
|
||
predicates:
|
||
- uri: "crm:P24i_changed_ownership_through"
|
||
domain: ["WRK.*", "WRK.OBJ"]
|
||
range: ["AGT.PER", "GRP.*"]
|
||
|
||
- code: "REL.CUS.DNT"
|
||
label: "Donated By"
|
||
description: "Object donated by agent"
|
||
predicates:
|
||
- uri: "wdt:P1028"
|
||
wikidata_label: "donated by"
|
||
domain: ["WRK.*", "WRK.OBJ"]
|
||
range: ["AGT.PER", "GRP.*"]
|
||
|
||
# -------------------------------------------------------------------------
|
||
# REL.SUB - Subject/Topic Relationships
|
||
# -------------------------------------------------------------------------
|
||
- code: "REL.SUB"
|
||
label: "Subject"
|
||
description: |
|
||
Relationships indicating what a work is about, depicts, or discusses.
|
||
Critical for subject indexing and discovery.
|
||
ontology_mappings:
|
||
- uri: "schema:about"
|
||
label: "about"
|
||
- uri: "dcterms:subject"
|
||
label: "subject"
|
||
- uri: "crm:P62_depicts"
|
||
label: "depicts"
|
||
|
||
hyponyms:
|
||
- code: "REL.SUB.ABT"
|
||
label: "About"
|
||
description: "Work is about a topic/entity"
|
||
predicates:
|
||
- uri: "schema:about"
|
||
- uri: "dcterms:subject"
|
||
- uri: "wdt:P921"
|
||
wikidata_label: "main subject"
|
||
domain: ["WRK.*"]
|
||
range: ["*"]
|
||
|
||
- code: "REL.SUB.DEP"
|
||
label: "Depicts"
|
||
description: "Visual work depicts entity"
|
||
predicates:
|
||
- uri: "crm:P62_depicts"
|
||
- uri: "wdt:P180"
|
||
wikidata_label: "depicts"
|
||
domain: ["WRK.VIS"]
|
||
range: ["AGT.PER", "TOP.*", "WRK.OBJ"]
|
||
|
||
- code: "REL.SUB.MNT"
|
||
label: "Mentions"
|
||
description: "Work mentions entity (not main subject)"
|
||
predicates:
|
||
- uri: "schema:mentions"
|
||
domain: ["WRK.*"]
|
||
range: ["*"]
|
||
|
||
- code: "REL.SUB.SET"
|
||
label: "Set In"
|
||
description: "Narrative work set in place/time"
|
||
predicates:
|
||
- uri: "schema:contentLocation"
|
||
- uri: "wdt:P840"
|
||
wikidata_label: "narrative location"
|
||
domain: ["WRK.TXT", "WRK.CIN", "WRK.PER"]
|
||
range: ["TOP.*", "TMP.*"]
|
||
|
||
- code: "REL.SUB.GNR"
|
||
label: "Genre"
|
||
description: "Work belongs to genre"
|
||
predicates:
|
||
- uri: "schema:genre"
|
||
- uri: "wdt:P136"
|
||
wikidata_label: "genre"
|
||
domain: ["WRK.*"]
|
||
range: ["WRK.GNR"]
|
||
|
||
# -------------------------------------------------------------------------
|
||
# REL.EVT - Event Relationships
|
||
# -------------------------------------------------------------------------
|
||
- code: "REL.EVT"
|
||
label: "Event"
|
||
description: |
|
||
Relationships involving participation in or connection to events.
|
||
Events are temporal entities with participants and locations.
|
||
ontology_mappings:
|
||
- uri: "crm:P11_had_participant"
|
||
label: "had participant"
|
||
- uri: "crm:P7_took_place_at"
|
||
label: "took place at"
|
||
- uri: "schema:event"
|
||
label: "event"
|
||
|
||
hyponyms:
|
||
- code: "REL.EVT.PAR"
|
||
label: "Participated In"
|
||
description: "Agent participated in event"
|
||
predicates:
|
||
- uri: "crm:P11i_participated_in"
|
||
- uri: "wdt:P1344"
|
||
wikidata_label: "participant in"
|
||
domain: ["AGT.*"]
|
||
range: ["EVT.*"]
|
||
|
||
- code: "REL.EVT.ORG"
|
||
label: "Organized"
|
||
description: "Agent organized event"
|
||
predicates:
|
||
- uri: "schema:organizer"
|
||
- uri: "wdt:P664"
|
||
wikidata_label: "organizer"
|
||
domain: ["AGT.*", "GRP.*"]
|
||
range: ["EVT.*"]
|
||
|
||
- code: "REL.EVT.LOC"
|
||
label: "Event Location"
|
||
description: "Event took place at location"
|
||
predicates:
|
||
- uri: "crm:P7_took_place_at"
|
||
- uri: "schema:location"
|
||
- uri: "wdt:P276"
|
||
wikidata_label: "location"
|
||
domain: ["EVT.*"]
|
||
range: ["TOP.*"]
|
||
|
||
- code: "REL.EVT.TIM"
|
||
label: "Event Time"
|
||
description: "Event occurred at time"
|
||
predicates:
|
||
- uri: "crm:P4_has_time-span"
|
||
- uri: "schema:startDate"
|
||
- uri: "wdt:P585"
|
||
wikidata_label: "point in time"
|
||
domain: ["EVT.*"]
|
||
range: ["TMP.*"]
|
||
|
||
- code: "REL.EVT.RSL"
|
||
label: "Resulted In"
|
||
description: "Event resulted in outcome"
|
||
predicates:
|
||
- uri: "prov:generated"
|
||
- uri: "crm:P92_brought_into_existence"
|
||
domain: ["EVT.*"]
|
||
range: ["*"]
|
||
|
||
# -------------------------------------------------------------------------
|
||
# REL.IDT - Identity Relationships
|
||
# -------------------------------------------------------------------------
|
||
- code: "REL.IDT"
|
||
label: "Identity"
|
||
description: |
|
||
Relationships expressing identity, equivalence, and representation.
|
||
Distinct from coreference (same entity mentions) - these are
|
||
ontological identity claims between different conceptualizations.
|
||
ontology_mappings:
|
||
- uri: "owl:sameAs"
|
||
label: "same as"
|
||
- uri: "skos:exactMatch"
|
||
label: "exact match"
|
||
- uri: "skos:closeMatch"
|
||
label: "close match"
|
||
|
||
hyponyms:
|
||
- code: "REL.IDT.SAM"
|
||
label: "Same As"
|
||
description: "Entity is identical to another (different URIs)"
|
||
predicates:
|
||
- uri: "owl:sameAs"
|
||
- uri: "skos:exactMatch"
|
||
domain: ["*"]
|
||
range: ["*"]
|
||
notes: |
|
||
Use for linking to external authority files (VIAF, Wikidata, etc.)
|
||
Distinct from coreference which links mentions within documents.
|
||
|
||
- code: "REL.IDT.REP"
|
||
label: "Represents"
|
||
description: "Entity represents another (e.g., portrait of person)"
|
||
predicates:
|
||
- uri: "crm:P138_represents"
|
||
domain: ["WRK.VIS", "WRK.OBJ"]
|
||
range: ["AGT.PER", "TOP.*"]
|
||
|
||
- code: "REL.IDT.SIM"
|
||
label: "Similar To"
|
||
description: "Entity is similar to another (not identical)"
|
||
predicates:
|
||
- uri: "skos:closeMatch"
|
||
- uri: "skos:relatedMatch"
|
||
domain: ["*"]
|
||
range: ["*"]
|
||
|
||
- code: "REL.IDT.ALT"
|
||
label: "Alternative Name For"
|
||
description: "Appellation is alternative for entity"
|
||
predicates:
|
||
- uri: "skos:altLabel"
|
||
- uri: "crm:P139_has_alternative_form"
|
||
domain: ["APP.*"]
|
||
range: ["*"]
|
||
|
||
# -------------------------------------------------------------------------
|
||
# REL.ROL - Role Relationships
|
||
# -------------------------------------------------------------------------
|
||
- code: "REL.ROL"
|
||
label: "Role"
|
||
description: |
|
||
Relationships expressing roles that agents play in contexts.
|
||
Temporally-bound role assertions.
|
||
ontology_mappings:
|
||
- uri: "crm:P14.1_in_the_role_of"
|
||
label: "in the role of"
|
||
- uri: "schema:roleName"
|
||
label: "role name"
|
||
|
||
hyponyms:
|
||
- code: "REL.ROL.HLD"
|
||
label: "Held Role"
|
||
description: "Agent held role/position"
|
||
predicates:
|
||
- uri: "schema:hasOccupation"
|
||
- uri: "wdt:P39"
|
||
wikidata_label: "position held"
|
||
domain: ["AGT.PER"]
|
||
range: ["ROL.*"]
|
||
temporal: true
|
||
|
||
- code: "REL.ROL.TTL"
|
||
label: "Held Title"
|
||
description: "Agent held title/rank"
|
||
predicates:
|
||
- uri: "wdt:P97"
|
||
wikidata_label: "noble title"
|
||
- uri: "wdt:P410"
|
||
wikidata_label: "military rank"
|
||
domain: ["AGT.PER"]
|
||
range: ["ROL.TTL"]
|
||
temporal: true
|
||
|
||
- code: "REL.ROL.OCC"
|
||
label: "Had Occupation"
|
||
description: "Agent had occupation"
|
||
predicates:
|
||
- uri: "schema:hasOccupation"
|
||
- uri: "wdt:P106"
|
||
wikidata_label: "occupation"
|
||
domain: ["AGT.PER"]
|
||
range: ["ROL.OCC"]
|
||
temporal: true
|
||
|
||
- code: "REL.ROL.CTX"
|
||
label: "Role Context"
|
||
description: "Role was held in context of organization/event"
|
||
predicates:
|
||
- uri: "schema:worksFor"
|
||
- uri: "crm:P11i_participated_in"
|
||
domain: ["ROL.*"]
|
||
range: ["GRP.*", "EVT.*"]
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# RELATIONSHIP ANNOTATION SCHEMA
|
||
# ---------------------------------------------------------------------------
|
||
|
||
annotation_schema:
|
||
description: |
|
||
Schema for encoding relationship annotations as structured data.
|
||
Supports both inline document annotations and standalone relationship
|
||
assertions for knowledge graph construction.
|
||
|
||
required_fields:
|
||
- field: "relationship_id"
|
||
type: "string"
|
||
format: "UUID or URI"
|
||
description: "Unique identifier for this relationship annotation"
|
||
|
||
- field: "relationship_type"
|
||
type: "string"
|
||
format: "REL.{HYPERNYM}.{HYPONYM}"
|
||
description: "Relationship type code from taxonomy"
|
||
examples:
|
||
- "REL.CRE.AUT"
|
||
- "REL.SOC.FAM.PAR"
|
||
- "REL.WRK.DRV"
|
||
|
||
- field: "subject"
|
||
type: "object"
|
||
description: "Source entity of the relationship"
|
||
schema:
|
||
entity_id: "string (reference to entity annotation)"
|
||
entity_type: "string (entity type code)"
|
||
span_text: "string (optional, surface form)"
|
||
uri: "string (optional, external URI)"
|
||
|
||
- field: "predicate"
|
||
type: "object"
|
||
description: "The relationship predicate"
|
||
schema:
|
||
uri: "string (ontology predicate URI)"
|
||
label: "string (human-readable label)"
|
||
direction: "enum (FORWARD, INVERSE)"
|
||
|
||
- field: "object"
|
||
type: "object"
|
||
description: "Target entity of the relationship"
|
||
schema:
|
||
entity_id: "string (reference to entity annotation)"
|
||
entity_type: "string (entity type code)"
|
||
span_text: "string (optional, surface form)"
|
||
uri: "string (optional, external URI)"
|
||
|
||
optional_fields:
|
||
- field: "temporal_scope"
|
||
type: "object"
|
||
description: "When the relationship held/holds"
|
||
schema:
|
||
start_date: "string (ISO 8601)"
|
||
end_date: "string (ISO 8601 or null for ongoing)"
|
||
temporal_modifier: "enum (CIRCA, BEFORE, AFTER, DURING)"
|
||
examples:
|
||
- start_date: "1632-01-01"
|
||
end_date: "1669-10-04"
|
||
temporal_modifier: null
|
||
- start_date: "1890"
|
||
end_date: null
|
||
temporal_modifier: "CIRCA"
|
||
|
||
- field: "spatial_scope"
|
||
type: "object"
|
||
description: "Where the relationship held/holds"
|
||
schema:
|
||
place_id: "string (reference to place entity)"
|
||
place_name: "string"
|
||
geo_uri: "string (GeoNames, Pleiades, etc.)"
|
||
|
||
- field: "provenance"
|
||
type: "object"
|
||
description: "Source and confidence of relationship assertion"
|
||
schema:
|
||
source_document: "string (document ID)"
|
||
source_span: "object (character offsets)"
|
||
extraction_method: "enum (MANUAL, RULE_BASED, ML_EXTRACTED, EXTERNAL_KB)"
|
||
extractor_id: "string (model or annotator ID)"
|
||
extraction_date: "string (ISO 8601)"
|
||
confidence: "float (0.0-1.0)"
|
||
evidence_text: "string (supporting text)"
|
||
|
||
- field: "qualifiers"
|
||
type: "array"
|
||
description: "Additional qualifications on the relationship"
|
||
item_schema:
|
||
qualifier_type: "string (e.g., 'role', 'capacity', 'manner')"
|
||
qualifier_value: "string"
|
||
qualifier_uri: "string (optional)"
|
||
examples:
|
||
- qualifier_type: "role"
|
||
qualifier_value: "co-author"
|
||
- qualifier_type: "capacity"
|
||
qualifier_value: "as regent"
|
||
|
||
- field: "negation"
|
||
type: "boolean"
|
||
default: false
|
||
description: "True if relationship is explicitly negated"
|
||
|
||
- field: "hypothetical"
|
||
type: "boolean"
|
||
default: false
|
||
description: "True if relationship is hypothetical/uncertain"
|
||
|
||
- field: "source_claim"
|
||
type: "boolean"
|
||
default: false
|
||
description: "True if relationship is attributed to a source (not factual assertion)"
|
||
|
||
- field: "attributed_to"
|
||
type: "object"
|
||
description: "If source_claim=true, who made the claim"
|
||
schema:
|
||
agent_id: "string"
|
||
agent_name: "string"
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# RELATIONSHIP EXTRACTION PATTERNS
|
||
# ---------------------------------------------------------------------------
|
||
|
||
extraction_patterns:
|
||
description: |
|
||
Linguistic patterns for extracting relationships from text.
|
||
Patterns are language-specific and map to relationship types.
|
||
|
||
pattern_types:
|
||
- type: "VERBAL"
|
||
description: "Verb-mediated relationships"
|
||
example: "Rembrandt painted The Night Watch"
|
||
extraction_notes: |
|
||
Parse verb as predicate, subject as source, object as target.
|
||
Handle passive voice inversion.
|
||
|
||
- type: "NOMINAL"
|
||
description: "Noun phrase relationships"
|
||
example: "Rembrandt's painting The Night Watch"
|
||
extraction_notes: |
|
||
Possessive constructions imply creation/ownership.
|
||
Appositive constructions may indicate identity.
|
||
|
||
- type: "PREPOSITIONAL"
|
||
description: "Preposition-mediated relationships"
|
||
example: "The Night Watch by Rembrandt"
|
||
extraction_notes: |
|
||
Prepositions indicate specific relationship types.
|
||
"by" → creation, "in" → location, "of" → part-whole
|
||
|
||
- type: "COPULAR"
|
||
description: "Is-a relationships"
|
||
example: "Rembrandt was a painter"
|
||
extraction_notes: |
|
||
Copular verbs indicate typing, role, identity.
|
||
Handle tense for temporal scoping.
|
||
|
||
- type: "COMPOUND"
|
||
description: "Compound noun relationships"
|
||
example: "the Rembrandt painting"
|
||
extraction_notes: |
|
||
Noun-noun compounds encode implicit relationships.
|
||
Requires domain knowledge to disambiguate.
|
||
|
||
language_patterns:
|
||
english:
|
||
creation:
|
||
- pattern: "{AGENT} (painted|wrote|composed|created|made|designed) {WORK}"
|
||
relationship: "REL.CRE.*"
|
||
- pattern: "{WORK} by {AGENT}"
|
||
relationship: "REL.CRE.*"
|
||
- pattern: "{AGENT}'s {WORK}"
|
||
relationship: "REL.CRE.*"
|
||
|
||
family:
|
||
- pattern: "{PERSON} (son|daughter|child) of {PERSON}"
|
||
relationship: "REL.SOC.FAM.CHD"
|
||
- pattern: "{PERSON} (father|mother|parent) of {PERSON}"
|
||
relationship: "REL.SOC.FAM.PAR"
|
||
- pattern: "{PERSON} married {PERSON}"
|
||
relationship: "REL.SOC.FAM.SPO"
|
||
|
||
location:
|
||
- pattern: "{ENTITY} in {PLACE}"
|
||
relationship: "REL.SPA.LOC"
|
||
- pattern: "{ENTITY} at {PLACE}"
|
||
relationship: "REL.SPA.LOC"
|
||
- pattern: "{ENTITY} from {PLACE}"
|
||
relationship: "REL.SPA.ORG"
|
||
|
||
temporal:
|
||
- pattern: "{ENTITY} in {DATE}"
|
||
relationship: "REL.TMP.DUR"
|
||
- pattern: "{ENTITY} (during|throughout) {PERIOD}"
|
||
relationship: "REL.TMP.DUR"
|
||
- pattern: "{ENTITY} (before|prior to) {EVENT/DATE}"
|
||
relationship: "REL.TMP.BEF"
|
||
|
||
dutch:
|
||
creation:
|
||
- pattern: "{AGENT} (schilderde|schreef|componeerde|maakte) {WORK}"
|
||
relationship: "REL.CRE.*"
|
||
- pattern: "{WORK} van {AGENT}"
|
||
relationship: "REL.CRE.*"
|
||
|
||
family:
|
||
- pattern: "{PERSON} (zoon|dochter|kind) van {PERSON}"
|
||
relationship: "REL.SOC.FAM.CHD"
|
||
- pattern: "{PERSON} (vader|moeder|ouder) van {PERSON}"
|
||
relationship: "REL.SOC.FAM.PAR"
|
||
- pattern: "{PERSON} trouwde met {PERSON}"
|
||
relationship: "REL.SOC.FAM.SPO"
|
||
|
||
german:
|
||
creation:
|
||
- pattern: "{AGENT} (malte|schrieb|komponierte|schuf) {WORK}"
|
||
relationship: "REL.CRE.*"
|
||
- pattern: "{WORK} von {AGENT}"
|
||
relationship: "REL.CRE.*"
|
||
|
||
family:
|
||
- pattern: "{PERSON} (Sohn|Tochter|Kind) von {PERSON}"
|
||
relationship: "REL.SOC.FAM.CHD"
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# RELATIONSHIP REIFICATION
|
||
# ---------------------------------------------------------------------------
|
||
|
||
reification:
|
||
description: |
|
||
Mechanism for making statements about relationships themselves.
|
||
Required for provenance, uncertainty, temporal qualification,
|
||
and nested relationship structures.
|
||
|
||
approaches:
|
||
- name: "RDF Reification"
|
||
standard: "W3C RDF"
|
||
description: "Classic quad-based reification using rdf:Statement"
|
||
example: |
|
||
_:stmt1 a rdf:Statement ;
|
||
rdf:subject :Rembrandt ;
|
||
rdf:predicate :created ;
|
||
rdf:object :NightWatch ;
|
||
:confidence 0.95 ;
|
||
:source :Document123 .
|
||
limitations: "Verbose, query complexity"
|
||
|
||
- name: "Named Graphs"
|
||
standard: "W3C RDF 1.1"
|
||
description: "Group triples in named graphs for provenance"
|
||
example: |
|
||
GRAPH :assertion123 {
|
||
:Rembrandt :created :NightWatch .
|
||
}
|
||
:assertion123 :confidence 0.95 ;
|
||
:source :Document123 .
|
||
advantages: "Cleaner queries, SPARQL support"
|
||
|
||
- name: "Wikibase Qualifiers"
|
||
standard: "Wikidata model"
|
||
description: "Qualifiers attached to statement nodes"
|
||
example: |
|
||
wd:Q5598 p:P170 [
|
||
ps:P170 wd:Q5582 ;
|
||
pq:P580 "1642"^^xsd:gYear ;
|
||
prov:wasDerivedFrom [ pr:P248 wd:Q12345 ]
|
||
] .
|
||
advantages: "Rich qualification, widespread adoption"
|
||
|
||
- name: "RDF-star"
|
||
standard: "W3C RDF-star (emerging)"
|
||
description: "Quoted triples as subjects/objects"
|
||
example: |
|
||
<< :Rembrandt :created :NightWatch >> :confidence 0.95 .
|
||
advantages: "Concise, intuitive"
|
||
limitations: "Limited tooling support (as of 2025)"
|
||
|
||
recommended_approach: "Named Graphs"
|
||
rationale: |
|
||
Named graphs provide the best balance of expressivity, query support,
|
||
and tooling maturity. Each relationship annotation becomes a named
|
||
graph containing the core triple, with provenance attached to the
|
||
graph identifier.
|
||
|
||
schema:
|
||
graph_naming: |
|
||
Graph URIs follow pattern:
|
||
{document_uri}#rel-{relationship_id}
|
||
|
||
Example:
|
||
https://example.org/doc/123#rel-a1b2c3d4
|
||
|
||
required_metadata:
|
||
- "prov:generatedAtTime"
|
||
- "prov:wasAttributedTo"
|
||
- "oa:hasBody (the relationship triple)"
|
||
- "oa:hasTarget (source document span)"
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# RELATIONSHIP VALIDATION
|
||
# ---------------------------------------------------------------------------
|
||
|
||
validation:
|
||
description: |
|
||
Rules for validating relationship annotations. Ensures consistency
|
||
with entity types, ontology constraints, and domain rules.
|
||
|
||
type_constraints:
|
||
description: "Validate domain/range compatibility"
|
||
rules:
|
||
- rule: "Subject entity type must be in relationship domain"
|
||
error_code: "REL_DOMAIN_VIOLATION"
|
||
example: |
|
||
INVALID: REL.CRE.AUT with subject type TOP.BLT (building can't author)
|
||
VALID: REL.CRE.AUT with subject type AGT.PER (person can author)
|
||
|
||
- rule: "Object entity type must be in relationship range"
|
||
error_code: "REL_RANGE_VIOLATION"
|
||
example: |
|
||
INVALID: REL.CRE.AUT with object type AGT.PER (can't author a person)
|
||
VALID: REL.CRE.AUT with object type WRK.TXT (can author a text)
|
||
|
||
logical_constraints:
|
||
description: "Validate logical consistency"
|
||
rules:
|
||
- rule: "Symmetric relationships must have inverse annotation"
|
||
applies_to: ["REL.SOC.FAM.SIB", "REL.SOC.FAM.SPO", "REL.SPA.ADJ"]
|
||
|
||
- rule: "Asymmetric relationships cannot have same subject and object"
|
||
applies_to: ["REL.SOC.FAM.PAR", "REL.ORG.PAR"]
|
||
|
||
- rule: "Temporal relationships must have valid date ordering"
|
||
applies_to: ["REL.TMP.BEF", "REL.TMP.AFT"]
|
||
example: |
|
||
INVALID: Event A REL.TMP.BEF Event B where A.date > B.date
|
||
|
||
cardinality_constraints:
|
||
description: "Validate multiplicity rules"
|
||
rules:
|
||
- rule: "Person can have at most 2 biological parents"
|
||
relationship: "REL.SOC.FAM.PAR"
|
||
max_cardinality: 2
|
||
|
||
- rule: "Work has exactly one original creator (primary)"
|
||
relationship: "REL.CRE.*"
|
||
notes: "Secondary creators use qualifiers"
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# EXAMPLES
|
||
# ---------------------------------------------------------------------------
|
||
|
||
examples:
|
||
- name: "Creation relationship"
|
||
source_text: "The Night Watch was painted by Rembrandt in 1642."
|
||
annotation:
|
||
relationship_id: "rel-001"
|
||
relationship_type: "REL.CRE.ART"
|
||
subject:
|
||
entity_id: "ent-rembrandt"
|
||
entity_type: "AGT.PER"
|
||
span_text: "Rembrandt"
|
||
uri: "http://viaf.org/viaf/64013650"
|
||
predicate:
|
||
uri: "http://www.cidoc-crm.org/cidoc-crm/P14_carried_out_by"
|
||
label: "carried out by"
|
||
direction: "INVERSE"
|
||
object:
|
||
entity_id: "ent-nightwatch"
|
||
entity_type: "WRK.VIS"
|
||
span_text: "The Night Watch"
|
||
uri: "http://www.wikidata.org/entity/Q219831"
|
||
temporal_scope:
|
||
start_date: "1642"
|
||
end_date: "1642"
|
||
provenance:
|
||
source_document: "doc-123"
|
||
source_span:
|
||
start: 0
|
||
end: 47
|
||
extraction_method: "RULE_BASED"
|
||
confidence: 0.95
|
||
evidence_text: "The Night Watch was painted by Rembrandt in 1642."
|
||
|
||
- name: "Family relationship with temporal scope"
|
||
source_text: "Willem-Alexander, son of Queen Beatrix, became king in 2013."
|
||
annotation:
|
||
relationship_id: "rel-002"
|
||
relationship_type: "REL.SOC.FAM.CHD"
|
||
subject:
|
||
entity_id: "ent-willem"
|
||
entity_type: "AGT.PER"
|
||
span_text: "Willem-Alexander"
|
||
predicate:
|
||
uri: "http://purl.org/vocab/relationship/childOf"
|
||
label: "child of"
|
||
direction: "FORWARD"
|
||
object:
|
||
entity_id: "ent-beatrix"
|
||
entity_type: "AGT.PER"
|
||
span_text: "Queen Beatrix"
|
||
temporal_scope:
|
||
start_date: "1967-04-27"
|
||
end_date: null
|
||
provenance:
|
||
extraction_method: "MANUAL"
|
||
confidence: 1.0
|
||
|
||
- name: "Custodial relationship"
|
||
source_text: "The Rijksmuseum houses over 8,000 objects from Rembrandt's era."
|
||
annotation:
|
||
relationship_id: "rel-003"
|
||
relationship_type: "REL.CUS.KEP"
|
||
subject:
|
||
entity_id: "ent-rijks-collection"
|
||
entity_type: "WRK.COL"
|
||
span_text: "8,000 objects"
|
||
predicate:
|
||
uri: "http://www.cidoc-crm.org/cidoc-crm/P50_has_current_keeper"
|
||
label: "has current keeper"
|
||
direction: "FORWARD"
|
||
object:
|
||
entity_id: "ent-rijksmuseum"
|
||
entity_type: "GRP.ORG"
|
||
span_text: "The Rijksmuseum"
|
||
uri: "http://www.wikidata.org/entity/Q190804"
|
||
provenance:
|
||
extraction_method: "ML_EXTRACTED"
|
||
confidence: 0.88
|
||
|
||
- name: "Work derivation (FRBR)"
|
||
source_text: "The 2019 film 'Little Women' is based on Louisa May Alcott's 1868 novel."
|
||
annotation:
|
||
relationship_id: "rel-004"
|
||
relationship_type: "REL.WRK.ADP"
|
||
subject:
|
||
entity_id: "ent-lw-film"
|
||
entity_type: "WRK.CIN"
|
||
span_text: "The 2019 film 'Little Women'"
|
||
predicate:
|
||
uri: "http://schema.org/isBasedOn"
|
||
label: "is based on"
|
||
direction: "FORWARD"
|
||
object:
|
||
entity_id: "ent-lw-novel"
|
||
entity_type: "WRK.TXT"
|
||
span_text: "Louisa May Alcott's 1868 novel"
|
||
provenance:
|
||
extraction_method: "RULE_BASED"
|
||
confidence: 0.98
|
||
|
||
- name: "Negated relationship"
|
||
source_text: "Despite popular belief, Mozart did not poison Salieri."
|
||
annotation:
|
||
relationship_id: "rel-005"
|
||
relationship_type: "REL.EVT.PAR"
|
||
subject:
|
||
entity_id: "ent-mozart"
|
||
entity_type: "AGT.PER"
|
||
span_text: "Mozart"
|
||
predicate:
|
||
uri: "http://www.cidoc-crm.org/cidoc-crm/P11i_participated_in"
|
||
label: "participated in"
|
||
direction: "FORWARD"
|
||
object:
|
||
entity_id: "ent-poisoning-event"
|
||
entity_type: "EVT.ACT"
|
||
span_text: "poison Salieri"
|
||
negation: true
|
||
provenance:
|
||
extraction_method: "MANUAL"
|
||
confidence: 1.0
|
||
evidence_text: "Despite popular belief, Mozart did not poison Salieri."
|
||
|
||
# =============================================================================
|
||
# SECTION 17: COREFERENCE RESOLUTION
|
||
# =============================================================================
|
||
#
|
||
# This section defines rules for identifying when multiple mentions in text
|
||
# refer to the same real-world entity. Distinct from relationship annotations
|
||
# which link DIFFERENT entities.
|
||
#
|
||
# Coreference is fundamental to:
|
||
# - Entity consolidation (grouping mentions into entity clusters)
|
||
# - Knowledge graph construction (single node per entity)
|
||
# - Information extraction (aggregating facts about an entity)
|
||
# - Document understanding (tracking entities through narrative)
|
||
#
|
||
# Ontological foundations:
|
||
# - OWL: owl:sameAs for identity assertions
|
||
# - SKOS: skos:exactMatch for vocabulary alignment
|
||
# - NIF: nif:referenceContext for text anchoring
|
||
# - CIDOC-CRM: E42 Identifier for naming
|
||
# - TEI: <rs> (referring string) with @ref and @corresp
|
||
#
|
||
# Linguistic foundations:
|
||
# - Centering Theory (Grosz, Joshi, Weinstein)
|
||
# - Binding Theory (Chomsky)
|
||
# - Discourse Representation Theory (Kamp)
|
||
# - ACE/OntoNotes coreference guidelines
|
||
# =============================================================================
|
||
|
||
coreference_resolution:
|
||
|
||
description: |
|
||
Coreference resolution identifies when two or more text spans refer to
|
||
the same entity in the real world (or fictional world for narratives).
|
||
|
||
KEY DISTINCTION:
|
||
- COREFERENCE: "Rembrandt" and "the Dutch master" → SAME entity
|
||
- RELATIONSHIP: "Rembrandt" and "The Night Watch" → DIFFERENT entities linked by creation
|
||
|
||
Coreference creates ENTITY CLUSTERS where each cluster represents one
|
||
real-world entity and contains all textual mentions of that entity.
|
||
|
||
namespaces:
|
||
nif: "http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#"
|
||
itsrdf: "http://www.w3.org/2005/11/its/rdf#"
|
||
conll: "http://ufal.mff.cuni.cz/conll2009-st/task-description.html#"
|
||
ace: "https://www.ldc.upenn.edu/collaborations/past-projects/ace"
|
||
ontonotes: "https://catalog.ldc.upenn.edu/LDC2013T19"
|
||
owl: "http://www.w3.org/2002/07/owl#"
|
||
skos: "http://www.w3.org/2004/02/skos/core#"
|
||
crm: "http://www.cidoc-crm.org/cidoc-crm/"
|
||
tei: "http://www.tei-c.org/ns/1.0"
|
||
prov: "http://www.w3.org/ns/prov#"
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# COREFERENCE TYPES
|
||
# ---------------------------------------------------------------------------
|
||
|
||
coreference_types:
|
||
description: |
|
||
Classification of coreference phenomena by linguistic mechanism.
|
||
Each type has distinct annotation requirements and resolution strategies.
|
||
|
||
types:
|
||
# -------------------------------------------------------------------------
|
||
# COREF.PRO - Pronominal Reference
|
||
# -------------------------------------------------------------------------
|
||
- code: "COREF.PRO"
|
||
label: "Pronominal"
|
||
description: |
|
||
Pronouns referring to antecedent noun phrases. The most common
|
||
coreference type. Requires grammatical agreement (person, number,
|
||
gender) and discourse salience.
|
||
|
||
subtypes:
|
||
- code: "COREF.PRO.PER"
|
||
label: "Personal Pronoun"
|
||
description: "he, she, they, it, him, her, them"
|
||
examples:
|
||
- text: "Rembrandt painted portraits. He was a master of light."
|
||
antecedent: "Rembrandt"
|
||
anaphor: "He"
|
||
|
||
- code: "COREF.PRO.POS"
|
||
label: "Possessive Pronoun"
|
||
description: "his, her, their, its"
|
||
examples:
|
||
- text: "The museum expanded its collection in 2020."
|
||
antecedent: "The museum"
|
||
anaphor: "its"
|
||
|
||
- code: "COREF.PRO.REF"
|
||
label: "Reflexive Pronoun"
|
||
description: "himself, herself, themselves, itself"
|
||
examples:
|
||
- text: "The artist painted himself in the corner."
|
||
antecedent: "The artist"
|
||
anaphor: "himself"
|
||
binding_constraint: "Must be in same clause as antecedent"
|
||
|
||
- code: "COREF.PRO.REC"
|
||
label: "Reciprocal Pronoun"
|
||
description: "each other, one another"
|
||
examples:
|
||
- text: "The artists influenced each other."
|
||
antecedent: "The artists"
|
||
anaphor: "each other"
|
||
|
||
- code: "COREF.PRO.DEM"
|
||
label: "Demonstrative Pronoun"
|
||
description: "this, that, these, those (standalone)"
|
||
examples:
|
||
- text: "The painting sold for millions. This surprised critics."
|
||
antecedent: "The painting sold for millions"
|
||
anaphor: "This"
|
||
notes: "Can refer to propositions/events, not just entities"
|
||
|
||
- code: "COREF.PRO.REL"
|
||
label: "Relative Pronoun"
|
||
description: "who, which, that, whose"
|
||
examples:
|
||
- text: "Vermeer, who lived in Delft, painted domestic scenes."
|
||
antecedent: "Vermeer"
|
||
anaphor: "who"
|
||
binding_constraint: "Immediate syntactic dependency on antecedent"
|
||
|
||
# -------------------------------------------------------------------------
|
||
# COREF.NOM - Nominal Reference
|
||
# -------------------------------------------------------------------------
|
||
- code: "COREF.NOM"
|
||
label: "Nominal"
|
||
description: |
|
||
Full noun phrases referring to previously mentioned entities.
|
||
Includes definite descriptions, epithets, and proper noun variants.
|
||
|
||
subtypes:
|
||
- code: "COREF.NOM.DEF"
|
||
label: "Definite Description"
|
||
description: "NP with definite article referring to known entity"
|
||
examples:
|
||
- text: "Rembrandt was born in Leiden. The painter moved to Amsterdam."
|
||
antecedent: "Rembrandt"
|
||
anaphor: "The painter"
|
||
- text: "The Rijksmuseum reopened in 2013. The museum attracted millions."
|
||
antecedent: "The Rijksmuseum"
|
||
anaphor: "The museum"
|
||
annotation_notes: |
|
||
Definite descriptions may be:
|
||
- Coreferent (same entity)
|
||
- Bridging (related entity, see COREF.BRD)
|
||
Disambiguation requires world knowledge.
|
||
|
||
- code: "COREF.NOM.EPI"
|
||
label: "Epithet"
|
||
description: "Descriptive phrase used as name substitute"
|
||
examples:
|
||
- text: "Vermeer captured light beautifully. The Sphinx of Delft remains mysterious."
|
||
antecedent: "Vermeer"
|
||
anaphor: "The Sphinx of Delft"
|
||
- text: "Napoleon dominated Europe. The Little Corporal met his end at Waterloo."
|
||
antecedent: "Napoleon"
|
||
anaphor: "The Little Corporal"
|
||
annotation_notes: |
|
||
Epithets are often culturally-specific and may require
|
||
domain knowledge to resolve. Flag for verification when
|
||
confidence is low.
|
||
|
||
- code: "COREF.NOM.NAM"
|
||
label: "Name Variant"
|
||
description: "Different forms of the same proper name"
|
||
examples:
|
||
- text: "Rembrandt Harmenszoon van Rijn was born in 1606. Rembrandt is known for self-portraits."
|
||
antecedent: "Rembrandt Harmenszoon van Rijn"
|
||
anaphor: "Rembrandt"
|
||
- text: "The Metropolitan Museum of Art opened in 1872. The Met now has 2 million works."
|
||
antecedent: "The Metropolitan Museum of Art"
|
||
anaphor: "The Met"
|
||
annotation_notes: |
|
||
Name variants include:
|
||
- Abbreviations (Metropolitan → Met)
|
||
- Nicknames (William → Bill)
|
||
- Formal/informal (Dr. Smith → John)
|
||
- Transliterations (Tchaikovsky / Čajkovskij)
|
||
|
||
- code: "COREF.NOM.REP"
|
||
label: "Repetition"
|
||
description: "Exact or near-exact repetition of noun phrase"
|
||
examples:
|
||
- text: "The Night Watch hangs in the Rijksmuseum. The Night Watch was restored in 2019."
|
||
antecedent: "The Night Watch"
|
||
anaphor: "The Night Watch"
|
||
annotation_notes: |
|
||
Repetitions are high-confidence coreferences but verify
|
||
for potential ambiguity (e.g., multiple paintings with
|
||
similar names).
|
||
|
||
# -------------------------------------------------------------------------
|
||
# COREF.ZER - Zero Anaphora
|
||
# -------------------------------------------------------------------------
|
||
- code: "COREF.ZER"
|
||
label: "Zero Anaphora"
|
||
description: |
|
||
Implicit reference where the anaphor is not realized in surface text.
|
||
Common in pro-drop languages (Spanish, Japanese, Chinese) and in
|
||
coordinate structures in English.
|
||
|
||
subtypes:
|
||
- code: "COREF.ZER.PRO"
|
||
label: "Pro-drop"
|
||
description: "Omitted subject pronoun (pro-drop languages)"
|
||
examples:
|
||
- text: "El artista pintó retratos. [Ø] Era un maestro."
|
||
language: "Spanish"
|
||
antecedent: "El artista"
|
||
anaphor: "[implicit subject]"
|
||
- text: "画家が肖像画を描いた。[Ø]光の達人だった。"
|
||
language: "Japanese"
|
||
antecedent: "画家"
|
||
anaphor: "[implicit subject]"
|
||
|
||
- code: "COREF.ZER.GAP"
|
||
label: "Gapping"
|
||
description: "Ellipsis in coordinate structures"
|
||
examples:
|
||
- text: "Vermeer painted interiors and Rembrandt [Ø] portraits."
|
||
antecedent: "painted"
|
||
anaphor: "[elided verb]"
|
||
notes: "This is VP ellipsis, not entity coreference"
|
||
|
||
- code: "COREF.ZER.SLU"
|
||
label: "Sluicing"
|
||
description: "Ellipsis in embedded questions"
|
||
examples:
|
||
- text: "Someone stole the painting, but we don't know who [Ø]."
|
||
antecedent: "stole the painting"
|
||
anaphor: "[elided VP]"
|
||
|
||
# -------------------------------------------------------------------------
|
||
# COREF.CAT - Cataphora
|
||
# -------------------------------------------------------------------------
|
||
- code: "COREF.CAT"
|
||
label: "Cataphora"
|
||
description: |
|
||
Forward-pointing reference where the pronoun PRECEDES its referent.
|
||
Less common than anaphora; creates suspense or emphasis.
|
||
|
||
examples:
|
||
- text: "When he arrived in Amsterdam, Rembrandt was only 25."
|
||
cataphor: "he"
|
||
postcedent: "Rembrandt"
|
||
- text: "Although it was controversial, The Night Watch became Rembrandt's most famous work."
|
||
cataphor: "it"
|
||
postcedent: "The Night Watch"
|
||
|
||
constraints:
|
||
- "Cataphor typically in subordinate clause"
|
||
- "Postcedent in main clause"
|
||
- "Binding theory: c-command constraints apply"
|
||
|
||
# -------------------------------------------------------------------------
|
||
# COREF.BRD - Bridging Reference
|
||
# -------------------------------------------------------------------------
|
||
- code: "COREF.BRD"
|
||
label: "Bridging"
|
||
description: |
|
||
Reference to an entity not explicitly mentioned but inferrable from
|
||
context via a relation to an mentioned entity. NOT strict coreference
|
||
but essential for discourse coherence.
|
||
|
||
subtypes:
|
||
- code: "COREF.BRD.PRT"
|
||
label: "Part-Whole Bridging"
|
||
description: "Reference to a part of a mentioned whole"
|
||
examples:
|
||
- text: "The museum was renovated. The roof was replaced."
|
||
anchor: "The museum"
|
||
bridged: "The roof"
|
||
relation: "part-of"
|
||
- text: "I bought a book. The first chapter was fascinating."
|
||
anchor: "a book"
|
||
bridged: "The first chapter"
|
||
relation: "part-of"
|
||
|
||
- code: "COREF.BRD.SET"
|
||
label: "Set-Member Bridging"
|
||
description: "Reference to a member of a mentioned set"
|
||
examples:
|
||
- text: "The paintings arrived. The largest was a Rubens."
|
||
anchor: "The paintings"
|
||
bridged: "The largest"
|
||
relation: "member-of"
|
||
|
||
- code: "COREF.BRD.EVT"
|
||
label: "Event Bridging"
|
||
description: "Reference to participant in mentioned event"
|
||
examples:
|
||
- text: "The auction concluded. The winning bidder remained anonymous."
|
||
anchor: "The auction"
|
||
bridged: "The winning bidder"
|
||
relation: "participant-in"
|
||
|
||
- code: "COREF.BRD.ATR"
|
||
label: "Attribute Bridging"
|
||
description: "Reference to attribute of mentioned entity"
|
||
examples:
|
||
- text: "The painting is impressive. The size overwhelms visitors."
|
||
anchor: "The painting"
|
||
bridged: "The size"
|
||
relation: "attribute-of"
|
||
|
||
- code: "COREF.BRD.PRD"
|
||
label: "Producer/Product Bridging"
|
||
description: "Reference via production relationship"
|
||
examples:
|
||
- text: "Rembrandt was prolific. The paintings number over 300."
|
||
anchor: "Rembrandt"
|
||
bridged: "The paintings"
|
||
relation: "produced-by"
|
||
|
||
annotation_notes: |
|
||
Bridging is DISTINCT from coreference:
|
||
- Coreference: same entity, different mentions
|
||
- Bridging: different entities, related by inference
|
||
|
||
Bridging annotations should specify the bridging RELATION
|
||
and the ANCHOR entity that enables the inference.
|
||
|
||
# -------------------------------------------------------------------------
|
||
# COREF.SPL - Split Antecedent
|
||
# -------------------------------------------------------------------------
|
||
- code: "COREF.SPL"
|
||
label: "Split Antecedent"
|
||
description: |
|
||
Plural pronoun referring to multiple previously mentioned entities
|
||
that together form the antecedent.
|
||
|
||
examples:
|
||
- text: "Rembrandt met Saskia in 1633. They married the following year."
|
||
antecedents: ["Rembrandt", "Saskia"]
|
||
anaphor: "They"
|
||
- text: "The curator and the director disagreed. They eventually compromised."
|
||
antecedents: ["The curator", "the director"]
|
||
anaphor: "They"
|
||
|
||
annotation_notes: |
|
||
Split antecedents create a GROUP entity from individual mentions.
|
||
The annotation should:
|
||
1. Link the anaphor to each antecedent mention
|
||
2. Create an implicit group entity
|
||
3. Record that this is split-antecedent resolution
|
||
|
||
# -------------------------------------------------------------------------
|
||
# COREF.GEN - Generic Reference
|
||
# -------------------------------------------------------------------------
|
||
- code: "COREF.GEN"
|
||
label: "Generic"
|
||
description: |
|
||
Reference to generic/kind-level entities rather than specific
|
||
individuals. Requires special handling as generic mentions don't
|
||
refer to token-level entities.
|
||
|
||
subtypes:
|
||
- code: "COREF.GEN.KND"
|
||
label: "Kind Reference"
|
||
description: "Reference to a kind or class"
|
||
examples:
|
||
- text: "The dodo is extinct. It was native to Mauritius."
|
||
kind: "dodo (species)"
|
||
notes: "Both mentions refer to the KIND, not an individual"
|
||
|
||
- code: "COREF.GEN.ARB"
|
||
label: "Arbitrary Reference"
|
||
description: "Reference to arbitrary/generic individual"
|
||
examples:
|
||
- text: "A museum curator must be knowledgeable. They work long hours."
|
||
generic_type: "arbitrary curator"
|
||
notes: "'They' refers to any curator, not a specific one"
|
||
|
||
annotation_notes: |
|
||
Generic coreference is often EXCLUDED from mention-level
|
||
coreference annotation (per ACE/OntoNotes guidelines) but
|
||
may be relevant for knowledge base population.
|
||
|
||
Flag generic mentions with is_generic=true.
|
||
|
||
# -------------------------------------------------------------------------
|
||
# COREF.APP - Appositive
|
||
# -------------------------------------------------------------------------
|
||
- code: "COREF.APP"
|
||
label: "Appositive"
|
||
description: |
|
||
Noun phrase immediately following another NP that renames or
|
||
describes the same entity. High-confidence coreference.
|
||
|
||
subtypes:
|
||
- code: "COREF.APP.TIT"
|
||
label: "Title/Role Appositive"
|
||
examples:
|
||
- text: "Rembrandt, the Dutch painter, was born in 1606."
|
||
head: "Rembrandt"
|
||
appositive: "the Dutch painter"
|
||
|
||
- code: "COREF.APP.ALT"
|
||
label: "Alternative Name Appositive"
|
||
examples:
|
||
- text: "The Night Watch, or De Nachtwacht, hangs in Amsterdam."
|
||
head: "The Night Watch"
|
||
appositive: "De Nachtwacht"
|
||
|
||
- code: "COREF.APP.REL"
|
||
label: "Relationship Appositive"
|
||
examples:
|
||
- text: "Saskia, Rembrandt's wife, died in 1642."
|
||
head: "Saskia"
|
||
appositive: "Rembrandt's wife"
|
||
|
||
syntactic_pattern: "NP1, NP2, ..."
|
||
annotation_notes: |
|
||
Appositives provide HIGH CONFIDENCE coreference signals.
|
||
They also often provide entity TYPE information
|
||
(e.g., "the Dutch painter" → type=AGT.PER, occupation=painter)
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# COREFERENCE ANNOTATION SCHEMA
|
||
# ---------------------------------------------------------------------------
|
||
|
||
annotation_schema:
|
||
description: |
|
||
Schema for encoding coreference annotations. Supports both mention-pair
|
||
and entity-cluster representations.
|
||
|
||
mention_annotation:
|
||
description: "Annotation for individual mentions"
|
||
required_fields:
|
||
- field: "mention_id"
|
||
type: "string"
|
||
format: "UUID or sequential ID"
|
||
description: "Unique identifier for this mention"
|
||
|
||
- field: "span"
|
||
type: "object"
|
||
schema:
|
||
start: "integer (character offset)"
|
||
end: "integer (character offset)"
|
||
text: "string (surface form)"
|
||
description: "Text span of the mention"
|
||
|
||
- field: "entity_type"
|
||
type: "string"
|
||
description: "Entity type code (AGT.PER, GRP.ORG, etc.)"
|
||
|
||
optional_fields:
|
||
- field: "head_span"
|
||
type: "object"
|
||
schema:
|
||
start: "integer"
|
||
end: "integer"
|
||
text: "string"
|
||
description: "Syntactic head of the mention (for complex NPs)"
|
||
|
||
- field: "mention_type"
|
||
type: "enum"
|
||
values:
|
||
- "PROPER" # Proper noun (Rembrandt, Amsterdam)
|
||
- "NOMINAL" # Common noun phrase (the painter, a museum)
|
||
- "PRONOMINAL" # Pronoun (he, she, it, they)
|
||
- "ZERO" # Implicit/zero anaphor
|
||
description: "Morphosyntactic type of mention"
|
||
|
||
- field: "is_generic"
|
||
type: "boolean"
|
||
default: false
|
||
description: "True if mention refers to a kind, not individual"
|
||
|
||
- field: "grammatical_role"
|
||
type: "enum"
|
||
values: ["SUBJECT", "OBJECT", "OBLIQUE", "POSSESSIVE", "APPOSITIVE"]
|
||
description: "Grammatical function in sentence"
|
||
|
||
- field: "number"
|
||
type: "enum"
|
||
values: ["SINGULAR", "PLURAL", "UNKNOWN"]
|
||
description: "Grammatical number"
|
||
|
||
- field: "gender"
|
||
type: "enum"
|
||
values: ["MASCULINE", "FEMININE", "NEUTER", "UNKNOWN"]
|
||
description: "Grammatical gender (language-dependent)"
|
||
|
||
- field: "animacy"
|
||
type: "enum"
|
||
values: ["ANIMATE", "INANIMATE", "UNKNOWN"]
|
||
description: "Animacy feature"
|
||
|
||
cluster_annotation:
|
||
description: |
|
||
Entity cluster grouping all mentions of the same entity.
|
||
The primary output of coreference resolution.
|
||
|
||
required_fields:
|
||
- field: "cluster_id"
|
||
type: "string"
|
||
description: "Unique identifier for the entity cluster"
|
||
|
||
- field: "mentions"
|
||
type: "array"
|
||
item_type: "mention_id (reference)"
|
||
description: "All mentions belonging to this cluster"
|
||
|
||
- field: "entity_type"
|
||
type: "string"
|
||
description: "Entity type of the cluster (most specific common type)"
|
||
|
||
optional_fields:
|
||
- field: "canonical_mention"
|
||
type: "mention_id"
|
||
description: "The 'best' mention for display (usually first proper noun)"
|
||
|
||
- field: "external_uri"
|
||
type: "string"
|
||
format: "URI"
|
||
description: "Link to external knowledge base (Wikidata, VIAF, etc.)"
|
||
|
||
- field: "confidence"
|
||
type: "float"
|
||
range: [0.0, 1.0]
|
||
description: "Confidence in cluster correctness"
|
||
|
||
- field: "provenance"
|
||
type: "object"
|
||
schema:
|
||
resolution_method: "enum (MANUAL, RULE_BASED, ML_NEURAL, HYBRID)"
|
||
resolver_id: "string"
|
||
resolution_date: "ISO 8601"
|
||
description: "How the cluster was formed"
|
||
|
||
link_annotation:
|
||
description: |
|
||
Explicit pairwise coreference links (alternative to cluster-first).
|
||
Useful for bridging and split antecedents.
|
||
|
||
required_fields:
|
||
- field: "link_id"
|
||
type: "string"
|
||
|
||
- field: "source_mention"
|
||
type: "mention_id"
|
||
description: "The anaphor (referring expression)"
|
||
|
||
- field: "target_mention"
|
||
type: "mention_id"
|
||
description: "The antecedent (referred-to expression)"
|
||
|
||
- field: "link_type"
|
||
type: "string"
|
||
format: "COREF.{TYPE}.{SUBTYPE}"
|
||
description: "Coreference type from taxonomy"
|
||
|
||
optional_fields:
|
||
- field: "bridging_relation"
|
||
type: "string"
|
||
description: "For COREF.BRD, the semantic relation"
|
||
values: ["part-of", "member-of", "attribute-of", "participant-in", "produced-by"]
|
||
|
||
- field: "confidence"
|
||
type: "float"
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# RESOLUTION STRATEGIES
|
||
# ---------------------------------------------------------------------------
|
||
|
||
resolution_strategies:
|
||
description: |
|
||
Strategies for resolving coreference, from rule-based to neural approaches.
|
||
|
||
rule_based:
|
||
description: "Deterministic rules based on linguistic constraints"
|
||
|
||
rules:
|
||
- name: "Exact String Match"
|
||
description: "Identical strings likely corefer"
|
||
confidence: 0.95
|
||
exceptions:
|
||
- "Common nouns in different contexts"
|
||
- "Homonymous proper nouns (multiple people named 'John Smith')"
|
||
|
||
- name: "Acronym Expansion"
|
||
description: "Acronym matches expanded form"
|
||
pattern: "Match 'BBC' with 'British Broadcasting Corporation'"
|
||
confidence: 0.90
|
||
|
||
- name: "Appositive"
|
||
description: "NP in apposition corefers with head"
|
||
pattern: "NP1, NP2 → NP1 = NP2"
|
||
confidence: 0.98
|
||
|
||
- name: "Predicate Nominative"
|
||
description: "Subject and predicate nominative corefer"
|
||
pattern: "X is Y → X = Y (when Y is identity statement)"
|
||
confidence: 0.85
|
||
|
||
- name: "Reflexive Binding"
|
||
description: "Reflexive binds to local subject"
|
||
pattern: "X ... himself/herself → reflexive = X"
|
||
confidence: 0.99
|
||
constraint: "Same clause"
|
||
|
||
- name: "Relative Pronoun"
|
||
description: "Relative pronoun binds to immediately preceding NP"
|
||
pattern: "NP who/which ... → relative = NP"
|
||
confidence: 0.99
|
||
|
||
- name: "Gender Agreement"
|
||
description: "Pronoun must match antecedent gender"
|
||
constraint: "FILTER: he → masculine, she → feminine, it → neuter"
|
||
|
||
- name: "Number Agreement"
|
||
description: "Pronoun must match antecedent number"
|
||
constraint: "FILTER: they → plural, he/she/it → singular"
|
||
|
||
- name: "Recency Preference"
|
||
description: "More recent mentions preferred as antecedents"
|
||
notes: "All else equal, prefer closer antecedent"
|
||
|
||
- name: "Grammatical Role Preference"
|
||
description: "Subject > Object > Oblique for antecedent"
|
||
notes: "Per Centering Theory"
|
||
|
||
statistical:
|
||
description: "Machine learning approaches to coreference"
|
||
|
||
approaches:
|
||
- name: "Mention-Pair Model"
|
||
description: "Binary classification for mention pairs"
|
||
features:
|
||
- "String matching features"
|
||
- "Distance features (sentence, mention)"
|
||
- "Grammatical features"
|
||
- "Semantic features (WordNet, embeddings)"
|
||
limitations: "Doesn't model transitivity well"
|
||
|
||
- name: "Mention-Ranking Model"
|
||
description: "Rank antecedent candidates for each mention"
|
||
advantages: "Handles competition between candidates"
|
||
|
||
- name: "Entity-Centric Model"
|
||
description: "Model clusters directly, not pairs"
|
||
advantages: "Better global coherence"
|
||
|
||
- name: "End-to-End Neural"
|
||
description: "Jointly learn mention detection and coreference"
|
||
sota_models:
|
||
- "Lee et al. (2017) - span representations"
|
||
- "Joshi et al. (2020) - SpanBERT-based"
|
||
- "Kirstain et al. (2021) - s2e model"
|
||
advantages: "No pipeline errors, learns complex patterns"
|
||
limitations: "Requires large training data, less interpretable"
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# DOMAIN-SPECIFIC CONSIDERATIONS
|
||
# ---------------------------------------------------------------------------
|
||
|
||
domain_considerations:
|
||
|
||
heritage_domain:
|
||
description: |
|
||
Special considerations for cultural heritage documents
|
||
(archival finding aids, museum catalogs, art history texts).
|
||
|
||
challenges:
|
||
- challenge: "Historical Name Variants"
|
||
description: "Same person may have multiple historical spellings"
|
||
examples:
|
||
- "Rembrandt / Rembrant / Rembrand"
|
||
- "Titian / Tiziano Vecelli / Tiziano Vecellio"
|
||
strategy: "Maintain authority file linkage; use VIAF/ULAN"
|
||
|
||
- challenge: "Institutional Name Changes"
|
||
description: "Organizations change names over time"
|
||
examples:
|
||
- "Mauritshuis / Koninklijk Kabinet van Schilderijen"
|
||
- "The Tate / Tate Gallery / Tate Britain"
|
||
strategy: "Track temporal validity of names; link to authority"
|
||
|
||
- challenge: "Anonymous/Unknown Creators"
|
||
description: "Attributions like 'Circle of Rembrandt', 'Follower of'"
|
||
examples:
|
||
- "Follower of Vermeer"
|
||
- "Workshop of Rubens"
|
||
- "Anonymous (Dutch, 17th century)"
|
||
strategy: |
|
||
Create placeholder entities for attribution groups.
|
||
Do NOT corefer with named artists.
|
||
|
||
- challenge: "Work Titles"
|
||
description: "Artworks may have multiple titles"
|
||
examples:
|
||
- "The Night Watch / De Nachtwacht / Militia Company of District II"
|
||
- "Mona Lisa / La Gioconda / Portrait of Lisa Gherardini"
|
||
strategy: "Canonical title in preferred language; track variants"
|
||
|
||
- challenge: "Archival Hierarchies"
|
||
description: "Nested references in finding aids"
|
||
examples:
|
||
- "The collection... this series... the folder..."
|
||
strategy: "Model archival hierarchy; bridging for containment"
|
||
|
||
entity_type_specifics:
|
||
- type: "AGT.PER (Persons)"
|
||
notes: |
|
||
- Artists often referred to by last name only after first mention
|
||
- Titles/honorifics may vary (van Gogh / Mr. van Gogh / Vincent)
|
||
- Biographical facts help disambiguation
|
||
|
||
- type: "GRP.ORG (Organizations)"
|
||
notes: |
|
||
- Museums, archives may merge/split
|
||
- Acronyms common (MoMA, V&A, BnF)
|
||
- Departmental references may need bridging
|
||
|
||
- type: "WRK.* (Works)"
|
||
notes: |
|
||
- Works may be referred to by title, artist, or description
|
||
- "The painting" may need context for resolution
|
||
- Distinguish work vs. physical object (FRBR)
|
||
|
||
- type: "TOP.* (Places)"
|
||
notes: |
|
||
- Historical place names may differ from modern
|
||
- "Delft" may be city or adjective ("Delft painter")
|
||
- Nested places (room in museum in city)
|
||
|
||
multilingual:
|
||
description: "Considerations for multilingual documents"
|
||
|
||
challenges:
|
||
- challenge: "Cross-lingual Coreference"
|
||
description: "Same entity mentioned in different languages"
|
||
examples:
|
||
- "Amsterdam (EN) / Amsterdam (NL) / Ámsterdam (ES)"
|
||
- "The Night Watch / De Nachtwacht"
|
||
strategy: "Normalize via external authority (Wikidata)"
|
||
|
||
- challenge: "Translation Variants"
|
||
description: "Translated names may not be exact"
|
||
examples:
|
||
- "Rijksmuseum / State Museum / Musée national"
|
||
strategy: "Fuzzy matching with translation knowledge"
|
||
|
||
- challenge: "Pro-drop Languages"
|
||
description: "Subject pronouns omitted in Spanish, Japanese, etc."
|
||
strategy: "Use zero anaphora annotation; infer from verb agreement"
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# ANNOTATION GUIDELINES
|
||
# ---------------------------------------------------------------------------
|
||
|
||
annotation_guidelines:
|
||
|
||
markable_entities:
|
||
description: "What counts as a markable mention"
|
||
include:
|
||
- "Named entities (proper nouns)"
|
||
- "Definite noun phrases with specific referent"
|
||
- "Pronouns with recoverable antecedent"
|
||
- "Demonstrative NPs referring to specific entities"
|
||
- "Possessive NPs (his, their, the museum's)"
|
||
exclude:
|
||
- "Expletive pronouns (It is raining, There are problems)"
|
||
- "Idioms where reference is not compositional"
|
||
- "Generic/kind-level references (optional exclusion)"
|
||
- "Predicative NPs (He is a painter - 'a painter' is not markable)"
|
||
- "Bound pronouns in quantified expressions (Every artist sells their work)"
|
||
|
||
span_conventions:
|
||
description: "How to delimit mention spans"
|
||
rules:
|
||
- rule: "Maximal NP Span"
|
||
description: "Include full NP including determiners and modifiers"
|
||
example: "[The famous Dutch painter Rembrandt] → entire span"
|
||
|
||
- rule: "Nested Mentions"
|
||
description: "Mark nested mentions separately"
|
||
example: "[Rembrandt]'s [wife] → two mentions, linked"
|
||
|
||
- rule: "Conjoined NPs"
|
||
description: "Mark entire conjunction and individual conjuncts"
|
||
example: "[[Rembrandt] and [Vermeer]] both painted → 3 mentions"
|
||
|
||
- rule: "Head Annotation"
|
||
description: "Additionally mark syntactic head within span"
|
||
example: "[The famous painter (head=painter)]"
|
||
|
||
edge_cases:
|
||
- case: "Singleton Entities"
|
||
description: "Entities mentioned only once"
|
||
guideline: "Include in annotation; mark as singleton cluster"
|
||
|
||
- case: "Discourse Deixis"
|
||
description: "Reference to propositions/events, not entities"
|
||
example: "'John left. That surprised me.' - 'That' refers to event"
|
||
guideline: "Annotate with special type COREF.PRO.DEM; link to event span"
|
||
|
||
- case: "Metonymy"
|
||
description: "Reference to related entity (e.g., 'Washington' for US government)"
|
||
guideline: "Create separate entity; mark metonymic relationship"
|
||
|
||
- case: "Generic Groups"
|
||
description: "Plural generics (Musicians are creative. They practice daily.)"
|
||
guideline: "Mark as generic; optionally exclude from scoring"
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# EVALUATION METRICS
|
||
# ---------------------------------------------------------------------------
|
||
|
||
evaluation:
|
||
description: |
|
||
Metrics for evaluating coreference resolution quality.
|
||
Multiple metrics capture different aspects of performance.
|
||
|
||
metrics:
|
||
- name: "MUC"
|
||
description: "Link-based F1 (Vilain et al., 1995)"
|
||
formula: "Precision/Recall over minimum spanning tree links"
|
||
strengths: "Penalizes missing links"
|
||
weaknesses: "Doesn't penalize over-merging well"
|
||
|
||
- name: "B-CUBED"
|
||
description: "Mention-based precision/recall (Bagga & Baldwin, 1998)"
|
||
formula: "Average P/R per mention based on cluster overlap"
|
||
strengths: "Balanced for split/merge errors"
|
||
weaknesses: "Weights all mentions equally"
|
||
|
||
- name: "CEAF"
|
||
description: "Entity-based alignment (Luo, 2005)"
|
||
formula: "Optimal 1-1 alignment between gold and system clusters"
|
||
variants:
|
||
- "CEAF-e: entity-based (cluster-level)"
|
||
- "CEAF-m: mention-based"
|
||
strengths: "Directly evaluates entity-level accuracy"
|
||
|
||
- name: "LEA"
|
||
description: "Link-based entity-aware (Moosavi & Strube, 2016)"
|
||
formula: "Importance-weighted link evaluation"
|
||
strengths: "Balances entity and link perspectives"
|
||
|
||
- name: "CoNLL Average"
|
||
description: "Average of MUC, B-CUBED, CEAF-e"
|
||
standard: "Official CoNLL shared task metric"
|
||
formula: "(MUC_F1 + B3_F1 + CEAF_e_F1) / 3"
|
||
|
||
error_types:
|
||
- type: "False Link (Precision Error)"
|
||
description: "System links two mentions that shouldn't be linked"
|
||
impact: "Over-merged clusters"
|
||
|
||
- type: "Missing Link (Recall Error)"
|
||
description: "System fails to link two coreferent mentions"
|
||
impact: "Fragmented clusters"
|
||
|
||
- type: "Wrong Link"
|
||
description: "Mention linked to wrong antecedent (but still coreferent)"
|
||
impact: "May not affect cluster-level metrics"
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# EXAMPLES
|
||
# ---------------------------------------------------------------------------
|
||
|
||
examples:
|
||
- name: "Pronominal coreference"
|
||
text: |
|
||
Rembrandt van Rijn was born in Leiden in 1606. He moved to Amsterdam
|
||
in 1631, where he established his studio. The Dutch master painted
|
||
over 300 works during his career. He died in 1669.
|
||
|
||
mentions:
|
||
- id: "m1"
|
||
span: {start: 0, end: 18, text: "Rembrandt van Rijn"}
|
||
type: "PROPER"
|
||
entity_type: "AGT.PER"
|
||
- id: "m2"
|
||
span: {start: 48, end: 50, text: "He"}
|
||
type: "PRONOMINAL"
|
||
entity_type: "AGT.PER"
|
||
- id: "m3"
|
||
span: {start: 97, end: 100, text: "his"}
|
||
type: "PRONOMINAL"
|
||
entity_type: "AGT.PER"
|
||
- id: "m4"
|
||
span: {start: 109, end: 125, text: "The Dutch master"}
|
||
type: "NOMINAL"
|
||
entity_type: "AGT.PER"
|
||
- id: "m5"
|
||
span: {start: 157, end: 160, text: "his"}
|
||
type: "PRONOMINAL"
|
||
entity_type: "AGT.PER"
|
||
- id: "m6"
|
||
span: {start: 170, end: 172, text: "He"}
|
||
type: "PRONOMINAL"
|
||
entity_type: "AGT.PER"
|
||
|
||
clusters:
|
||
- cluster_id: "e1"
|
||
entity_type: "AGT.PER"
|
||
canonical_mention: "m1"
|
||
mentions: ["m1", "m2", "m3", "m4", "m5", "m6"]
|
||
external_uri: "http://viaf.org/viaf/64013650"
|
||
|
||
links:
|
||
- {source: "m2", target: "m1", type: "COREF.PRO.PER"}
|
||
- {source: "m3", target: "m2", type: "COREF.PRO.POS"}
|
||
- {source: "m4", target: "m1", type: "COREF.NOM.EPI"}
|
||
- {source: "m5", target: "m4", type: "COREF.PRO.POS"}
|
||
- {source: "m6", target: "m4", type: "COREF.PRO.PER"}
|
||
|
||
- name: "Bridging reference"
|
||
text: |
|
||
The Rijksmuseum reopened in 2013 after a decade of renovation.
|
||
The main building houses the collection of Dutch masters.
|
||
The entrance on Museumplein welcomes 2.5 million visitors annually.
|
||
|
||
mentions:
|
||
- id: "m1"
|
||
span: {text: "The Rijksmuseum"}
|
||
entity_type: "GRP.ORG"
|
||
- id: "m2"
|
||
span: {text: "The main building"}
|
||
entity_type: "TOP.BLT"
|
||
- id: "m3"
|
||
span: {text: "The entrance"}
|
||
entity_type: "TOP.BLT"
|
||
|
||
annotations:
|
||
- type: "COREF.BRD.PRT"
|
||
source: "m2"
|
||
anchor: "m1"
|
||
relation: "part-of"
|
||
notes: "Building is part of museum complex"
|
||
- type: "COREF.BRD.PRT"
|
||
source: "m3"
|
||
anchor: "m2"
|
||
relation: "part-of"
|
||
notes: "Entrance is part of building"
|
||
|
||
- name: "Split antecedent"
|
||
text: |
|
||
Frans Hals and Judith Leyster both painted in Haarlem.
|
||
They influenced each other's work.
|
||
|
||
mentions:
|
||
- id: "m1"
|
||
span: {text: "Frans Hals"}
|
||
entity_type: "AGT.PER"
|
||
- id: "m2"
|
||
span: {text: "Judith Leyster"}
|
||
entity_type: "AGT.PER"
|
||
- id: "m3"
|
||
span: {text: "They"}
|
||
entity_type: "AGT.GRP"
|
||
- id: "m4"
|
||
span: {text: "each other's"}
|
||
entity_type: "AGT.PER"
|
||
|
||
annotations:
|
||
- type: "COREF.SPL"
|
||
source: "m3"
|
||
antecedents: ["m1", "m2"]
|
||
notes: "Plural pronoun with split antecedent"
|
||
- type: "COREF.PRO.REC"
|
||
source: "m4"
|
||
target: "m3"
|
||
notes: "Reciprocal within split-antecedent group"
|
||
|
||
# =============================================================================
|
||
# SECTION 18: UNCERTAINTY AND CONFIDENCE SCORING
|
||
# =============================================================================
|
||
#
|
||
# This section defines the framework for expressing and scoring uncertainty
|
||
# in entity annotations, relationships, and coreference decisions.
|
||
#
|
||
# Uncertainty is inherent in NLP/NER and must be explicitly modeled to:
|
||
# - Prioritize human review (low confidence → needs verification)
|
||
# - Enable probabilistic reasoning in downstream applications
|
||
# - Support provenance tracking (who asserted what, with what certainty)
|
||
# - Allow confidence-based filtering for different use cases
|
||
#
|
||
# Ontological foundations:
|
||
# - W3C PROV-O: Provenance tracking
|
||
# - OA (Open Annotation): Annotation confidence
|
||
# - CIDOC-CRM: E13 Attribute Assignment for epistemic claims
|
||
# - IAO (Information Artifact Ontology): Information quality
|
||
# - ISO 19157: Data quality for geographic information
|
||
# - TEI: @cert (certainty) and <certainty> element
|
||
# - TimeML: @temporalFunction for temporal uncertainty
|
||
#
|
||
# Statistical foundations:
|
||
# - Bayesian probability for belief updating
|
||
# - Calibration theory for reliable confidence scores
|
||
# - Inter-annotator agreement metrics (Cohen's κ, Fleiss' κ)
|
||
# =============================================================================
|
||
|
||
uncertainty_and_confidence:
|
||
|
||
description: |
|
||
Uncertainty modeling captures the degree of confidence in annotations,
|
||
enabling consumers to filter, prioritize, and reason about data quality.
|
||
|
||
This framework addresses THREE types of uncertainty:
|
||
|
||
1. ANNOTATION UNCERTAINTY: Confidence in a specific annotation decision
|
||
- "How sure are we that 'Rembrandt' refers to AGT.PER?"
|
||
|
||
2. EPISTEMIC UNCERTAINTY: Uncertainty about facts in the world
|
||
- "Did Rembrandt actually paint this work?" (attribution uncertainty)
|
||
|
||
3. LINGUISTIC UNCERTAINTY: Ambiguity in the source text itself
|
||
- "The text says 'possibly by Rembrandt'" (hedged language)
|
||
|
||
These are distinct and must be tracked separately.
|
||
|
||
namespaces:
|
||
prov: "http://www.w3.org/ns/prov#"
|
||
oa: "http://www.w3.org/ns/oa#"
|
||
crm: "http://www.cidoc-crm.org/cidoc-crm/"
|
||
iao: "http://purl.obolibrary.org/obo/IAO_"
|
||
dqv: "http://www.w3.org/ns/dqv#"
|
||
tei: "http://www.tei-c.org/ns/1.0"
|
||
schema: "http://schema.org/"
|
||
xsd: "http://www.w3.org/2001/XMLSchema#"
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# CONFIDENCE SCORE FRAMEWORK
|
||
# ---------------------------------------------------------------------------
|
||
|
||
confidence_framework:
|
||
description: |
|
||
Numerical confidence scores on [0.0, 1.0] scale with defined semantics.
|
||
Scores should be CALIBRATED: a score of 0.8 means the annotation is
|
||
correct approximately 80% of the time across similar cases.
|
||
|
||
score_semantics:
|
||
- range: "[0.95, 1.0]"
|
||
range_min: 0.95
|
||
range_max: 1.0
|
||
inclusive: "both"
|
||
label: "CERTAIN"
|
||
tei_cert: "high"
|
||
description: |
|
||
Near-certain annotation. Human annotators would agree 95%+ of the time.
|
||
Examples: exact string matches, unambiguous proper nouns, rule-based
|
||
high-precision extractions.
|
||
action: "Accept automatically; no review needed"
|
||
color_code: "#228B22" # Forest green
|
||
|
||
- range: "[0.80, 0.95)"
|
||
range_min: 0.80
|
||
range_max: 0.95
|
||
inclusive: "left"
|
||
label: "HIGH_CONFIDENCE"
|
||
tei_cert: "high"
|
||
description: |
|
||
High confidence annotation. Strong signals but some ambiguity possible.
|
||
Examples: well-known entities with standard forms, clear context.
|
||
action: "Accept with spot-checking"
|
||
color_code: "#32CD32" # Lime green
|
||
|
||
- range: "[0.60, 0.80)"
|
||
range_min: 0.60
|
||
range_max: 0.80
|
||
inclusive: "left"
|
||
label: "MEDIUM_CONFIDENCE"
|
||
tei_cert: "medium"
|
||
description: |
|
||
Moderate confidence. Multiple interpretations possible; context helps.
|
||
Examples: common names with multiple referents, unclear boundaries.
|
||
action: "Review if resource-sensitive; accept for bulk processing"
|
||
color_code: "#FFD700" # Gold
|
||
|
||
- range: "[0.40, 0.60)"
|
||
range_min: 0.40
|
||
range_max: 0.60
|
||
inclusive: "left"
|
||
label: "LOW_CONFIDENCE"
|
||
tei_cert: "low"
|
||
description: |
|
||
Low confidence. Significant uncertainty; near chance for binary decision.
|
||
Examples: ambiguous pronouns, unknown entities, noisy text.
|
||
action: "Flag for human review"
|
||
color_code: "#FFA500" # Orange
|
||
|
||
- range: "[0.20, 0.40)"
|
||
range_min: 0.20
|
||
range_max: 0.40
|
||
inclusive: "left"
|
||
label: "VERY_LOW_CONFIDENCE"
|
||
tei_cert: "low"
|
||
description: |
|
||
Very low confidence. Model is guessing; alternative interpretations likely.
|
||
action: "Require human verification before use"
|
||
color_code: "#FF4500" # Orange-red
|
||
|
||
- range: "[0.0, 0.20)"
|
||
range_min: 0.0
|
||
range_max: 0.20
|
||
inclusive: "left"
|
||
label: "UNCERTAIN"
|
||
tei_cert: "unknown"
|
||
description: |
|
||
Near-zero confidence. Essentially no signal; included for completeness.
|
||
action: "Do not use without manual annotation"
|
||
color_code: "#DC143C" # Crimson
|
||
|
||
special_values:
|
||
- value: null
|
||
meaning: "Confidence not computed/available"
|
||
|
||
- value: 1.0
|
||
meaning: "Absolute certainty (use sparingly; only for definitional truths)"
|
||
|
||
- value: 0.0
|
||
meaning: "Definite negative (annotation is known to be incorrect)"
|
||
|
||
aggregation_rules:
|
||
description: "How to combine confidence scores"
|
||
|
||
rules:
|
||
- name: "Independent Conjunction"
|
||
formula: "P(A ∧ B) = P(A) × P(B)"
|
||
use_case: "Joint probability of independent annotations"
|
||
example: "Entity type AND entity boundaries both correct"
|
||
|
||
- name: "Conservative Minimum"
|
||
formula: "min(conf_1, conf_2, ...)"
|
||
use_case: "Weakest-link scenarios; overall quality limited by weakest component"
|
||
example: "Relationship confidence = min(subject_conf, object_conf, predicate_conf)"
|
||
|
||
- name: "Weighted Average"
|
||
formula: "Σ(weight_i × conf_i) / Σ(weight_i)"
|
||
use_case: "Combining scores from multiple sources/annotators"
|
||
example: "Ensemble model output"
|
||
|
||
- name: "Maximum"
|
||
formula: "max(conf_1, conf_2, ...)"
|
||
use_case: "Any-correct scenarios (at least one interpretation valid)"
|
||
example: "Multiple valid entity types"
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# ANNOTATION UNCERTAINTY
|
||
# ---------------------------------------------------------------------------
|
||
|
||
annotation_uncertainty:
|
||
description: |
|
||
Uncertainty about the correctness of annotation DECISIONS.
|
||
This is process uncertainty, not world uncertainty.
|
||
|
||
dimensions:
|
||
- dimension: "boundary_confidence"
|
||
description: "Confidence in span boundaries (start/end offsets)"
|
||
factors:
|
||
- "Tokenization ambiguity"
|
||
- "Nested entity decisions"
|
||
- "Modifier attachment"
|
||
examples:
|
||
- text: "the Dutch painter Rembrandt"
|
||
issue: "Should span include 'the Dutch painter' or just 'Rembrandt'?"
|
||
resolution: "Mark 'Rembrandt' as head; broader span has lower confidence"
|
||
|
||
- dimension: "type_confidence"
|
||
description: "Confidence in entity type assignment"
|
||
factors:
|
||
- "Ambiguous entity types (organization vs. event: 'the conference')"
|
||
- "Metonymy (place for organization: 'Washington announced')"
|
||
- "Type granularity (museum subtype assignment)"
|
||
examples:
|
||
- text: "Apple announced new products"
|
||
issue: "AGT.PER (person)? GRP.COR (corporation)? WRK.OBJ (fruit)?"
|
||
resolution: "Context disambiguates; assign GRP.COR with high confidence"
|
||
|
||
- dimension: "referent_confidence"
|
||
description: "Confidence in identity of referred entity"
|
||
factors:
|
||
- "Ambiguous names (multiple people named 'John Smith')"
|
||
- "Unknown entities (not in knowledge base)"
|
||
- "Nickname/variant resolution"
|
||
examples:
|
||
- text: "Dr. Williams presented the findings"
|
||
issue: "Which Dr. Williams? No disambiguating context."
|
||
resolution: "Create entity with low referent_confidence; flag for KB linking"
|
||
|
||
- dimension: "extraction_confidence"
|
||
description: "Confidence from the extraction model/process"
|
||
factors:
|
||
- "Model probability/logits"
|
||
- "Rule match specificity"
|
||
- "OCR quality (for digitized documents)"
|
||
|
||
combined_score:
|
||
formula: |
|
||
annotation_confidence = min(
|
||
boundary_confidence,
|
||
type_confidence,
|
||
referent_confidence
|
||
) × extraction_confidence
|
||
|
||
rationale: |
|
||
Use minimum for boundary/type/referent (weakest link), then scale by
|
||
extraction quality. Low OCR confidence degrades all downstream scores.
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# EPISTEMIC UNCERTAINTY
|
||
# ---------------------------------------------------------------------------
|
||
|
||
epistemic_uncertainty:
|
||
description: |
|
||
Uncertainty about FACTS IN THE WORLD, not annotation process.
|
||
Captures disputed, unknown, or probability claims about reality.
|
||
|
||
uncertainty_types:
|
||
- type: "DISPUTED"
|
||
description: "Multiple conflicting claims in sources"
|
||
examples:
|
||
- "Scholars dispute whether Vermeer used a camera obscura"
|
||
- "The painting's attribution is contested"
|
||
annotation_fields:
|
||
- field: "epistemic_status"
|
||
value: "disputed"
|
||
- field: "competing_claims"
|
||
value: ["claim_1", "claim_2"]
|
||
|
||
- type: "UNKNOWN"
|
||
description: "No reliable information available"
|
||
examples:
|
||
- "The artist's birthdate is unknown"
|
||
- "The work's provenance before 1900 is unrecorded"
|
||
annotation_fields:
|
||
- field: "epistemic_status"
|
||
value: "unknown"
|
||
|
||
- type: "APPROXIMATE"
|
||
description: "Value is estimated or rounded"
|
||
examples:
|
||
- "The collection contains approximately 8,000 works"
|
||
- "Painted circa 1642"
|
||
annotation_fields:
|
||
- field: "epistemic_status"
|
||
value: "approximate"
|
||
- field: "precision"
|
||
value: "circa_year | circa_decade | order_of_magnitude"
|
||
|
||
- type: "INFERRED"
|
||
description: "Derived from other facts, not directly stated"
|
||
examples:
|
||
- "If born in 1606 and died in 1669, he lived 63 years"
|
||
- "Based on style, attributed to the artist's late period"
|
||
annotation_fields:
|
||
- field: "epistemic_status"
|
||
value: "inferred"
|
||
- field: "inference_basis"
|
||
value: "description of reasoning"
|
||
|
||
- type: "HYPOTHETICAL"
|
||
description: "Conditional or speculative claim"
|
||
examples:
|
||
- "If the signature is authentic, the painting is by Vermeer"
|
||
- "The proposed identification remains unconfirmed"
|
||
annotation_fields:
|
||
- field: "epistemic_status"
|
||
value: "hypothetical"
|
||
- field: "condition"
|
||
value: "description of condition"
|
||
|
||
source_reliability:
|
||
description: "Confidence based on source quality"
|
||
|
||
tiers:
|
||
- tier: 1
|
||
label: "AUTHORITATIVE"
|
||
sources:
|
||
- "Primary sources (original documents, eyewitness accounts)"
|
||
- "Official registries (ISIL, Wikidata with references)"
|
||
- "Peer-reviewed scholarship"
|
||
default_confidence: 0.95
|
||
|
||
- tier: 2
|
||
label: "RELIABLE"
|
||
sources:
|
||
- "Institutional websites"
|
||
- "Encyclopedia entries (Britannica, Wikipedia with citations)"
|
||
- "Expert secondary sources"
|
||
default_confidence: 0.85
|
||
|
||
- tier: 3
|
||
label: "CREDIBLE"
|
||
sources:
|
||
- "News media"
|
||
- "Wikipedia without citations"
|
||
- "Aggregated databases"
|
||
default_confidence: 0.70
|
||
|
||
- tier: 4
|
||
label: "UNVERIFIED"
|
||
sources:
|
||
- "User-generated content"
|
||
- "Social media"
|
||
- "NLP extraction without verification"
|
||
default_confidence: 0.50
|
||
|
||
- tier: 5
|
||
label: "SUSPECT"
|
||
sources:
|
||
- "Known unreliable sources"
|
||
- "Contradicted by authoritative sources"
|
||
- "Outdated information"
|
||
default_confidence: 0.20
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# LINGUISTIC UNCERTAINTY
|
||
# ---------------------------------------------------------------------------
|
||
|
||
linguistic_uncertainty:
|
||
description: |
|
||
Uncertainty encoded IN THE SOURCE TEXT itself through hedging,
|
||
modality, attribution, and evidentiality markers.
|
||
|
||
hedging_markers:
|
||
description: "Words/phrases indicating reduced certainty in source"
|
||
|
||
categories:
|
||
- category: "MODAL_VERBS"
|
||
markers:
|
||
- marker: "may/might"
|
||
uncertainty_reduction: 0.4
|
||
example: "The painting may be by Vermeer"
|
||
- marker: "could"
|
||
uncertainty_reduction: 0.3
|
||
example: "This could date to the 1640s"
|
||
- marker: "must (epistemic)"
|
||
uncertainty_reduction: 0.1
|
||
example: "He must have known the artist"
|
||
- marker: "should"
|
||
uncertainty_reduction: 0.2
|
||
example: "The document should contain the date"
|
||
|
||
- category: "HEDGING_ADVERBS"
|
||
markers:
|
||
- marker: "possibly/perhaps/maybe"
|
||
uncertainty_reduction: 0.4
|
||
example: "Possibly painted in Amsterdam"
|
||
- marker: "probably/likely"
|
||
uncertainty_reduction: 0.2
|
||
example: "Probably by Rembrandt's workshop"
|
||
- marker: "certainly/definitely"
|
||
uncertainty_reduction: 0.0
|
||
example: "Certainly authentic"
|
||
- marker: "apparently/seemingly"
|
||
uncertainty_reduction: 0.3
|
||
example: "Apparently a self-portrait"
|
||
|
||
- category: "HEDGING_ADJECTIVES"
|
||
markers:
|
||
- marker: "possible/potential"
|
||
uncertainty_reduction: 0.4
|
||
example: "A possible attribution to Hals"
|
||
- marker: "probable/likely"
|
||
uncertainty_reduction: 0.2
|
||
example: "The probable author"
|
||
- marker: "alleged/purported"
|
||
uncertainty_reduction: 0.5
|
||
example: "The alleged forgery"
|
||
- marker: "so-called"
|
||
uncertainty_reduction: 0.3
|
||
example: "The so-called 'Night Watch'"
|
||
|
||
- category: "ATTRIBUTION_PHRASES"
|
||
description: "Attribution to uncertain source"
|
||
markers:
|
||
- marker: "according to X"
|
||
uncertainty_reduction: "depends on X reliability"
|
||
example: "According to early sources..."
|
||
- marker: "it is said that"
|
||
uncertainty_reduction: 0.4
|
||
example: "It is said that Vermeer used..."
|
||
- marker: "traditionally attributed to"
|
||
uncertainty_reduction: 0.3
|
||
example: "Traditionally attributed to Leonardo"
|
||
- marker: "some scholars believe"
|
||
uncertainty_reduction: 0.3
|
||
example: "Some scholars believe this is..."
|
||
|
||
- category: "APPROXIMATION"
|
||
markers:
|
||
- marker: "circa/c./ca."
|
||
uncertainty_reduction: 0.2
|
||
example: "c. 1642"
|
||
- marker: "approximately/about/around"
|
||
uncertainty_reduction: 0.2
|
||
example: "Around 1,000 works"
|
||
- marker: "roughly/nearly"
|
||
uncertainty_reduction: 0.2
|
||
example: "Nearly 50 years old"
|
||
|
||
annotation_pattern:
|
||
description: "How to annotate linguistic uncertainty"
|
||
|
||
schema:
|
||
- field: "has_hedging"
|
||
type: "boolean"
|
||
description: "True if source text contains hedging"
|
||
|
||
- field: "hedging_markers"
|
||
type: "array[string]"
|
||
description: "List of hedging markers detected"
|
||
|
||
- field: "source_certainty"
|
||
type: "float"
|
||
description: "Certainty expressed in source (before extraction adjustment)"
|
||
formula: "1.0 - max(uncertainty_reductions)"
|
||
|
||
- field: "attributed_to"
|
||
type: "object"
|
||
description: "If claim is attributed, to whom"
|
||
schema:
|
||
source_name: "string"
|
||
source_reliability: "float"
|
||
is_author_endorsement: "boolean"
|
||
|
||
examples:
|
||
- text: "The painting is possibly by Rembrandt"
|
||
annotation:
|
||
has_hedging: true
|
||
hedging_markers: ["possibly"]
|
||
source_certainty: 0.6
|
||
notes: "Attribution uncertain in source text"
|
||
|
||
- text: "According to Houbraken, Rembrandt was born in Leiden"
|
||
annotation:
|
||
has_hedging: true
|
||
hedging_markers: ["according to"]
|
||
attributed_to:
|
||
source_name: "Arnold Houbraken"
|
||
source_reliability: 0.8
|
||
is_author_endorsement: false
|
||
source_certainty: 0.8
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# UNCERTAINTY ANNOTATION SCHEMA
|
||
# ---------------------------------------------------------------------------
|
||
|
||
annotation_schema:
|
||
description: "Complete schema for uncertainty annotation"
|
||
|
||
fields:
|
||
# Core confidence score
|
||
- field: "confidence"
|
||
type: "float"
|
||
range: [0.0, 1.0]
|
||
required: true
|
||
description: "Overall confidence in the annotation"
|
||
|
||
# Detailed confidence breakdown
|
||
- field: "confidence_breakdown"
|
||
type: "object"
|
||
optional: true
|
||
schema:
|
||
boundary_confidence: "float"
|
||
type_confidence: "float"
|
||
referent_confidence: "float"
|
||
extraction_confidence: "float"
|
||
description: "Component confidence scores"
|
||
|
||
# Confidence metadata
|
||
- field: "confidence_method"
|
||
type: "enum"
|
||
values:
|
||
- "MODEL_PROBABILITY" # From ML model output
|
||
- "RULE_CONFIDENCE" # Assigned by rule
|
||
- "HUMAN_JUDGMENT" # Manual annotation
|
||
- "ENSEMBLE_AGGREGATE" # Combined from multiple sources
|
||
- "HEURISTIC" # Based on heuristic rules
|
||
- "CALIBRATED" # Post-hoc calibrated
|
||
description: "How confidence was computed"
|
||
|
||
- field: "calibration_status"
|
||
type: "enum"
|
||
values:
|
||
- "UNCALIBRATED" # Raw model output
|
||
- "CALIBRATED" # Adjusted for reliability
|
||
- "VALIDATED" # Verified against ground truth
|
||
description: "Whether score has been calibrated"
|
||
|
||
# Epistemic uncertainty
|
||
- field: "epistemic_status"
|
||
type: "enum"
|
||
values:
|
||
- "ASSERTED" # Presented as fact
|
||
- "DISPUTED" # Multiple conflicting claims
|
||
- "UNKNOWN" # No reliable information
|
||
- "APPROXIMATE" # Estimated value
|
||
- "INFERRED" # Derived from other facts
|
||
- "HYPOTHETICAL" # Conditional/speculative
|
||
optional: true
|
||
default: "ASSERTED"
|
||
description: "Epistemic status of the claim"
|
||
|
||
- field: "competing_claims"
|
||
type: "array[object]"
|
||
optional: true
|
||
description: "If disputed, list of competing claims"
|
||
item_schema:
|
||
claim: "string"
|
||
source: "string"
|
||
confidence: "float"
|
||
|
||
# Linguistic uncertainty
|
||
- field: "source_hedging"
|
||
type: "object"
|
||
optional: true
|
||
schema:
|
||
has_hedging: "boolean"
|
||
markers: "array[string]"
|
||
source_certainty: "float"
|
||
description: "Hedging detected in source text"
|
||
|
||
# Attribution
|
||
- field: "attribution"
|
||
type: "object"
|
||
optional: true
|
||
schema:
|
||
attributed_to: "string (source/speaker)"
|
||
attribution_type: "enum (QUOTE, PARAPHRASE, CLAIM)"
|
||
endorsement: "boolean (does author endorse?)"
|
||
description: "If claim is attributed to another source"
|
||
|
||
# Review status
|
||
- field: "review_status"
|
||
type: "enum"
|
||
values:
|
||
- "UNREVIEWED" # Not yet reviewed
|
||
- "PENDING_REVIEW" # Flagged for review
|
||
- "REVIEWED" # Reviewed, accepted
|
||
- "DISPUTED" # Reviewer disagrees
|
||
- "CORRECTED" # Original was wrong, corrected
|
||
optional: true
|
||
description: "Human review status"
|
||
|
||
- field: "reviewed_by"
|
||
type: "string"
|
||
optional: true
|
||
description: "Reviewer identifier"
|
||
|
||
- field: "review_date"
|
||
type: "string"
|
||
format: "ISO 8601"
|
||
optional: true
|
||
description: "When reviewed"
|
||
|
||
- field: "review_notes"
|
||
type: "string"
|
||
optional: true
|
||
description: "Reviewer comments"
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# CALIBRATION AND VALIDATION
|
||
# ---------------------------------------------------------------------------
|
||
|
||
calibration:
|
||
description: |
|
||
Methods for ensuring confidence scores are RELIABLE (calibrated).
|
||
A calibrated score of 0.8 means 80% of annotations at that score are correct.
|
||
|
||
calibration_methods:
|
||
- method: "Temperature Scaling"
|
||
description: |
|
||
Post-hoc calibration using held-out validation set.
|
||
Learn temperature T such that softmax(logits/T) is calibrated.
|
||
suitable_for: "Neural model outputs"
|
||
|
||
- method: "Platt Scaling"
|
||
description: |
|
||
Fit sigmoid function to map raw scores to calibrated probabilities.
|
||
P(correct) = 1 / (1 + exp(A × score + B))
|
||
suitable_for: "Binary classification outputs"
|
||
|
||
- method: "Isotonic Regression"
|
||
description: |
|
||
Non-parametric calibration preserving score ordering.
|
||
Maps scores to calibrated values via piecewise constant function.
|
||
suitable_for: "When calibration curve is non-monotonic"
|
||
|
||
- method: "Histogram Binning"
|
||
description: |
|
||
Bin predictions and assign calibrated score per bin.
|
||
suitable_for: "Simple, interpretable calibration"
|
||
|
||
calibration_metrics:
|
||
- metric: "Expected Calibration Error (ECE)"
|
||
description: "Average gap between confidence and accuracy per bin"
|
||
formula: "Σ |B_i|/n × |accuracy(B_i) - confidence(B_i)|"
|
||
target: "< 0.05"
|
||
|
||
- metric: "Maximum Calibration Error (MCE)"
|
||
description: "Largest gap across bins"
|
||
formula: "max_i |accuracy(B_i) - confidence(B_i)|"
|
||
target: "< 0.10"
|
||
|
||
- metric: "Brier Score"
|
||
description: "Mean squared error of probability estimates"
|
||
formula: "Σ (p_i - y_i)² / n"
|
||
target: "Lower is better"
|
||
|
||
validation_requirements:
|
||
description: "Requirements for validating confidence scores"
|
||
|
||
requirements:
|
||
- "Held-out test set with ground truth annotations"
|
||
- "Stratified sampling across entity types and confidence ranges"
|
||
- "Minimum 100 samples per calibration bin"
|
||
- "Regular recalibration as model/data changes"
|
||
|
||
monitoring:
|
||
- "Track calibration metrics over time"
|
||
- "Alert on calibration drift"
|
||
- "Retrain calibration when ECE exceeds threshold"
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# INTER-ANNOTATOR AGREEMENT
|
||
# ---------------------------------------------------------------------------
|
||
|
||
inter_annotator_agreement:
|
||
description: |
|
||
Metrics for measuring agreement between annotators, which informs
|
||
confidence estimation and task difficulty assessment.
|
||
|
||
metrics:
|
||
- metric: "Cohen's Kappa (κ)"
|
||
description: "Agreement corrected for chance (2 annotators)"
|
||
formula: "κ = (P_o - P_e) / (1 - P_e)"
|
||
interpretation:
|
||
- range: [0.81, 1.0]
|
||
label: "Almost perfect"
|
||
- range: [0.61, 0.80]
|
||
label: "Substantial"
|
||
- range: [0.41, 0.60]
|
||
label: "Moderate"
|
||
- range: [0.21, 0.40]
|
||
label: "Fair"
|
||
- range: [0.0, 0.20]
|
||
label: "Slight"
|
||
|
||
- metric: "Fleiss' Kappa"
|
||
description: "Multi-annotator extension of Cohen's κ"
|
||
use_case: "3+ annotators on same items"
|
||
|
||
- metric: "Krippendorff's Alpha (α)"
|
||
description: "Handles missing data, any number of annotators"
|
||
use_case: "Production annotation with variable annotator coverage"
|
||
|
||
- metric: "F1 Agreement"
|
||
description: "Treat one annotator as gold, compute F1"
|
||
use_case: "When one annotator is more senior/authoritative"
|
||
|
||
agreement_to_confidence:
|
||
description: |
|
||
Use IAA to inform confidence scoring. Low agreement items
|
||
should have lower confidence bounds.
|
||
|
||
heuristic:
|
||
- agreement: "All annotators agree"
|
||
confidence_boost: 0.1
|
||
|
||
- agreement: "Majority agrees (>66%)"
|
||
confidence_boost: 0.0
|
||
|
||
- agreement: "Split decision (50%)"
|
||
confidence_reduction: 0.2
|
||
flag: "REQUIRES_ADJUDICATION"
|
||
|
||
- agreement: "No majority"
|
||
confidence_reduction: 0.4
|
||
flag: "HIGHLY_AMBIGUOUS"
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# UNCERTAINTY PROPAGATION
|
||
# ---------------------------------------------------------------------------
|
||
|
||
uncertainty_propagation:
|
||
description: |
|
||
How uncertainty flows through annotation pipelines and affects
|
||
downstream tasks.
|
||
|
||
propagation_rules:
|
||
- stage: "Entity Extraction → Relationship Extraction"
|
||
rule: |
|
||
Relationship confidence ≤ min(subject_confidence, object_confidence)
|
||
rationale: "Can't have high-confidence relationship with low-confidence entities"
|
||
|
||
- stage: "Mention Detection → Coreference"
|
||
rule: |
|
||
Coreference confidence ≤ min(mention_1_confidence, mention_2_confidence)
|
||
rationale: "Coreference uncertain if mentions are uncertain"
|
||
|
||
- stage: "Individual Annotations → Aggregate Statistics"
|
||
rule: |
|
||
Report confidence intervals, not just point estimates.
|
||
E.g., "8,000 ± 500 entities extracted (95% CI)"
|
||
|
||
- stage: "Multiple Sources → Merged Entity"
|
||
rule: |
|
||
merged_confidence = f(source_confidences, agreement)
|
||
Agreement boosts; conflict reduces.
|
||
|
||
monte_carlo_simulation:
|
||
description: |
|
||
For complex pipelines, use Monte Carlo simulation:
|
||
1. Sample annotations according to confidence distributions
|
||
2. Run downstream pipeline
|
||
3. Aggregate to get output distribution
|
||
use_case: "Estimating uncertainty in knowledge graph population"
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# PRESENTATION AND VISUALIZATION
|
||
# ---------------------------------------------------------------------------
|
||
|
||
presentation:
|
||
description: "How to present uncertainty to users"
|
||
|
||
ui_guidelines:
|
||
- guideline: "Color Coding"
|
||
description: "Use consistent color scale from green (certain) to red (uncertain)"
|
||
implementation:
|
||
- confidence_range: "[0.9, 1.0]"
|
||
confidence_min: 0.9
|
||
confidence_max: 1.0
|
||
color: "green"
|
||
icon: "✓"
|
||
- confidence_range: "[0.7, 0.9)"
|
||
confidence_min: 0.7
|
||
confidence_max: 0.9
|
||
color: "light-green"
|
||
icon: "○"
|
||
- confidence_range: "[0.5, 0.7)"
|
||
confidence_min: 0.5
|
||
confidence_max: 0.7
|
||
color: "yellow"
|
||
icon: "?"
|
||
- confidence_range: "[0.3, 0.5)"
|
||
confidence_min: 0.3
|
||
confidence_max: 0.5
|
||
color: "orange"
|
||
icon: "⚠"
|
||
- confidence_range: "[0.0, 0.3)"
|
||
confidence_min: 0.0
|
||
confidence_max: 0.3
|
||
color: "red"
|
||
icon: "✗"
|
||
|
||
- guideline: "Uncertainty Indicators"
|
||
description: "Visual indicators scaled to confidence"
|
||
options:
|
||
- "Border thickness (thicker = less certain)"
|
||
- "Opacity (more transparent = less certain)"
|
||
- "Hatching/patterns for uncertain regions"
|
||
- "Tooltip with detailed confidence breakdown"
|
||
|
||
- guideline: "Sortable/Filterable"
|
||
description: "Allow users to sort and filter by confidence"
|
||
features:
|
||
- "Slider to set confidence threshold"
|
||
- "Show only items needing review (conf < 0.8)"
|
||
- "Sort by confidence ascending (most uncertain first)"
|
||
|
||
- guideline: "Confidence Distribution"
|
||
description: "Show overall confidence distribution"
|
||
implementation: "Histogram of confidence scores across annotations"
|
||
|
||
- guideline: "Explain Uncertainty"
|
||
description: "On hover/click, explain WHY confidence is low"
|
||
example: |
|
||
"Low confidence (0.45) because:
|
||
- Ambiguous entity type (0.52)
|
||
- Multiple possible referents in KB (0.65)
|
||
- Source text contains hedging: 'possibly'"
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# EXAMPLES
|
||
# ---------------------------------------------------------------------------
|
||
|
||
examples:
|
||
- name: "High confidence entity"
|
||
text: "The Rijksmuseum in Amsterdam houses Rembrandt's Night Watch."
|
||
annotation:
|
||
entity: "Rijksmuseum"
|
||
entity_type: "GRP.ORG"
|
||
confidence: 0.97
|
||
confidence_breakdown:
|
||
boundary_confidence: 0.99
|
||
type_confidence: 0.98
|
||
referent_confidence: 0.96
|
||
extraction_confidence: 0.98
|
||
confidence_method: "MODEL_PROBABILITY"
|
||
epistemic_status: "ASSERTED"
|
||
external_uri: "https://www.wikidata.org/entity/Q190804"
|
||
|
||
- name: "Hedged attribution"
|
||
text: "The painting is possibly by Rembrandt or his workshop."
|
||
annotation:
|
||
entity: "Rembrandt or his workshop"
|
||
entity_type: "AGT.PER"
|
||
confidence: 0.75
|
||
confidence_breakdown:
|
||
boundary_confidence: 0.85
|
||
type_confidence: 0.90
|
||
referent_confidence: 0.60
|
||
extraction_confidence: 0.85
|
||
source_hedging:
|
||
has_hedging: true
|
||
markers: ["possibly", "or"]
|
||
source_certainty: 0.5
|
||
epistemic_status: "DISPUTED"
|
||
competing_claims:
|
||
- claim: "By Rembrandt himself"
|
||
confidence: 0.4
|
||
- claim: "By Rembrandt's workshop"
|
||
confidence: 0.5
|
||
- claim: "Later copy"
|
||
confidence: 0.1
|
||
|
||
- name: "Approximate temporal reference"
|
||
text: "The collection was established around 1800."
|
||
annotation:
|
||
entity: "around 1800"
|
||
entity_type: "TMP.DAB"
|
||
confidence: 0.85
|
||
epistemic_status: "APPROXIMATE"
|
||
source_hedging:
|
||
has_hedging: true
|
||
markers: ["around"]
|
||
source_certainty: 0.8
|
||
temporal_uncertainty:
|
||
point_estimate: "1800-01-01"
|
||
range_start: "1795-01-01"
|
||
range_end: "1805-12-31"
|
||
precision: "circa_decade"
|
||
|
||
- name: "Low confidence extraction needing review"
|
||
text: "Dr. J. van der Berg described the artifact."
|
||
annotation:
|
||
entity: "Dr. J. van der Berg"
|
||
entity_type: "AGT.PER"
|
||
confidence: 0.45
|
||
confidence_breakdown:
|
||
boundary_confidence: 0.90
|
||
type_confidence: 0.85
|
||
referent_confidence: 0.35
|
||
extraction_confidence: 0.80
|
||
confidence_method: "MODEL_PROBABILITY"
|
||
review_status: "PENDING_REVIEW"
|
||
review_notes: |
|
||
Multiple "J. van der Berg" in knowledge base.
|
||
Need additional context for disambiguation.
|
||
possible_referents:
|
||
- uri: "https://viaf.org/viaf/12345"
|
||
name: "Johan van der Berg (1890-1960)"
|
||
match_confidence: 0.35
|
||
- uri: "https://viaf.org/viaf/67890"
|
||
name: "Johannes van der Berg (1920-1995)"
|
||
match_confidence: 0.30
|
||
- uri: null
|
||
name: "(Unknown person)"
|
||
match_confidence: 0.35
|
||
|
||
# =============================================================================
|
||
# END OF CONVENTION
|
||
# =============================================================================
|
||
#
|
||
# This convention is COMPLETE and SELF-CONTAINED.
|
||
# Version: 1.7.0-unified
|
||
# Date: 2025-12-02
|
||
#
|
||
# For questions or updates, see: docs/convention/README.md
|
||
# =============================================================================
|