190 lines
8.8 KiB
YAML
190 lines
8.8 KiB
YAML
# =============================================================================
|
|
# GLAM-NER: CONVENTION METADATA MODULE
|
|
# =============================================================================
|
|
# Module: core/convention.yaml
|
|
# Parent: entity_annotation_rules_v1.7.0_unified.yaml
|
|
# Purpose: Convention identity, versioning, scope, and authority references
|
|
# =============================================================================
|
|
|
|
id: https://w3id.org/glam/ner/convention
|
|
name: glam-ner-convention-metadata
|
|
|
|
convention:
|
|
name: "GLAM-NER Unified Entity Annotation Convention"
|
|
version: "1.7.0-unified"
|
|
date: "2025-12-02"
|
|
status: "production"
|
|
|
|
description: |
|
|
A unified, domain-agnostic convention for the complete entity extraction
|
|
pipeline: Named Entity Recognition (NER), Property Extraction, Entity
|
|
Resolution, Entity Linking, and Claim Validation.
|
|
|
|
This convention applies universally to any text source: historical
|
|
manuscripts, modern websites, archival documents, or digital platforms.
|
|
|
|
The convention identifies 9 hypernym entity types (expanded from 8 in v1.6.x)
|
|
with ontology mappings to CIDOC-CRM, RiC-O, Schema.org, PiCO, W3C Org,
|
|
RegOrg, FOAF, vCard, EDM, OWL-Time, GeoSPARQL, FRBR, TEI, and NIF 2.0.
|
|
|
|
CRITICAL DESIGN PRINCIPLES (v1.7.0):
|
|
1. AGENT vs PERSON: "Being" is too narrow - AGENT covers humans, animals, AI,
|
|
fictional characters, robots. Uses CIDOC-CRM E39_Actor as primary class.
|
|
2. TOPONYM vs GEOMETRY: Place names (nominal references) are distinct from
|
|
coordinate data (geometric representations). Never conflate these.
|
|
3. GROUP vs ORGANISATION: "Organisation" implies formal structure - GROUP
|
|
covers all collectives from informal bands to registered corporations.
|
|
4. TEMPORAL distinctions: Absolute timestamps, relative expressions, durations,
|
|
and recurring periods have fundamentally different semantics (TimeML/TIMEX3).
|
|
5. ROLE is distinct from AGENT: "Director" is a role; "Dr. Jan de Wit" is an
|
|
agent. Roles can be filled by different agents over time.
|
|
6. TEXTUAL REFERENCE is not PRODUCT: Documents have Work/Expression/Manifestation
|
|
distinctions (FRBR) that product-centric models miss.
|
|
|
|
All extracted claims MUST include verifiable provenance following the LinkML
|
|
schema at https://linkml.io/ and this project's WebClaim.yaml schema.
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# HYPERNYM SUMMARY
|
|
# ---------------------------------------------------------------------------
|
|
hypernyms:
|
|
- code: AGT
|
|
name: AGENT
|
|
description: "Entities capable of intentional action (humans, animals, AI, fictional)"
|
|
module: "hypernyms/agt.yaml"
|
|
- code: GRP
|
|
name: GROUP
|
|
description: "All collectives from informal bands to formal corporations"
|
|
module: "hypernyms/grp.yaml"
|
|
- code: TOP
|
|
name: TOPONYM
|
|
description: "Place names (nominal references to locations)"
|
|
module: "hypernyms/top.yaml"
|
|
- code: GEO
|
|
name: GEOMETRY
|
|
description: "Coordinate and shape data (geometric representations)"
|
|
module: "hypernyms/geo.yaml"
|
|
- code: TMP
|
|
name: TEMPORAL_REFERENCE
|
|
description: "Time expressions (absolute, relative, durations, recurring)"
|
|
module: "hypernyms/tmp.yaml"
|
|
- code: APP
|
|
name: APPELLATION
|
|
description: "Identifiers, codes, contact information"
|
|
module: "hypernyms/app.yaml"
|
|
- code: ROL
|
|
name: ROLE
|
|
description: "Positions, honorifics, titles, occupational designations"
|
|
module: "hypernyms/rol.yaml"
|
|
- code: WRK
|
|
name: TEXTUAL_REFERENCE
|
|
description: "References to documents, works, expressions (FRBR model)"
|
|
module: "hypernyms/wrk.yaml"
|
|
- code: DOC
|
|
name: DOCUMENT_REGION
|
|
description: "Document structure and layout regions"
|
|
module: "hypernyms/doc.yaml"
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# SCOPE DEFINITION
|
|
# ---------------------------------------------------------------------------
|
|
scope:
|
|
- entity_recognition: "Detect and classify named entities in text"
|
|
- property_extraction: "Extract relationships and attributes between entities"
|
|
- entity_resolution: "Disambiguate and merge entity mentions"
|
|
- entity_linking: "Link entities to knowledge bases (Wikidata, etc.)"
|
|
- claim_validation: "Verify and track provenance for all claims"
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# DIGITAL HUMANITIES AUTHORITIES
|
|
# ---------------------------------------------------------------------------
|
|
digital_humanities_authorities:
|
|
- name: "TEI P5 Guidelines, Chapter 14: Names, Dates, People, Places"
|
|
url: "https://tei-c.org/release/doc/tei-p5-doc/en/html/ND.html"
|
|
note: "Authoritative for persName, placeName, roleName, orgName distinctions"
|
|
- name: "TimeML/TIMEX3 Specification"
|
|
url: "https://timeml.github.io/site/timebank/"
|
|
note: "Authoritative for temporal expression annotation types"
|
|
- name: "CIDOC-CRM 7.1.3"
|
|
url: "https://cidoc-crm.org/Version/version-7.1.3"
|
|
note: "Authoritative for heritage domain entity classes"
|
|
- name: "PiCO (Persons in Context) Ontology"
|
|
url: "https://personsincontext.org/"
|
|
note: "Authoritative for person observation/reconstruction pattern"
|
|
- name: "FRBR (Functional Requirements for Bibliographic Records)"
|
|
url: "https://www.ifla.org/publications/functional-requirements-for-bibliographic-records"
|
|
note: "Authoritative for Work/Expression/Manifestation/Item distinctions"
|
|
- name: "Pleiades Gazetteer"
|
|
url: "https://pleiades.stoa.org/"
|
|
note: "Model for place/name/location separation in historical geography"
|
|
- name: "GeoSPARQL"
|
|
url: "https://www.ogc.org/standard/geosparql/"
|
|
note: "Authoritative for spatial geometry representation"
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# NERD DEPRECATION NOTICE
|
|
# ---------------------------------------------------------------------------
|
|
nerd_deprecation_note: |
|
|
NERD (Named Entity Recognition and Disambiguation) ontology mappings are
|
|
RETAINED for cross-system NLP tool interchange but are NOT AUTHORITATIVE
|
|
for this convention. NERD has biases toward modern web/journalism contexts:
|
|
- nerd:Person excludes non-human agents (too narrow)
|
|
- nerd:Location conflates toponyms with geometry (imprecise)
|
|
- nerd:Organization excludes informal collectives (too narrow)
|
|
- nerd:Time conflates absolute/relative/duration (imprecise)
|
|
- nerd:Product misrepresents textual references (wrong semantics)
|
|
- NERD lacks role/title/honorific class (missing category)
|
|
|
|
Use NERD classes only for NIF export to downstream NLP tools that expect
|
|
NERD vocabulary. Internal processing uses CIDOC-CRM, TEI, and TimeML classes.
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# LINKML REFERENCE
|
|
# ---------------------------------------------------------------------------
|
|
linkml_reference:
|
|
specification: "https://linkml.io/linkml/"
|
|
version: "1.8.x"
|
|
purpose: |
|
|
LinkML (Linked Data Modeling Language) is used for:
|
|
- Formal schema definitions (classes, slots, enums)
|
|
- Instance validation
|
|
- Multi-format serialization (YAML, JSON, RDF, etc.)
|
|
- Type-safe claim structures
|
|
|
|
# =============================================================================
|
|
# CHANGELOG (abbreviated - full changelog in main unified file)
|
|
# =============================================================================
|
|
changelog:
|
|
- version: "1.7.0"
|
|
date: "2025-12-02"
|
|
breaking_changes:
|
|
- "Renamed BEING hypernym to AGENT (code: AGT)"
|
|
- "Renamed ORGANISATION hypernym to GROUP (code: GRP)"
|
|
- "Split PLACE into TOPONYM and GEOMETRY"
|
|
- "Restructured TEMPORAL_REFERENCE with TimeML/TIMEX3"
|
|
- "Replaced nerd:Product with FRBR model for TEXTUAL_REFERENCE"
|
|
- "Added new hypernym ROLE (code: ROL)"
|
|
additions:
|
|
- "Section 15: DOCUMENT_REGION hypernym with comprehensive layout ontology"
|
|
- "Section 16: Relationship Annotations (11 hypernyms)"
|
|
- "Section 17: Coreference Resolution"
|
|
- "Section 18: Uncertainty and Confidence Scoring"
|
|
- version: "1.6.3"
|
|
date: "2025-11-30"
|
|
additions:
|
|
- "Expanded scope to full extraction pipeline"
|
|
- "Property Extraction Rules section"
|
|
- "Claim Validation Schema (LinkML-based provenance)"
|
|
- "Entity Resolution and Linking Pipeline section"
|
|
- version: "1.6.2"
|
|
date: "2025-11-28"
|
|
additions:
|
|
- "NIF (NLP Interchange Format) vocabulary"
|
|
- "NERD ontology core class mappings"
|
|
- "W3C Web Annotation Data Model"
|
|
- version: "1.6.1"
|
|
date: "2025-11-25"
|
|
additions:
|
|
- "W3C Org Ontology mappings"
|
|
- "RegOrg vocabulary mappings"
|
|
- "Europeana Data Model (EDM) mappings"
|