# ============================================================================= # GLAM-NER: CONVENTION METADATA MODULE # ============================================================================= # Module: core/convention.yaml # Parent: entity_annotation_rules_v1.7.0_unified.yaml # Purpose: Convention identity, versioning, scope, and authority references # ============================================================================= id: https://w3id.org/glam/ner/convention name: glam-ner-convention-metadata convention: name: "GLAM-NER Unified Entity Annotation Convention" version: "1.7.0-unified" date: "2025-12-02" status: "production" description: | A unified, domain-agnostic convention for the complete entity extraction pipeline: Named Entity Recognition (NER), Property Extraction, Entity Resolution, Entity Linking, and Claim Validation. This convention applies universally to any text source: historical manuscripts, modern websites, archival documents, or digital platforms. The convention identifies 9 hypernym entity types (expanded from 8 in v1.6.x) with ontology mappings to CIDOC-CRM, RiC-O, Schema.org, PiCO, W3C Org, RegOrg, FOAF, vCard, EDM, OWL-Time, GeoSPARQL, FRBR, TEI, and NIF 2.0. CRITICAL DESIGN PRINCIPLES (v1.7.0): 1. AGENT vs PERSON: "Being" is too narrow - AGENT covers humans, animals, AI, fictional characters, robots. Uses CIDOC-CRM E39_Actor as primary class. 2. TOPONYM vs GEOMETRY: Place names (nominal references) are distinct from coordinate data (geometric representations). Never conflate these. 3. GROUP vs ORGANISATION: "Organisation" implies formal structure - GROUP covers all collectives from informal bands to registered corporations. 4. TEMPORAL distinctions: Absolute timestamps, relative expressions, durations, and recurring periods have fundamentally different semantics (TimeML/TIMEX3). 5. ROLE is distinct from AGENT: "Director" is a role; "Dr. Jan de Wit" is an agent. Roles can be filled by different agents over time. 6. TEXTUAL REFERENCE is not PRODUCT: Documents have Work/Expression/Manifestation distinctions (FRBR) that product-centric models miss. All extracted claims MUST include verifiable provenance following the LinkML schema at https://linkml.io/ and this project's WebClaim.yaml schema. # --------------------------------------------------------------------------- # HYPERNYM SUMMARY # --------------------------------------------------------------------------- hypernyms: - code: AGT name: AGENT description: "Entities capable of intentional action (humans, animals, AI, fictional)" module: "hypernyms/agt.yaml" - code: GRP name: GROUP description: "All collectives from informal bands to formal corporations" module: "hypernyms/grp.yaml" - code: TOP name: TOPONYM description: "Place names (nominal references to locations)" module: "hypernyms/top.yaml" - code: GEO name: GEOMETRY description: "Coordinate and shape data (geometric representations)" module: "hypernyms/geo.yaml" - code: TMP name: TEMPORAL_REFERENCE description: "Time expressions (absolute, relative, durations, recurring)" module: "hypernyms/tmp.yaml" - code: APP name: APPELLATION description: "Identifiers, codes, contact information" module: "hypernyms/app.yaml" - code: ROL name: ROLE description: "Positions, honorifics, titles, occupational designations" module: "hypernyms/rol.yaml" - code: WRK name: TEXTUAL_REFERENCE description: "References to documents, works, expressions (FRBR model)" module: "hypernyms/wrk.yaml" - code: DOC name: DOCUMENT_REGION description: "Document structure and layout regions" module: "hypernyms/doc.yaml" # --------------------------------------------------------------------------- # SCOPE DEFINITION # --------------------------------------------------------------------------- scope: - entity_recognition: "Detect and classify named entities in text" - property_extraction: "Extract relationships and attributes between entities" - entity_resolution: "Disambiguate and merge entity mentions" - entity_linking: "Link entities to knowledge bases (Wikidata, etc.)" - claim_validation: "Verify and track provenance for all claims" # --------------------------------------------------------------------------- # DIGITAL HUMANITIES AUTHORITIES # --------------------------------------------------------------------------- digital_humanities_authorities: - name: "TEI P5 Guidelines, Chapter 14: Names, Dates, People, Places" url: "https://tei-c.org/release/doc/tei-p5-doc/en/html/ND.html" note: "Authoritative for persName, placeName, roleName, orgName distinctions" - name: "TimeML/TIMEX3 Specification" url: "https://timeml.github.io/site/timebank/" note: "Authoritative for temporal expression annotation types" - name: "CIDOC-CRM 7.1.3" url: "https://cidoc-crm.org/Version/version-7.1.3" note: "Authoritative for heritage domain entity classes" - name: "PiCO (Persons in Context) Ontology" url: "https://personsincontext.org/" note: "Authoritative for person observation/reconstruction pattern" - name: "FRBR (Functional Requirements for Bibliographic Records)" url: "https://www.ifla.org/publications/functional-requirements-for-bibliographic-records" note: "Authoritative for Work/Expression/Manifestation/Item distinctions" - name: "Pleiades Gazetteer" url: "https://pleiades.stoa.org/" note: "Model for place/name/location separation in historical geography" - name: "GeoSPARQL" url: "https://www.ogc.org/standard/geosparql/" note: "Authoritative for spatial geometry representation" # --------------------------------------------------------------------------- # NERD DEPRECATION NOTICE # --------------------------------------------------------------------------- nerd_deprecation_note: | NERD (Named Entity Recognition and Disambiguation) ontology mappings are RETAINED for cross-system NLP tool interchange but are NOT AUTHORITATIVE for this convention. NERD has biases toward modern web/journalism contexts: - nerd:Person excludes non-human agents (too narrow) - nerd:Location conflates toponyms with geometry (imprecise) - nerd:Organization excludes informal collectives (too narrow) - nerd:Time conflates absolute/relative/duration (imprecise) - nerd:Product misrepresents textual references (wrong semantics) - NERD lacks role/title/honorific class (missing category) Use NERD classes only for NIF export to downstream NLP tools that expect NERD vocabulary. Internal processing uses CIDOC-CRM, TEI, and TimeML classes. # --------------------------------------------------------------------------- # LINKML REFERENCE # --------------------------------------------------------------------------- linkml_reference: specification: "https://linkml.io/linkml/" version: "1.8.x" purpose: | LinkML (Linked Data Modeling Language) is used for: - Formal schema definitions (classes, slots, enums) - Instance validation - Multi-format serialization (YAML, JSON, RDF, etc.) - Type-safe claim structures # ============================================================================= # CHANGELOG (abbreviated - full changelog in main unified file) # ============================================================================= changelog: - version: "1.7.0" date: "2025-12-02" breaking_changes: - "Renamed BEING hypernym to AGENT (code: AGT)" - "Renamed ORGANISATION hypernym to GROUP (code: GRP)" - "Split PLACE into TOPONYM and GEOMETRY" - "Restructured TEMPORAL_REFERENCE with TimeML/TIMEX3" - "Replaced nerd:Product with FRBR model for TEXTUAL_REFERENCE" - "Added new hypernym ROLE (code: ROL)" additions: - "Section 15: DOCUMENT_REGION hypernym with comprehensive layout ontology" - "Section 16: Relationship Annotations (11 hypernyms)" - "Section 17: Coreference Resolution" - "Section 18: Uncertainty and Confidence Scoring" - version: "1.6.3" date: "2025-11-30" additions: - "Expanded scope to full extraction pipeline" - "Property Extraction Rules section" - "Claim Validation Schema (LinkML-based provenance)" - "Entity Resolution and Linking Pipeline section" - version: "1.6.2" date: "2025-11-28" additions: - "NIF (NLP Interchange Format) vocabulary" - "NERD ontology core class mappings" - "W3C Web Annotation Data Model" - version: "1.6.1" date: "2025-11-25" additions: - "W3C Org Ontology mappings" - "RegOrg vocabulary mappings" - "Europeana Data Model (EDM) mappings"