- Implemented a new script `test_pico_arabic_waqf.py` to test the GLM annotator's ability to extract person observations from Arabic historical documents. - The script includes environment variable handling for API token, structured prompts for the GLM API, and validation of extraction results. - Added comprehensive logging for API responses, extraction results, and validation errors. - Included a sample Arabic waqf text for testing purposes, following the PiCo ontology pattern.
301 lines
11 KiB
YAML
301 lines
11 KiB
YAML
# =============================================================================
|
|
# CH-Annotator Entity Annotation Convention - Modular Schema Index
|
|
# =============================================================================
|
|
# Convention ID: ch_annotator-v1_7_0
|
|
# Full Name: CH-Annotator (Cultural Heritage Annotator)
|
|
# Version: 1.7.0
|
|
# Date: 2025-12-02
|
|
# Renamed: 2025-12-06 (formerly GLAM-NER)
|
|
#
|
|
# This is the main entry point for the modular entity annotation convention.
|
|
# All modules are organized by category and can be imported individually
|
|
# or as a complete set.
|
|
#
|
|
# BREAKING CHANGES in v1.7.0:
|
|
# - BEING → AGENT (AGT)
|
|
# - PLACE → TOPONYM (TOP) + GEOMETRY (GEO)
|
|
# - ORGANISATION → GROUP (GRP)
|
|
# - TEMPORAL restructured with TimeML/TIMEX3
|
|
# - TEXTUAL_REFERENCE → WORK (WRK) with FRBR model
|
|
# - Added ROLE (ROL) hypernym
|
|
# =============================================================================
|
|
|
|
schema:
|
|
id: ch_annotator
|
|
name: "CH-Annotator Entity Annotation Convention"
|
|
version: "1.7.0"
|
|
version_date: "2025-12-02"
|
|
status: "stable"
|
|
formerly_known_as: "GLAM-NER"
|
|
|
|
description: |
|
|
A comprehensive convention for annotating named entities in heritage,
|
|
archival, library, and museum (GLAM) contexts. This convention prioritizes
|
|
Digital Humanities standards (TEI, CIDOC-CRM, TimeML, FRBR, GeoSPARQL)
|
|
over web-centric NER systems.
|
|
|
|
The convention defines 10 hypernym categories with domain-agnostic
|
|
subcategories suitable for:
|
|
- Heritage institutions and collections
|
|
- Web content and digital platforms
|
|
- Publishing and scholarly communication
|
|
- Archives and records management
|
|
- Legal and governmental documents
|
|
- Academic and research contexts
|
|
|
|
# =============================================================================
|
|
# MODULE IMPORTS
|
|
# =============================================================================
|
|
|
|
modules:
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# CORE MODULES - Convention metadata and namespaces
|
|
# ---------------------------------------------------------------------------
|
|
core:
|
|
- path: "core/convention.yaml"
|
|
description: "Convention metadata, version, scope, DH authorities"
|
|
- path: "core/namespaces.yaml"
|
|
description: "All ontology namespace prefixes with categories"
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# HYPERNYM MODULES - Entity type definitions
|
|
# ---------------------------------------------------------------------------
|
|
hypernyms:
|
|
# Agents and Persons
|
|
- path: "hypernyms/agt.yaml"
|
|
id: "AGT"
|
|
name: "AGENT"
|
|
description: "Humans, AI agents, animals, fictional beings"
|
|
primary_class: "crm:E39_Actor"
|
|
|
|
# Collectives and Organizations
|
|
- path: "hypernyms/grp.yaml"
|
|
id: "GRP"
|
|
name: "GROUP"
|
|
description: "Formal and informal collectives of agents"
|
|
primary_class: "crm:E74_Group"
|
|
|
|
# Place Names
|
|
- path: "hypernyms/top.yaml"
|
|
id: "TOP"
|
|
name: "TOPONYM"
|
|
description: "Place names as nominal references"
|
|
primary_class: "crm:E53_Place"
|
|
|
|
# Spatial Geometry
|
|
- path: "hypernyms/geo.yaml"
|
|
id: "GEO"
|
|
name: "GEOMETRY"
|
|
description: "Coordinates, polygons, spatial primitives"
|
|
primary_class: "geo:Geometry"
|
|
|
|
# Temporal Expressions
|
|
- path: "hypernyms/tmp.yaml"
|
|
id: "TMP"
|
|
name: "TEMPORAL"
|
|
description: "TimeML/TIMEX3 temporal expressions"
|
|
primary_class: "crm:E52_Time-Span"
|
|
|
|
# Names and Titles
|
|
- path: "hypernyms/app.yaml"
|
|
id: "APP"
|
|
name: "APPELLATION"
|
|
description: "Titles, collection names, awards, structured names"
|
|
primary_class: "crm:E41_Appellation"
|
|
|
|
# Social Positions
|
|
- path: "hypernyms/rol.yaml"
|
|
id: "ROL"
|
|
name: "ROLE"
|
|
description: "Occupations, honorifics, positions"
|
|
primary_class: "org:Role"
|
|
|
|
# Intellectual Works
|
|
- path: "hypernyms/wrk.yaml"
|
|
id: "WRK"
|
|
name: "WORK"
|
|
description: "FRBR Work/Expression/Manifestation/Item"
|
|
primary_class: "frbroo:F1_Work"
|
|
|
|
# Quantities
|
|
- path: "hypernyms/qty.yaml"
|
|
id: "QTY"
|
|
name: "QUANTITY"
|
|
description: "Counts, measurements, currency, ranges"
|
|
primary_class: "crm:E54_Dimension"
|
|
|
|
# Objects and Concepts
|
|
- path: "hypernyms/thg.yaml"
|
|
id: "THG"
|
|
name: "THING"
|
|
description: "Artworks, artifacts, events, concepts"
|
|
primary_class: "crm:E70_Thing"
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# PROCESSING MODULES - Extraction and annotation rules
|
|
# ---------------------------------------------------------------------------
|
|
processing:
|
|
- path: "processing/exclusions.yaml"
|
|
description: "Universal exclusion rules for all entity types"
|
|
- path: "processing/double_tagging.yaml"
|
|
description: "Permitted and prohibited double-tagging patterns"
|
|
- path: "processing/relationships.yaml"
|
|
description: "Ontology relationship patterns between entities"
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# INTEGRATION MODULES - External system mappings
|
|
# ---------------------------------------------------------------------------
|
|
integrations:
|
|
- path: "integrations/pico.yaml"
|
|
description: "PiCo ontology integration for person observations"
|
|
- path: "integrations/nif_nerd.yaml"
|
|
description: "NIF/NERD/Open Annotation compatibility layer with GLAM-NER mappings"
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# RELATIONSHIP MODULES - Family and social relationship patterns
|
|
# ---------------------------------------------------------------------------
|
|
relationships:
|
|
- path: "relationships/family.yaml"
|
|
description: "Family relationship properties and historical source patterns (34 relationship types, 13 languages)"
|
|
line_count: 1503
|
|
languages:
|
|
- "Dutch"
|
|
- "Latin"
|
|
- "German"
|
|
- "Arabic"
|
|
- "French"
|
|
- "Ottoman Turkish"
|
|
- "Hebrew"
|
|
- "Persian/Farsi"
|
|
- "Spanish"
|
|
- "Portuguese"
|
|
- "Italian"
|
|
- "Greek"
|
|
- "Russian"
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# ADVANCED MODULES - Complex annotation patterns
|
|
# ---------------------------------------------------------------------------
|
|
advanced:
|
|
- path: "advanced/document_structure.yaml"
|
|
description: "DOC hypernym for layout semantic regions (30+ document region types)"
|
|
- path: "advanced/relationship_annotations.yaml"
|
|
description: "11 relationship hypernyms (REL.CRE, REL.TMP, REL.SPA, REL.SOC, etc.)"
|
|
- path: "advanced/coreference.yaml"
|
|
description: "Coreference resolution, mention types, entity linking, cross-document"
|
|
- path: "advanced/uncertainty.yaml"
|
|
description: "Confidence scoring, epistemic/linguistic uncertainty, calibration"
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# TEI P5 MODULES - Text Encoding Initiative element schemas
|
|
# ---------------------------------------------------------------------------
|
|
tei:
|
|
index: "advanced/tei/index.yaml"
|
|
version: "4.10.2"
|
|
modules:
|
|
- path: "advanced/tei/core.yaml"
|
|
description: "TEI P5 Chapter 3 - Core Elements (date, time, measure, bibl)"
|
|
tei_module: "core"
|
|
line_count: 1575
|
|
status: "complete"
|
|
|
|
- path: "advanced/tei/namesdates.yaml"
|
|
description: "TEI P5 Chapter 14 - Names, Dates, People, Places (58 elements)"
|
|
tei_module: "namesdates"
|
|
line_count: 1962
|
|
status: "complete"
|
|
|
|
- path: "advanced/tei/msdescription.yaml"
|
|
description: "TEI P5 Chapter 11 - Manuscript Description (58 elements)"
|
|
tei_module: "msdescription"
|
|
line_count: 1923
|
|
status: "complete"
|
|
|
|
- path: "advanced/tei/linking.yaml"
|
|
description: "TEI P5 Chapter 17 - Linking, Segmentation, Alignment (20 elements)"
|
|
tei_module: "linking"
|
|
line_count: 1393
|
|
status: "complete"
|
|
|
|
# =============================================================================
|
|
# HYPERNYM SUMMARY
|
|
# =============================================================================
|
|
|
|
hypernym_codes:
|
|
AGT: "AGENT - Actors with agency (humans, AI, animals, fictional)"
|
|
GRP: "GROUP - Collectives of agents (organizations, movements, families)"
|
|
TOP: "TOPONYM - Place names as nominal references"
|
|
GEO: "GEOMETRY - Spatial coordinates and geometric primitives"
|
|
TMP: "TEMPORAL - Time expressions (TimeML/TIMEX3)"
|
|
APP: "APPELLATION - Names and titles as linguistic constructs"
|
|
ROL: "ROLE - Social positions and occupations"
|
|
WRK: "WORK - Intellectual works (FRBR model)"
|
|
QTY: "QUANTITY - Numeric values and measurements"
|
|
THG: "THING - Objects, concepts, events"
|
|
|
|
# =============================================================================
|
|
# DIGITAL HUMANITIES AUTHORITIES
|
|
# =============================================================================
|
|
|
|
authorities:
|
|
primary:
|
|
TEI_P5:
|
|
name: "Text Encoding Initiative P5 Guidelines"
|
|
url: "https://tei-c.org/guidelines/p5/"
|
|
usage: "Document structure, person/place/org names, temporal expressions"
|
|
|
|
CIDOC_CRM:
|
|
name: "CIDOC Conceptual Reference Model"
|
|
version: "7.1.3"
|
|
url: "https://cidoc-crm.org/"
|
|
usage: "Cultural heritage entity modeling, events, temporal entities"
|
|
|
|
TimeML:
|
|
name: "TimeML / TIMEX3"
|
|
url: "https://www.timeml.org/"
|
|
usage: "Temporal expression annotation (DATE, TIME, DURATION, SET)"
|
|
|
|
FRBR_LRM:
|
|
name: "IFLA Library Reference Model (FRBR successor)"
|
|
url: "https://www.ifla.org/publications/ifla-library-reference-model/"
|
|
usage: "Work/Expression/Manifestation/Item for bibliographic entities"
|
|
|
|
GeoSPARQL:
|
|
name: "OGC GeoSPARQL Standard"
|
|
url: "https://www.ogc.org/standards/geosparql"
|
|
usage: "Spatial geometry representation in RDF"
|
|
|
|
Pleiades:
|
|
name: "Pleiades Gazetteer of Ancient Places"
|
|
url: "https://pleiades.stoa.org/"
|
|
usage: "Historical and ancient world toponyms"
|
|
|
|
secondary:
|
|
W3C_Org:
|
|
name: "W3C Organization Ontology"
|
|
url: "https://www.w3.org/TR/vocab-org/"
|
|
usage: "Organizational structure, roles, memberships"
|
|
|
|
RiC_O:
|
|
name: "Records in Contexts Ontology"
|
|
url: "https://www.ica.org/standards/RiC/ontology"
|
|
usage: "Archival description and record relationships"
|
|
|
|
PNV:
|
|
name: "Person Name Vocabulary"
|
|
url: "https://w3id.org/pnv"
|
|
usage: "Structured person name components"
|
|
|
|
PiCo:
|
|
name: "Person in Context Ontology"
|
|
url: "https://w3id.org/pico"
|
|
usage: "Person observations in historical sources"
|
|
|
|
deprecated:
|
|
NERD:
|
|
name: "Named Entity Recognition and Disambiguation"
|
|
status: "DEPRECATED for DH use"
|
|
note: |
|
|
NERD classes are retained ONLY for NLP pipeline interchange.
|
|
Use DH authorities (TEI, CIDOC-CRM, TimeML, FRBR) for semantic precision.
|