glam/data/entity_annotation/modules/index.yaml
kempersc 505c12601a Add test script for PiCo extraction from Arabic waqf documents
- Implemented a new script `test_pico_arabic_waqf.py` to test the GLM annotator's ability to extract person observations from Arabic historical documents.
- The script includes environment variable handling for API token, structured prompts for the GLM API, and validation of extraction results.
- Added comprehensive logging for API responses, extraction results, and validation errors.
- Included a sample Arabic waqf text for testing purposes, following the PiCo ontology pattern.
2025-12-12 17:50:17 +01:00

301 lines
11 KiB
YAML

# =============================================================================
# CH-Annotator Entity Annotation Convention - Modular Schema Index
# =============================================================================
# Convention ID: ch_annotator-v1_7_0
# Full Name: CH-Annotator (Cultural Heritage Annotator)
# Version: 1.7.0
# Date: 2025-12-02
# Renamed: 2025-12-06 (formerly GLAM-NER)
#
# This is the main entry point for the modular entity annotation convention.
# All modules are organized by category and can be imported individually
# or as a complete set.
#
# BREAKING CHANGES in v1.7.0:
# - BEING → AGENT (AGT)
# - PLACE → TOPONYM (TOP) + GEOMETRY (GEO)
# - ORGANISATION → GROUP (GRP)
# - TEMPORAL restructured with TimeML/TIMEX3
# - TEXTUAL_REFERENCE → WORK (WRK) with FRBR model
# - Added ROLE (ROL) hypernym
# =============================================================================
schema:
id: ch_annotator
name: "CH-Annotator Entity Annotation Convention"
version: "1.7.0"
version_date: "2025-12-02"
status: "stable"
formerly_known_as: "GLAM-NER"
description: |
A comprehensive convention for annotating named entities in heritage,
archival, library, and museum (GLAM) contexts. This convention prioritizes
Digital Humanities standards (TEI, CIDOC-CRM, TimeML, FRBR, GeoSPARQL)
over web-centric NER systems.
The convention defines 10 hypernym categories with domain-agnostic
subcategories suitable for:
- Heritage institutions and collections
- Web content and digital platforms
- Publishing and scholarly communication
- Archives and records management
- Legal and governmental documents
- Academic and research contexts
# =============================================================================
# MODULE IMPORTS
# =============================================================================
modules:
# ---------------------------------------------------------------------------
# CORE MODULES - Convention metadata and namespaces
# ---------------------------------------------------------------------------
core:
- path: "core/convention.yaml"
description: "Convention metadata, version, scope, DH authorities"
- path: "core/namespaces.yaml"
description: "All ontology namespace prefixes with categories"
# ---------------------------------------------------------------------------
# HYPERNYM MODULES - Entity type definitions
# ---------------------------------------------------------------------------
hypernyms:
# Agents and Persons
- path: "hypernyms/agt.yaml"
id: "AGT"
name: "AGENT"
description: "Humans, AI agents, animals, fictional beings"
primary_class: "crm:E39_Actor"
# Collectives and Organizations
- path: "hypernyms/grp.yaml"
id: "GRP"
name: "GROUP"
description: "Formal and informal collectives of agents"
primary_class: "crm:E74_Group"
# Place Names
- path: "hypernyms/top.yaml"
id: "TOP"
name: "TOPONYM"
description: "Place names as nominal references"
primary_class: "crm:E53_Place"
# Spatial Geometry
- path: "hypernyms/geo.yaml"
id: "GEO"
name: "GEOMETRY"
description: "Coordinates, polygons, spatial primitives"
primary_class: "geo:Geometry"
# Temporal Expressions
- path: "hypernyms/tmp.yaml"
id: "TMP"
name: "TEMPORAL"
description: "TimeML/TIMEX3 temporal expressions"
primary_class: "crm:E52_Time-Span"
# Names and Titles
- path: "hypernyms/app.yaml"
id: "APP"
name: "APPELLATION"
description: "Titles, collection names, awards, structured names"
primary_class: "crm:E41_Appellation"
# Social Positions
- path: "hypernyms/rol.yaml"
id: "ROL"
name: "ROLE"
description: "Occupations, honorifics, positions"
primary_class: "org:Role"
# Intellectual Works
- path: "hypernyms/wrk.yaml"
id: "WRK"
name: "WORK"
description: "FRBR Work/Expression/Manifestation/Item"
primary_class: "frbroo:F1_Work"
# Quantities
- path: "hypernyms/qty.yaml"
id: "QTY"
name: "QUANTITY"
description: "Counts, measurements, currency, ranges"
primary_class: "crm:E54_Dimension"
# Objects and Concepts
- path: "hypernyms/thg.yaml"
id: "THG"
name: "THING"
description: "Artworks, artifacts, events, concepts"
primary_class: "crm:E70_Thing"
# ---------------------------------------------------------------------------
# PROCESSING MODULES - Extraction and annotation rules
# ---------------------------------------------------------------------------
processing:
- path: "processing/exclusions.yaml"
description: "Universal exclusion rules for all entity types"
- path: "processing/double_tagging.yaml"
description: "Permitted and prohibited double-tagging patterns"
- path: "processing/relationships.yaml"
description: "Ontology relationship patterns between entities"
# ---------------------------------------------------------------------------
# INTEGRATION MODULES - External system mappings
# ---------------------------------------------------------------------------
integrations:
- path: "integrations/pico.yaml"
description: "PiCo ontology integration for person observations"
- path: "integrations/nif_nerd.yaml"
description: "NIF/NERD/Open Annotation compatibility layer with GLAM-NER mappings"
# ---------------------------------------------------------------------------
# RELATIONSHIP MODULES - Family and social relationship patterns
# ---------------------------------------------------------------------------
relationships:
- path: "relationships/family.yaml"
description: "Family relationship properties and historical source patterns (34 relationship types, 13 languages)"
line_count: 1503
languages:
- "Dutch"
- "Latin"
- "German"
- "Arabic"
- "French"
- "Ottoman Turkish"
- "Hebrew"
- "Persian/Farsi"
- "Spanish"
- "Portuguese"
- "Italian"
- "Greek"
- "Russian"
# ---------------------------------------------------------------------------
# ADVANCED MODULES - Complex annotation patterns
# ---------------------------------------------------------------------------
advanced:
- path: "advanced/document_structure.yaml"
description: "DOC hypernym for layout semantic regions (30+ document region types)"
- path: "advanced/relationship_annotations.yaml"
description: "11 relationship hypernyms (REL.CRE, REL.TMP, REL.SPA, REL.SOC, etc.)"
- path: "advanced/coreference.yaml"
description: "Coreference resolution, mention types, entity linking, cross-document"
- path: "advanced/uncertainty.yaml"
description: "Confidence scoring, epistemic/linguistic uncertainty, calibration"
# ---------------------------------------------------------------------------
# TEI P5 MODULES - Text Encoding Initiative element schemas
# ---------------------------------------------------------------------------
tei:
index: "advanced/tei/index.yaml"
version: "4.10.2"
modules:
- path: "advanced/tei/core.yaml"
description: "TEI P5 Chapter 3 - Core Elements (date, time, measure, bibl)"
tei_module: "core"
line_count: 1575
status: "complete"
- path: "advanced/tei/namesdates.yaml"
description: "TEI P5 Chapter 14 - Names, Dates, People, Places (58 elements)"
tei_module: "namesdates"
line_count: 1962
status: "complete"
- path: "advanced/tei/msdescription.yaml"
description: "TEI P5 Chapter 11 - Manuscript Description (58 elements)"
tei_module: "msdescription"
line_count: 1923
status: "complete"
- path: "advanced/tei/linking.yaml"
description: "TEI P5 Chapter 17 - Linking, Segmentation, Alignment (20 elements)"
tei_module: "linking"
line_count: 1393
status: "complete"
# =============================================================================
# HYPERNYM SUMMARY
# =============================================================================
hypernym_codes:
AGT: "AGENT - Actors with agency (humans, AI, animals, fictional)"
GRP: "GROUP - Collectives of agents (organizations, movements, families)"
TOP: "TOPONYM - Place names as nominal references"
GEO: "GEOMETRY - Spatial coordinates and geometric primitives"
TMP: "TEMPORAL - Time expressions (TimeML/TIMEX3)"
APP: "APPELLATION - Names and titles as linguistic constructs"
ROL: "ROLE - Social positions and occupations"
WRK: "WORK - Intellectual works (FRBR model)"
QTY: "QUANTITY - Numeric values and measurements"
THG: "THING - Objects, concepts, events"
# =============================================================================
# DIGITAL HUMANITIES AUTHORITIES
# =============================================================================
authorities:
primary:
TEI_P5:
name: "Text Encoding Initiative P5 Guidelines"
url: "https://tei-c.org/guidelines/p5/"
usage: "Document structure, person/place/org names, temporal expressions"
CIDOC_CRM:
name: "CIDOC Conceptual Reference Model"
version: "7.1.3"
url: "https://cidoc-crm.org/"
usage: "Cultural heritage entity modeling, events, temporal entities"
TimeML:
name: "TimeML / TIMEX3"
url: "https://www.timeml.org/"
usage: "Temporal expression annotation (DATE, TIME, DURATION, SET)"
FRBR_LRM:
name: "IFLA Library Reference Model (FRBR successor)"
url: "https://www.ifla.org/publications/ifla-library-reference-model/"
usage: "Work/Expression/Manifestation/Item for bibliographic entities"
GeoSPARQL:
name: "OGC GeoSPARQL Standard"
url: "https://www.ogc.org/standards/geosparql"
usage: "Spatial geometry representation in RDF"
Pleiades:
name: "Pleiades Gazetteer of Ancient Places"
url: "https://pleiades.stoa.org/"
usage: "Historical and ancient world toponyms"
secondary:
W3C_Org:
name: "W3C Organization Ontology"
url: "https://www.w3.org/TR/vocab-org/"
usage: "Organizational structure, roles, memberships"
RiC_O:
name: "Records in Contexts Ontology"
url: "https://www.ica.org/standards/RiC/ontology"
usage: "Archival description and record relationships"
PNV:
name: "Person Name Vocabulary"
url: "https://w3id.org/pnv"
usage: "Structured person name components"
PiCo:
name: "Person in Context Ontology"
url: "https://w3id.org/pico"
usage: "Person observations in historical sources"
deprecated:
NERD:
name: "Named Entity Recognition and Disambiguation"
status: "DEPRECATED for DH use"
note: |
NERD classes are retained ONLY for NLP pipeline interchange.
Use DH authorities (TEI, CIDOC-CRM, TimeML, FRBR) for semantic precision.