glam/data/entity_annotation/modules/advanced/coreference.yaml
2025-12-05 15:30:23 +01:00

854 lines
27 KiB
YAML

# =============================================================================
# GLAM-NER Entity Annotation Convention
# Module: Coreference Resolution
# Path: modules/advanced/coreference.yaml
# Version: 1.7.0
# =============================================================================
#
# This module defines coreference annotation for linking multiple mentions
# that refer to the same real-world entity within and across documents.
#
# Key distinction from relationships:
# - COREFERENCE: Two mentions refer to the SAME entity (identity)
# - RELATIONSHIP: Two DIFFERENT entities have a semantic connection
#
# Example:
# - Coreference: "Rembrandt" and "the Dutch master" → same person
# - Relationship: "Rembrandt" created "The Night Watch" → two entities linked
#
# Standards alignment:
# - CoNLL-2012 coreference format
# - OntoNotes coreference guidelines
# - ACE (Automatic Content Extraction)
# - TAC-KBP entity linking
#
# =============================================================================
module:
id: coreference
name: Coreference Resolution
version: "1.7.0"
status: stable
category: advanced
dependencies:
- core/convention
- core/namespaces
- hypernyms/agt
- hypernyms/grp
- hypernyms/top
- hypernyms/wrk
- hypernyms/app
description: |
Coreference resolution identifies when multiple textual mentions refer to
the same real-world entity. This is essential for:
- Entity consolidation across document spans
- Knowledge base population (linking mentions to KB entities)
- Cross-document entity tracking
- Pronoun resolution and anaphora handling
This module provides:
- Coreference chain data model
- Mention types taxonomy
- Entity linking to external knowledge bases
- Cross-document coreference handling
- Singleton and split-antecedent patterns
# =============================================================================
# NAMESPACES
# =============================================================================
namespaces:
nif: "http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#"
itsrdf: "http://www.w3.org/2005/11/its/rdf#"
oa: "http://www.w3.org/ns/oa#"
prov: "http://www.w3.org/ns/prov#"
skos: "http://www.w3.org/2004/02/skos/core#"
owl: "http://www.w3.org/2002/07/owl#"
dcterms: "http://purl.org/dc/terms/"
schema: "http://schema.org/"
# Entity linking targets
wd: "http://www.wikidata.org/entity/"
viaf: "http://viaf.org/viaf/"
gnd: "https://d-nb.info/gnd/"
loc: "http://id.loc.gov/authorities/"
geonames: "http://sws.geonames.org/"
pleiades: "https://pleiades.stoa.org/places/"
tgn: "http://vocab.getty.edu/tgn/"
aat: "http://vocab.getty.edu/aat/"
ulan: "http://vocab.getty.edu/ulan/"
# =============================================================================
# COREFERENCE MODEL
# =============================================================================
coreference_model:
description: |
The coreference model groups mentions into chains, where each chain
represents a single real-world entity. Chains may link to external
knowledge base entities.
core_concepts:
- concept: "Mention"
description: |
A textual span that refers to an entity. Mentions are the atomic
units of coreference - they are linked into chains.
properties:
- mention_id: "Unique identifier"
- span_start: "Character offset start"
- span_end: "Character offset end"
- span_text: "Surface form text"
- mention_type: "Type of mention (see taxonomy)"
- entity_type: "GLAM-NER entity type code"
- head_span: "Optional head word offsets"
- document_id: "Source document identifier"
- sentence_id: "Sentence identifier within document"
- concept: "Coreference Chain"
description: |
A cluster of mentions that all refer to the same entity.
Chains have a canonical/representative mention and zero or more
linked external identifiers.
properties:
- chain_id: "Unique identifier for the chain"
- mentions: "List of mention_ids in this chain"
- representative_mention: "Canonical mention (usually first full NP)"
- entity_type: "Resolved entity type for the chain"
- external_links: "Links to knowledge bases (VIAF, Wikidata, etc.)"
- confidence: "Confidence score for chain (0.0-1.0)"
- concept: "Entity Link"
description: |
A link from a coreference chain to an external knowledge base entity.
Enables entity disambiguation and knowledge graph integration.
properties:
- link_id: "Unique identifier"
- chain_id: "Reference to coreference chain"
- target_uri: "External KB URI"
- target_source: "KB identifier (VIAF, Wikidata, GND, etc.)"
- link_type: "Type of link (exact, close, related)"
- confidence: "Linking confidence (0.0-1.0)"
- linking_method: "How link was established"
# =============================================================================
# MENTION TYPES
# =============================================================================
mention_types:
description: |
Taxonomy of mention types for coreference. Based on linguistic form
and referential properties.
types:
- code: "NAM"
label: "Named"
description: "Proper noun mention (names)"
examples:
- "Rembrandt van Rijn"
- "The Rijksmuseum"
- "Amsterdam"
properties:
canonical: true
anaphoric: false
- code: "NOM"
label: "Nominal"
description: "Common noun phrase mention"
examples:
- "the painter"
- "the museum"
- "the Dutch master"
properties:
canonical: false
anaphoric: true
requires_antecedent: "Often, but not always"
- code: "PRO"
label: "Pronominal"
description: "Pronoun mention"
examples:
- "he", "she", "it", "they"
- "his", "her", "its", "their"
- "himself", "herself"
properties:
canonical: false
anaphoric: true
requires_antecedent: true
subtypes:
- code: "PRO.PER"
label: "Personal Pronoun"
examples: ["he", "she", "it", "they", "him", "her", "them"]
- code: "PRO.POS"
label: "Possessive Pronoun"
examples: ["his", "her", "its", "their", "hers", "theirs"]
- code: "PRO.REF"
label: "Reflexive Pronoun"
examples: ["himself", "herself", "itself", "themselves"]
- code: "PRO.DEM"
label: "Demonstrative Pronoun"
examples: ["this", "that", "these", "those"]
- code: "PRO.REL"
label: "Relative Pronoun"
examples: ["who", "which", "that", "whom", "whose"]
- code: "DEF"
label: "Definite Description"
description: "Definite NP that uniquely identifies"
examples:
- "the author of Don Quixote"
- "the current Dutch monarch"
- "the capital of the Netherlands"
properties:
canonical: false
anaphoric: "May be discourse-new"
- code: "DEM"
label: "Demonstrative NP"
description: "NP with demonstrative determiner"
examples:
- "this painting"
- "that museum"
- "these documents"
properties:
canonical: false
anaphoric: true
deictic: true
- code: "APP"
label: "Appositive"
description: "Appositive phrase providing additional info"
examples:
- "Rembrandt, the famous painter,"
- "Amsterdam, capital of the Netherlands,"
properties:
canonical: false
non_restrictive: true
coreferent_with_head: true
- code: "TTL"
label: "Title/Role"
description: "Title or role used to refer to person"
examples:
- "the King"
- "the Director"
- "Her Majesty"
properties:
canonical: false
context_dependent: true
# =============================================================================
# COREFERENCE CHAIN SCHEMA
# =============================================================================
chain_schema:
description: "JSON/YAML schema for coreference chain annotations"
required_fields:
- field: "chain_id"
type: "string"
format: "UUID or URI"
description: "Unique identifier for this coreference chain"
- field: "entity_type"
type: "string"
format: "GLAM-NER type code"
description: "Resolved entity type for all mentions in chain"
- field: "mentions"
type: "array"
item_type: "Mention"
min_items: 1
description: "All mentions belonging to this chain"
optional_fields:
- field: "representative_mention"
type: "string"
format: "mention_id reference"
description: "The canonical/representative mention"
selection_criteria: |
1. Prefer NAM (named) mentions over NOM/PRO
2. Prefer first occurrence if multiple NAM
3. Prefer longest/most complete form
4. Prefer mentions with external links
- field: "external_links"
type: "array"
item_type: "EntityLink"
description: "Links to external knowledge bases"
- field: "attributes"
type: "object"
description: "Aggregated attributes from all mentions"
schema:
gender: "enum (MALE, FEMALE, NEUTRAL, UNKNOWN)"
number: "enum (SINGULAR, PLURAL, UNKNOWN)"
animacy: "enum (ANIMATE, INANIMATE, UNKNOWN)"
- field: "confidence"
type: "float"
range: [0.0, 1.0]
description: "Confidence in chain correctness"
- field: "provenance"
type: "object"
description: "How chain was created"
schema:
method: "enum (MANUAL, RULE_BASED, ML, HYBRID)"
annotator_id: "string"
annotation_date: "ISO 8601"
guidelines_version: "string"
mention_schema:
required_fields:
- field: "mention_id"
type: "string"
- field: "span_start"
type: "integer"
description: "Character offset start (0-indexed)"
- field: "span_end"
type: "integer"
description: "Character offset end (exclusive)"
- field: "span_text"
type: "string"
description: "Surface form text"
- field: "mention_type"
type: "string"
format: "Mention type code"
optional_fields:
- field: "head_start"
type: "integer"
description: "Head word start offset"
- field: "head_end"
type: "integer"
description: "Head word end offset"
- field: "entity_type"
type: "string"
description: "Mention-level entity type (may differ from chain)"
- field: "sentence_id"
type: "string"
description: "Sentence containing this mention"
# =============================================================================
# ENTITY LINKING
# =============================================================================
entity_linking:
description: |
Entity linking connects coreference chains to external knowledge bases,
enabling disambiguation and knowledge graph integration.
target_knowledge_bases:
persons:
- name: "VIAF"
namespace: "viaf"
uri_pattern: "http://viaf.org/viaf/{id}"
description: "Virtual International Authority File"
priority: 1
- name: "Wikidata"
namespace: "wd"
uri_pattern: "http://www.wikidata.org/entity/{id}"
description: "Wikidata knowledge base"
priority: 2
- name: "GND"
namespace: "gnd"
uri_pattern: "https://d-nb.info/gnd/{id}"
description: "German National Library authority"
priority: 3
- name: "ULAN"
namespace: "ulan"
uri_pattern: "http://vocab.getty.edu/ulan/{id}"
description: "Getty Union List of Artist Names"
priority: 2
domain: "artists"
- name: "LC Names"
namespace: "loc"
uri_pattern: "http://id.loc.gov/authorities/names/{id}"
description: "Library of Congress Name Authority"
priority: 3
places:
- name: "GeoNames"
namespace: "geonames"
uri_pattern: "http://sws.geonames.org/{id}/"
description: "GeoNames geographical database"
priority: 1
- name: "Pleiades"
namespace: "pleiades"
uri_pattern: "https://pleiades.stoa.org/places/{id}"
description: "Pleiades gazetteer of ancient places"
priority: 1
domain: "ancient/historical"
- name: "TGN"
namespace: "tgn"
uri_pattern: "http://vocab.getty.edu/tgn/{id}"
description: "Getty Thesaurus of Geographic Names"
priority: 2
- name: "Wikidata"
namespace: "wd"
uri_pattern: "http://www.wikidata.org/entity/{id}"
priority: 3
organizations:
- name: "VIAF"
namespace: "viaf"
uri_pattern: "http://viaf.org/viaf/{id}"
priority: 1
- name: "ISNI"
namespace: "isni"
uri_pattern: "https://isni.org/isni/{id}"
description: "International Standard Name Identifier"
priority: 2
- name: "Wikidata"
namespace: "wd"
uri_pattern: "http://www.wikidata.org/entity/{id}"
priority: 3
works:
- name: "Wikidata"
namespace: "wd"
uri_pattern: "http://www.wikidata.org/entity/{id}"
priority: 1
- name: "VIAF"
namespace: "viaf"
uri_pattern: "http://viaf.org/viaf/{id}"
priority: 2
- name: "AAT"
namespace: "aat"
uri_pattern: "http://vocab.getty.edu/aat/{id}"
description: "Getty Art & Architecture Thesaurus"
priority: 2
domain: "art objects"
linking_methods:
- method: "EXACT_STRING_MATCH"
description: "Exact match on entity label"
confidence_range: [0.9, 1.0]
- method: "FUZZY_STRING_MATCH"
description: "Approximate string matching (Levenshtein, Jaro-Winkler)"
confidence_range: [0.7, 0.9]
- method: "CONTEXT_SIMILARITY"
description: "Match based on surrounding context"
confidence_range: [0.6, 0.85]
- method: "EMBEDDING_SIMILARITY"
description: "Neural embedding-based matching"
confidence_range: [0.7, 0.95]
- method: "STRUCTURED_ATTRIBUTES"
description: "Match on dates, locations, relationships"
confidence_range: [0.8, 1.0]
- method: "MANUAL_CURATION"
description: "Human-verified linking"
confidence_range: [0.95, 1.0]
- method: "CROSS_REFERENCE"
description: "Match via owl:sameAs links in KB"
confidence_range: [0.9, 1.0]
link_types:
- type: "EXACT"
predicate: "owl:sameAs"
description: "Chain refers to exactly this KB entity"
- type: "CLOSE"
predicate: "skos:closeMatch"
description: "Chain is very similar to this KB entity"
- type: "RELATED"
predicate: "skos:relatedMatch"
description: "Chain is related to this KB entity"
- type: "NIL"
predicate: null
description: "No matching KB entity exists (NIL clustering)"
# =============================================================================
# CROSS-DOCUMENT COREFERENCE
# =============================================================================
cross_document:
description: |
Handling coreference across multiple documents. Essential for
corpus-level entity consolidation and knowledge base construction.
approaches:
- approach: "Global Entity Clustering"
description: |
All mentions across all documents are clustered into global entities.
Each global entity corresponds to one real-world referent.
workflow:
1: "Extract mentions from all documents"
2: "Perform within-document coreference"
3: "Extract features from document-level chains"
4: "Cluster chains across documents by similarity"
5: "Link clusters to KB entities"
advantages:
- "Unified entity inventory"
- "Enables cross-document analysis"
challenges:
- "Scalability with large corpora"
- "Error propagation from document-level"
- approach: "KB-Anchored Linking"
description: |
Document-level chains are independently linked to KB entities.
Cross-document coreference is implicit via shared KB links.
workflow:
1: "Perform within-document coreference"
2: "Link each chain to KB independently"
3: "Chains with same KB link are cross-doc coreferent"
advantages:
- "Scalable (document-independent)"
- "Leverages KB structure"
challenges:
- "Depends on KB coverage"
- "NIL entities not handled"
- approach: "Incremental Clustering"
description: |
Documents processed incrementally, updating global entity inventory.
workflow:
1: "Process documents in order"
2: "For each document chain, find matching global entity"
3: "If match found, merge; else create new global entity"
4: "Update global entity features"
advantages:
- "Streaming-compatible"
- "Memory-efficient"
challenges:
- "Order-dependent results"
- "May require periodic re-clustering"
global_entity_schema:
required_fields:
- field: "global_entity_id"
type: "string"
format: "UUID or URI"
- field: "entity_type"
type: "string"
format: "GLAM-NER type code"
- field: "canonical_name"
type: "string"
description: "Best/canonical name for entity"
- field: "document_chains"
type: "array"
description: "List of (document_id, chain_id) pairs"
optional_fields:
- field: "alternative_names"
type: "array"
item_type: "string"
description: "All name variants observed"
- field: "external_links"
type: "array"
item_type: "EntityLink"
- field: "mention_count"
type: "integer"
description: "Total mentions across all documents"
- field: "document_count"
type: "integer"
description: "Number of documents mentioning entity"
# =============================================================================
# SPECIAL CASES
# =============================================================================
special_cases:
singletons:
description: |
Singleton mentions are entity mentions that don't corefer with any
other mention in the document. They form single-member chains.
handling:
policy: "INCLUDE"
rationale: |
Include singletons for completeness and entity linking.
Many entity mentions in documents are singletons.
annotation: |
Create a chain with exactly one mention.
Still attempt entity linking for singletons.
example:
text: "Van Gogh visited Arles in 1888."
chains:
- chain_id: "chain-001"
mentions:
- mention_id: "m1"
span_text: "Van Gogh"
mention_type: "NAM"
# Singleton - no other mentions in document
split_antecedents:
description: |
Split antecedents occur when a plural pronoun refers to multiple
previously mentioned entities.
handling:
annotation_strategy: "MULTI_CHAIN_LINK"
description: |
Create a special "group mention" that links to multiple chains.
The plural pronoun forms its own chain that references the component chains.
example:
text: "Rembrandt met Vermeer. They discussed technique."
chains:
- chain_id: "chain-rembrandt"
mentions: [{span_text: "Rembrandt"}]
- chain_id: "chain-vermeer"
mentions: [{span_text: "Vermeer"}]
- chain_id: "chain-they"
mentions: [{span_text: "They", mention_type: "PRO"}]
split_antecedent_refs: ["chain-rembrandt", "chain-vermeer"]
generic_mentions:
description: |
Generic mentions refer to a class or type rather than a specific entity.
handling:
policy: "EXCLUDE_FROM_CHAINS"
rationale: |
Generic mentions don't refer to specific entities and should not
be in coreference chains. They may still be annotated as entities.
examples:
- "A painter must understand light." # Generic 'painter'
- "Museums should preserve culture." # Generic 'museums'
annotation:
entity_annotation: true
coreference: false
marker: "GENERIC"
cataphora:
description: |
Cataphora is forward-pointing reference, where a pronoun precedes
its referent.
handling:
same_as_anaphora: true
note: "Chain ordering doesn't imply antecedent relationship"
example:
text: "When he arrived, Rembrandt found the studio empty."
chain:
mentions:
- {span_text: "he", mention_type: "PRO"} # Cataphoric
- {span_text: "Rembrandt", mention_type: "NAM"} # Referent
metonymy:
description: |
Metonymic mentions use one entity to refer to a related entity.
handling:
policy: "ANNOTATE_BOTH"
description: |
Annotate the literal entity and create a separate chain for the
metonymic referent with a metonymy marker.
examples:
- text: "The White House announced..."
literal: "TOP.BLT (building)"
metonymic: "GRP.GOV (US executive branch)"
- text: "Amsterdam rejected the proposal."
literal: "TOP.CTY (city)"
metonymic: "GRP.GOV (city government)"
# =============================================================================
# OUTPUT FORMATS
# =============================================================================
output_formats:
conll_2012:
description: "CoNLL-2012 shared task format (column-based)"
specification: "https://conll.cemantix.org/2012/data.html"
columns:
- "Document ID"
- "Part number"
- "Word number"
- "Word"
- "POS tag"
- "Parse bit"
- "Predicate lemma"
- "Predicate frameset"
- "Word sense"
- "Speaker"
- "Named entities"
- "Coreference"
coreference_notation: |
Parentheses notation:
- (X = start of chain X
- X) = end of chain X
- (X) = single-word chain X
example: |
doc1 0 0 Rembrandt NNP ... (1)
doc1 0 1 painted VBD ... -
doc1 0 2 the DT ... (2
doc1 0 3 Night NNP ... -
doc1 0 4 Watch NNP ... 2)
doc1 0 5 . . ... -
doc1 0 6 He PRP ... (1)
doc1 0 7 completed VBD ... -
doc1 0 8 it PRP ... (2)
json_chains:
description: "JSON format with explicit chain objects"
schema: |
{
"document_id": "string",
"chains": [
{
"chain_id": "string",
"entity_type": "string",
"mentions": [
{
"mention_id": "string",
"span_start": int,
"span_end": int,
"span_text": "string",
"mention_type": "string"
}
],
"external_links": [
{
"target_uri": "string",
"target_source": "string",
"confidence": float
}
]
}
]
}
nif_rdf:
description: "NIF-based RDF format"
example: |
@prefix nif: <http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#> .
@prefix itsrdf: <http://www.w3.org/2005/11/its/rdf#> .
<http://example.org/doc1#char=0,9>
a nif:Phrase ;
nif:anchorOf "Rembrandt" ;
nif:beginIndex 0 ;
nif:endIndex 9 ;
itsrdf:taIdentRef <http://viaf.org/viaf/64013650> ;
nif:referenceContext <http://example.org/doc1#char=0,100> .
<http://example.org/doc1#char=40,42>
a nif:Phrase ;
nif:anchorOf "He" ;
nif:beginIndex 40 ;
nif:endIndex 42 ;
owl:sameAs <http://example.org/doc1#char=0,9> .
# =============================================================================
# EXAMPLES
# =============================================================================
examples:
- name: "Simple pronominal coreference"
text: "Rembrandt van Rijn was born in Leiden. He became Amsterdam's most famous painter."
annotation:
chains:
- chain_id: "chain-rembrandt"
entity_type: "AGT.PER"
representative_mention: "m1"
mentions:
- mention_id: "m1"
span_start: 0
span_end: 18
span_text: "Rembrandt van Rijn"
mention_type: "NAM"
- mention_id: "m2"
span_start: 39
span_end: 41
span_text: "He"
mention_type: "PRO.PER"
external_links:
- target_uri: "http://viaf.org/viaf/64013650"
target_source: "VIAF"
link_type: "EXACT"
confidence: 0.98
- name: "Definite description coreference"
text: "The Rijksmuseum opened in 1885. The museum houses The Night Watch."
annotation:
chains:
- chain_id: "chain-rijksmuseum"
entity_type: "GRP.ORG"
representative_mention: "m1"
mentions:
- mention_id: "m1"
span_start: 0
span_end: 14
span_text: "The Rijksmuseum"
mention_type: "NAM"
- mention_id: "m2"
span_start: 32
span_end: 42
span_text: "The museum"
mention_type: "NOM"
external_links:
- target_uri: "http://www.wikidata.org/entity/Q190804"
target_source: "Wikidata"
link_type: "EXACT"
confidence: 0.99
- name: "Appositive coreference"
text: "Vermeer, the Delft master, painted Girl with a Pearl Earring."
annotation:
chains:
- chain_id: "chain-vermeer"
entity_type: "AGT.PER"
mentions:
- mention_id: "m1"
span_start: 0
span_end: 7
span_text: "Vermeer"
mention_type: "NAM"
- mention_id: "m2"
span_start: 9
span_end: 25
span_text: "the Delft master"
mention_type: "APP"
- name: "NIL entity (no KB match)"
text: "Local artist Jan de Vries exhibited at the town hall."
annotation:
chains:
- chain_id: "chain-jdv"
entity_type: "AGT.PER"
mentions:
- mention_id: "m1"
span_start: 13
span_end: 25
span_text: "Jan de Vries"
mention_type: "NAM"
external_links:
- target_uri: null
target_source: null
link_type: "NIL"
confidence: 0.85
nil_cluster_id: "nil-cluster-001"