854 lines
27 KiB
YAML
854 lines
27 KiB
YAML
# =============================================================================
|
|
# GLAM-NER Entity Annotation Convention
|
|
# Module: Coreference Resolution
|
|
# Path: modules/advanced/coreference.yaml
|
|
# Version: 1.7.0
|
|
# =============================================================================
|
|
#
|
|
# This module defines coreference annotation for linking multiple mentions
|
|
# that refer to the same real-world entity within and across documents.
|
|
#
|
|
# Key distinction from relationships:
|
|
# - COREFERENCE: Two mentions refer to the SAME entity (identity)
|
|
# - RELATIONSHIP: Two DIFFERENT entities have a semantic connection
|
|
#
|
|
# Example:
|
|
# - Coreference: "Rembrandt" and "the Dutch master" → same person
|
|
# - Relationship: "Rembrandt" created "The Night Watch" → two entities linked
|
|
#
|
|
# Standards alignment:
|
|
# - CoNLL-2012 coreference format
|
|
# - OntoNotes coreference guidelines
|
|
# - ACE (Automatic Content Extraction)
|
|
# - TAC-KBP entity linking
|
|
#
|
|
# =============================================================================
|
|
|
|
module:
|
|
id: coreference
|
|
name: Coreference Resolution
|
|
version: "1.7.0"
|
|
status: stable
|
|
category: advanced
|
|
|
|
dependencies:
|
|
- core/convention
|
|
- core/namespaces
|
|
- hypernyms/agt
|
|
- hypernyms/grp
|
|
- hypernyms/top
|
|
- hypernyms/wrk
|
|
- hypernyms/app
|
|
|
|
description: |
|
|
Coreference resolution identifies when multiple textual mentions refer to
|
|
the same real-world entity. This is essential for:
|
|
|
|
- Entity consolidation across document spans
|
|
- Knowledge base population (linking mentions to KB entities)
|
|
- Cross-document entity tracking
|
|
- Pronoun resolution and anaphora handling
|
|
|
|
This module provides:
|
|
- Coreference chain data model
|
|
- Mention types taxonomy
|
|
- Entity linking to external knowledge bases
|
|
- Cross-document coreference handling
|
|
- Singleton and split-antecedent patterns
|
|
|
|
# =============================================================================
|
|
# NAMESPACES
|
|
# =============================================================================
|
|
|
|
namespaces:
|
|
nif: "http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#"
|
|
itsrdf: "http://www.w3.org/2005/11/its/rdf#"
|
|
oa: "http://www.w3.org/ns/oa#"
|
|
prov: "http://www.w3.org/ns/prov#"
|
|
skos: "http://www.w3.org/2004/02/skos/core#"
|
|
owl: "http://www.w3.org/2002/07/owl#"
|
|
dcterms: "http://purl.org/dc/terms/"
|
|
schema: "http://schema.org/"
|
|
# Entity linking targets
|
|
wd: "http://www.wikidata.org/entity/"
|
|
viaf: "http://viaf.org/viaf/"
|
|
gnd: "https://d-nb.info/gnd/"
|
|
loc: "http://id.loc.gov/authorities/"
|
|
geonames: "http://sws.geonames.org/"
|
|
pleiades: "https://pleiades.stoa.org/places/"
|
|
tgn: "http://vocab.getty.edu/tgn/"
|
|
aat: "http://vocab.getty.edu/aat/"
|
|
ulan: "http://vocab.getty.edu/ulan/"
|
|
|
|
# =============================================================================
|
|
# COREFERENCE MODEL
|
|
# =============================================================================
|
|
|
|
coreference_model:
|
|
description: |
|
|
The coreference model groups mentions into chains, where each chain
|
|
represents a single real-world entity. Chains may link to external
|
|
knowledge base entities.
|
|
|
|
core_concepts:
|
|
|
|
- concept: "Mention"
|
|
description: |
|
|
A textual span that refers to an entity. Mentions are the atomic
|
|
units of coreference - they are linked into chains.
|
|
properties:
|
|
- mention_id: "Unique identifier"
|
|
- span_start: "Character offset start"
|
|
- span_end: "Character offset end"
|
|
- span_text: "Surface form text"
|
|
- mention_type: "Type of mention (see taxonomy)"
|
|
- entity_type: "GLAM-NER entity type code"
|
|
- head_span: "Optional head word offsets"
|
|
- document_id: "Source document identifier"
|
|
- sentence_id: "Sentence identifier within document"
|
|
|
|
- concept: "Coreference Chain"
|
|
description: |
|
|
A cluster of mentions that all refer to the same entity.
|
|
Chains have a canonical/representative mention and zero or more
|
|
linked external identifiers.
|
|
properties:
|
|
- chain_id: "Unique identifier for the chain"
|
|
- mentions: "List of mention_ids in this chain"
|
|
- representative_mention: "Canonical mention (usually first full NP)"
|
|
- entity_type: "Resolved entity type for the chain"
|
|
- external_links: "Links to knowledge bases (VIAF, Wikidata, etc.)"
|
|
- confidence: "Confidence score for chain (0.0-1.0)"
|
|
|
|
- concept: "Entity Link"
|
|
description: |
|
|
A link from a coreference chain to an external knowledge base entity.
|
|
Enables entity disambiguation and knowledge graph integration.
|
|
properties:
|
|
- link_id: "Unique identifier"
|
|
- chain_id: "Reference to coreference chain"
|
|
- target_uri: "External KB URI"
|
|
- target_source: "KB identifier (VIAF, Wikidata, GND, etc.)"
|
|
- link_type: "Type of link (exact, close, related)"
|
|
- confidence: "Linking confidence (0.0-1.0)"
|
|
- linking_method: "How link was established"
|
|
|
|
# =============================================================================
|
|
# MENTION TYPES
|
|
# =============================================================================
|
|
|
|
mention_types:
|
|
description: |
|
|
Taxonomy of mention types for coreference. Based on linguistic form
|
|
and referential properties.
|
|
|
|
types:
|
|
- code: "NAM"
|
|
label: "Named"
|
|
description: "Proper noun mention (names)"
|
|
examples:
|
|
- "Rembrandt van Rijn"
|
|
- "The Rijksmuseum"
|
|
- "Amsterdam"
|
|
properties:
|
|
canonical: true
|
|
anaphoric: false
|
|
|
|
- code: "NOM"
|
|
label: "Nominal"
|
|
description: "Common noun phrase mention"
|
|
examples:
|
|
- "the painter"
|
|
- "the museum"
|
|
- "the Dutch master"
|
|
properties:
|
|
canonical: false
|
|
anaphoric: true
|
|
requires_antecedent: "Often, but not always"
|
|
|
|
- code: "PRO"
|
|
label: "Pronominal"
|
|
description: "Pronoun mention"
|
|
examples:
|
|
- "he", "she", "it", "they"
|
|
- "his", "her", "its", "their"
|
|
- "himself", "herself"
|
|
properties:
|
|
canonical: false
|
|
anaphoric: true
|
|
requires_antecedent: true
|
|
subtypes:
|
|
- code: "PRO.PER"
|
|
label: "Personal Pronoun"
|
|
examples: ["he", "she", "it", "they", "him", "her", "them"]
|
|
- code: "PRO.POS"
|
|
label: "Possessive Pronoun"
|
|
examples: ["his", "her", "its", "their", "hers", "theirs"]
|
|
- code: "PRO.REF"
|
|
label: "Reflexive Pronoun"
|
|
examples: ["himself", "herself", "itself", "themselves"]
|
|
- code: "PRO.DEM"
|
|
label: "Demonstrative Pronoun"
|
|
examples: ["this", "that", "these", "those"]
|
|
- code: "PRO.REL"
|
|
label: "Relative Pronoun"
|
|
examples: ["who", "which", "that", "whom", "whose"]
|
|
|
|
- code: "DEF"
|
|
label: "Definite Description"
|
|
description: "Definite NP that uniquely identifies"
|
|
examples:
|
|
- "the author of Don Quixote"
|
|
- "the current Dutch monarch"
|
|
- "the capital of the Netherlands"
|
|
properties:
|
|
canonical: false
|
|
anaphoric: "May be discourse-new"
|
|
|
|
- code: "DEM"
|
|
label: "Demonstrative NP"
|
|
description: "NP with demonstrative determiner"
|
|
examples:
|
|
- "this painting"
|
|
- "that museum"
|
|
- "these documents"
|
|
properties:
|
|
canonical: false
|
|
anaphoric: true
|
|
deictic: true
|
|
|
|
- code: "APP"
|
|
label: "Appositive"
|
|
description: "Appositive phrase providing additional info"
|
|
examples:
|
|
- "Rembrandt, the famous painter,"
|
|
- "Amsterdam, capital of the Netherlands,"
|
|
properties:
|
|
canonical: false
|
|
non_restrictive: true
|
|
coreferent_with_head: true
|
|
|
|
- code: "TTL"
|
|
label: "Title/Role"
|
|
description: "Title or role used to refer to person"
|
|
examples:
|
|
- "the King"
|
|
- "the Director"
|
|
- "Her Majesty"
|
|
properties:
|
|
canonical: false
|
|
context_dependent: true
|
|
|
|
# =============================================================================
|
|
# COREFERENCE CHAIN SCHEMA
|
|
# =============================================================================
|
|
|
|
chain_schema:
|
|
description: "JSON/YAML schema for coreference chain annotations"
|
|
|
|
required_fields:
|
|
- field: "chain_id"
|
|
type: "string"
|
|
format: "UUID or URI"
|
|
description: "Unique identifier for this coreference chain"
|
|
|
|
- field: "entity_type"
|
|
type: "string"
|
|
format: "GLAM-NER type code"
|
|
description: "Resolved entity type for all mentions in chain"
|
|
|
|
- field: "mentions"
|
|
type: "array"
|
|
item_type: "Mention"
|
|
min_items: 1
|
|
description: "All mentions belonging to this chain"
|
|
|
|
optional_fields:
|
|
- field: "representative_mention"
|
|
type: "string"
|
|
format: "mention_id reference"
|
|
description: "The canonical/representative mention"
|
|
selection_criteria: |
|
|
1. Prefer NAM (named) mentions over NOM/PRO
|
|
2. Prefer first occurrence if multiple NAM
|
|
3. Prefer longest/most complete form
|
|
4. Prefer mentions with external links
|
|
|
|
- field: "external_links"
|
|
type: "array"
|
|
item_type: "EntityLink"
|
|
description: "Links to external knowledge bases"
|
|
|
|
- field: "attributes"
|
|
type: "object"
|
|
description: "Aggregated attributes from all mentions"
|
|
schema:
|
|
gender: "enum (MALE, FEMALE, NEUTRAL, UNKNOWN)"
|
|
number: "enum (SINGULAR, PLURAL, UNKNOWN)"
|
|
animacy: "enum (ANIMATE, INANIMATE, UNKNOWN)"
|
|
|
|
- field: "confidence"
|
|
type: "float"
|
|
range: [0.0, 1.0]
|
|
description: "Confidence in chain correctness"
|
|
|
|
- field: "provenance"
|
|
type: "object"
|
|
description: "How chain was created"
|
|
schema:
|
|
method: "enum (MANUAL, RULE_BASED, ML, HYBRID)"
|
|
annotator_id: "string"
|
|
annotation_date: "ISO 8601"
|
|
guidelines_version: "string"
|
|
|
|
mention_schema:
|
|
required_fields:
|
|
- field: "mention_id"
|
|
type: "string"
|
|
|
|
- field: "span_start"
|
|
type: "integer"
|
|
description: "Character offset start (0-indexed)"
|
|
|
|
- field: "span_end"
|
|
type: "integer"
|
|
description: "Character offset end (exclusive)"
|
|
|
|
- field: "span_text"
|
|
type: "string"
|
|
description: "Surface form text"
|
|
|
|
- field: "mention_type"
|
|
type: "string"
|
|
format: "Mention type code"
|
|
|
|
optional_fields:
|
|
- field: "head_start"
|
|
type: "integer"
|
|
description: "Head word start offset"
|
|
|
|
- field: "head_end"
|
|
type: "integer"
|
|
description: "Head word end offset"
|
|
|
|
- field: "entity_type"
|
|
type: "string"
|
|
description: "Mention-level entity type (may differ from chain)"
|
|
|
|
- field: "sentence_id"
|
|
type: "string"
|
|
description: "Sentence containing this mention"
|
|
|
|
# =============================================================================
|
|
# ENTITY LINKING
|
|
# =============================================================================
|
|
|
|
entity_linking:
|
|
description: |
|
|
Entity linking connects coreference chains to external knowledge bases,
|
|
enabling disambiguation and knowledge graph integration.
|
|
|
|
target_knowledge_bases:
|
|
persons:
|
|
- name: "VIAF"
|
|
namespace: "viaf"
|
|
uri_pattern: "http://viaf.org/viaf/{id}"
|
|
description: "Virtual International Authority File"
|
|
priority: 1
|
|
|
|
- name: "Wikidata"
|
|
namespace: "wd"
|
|
uri_pattern: "http://www.wikidata.org/entity/{id}"
|
|
description: "Wikidata knowledge base"
|
|
priority: 2
|
|
|
|
- name: "GND"
|
|
namespace: "gnd"
|
|
uri_pattern: "https://d-nb.info/gnd/{id}"
|
|
description: "German National Library authority"
|
|
priority: 3
|
|
|
|
- name: "ULAN"
|
|
namespace: "ulan"
|
|
uri_pattern: "http://vocab.getty.edu/ulan/{id}"
|
|
description: "Getty Union List of Artist Names"
|
|
priority: 2
|
|
domain: "artists"
|
|
|
|
- name: "LC Names"
|
|
namespace: "loc"
|
|
uri_pattern: "http://id.loc.gov/authorities/names/{id}"
|
|
description: "Library of Congress Name Authority"
|
|
priority: 3
|
|
|
|
places:
|
|
- name: "GeoNames"
|
|
namespace: "geonames"
|
|
uri_pattern: "http://sws.geonames.org/{id}/"
|
|
description: "GeoNames geographical database"
|
|
priority: 1
|
|
|
|
- name: "Pleiades"
|
|
namespace: "pleiades"
|
|
uri_pattern: "https://pleiades.stoa.org/places/{id}"
|
|
description: "Pleiades gazetteer of ancient places"
|
|
priority: 1
|
|
domain: "ancient/historical"
|
|
|
|
- name: "TGN"
|
|
namespace: "tgn"
|
|
uri_pattern: "http://vocab.getty.edu/tgn/{id}"
|
|
description: "Getty Thesaurus of Geographic Names"
|
|
priority: 2
|
|
|
|
- name: "Wikidata"
|
|
namespace: "wd"
|
|
uri_pattern: "http://www.wikidata.org/entity/{id}"
|
|
priority: 3
|
|
|
|
organizations:
|
|
- name: "VIAF"
|
|
namespace: "viaf"
|
|
uri_pattern: "http://viaf.org/viaf/{id}"
|
|
priority: 1
|
|
|
|
- name: "ISNI"
|
|
namespace: "isni"
|
|
uri_pattern: "https://isni.org/isni/{id}"
|
|
description: "International Standard Name Identifier"
|
|
priority: 2
|
|
|
|
- name: "Wikidata"
|
|
namespace: "wd"
|
|
uri_pattern: "http://www.wikidata.org/entity/{id}"
|
|
priority: 3
|
|
|
|
works:
|
|
- name: "Wikidata"
|
|
namespace: "wd"
|
|
uri_pattern: "http://www.wikidata.org/entity/{id}"
|
|
priority: 1
|
|
|
|
- name: "VIAF"
|
|
namespace: "viaf"
|
|
uri_pattern: "http://viaf.org/viaf/{id}"
|
|
priority: 2
|
|
|
|
- name: "AAT"
|
|
namespace: "aat"
|
|
uri_pattern: "http://vocab.getty.edu/aat/{id}"
|
|
description: "Getty Art & Architecture Thesaurus"
|
|
priority: 2
|
|
domain: "art objects"
|
|
|
|
linking_methods:
|
|
- method: "EXACT_STRING_MATCH"
|
|
description: "Exact match on entity label"
|
|
confidence_range: [0.9, 1.0]
|
|
|
|
- method: "FUZZY_STRING_MATCH"
|
|
description: "Approximate string matching (Levenshtein, Jaro-Winkler)"
|
|
confidence_range: [0.7, 0.9]
|
|
|
|
- method: "CONTEXT_SIMILARITY"
|
|
description: "Match based on surrounding context"
|
|
confidence_range: [0.6, 0.85]
|
|
|
|
- method: "EMBEDDING_SIMILARITY"
|
|
description: "Neural embedding-based matching"
|
|
confidence_range: [0.7, 0.95]
|
|
|
|
- method: "STRUCTURED_ATTRIBUTES"
|
|
description: "Match on dates, locations, relationships"
|
|
confidence_range: [0.8, 1.0]
|
|
|
|
- method: "MANUAL_CURATION"
|
|
description: "Human-verified linking"
|
|
confidence_range: [0.95, 1.0]
|
|
|
|
- method: "CROSS_REFERENCE"
|
|
description: "Match via owl:sameAs links in KB"
|
|
confidence_range: [0.9, 1.0]
|
|
|
|
link_types:
|
|
- type: "EXACT"
|
|
predicate: "owl:sameAs"
|
|
description: "Chain refers to exactly this KB entity"
|
|
|
|
- type: "CLOSE"
|
|
predicate: "skos:closeMatch"
|
|
description: "Chain is very similar to this KB entity"
|
|
|
|
- type: "RELATED"
|
|
predicate: "skos:relatedMatch"
|
|
description: "Chain is related to this KB entity"
|
|
|
|
- type: "NIL"
|
|
predicate: null
|
|
description: "No matching KB entity exists (NIL clustering)"
|
|
|
|
# =============================================================================
|
|
# CROSS-DOCUMENT COREFERENCE
|
|
# =============================================================================
|
|
|
|
cross_document:
|
|
description: |
|
|
Handling coreference across multiple documents. Essential for
|
|
corpus-level entity consolidation and knowledge base construction.
|
|
|
|
approaches:
|
|
- approach: "Global Entity Clustering"
|
|
description: |
|
|
All mentions across all documents are clustered into global entities.
|
|
Each global entity corresponds to one real-world referent.
|
|
workflow:
|
|
1: "Extract mentions from all documents"
|
|
2: "Perform within-document coreference"
|
|
3: "Extract features from document-level chains"
|
|
4: "Cluster chains across documents by similarity"
|
|
5: "Link clusters to KB entities"
|
|
advantages:
|
|
- "Unified entity inventory"
|
|
- "Enables cross-document analysis"
|
|
challenges:
|
|
- "Scalability with large corpora"
|
|
- "Error propagation from document-level"
|
|
|
|
- approach: "KB-Anchored Linking"
|
|
description: |
|
|
Document-level chains are independently linked to KB entities.
|
|
Cross-document coreference is implicit via shared KB links.
|
|
workflow:
|
|
1: "Perform within-document coreference"
|
|
2: "Link each chain to KB independently"
|
|
3: "Chains with same KB link are cross-doc coreferent"
|
|
advantages:
|
|
- "Scalable (document-independent)"
|
|
- "Leverages KB structure"
|
|
challenges:
|
|
- "Depends on KB coverage"
|
|
- "NIL entities not handled"
|
|
|
|
- approach: "Incremental Clustering"
|
|
description: |
|
|
Documents processed incrementally, updating global entity inventory.
|
|
workflow:
|
|
1: "Process documents in order"
|
|
2: "For each document chain, find matching global entity"
|
|
3: "If match found, merge; else create new global entity"
|
|
4: "Update global entity features"
|
|
advantages:
|
|
- "Streaming-compatible"
|
|
- "Memory-efficient"
|
|
challenges:
|
|
- "Order-dependent results"
|
|
- "May require periodic re-clustering"
|
|
|
|
global_entity_schema:
|
|
required_fields:
|
|
- field: "global_entity_id"
|
|
type: "string"
|
|
format: "UUID or URI"
|
|
|
|
- field: "entity_type"
|
|
type: "string"
|
|
format: "GLAM-NER type code"
|
|
|
|
- field: "canonical_name"
|
|
type: "string"
|
|
description: "Best/canonical name for entity"
|
|
|
|
- field: "document_chains"
|
|
type: "array"
|
|
description: "List of (document_id, chain_id) pairs"
|
|
|
|
optional_fields:
|
|
- field: "alternative_names"
|
|
type: "array"
|
|
item_type: "string"
|
|
description: "All name variants observed"
|
|
|
|
- field: "external_links"
|
|
type: "array"
|
|
item_type: "EntityLink"
|
|
|
|
- field: "mention_count"
|
|
type: "integer"
|
|
description: "Total mentions across all documents"
|
|
|
|
- field: "document_count"
|
|
type: "integer"
|
|
description: "Number of documents mentioning entity"
|
|
|
|
# =============================================================================
|
|
# SPECIAL CASES
|
|
# =============================================================================
|
|
|
|
special_cases:
|
|
|
|
singletons:
|
|
description: |
|
|
Singleton mentions are entity mentions that don't corefer with any
|
|
other mention in the document. They form single-member chains.
|
|
handling:
|
|
policy: "INCLUDE"
|
|
rationale: |
|
|
Include singletons for completeness and entity linking.
|
|
Many entity mentions in documents are singletons.
|
|
annotation: |
|
|
Create a chain with exactly one mention.
|
|
Still attempt entity linking for singletons.
|
|
example:
|
|
text: "Van Gogh visited Arles in 1888."
|
|
chains:
|
|
- chain_id: "chain-001"
|
|
mentions:
|
|
- mention_id: "m1"
|
|
span_text: "Van Gogh"
|
|
mention_type: "NAM"
|
|
# Singleton - no other mentions in document
|
|
|
|
split_antecedents:
|
|
description: |
|
|
Split antecedents occur when a plural pronoun refers to multiple
|
|
previously mentioned entities.
|
|
handling:
|
|
annotation_strategy: "MULTI_CHAIN_LINK"
|
|
description: |
|
|
Create a special "group mention" that links to multiple chains.
|
|
The plural pronoun forms its own chain that references the component chains.
|
|
example:
|
|
text: "Rembrandt met Vermeer. They discussed technique."
|
|
chains:
|
|
- chain_id: "chain-rembrandt"
|
|
mentions: [{span_text: "Rembrandt"}]
|
|
- chain_id: "chain-vermeer"
|
|
mentions: [{span_text: "Vermeer"}]
|
|
- chain_id: "chain-they"
|
|
mentions: [{span_text: "They", mention_type: "PRO"}]
|
|
split_antecedent_refs: ["chain-rembrandt", "chain-vermeer"]
|
|
|
|
generic_mentions:
|
|
description: |
|
|
Generic mentions refer to a class or type rather than a specific entity.
|
|
handling:
|
|
policy: "EXCLUDE_FROM_CHAINS"
|
|
rationale: |
|
|
Generic mentions don't refer to specific entities and should not
|
|
be in coreference chains. They may still be annotated as entities.
|
|
examples:
|
|
- "A painter must understand light." # Generic 'painter'
|
|
- "Museums should preserve culture." # Generic 'museums'
|
|
annotation:
|
|
entity_annotation: true
|
|
coreference: false
|
|
marker: "GENERIC"
|
|
|
|
cataphora:
|
|
description: |
|
|
Cataphora is forward-pointing reference, where a pronoun precedes
|
|
its referent.
|
|
handling:
|
|
same_as_anaphora: true
|
|
note: "Chain ordering doesn't imply antecedent relationship"
|
|
example:
|
|
text: "When he arrived, Rembrandt found the studio empty."
|
|
chain:
|
|
mentions:
|
|
- {span_text: "he", mention_type: "PRO"} # Cataphoric
|
|
- {span_text: "Rembrandt", mention_type: "NAM"} # Referent
|
|
|
|
metonymy:
|
|
description: |
|
|
Metonymic mentions use one entity to refer to a related entity.
|
|
handling:
|
|
policy: "ANNOTATE_BOTH"
|
|
description: |
|
|
Annotate the literal entity and create a separate chain for the
|
|
metonymic referent with a metonymy marker.
|
|
examples:
|
|
- text: "The White House announced..."
|
|
literal: "TOP.BLT (building)"
|
|
metonymic: "GRP.GOV (US executive branch)"
|
|
- text: "Amsterdam rejected the proposal."
|
|
literal: "TOP.CTY (city)"
|
|
metonymic: "GRP.GOV (city government)"
|
|
|
|
# =============================================================================
|
|
# OUTPUT FORMATS
|
|
# =============================================================================
|
|
|
|
output_formats:
|
|
|
|
conll_2012:
|
|
description: "CoNLL-2012 shared task format (column-based)"
|
|
specification: "https://conll.cemantix.org/2012/data.html"
|
|
columns:
|
|
- "Document ID"
|
|
- "Part number"
|
|
- "Word number"
|
|
- "Word"
|
|
- "POS tag"
|
|
- "Parse bit"
|
|
- "Predicate lemma"
|
|
- "Predicate frameset"
|
|
- "Word sense"
|
|
- "Speaker"
|
|
- "Named entities"
|
|
- "Coreference"
|
|
coreference_notation: |
|
|
Parentheses notation:
|
|
- (X = start of chain X
|
|
- X) = end of chain X
|
|
- (X) = single-word chain X
|
|
example: |
|
|
doc1 0 0 Rembrandt NNP ... (1)
|
|
doc1 0 1 painted VBD ... -
|
|
doc1 0 2 the DT ... (2
|
|
doc1 0 3 Night NNP ... -
|
|
doc1 0 4 Watch NNP ... 2)
|
|
doc1 0 5 . . ... -
|
|
doc1 0 6 He PRP ... (1)
|
|
doc1 0 7 completed VBD ... -
|
|
doc1 0 8 it PRP ... (2)
|
|
|
|
json_chains:
|
|
description: "JSON format with explicit chain objects"
|
|
schema: |
|
|
{
|
|
"document_id": "string",
|
|
"chains": [
|
|
{
|
|
"chain_id": "string",
|
|
"entity_type": "string",
|
|
"mentions": [
|
|
{
|
|
"mention_id": "string",
|
|
"span_start": int,
|
|
"span_end": int,
|
|
"span_text": "string",
|
|
"mention_type": "string"
|
|
}
|
|
],
|
|
"external_links": [
|
|
{
|
|
"target_uri": "string",
|
|
"target_source": "string",
|
|
"confidence": float
|
|
}
|
|
]
|
|
}
|
|
]
|
|
}
|
|
|
|
nif_rdf:
|
|
description: "NIF-based RDF format"
|
|
example: |
|
|
@prefix nif: <http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#> .
|
|
@prefix itsrdf: <http://www.w3.org/2005/11/its/rdf#> .
|
|
|
|
<http://example.org/doc1#char=0,9>
|
|
a nif:Phrase ;
|
|
nif:anchorOf "Rembrandt" ;
|
|
nif:beginIndex 0 ;
|
|
nif:endIndex 9 ;
|
|
itsrdf:taIdentRef <http://viaf.org/viaf/64013650> ;
|
|
nif:referenceContext <http://example.org/doc1#char=0,100> .
|
|
|
|
<http://example.org/doc1#char=40,42>
|
|
a nif:Phrase ;
|
|
nif:anchorOf "He" ;
|
|
nif:beginIndex 40 ;
|
|
nif:endIndex 42 ;
|
|
owl:sameAs <http://example.org/doc1#char=0,9> .
|
|
|
|
# =============================================================================
|
|
# EXAMPLES
|
|
# =============================================================================
|
|
|
|
examples:
|
|
|
|
- name: "Simple pronominal coreference"
|
|
text: "Rembrandt van Rijn was born in Leiden. He became Amsterdam's most famous painter."
|
|
annotation:
|
|
chains:
|
|
- chain_id: "chain-rembrandt"
|
|
entity_type: "AGT.PER"
|
|
representative_mention: "m1"
|
|
mentions:
|
|
- mention_id: "m1"
|
|
span_start: 0
|
|
span_end: 18
|
|
span_text: "Rembrandt van Rijn"
|
|
mention_type: "NAM"
|
|
- mention_id: "m2"
|
|
span_start: 39
|
|
span_end: 41
|
|
span_text: "He"
|
|
mention_type: "PRO.PER"
|
|
external_links:
|
|
- target_uri: "http://viaf.org/viaf/64013650"
|
|
target_source: "VIAF"
|
|
link_type: "EXACT"
|
|
confidence: 0.98
|
|
|
|
- name: "Definite description coreference"
|
|
text: "The Rijksmuseum opened in 1885. The museum houses The Night Watch."
|
|
annotation:
|
|
chains:
|
|
- chain_id: "chain-rijksmuseum"
|
|
entity_type: "GRP.ORG"
|
|
representative_mention: "m1"
|
|
mentions:
|
|
- mention_id: "m1"
|
|
span_start: 0
|
|
span_end: 14
|
|
span_text: "The Rijksmuseum"
|
|
mention_type: "NAM"
|
|
- mention_id: "m2"
|
|
span_start: 32
|
|
span_end: 42
|
|
span_text: "The museum"
|
|
mention_type: "NOM"
|
|
external_links:
|
|
- target_uri: "http://www.wikidata.org/entity/Q190804"
|
|
target_source: "Wikidata"
|
|
link_type: "EXACT"
|
|
confidence: 0.99
|
|
|
|
- name: "Appositive coreference"
|
|
text: "Vermeer, the Delft master, painted Girl with a Pearl Earring."
|
|
annotation:
|
|
chains:
|
|
- chain_id: "chain-vermeer"
|
|
entity_type: "AGT.PER"
|
|
mentions:
|
|
- mention_id: "m1"
|
|
span_start: 0
|
|
span_end: 7
|
|
span_text: "Vermeer"
|
|
mention_type: "NAM"
|
|
- mention_id: "m2"
|
|
span_start: 9
|
|
span_end: 25
|
|
span_text: "the Delft master"
|
|
mention_type: "APP"
|
|
|
|
- name: "NIL entity (no KB match)"
|
|
text: "Local artist Jan de Vries exhibited at the town hall."
|
|
annotation:
|
|
chains:
|
|
- chain_id: "chain-jdv"
|
|
entity_type: "AGT.PER"
|
|
mentions:
|
|
- mention_id: "m1"
|
|
span_start: 13
|
|
span_end: 25
|
|
span_text: "Jan de Vries"
|
|
mention_type: "NAM"
|
|
external_links:
|
|
- target_uri: null
|
|
target_source: null
|
|
link_type: "NIL"
|
|
confidence: 0.85
|
|
nil_cluster_id: "nil-cluster-001"
|