# ============================================================================= # GLAM-NER Entity Annotation Convention # Module: Coreference Resolution # Path: modules/advanced/coreference.yaml # Version: 1.7.0 # ============================================================================= # # This module defines coreference annotation for linking multiple mentions # that refer to the same real-world entity within and across documents. # # Key distinction from relationships: # - COREFERENCE: Two mentions refer to the SAME entity (identity) # - RELATIONSHIP: Two DIFFERENT entities have a semantic connection # # Example: # - Coreference: "Rembrandt" and "the Dutch master" → same person # - Relationship: "Rembrandt" created "The Night Watch" → two entities linked # # Standards alignment: # - CoNLL-2012 coreference format # - OntoNotes coreference guidelines # - ACE (Automatic Content Extraction) # - TAC-KBP entity linking # # ============================================================================= module: id: coreference name: Coreference Resolution version: "1.7.0" status: stable category: advanced dependencies: - core/convention - core/namespaces - hypernyms/agt - hypernyms/grp - hypernyms/top - hypernyms/wrk - hypernyms/app description: | Coreference resolution identifies when multiple textual mentions refer to the same real-world entity. This is essential for: - Entity consolidation across document spans - Knowledge base population (linking mentions to KB entities) - Cross-document entity tracking - Pronoun resolution and anaphora handling This module provides: - Coreference chain data model - Mention types taxonomy - Entity linking to external knowledge bases - Cross-document coreference handling - Singleton and split-antecedent patterns # ============================================================================= # NAMESPACES # ============================================================================= namespaces: nif: "http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#" itsrdf: "http://www.w3.org/2005/11/its/rdf#" oa: "http://www.w3.org/ns/oa#" prov: "http://www.w3.org/ns/prov#" skos: "http://www.w3.org/2004/02/skos/core#" owl: "http://www.w3.org/2002/07/owl#" dcterms: "http://purl.org/dc/terms/" schema: "http://schema.org/" # Entity linking targets wd: "http://www.wikidata.org/entity/" viaf: "http://viaf.org/viaf/" gnd: "https://d-nb.info/gnd/" loc: "http://id.loc.gov/authorities/" geonames: "http://sws.geonames.org/" pleiades: "https://pleiades.stoa.org/places/" tgn: "http://vocab.getty.edu/tgn/" aat: "http://vocab.getty.edu/aat/" ulan: "http://vocab.getty.edu/ulan/" # ============================================================================= # COREFERENCE MODEL # ============================================================================= coreference_model: description: | The coreference model groups mentions into chains, where each chain represents a single real-world entity. Chains may link to external knowledge base entities. core_concepts: - concept: "Mention" description: | A textual span that refers to an entity. Mentions are the atomic units of coreference - they are linked into chains. properties: - mention_id: "Unique identifier" - span_start: "Character offset start" - span_end: "Character offset end" - span_text: "Surface form text" - mention_type: "Type of mention (see taxonomy)" - entity_type: "GLAM-NER entity type code" - head_span: "Optional head word offsets" - document_id: "Source document identifier" - sentence_id: "Sentence identifier within document" - concept: "Coreference Chain" description: | A cluster of mentions that all refer to the same entity. Chains have a canonical/representative mention and zero or more linked external identifiers. properties: - chain_id: "Unique identifier for the chain" - mentions: "List of mention_ids in this chain" - representative_mention: "Canonical mention (usually first full NP)" - entity_type: "Resolved entity type for the chain" - external_links: "Links to knowledge bases (VIAF, Wikidata, etc.)" - confidence: "Confidence score for chain (0.0-1.0)" - concept: "Entity Link" description: | A link from a coreference chain to an external knowledge base entity. Enables entity disambiguation and knowledge graph integration. properties: - link_id: "Unique identifier" - chain_id: "Reference to coreference chain" - target_uri: "External KB URI" - target_source: "KB identifier (VIAF, Wikidata, GND, etc.)" - link_type: "Type of link (exact, close, related)" - confidence: "Linking confidence (0.0-1.0)" - linking_method: "How link was established" # ============================================================================= # MENTION TYPES # ============================================================================= mention_types: description: | Taxonomy of mention types for coreference. Based on linguistic form and referential properties. types: - code: "NAM" label: "Named" description: "Proper noun mention (names)" examples: - "Rembrandt van Rijn" - "The Rijksmuseum" - "Amsterdam" properties: canonical: true anaphoric: false - code: "NOM" label: "Nominal" description: "Common noun phrase mention" examples: - "the painter" - "the museum" - "the Dutch master" properties: canonical: false anaphoric: true requires_antecedent: "Often, but not always" - code: "PRO" label: "Pronominal" description: "Pronoun mention" examples: - "he", "she", "it", "they" - "his", "her", "its", "their" - "himself", "herself" properties: canonical: false anaphoric: true requires_antecedent: true subtypes: - code: "PRO.PER" label: "Personal Pronoun" examples: ["he", "she", "it", "they", "him", "her", "them"] - code: "PRO.POS" label: "Possessive Pronoun" examples: ["his", "her", "its", "their", "hers", "theirs"] - code: "PRO.REF" label: "Reflexive Pronoun" examples: ["himself", "herself", "itself", "themselves"] - code: "PRO.DEM" label: "Demonstrative Pronoun" examples: ["this", "that", "these", "those"] - code: "PRO.REL" label: "Relative Pronoun" examples: ["who", "which", "that", "whom", "whose"] - code: "DEF" label: "Definite Description" description: "Definite NP that uniquely identifies" examples: - "the author of Don Quixote" - "the current Dutch monarch" - "the capital of the Netherlands" properties: canonical: false anaphoric: "May be discourse-new" - code: "DEM" label: "Demonstrative NP" description: "NP with demonstrative determiner" examples: - "this painting" - "that museum" - "these documents" properties: canonical: false anaphoric: true deictic: true - code: "APP" label: "Appositive" description: "Appositive phrase providing additional info" examples: - "Rembrandt, the famous painter," - "Amsterdam, capital of the Netherlands," properties: canonical: false non_restrictive: true coreferent_with_head: true - code: "TTL" label: "Title/Role" description: "Title or role used to refer to person" examples: - "the King" - "the Director" - "Her Majesty" properties: canonical: false context_dependent: true # ============================================================================= # COREFERENCE CHAIN SCHEMA # ============================================================================= chain_schema: description: "JSON/YAML schema for coreference chain annotations" required_fields: - field: "chain_id" type: "string" format: "UUID or URI" description: "Unique identifier for this coreference chain" - field: "entity_type" type: "string" format: "GLAM-NER type code" description: "Resolved entity type for all mentions in chain" - field: "mentions" type: "array" item_type: "Mention" min_items: 1 description: "All mentions belonging to this chain" optional_fields: - field: "representative_mention" type: "string" format: "mention_id reference" description: "The canonical/representative mention" selection_criteria: | 1. Prefer NAM (named) mentions over NOM/PRO 2. Prefer first occurrence if multiple NAM 3. Prefer longest/most complete form 4. Prefer mentions with external links - field: "external_links" type: "array" item_type: "EntityLink" description: "Links to external knowledge bases" - field: "attributes" type: "object" description: "Aggregated attributes from all mentions" schema: gender: "enum (MALE, FEMALE, NEUTRAL, UNKNOWN)" number: "enum (SINGULAR, PLURAL, UNKNOWN)" animacy: "enum (ANIMATE, INANIMATE, UNKNOWN)" - field: "confidence" type: "float" range: [0.0, 1.0] description: "Confidence in chain correctness" - field: "provenance" type: "object" description: "How chain was created" schema: method: "enum (MANUAL, RULE_BASED, ML, HYBRID)" annotator_id: "string" annotation_date: "ISO 8601" guidelines_version: "string" mention_schema: required_fields: - field: "mention_id" type: "string" - field: "span_start" type: "integer" description: "Character offset start (0-indexed)" - field: "span_end" type: "integer" description: "Character offset end (exclusive)" - field: "span_text" type: "string" description: "Surface form text" - field: "mention_type" type: "string" format: "Mention type code" optional_fields: - field: "head_start" type: "integer" description: "Head word start offset" - field: "head_end" type: "integer" description: "Head word end offset" - field: "entity_type" type: "string" description: "Mention-level entity type (may differ from chain)" - field: "sentence_id" type: "string" description: "Sentence containing this mention" # ============================================================================= # ENTITY LINKING # ============================================================================= entity_linking: description: | Entity linking connects coreference chains to external knowledge bases, enabling disambiguation and knowledge graph integration. target_knowledge_bases: persons: - name: "VIAF" namespace: "viaf" uri_pattern: "http://viaf.org/viaf/{id}" description: "Virtual International Authority File" priority: 1 - name: "Wikidata" namespace: "wd" uri_pattern: "http://www.wikidata.org/entity/{id}" description: "Wikidata knowledge base" priority: 2 - name: "GND" namespace: "gnd" uri_pattern: "https://d-nb.info/gnd/{id}" description: "German National Library authority" priority: 3 - name: "ULAN" namespace: "ulan" uri_pattern: "http://vocab.getty.edu/ulan/{id}" description: "Getty Union List of Artist Names" priority: 2 domain: "artists" - name: "LC Names" namespace: "loc" uri_pattern: "http://id.loc.gov/authorities/names/{id}" description: "Library of Congress Name Authority" priority: 3 places: - name: "GeoNames" namespace: "geonames" uri_pattern: "http://sws.geonames.org/{id}/" description: "GeoNames geographical database" priority: 1 - name: "Pleiades" namespace: "pleiades" uri_pattern: "https://pleiades.stoa.org/places/{id}" description: "Pleiades gazetteer of ancient places" priority: 1 domain: "ancient/historical" - name: "TGN" namespace: "tgn" uri_pattern: "http://vocab.getty.edu/tgn/{id}" description: "Getty Thesaurus of Geographic Names" priority: 2 - name: "Wikidata" namespace: "wd" uri_pattern: "http://www.wikidata.org/entity/{id}" priority: 3 organizations: - name: "VIAF" namespace: "viaf" uri_pattern: "http://viaf.org/viaf/{id}" priority: 1 - name: "ISNI" namespace: "isni" uri_pattern: "https://isni.org/isni/{id}" description: "International Standard Name Identifier" priority: 2 - name: "Wikidata" namespace: "wd" uri_pattern: "http://www.wikidata.org/entity/{id}" priority: 3 works: - name: "Wikidata" namespace: "wd" uri_pattern: "http://www.wikidata.org/entity/{id}" priority: 1 - name: "VIAF" namespace: "viaf" uri_pattern: "http://viaf.org/viaf/{id}" priority: 2 - name: "AAT" namespace: "aat" uri_pattern: "http://vocab.getty.edu/aat/{id}" description: "Getty Art & Architecture Thesaurus" priority: 2 domain: "art objects" linking_methods: - method: "EXACT_STRING_MATCH" description: "Exact match on entity label" confidence_range: [0.9, 1.0] - method: "FUZZY_STRING_MATCH" description: "Approximate string matching (Levenshtein, Jaro-Winkler)" confidence_range: [0.7, 0.9] - method: "CONTEXT_SIMILARITY" description: "Match based on surrounding context" confidence_range: [0.6, 0.85] - method: "EMBEDDING_SIMILARITY" description: "Neural embedding-based matching" confidence_range: [0.7, 0.95] - method: "STRUCTURED_ATTRIBUTES" description: "Match on dates, locations, relationships" confidence_range: [0.8, 1.0] - method: "MANUAL_CURATION" description: "Human-verified linking" confidence_range: [0.95, 1.0] - method: "CROSS_REFERENCE" description: "Match via owl:sameAs links in KB" confidence_range: [0.9, 1.0] link_types: - type: "EXACT" predicate: "owl:sameAs" description: "Chain refers to exactly this KB entity" - type: "CLOSE" predicate: "skos:closeMatch" description: "Chain is very similar to this KB entity" - type: "RELATED" predicate: "skos:relatedMatch" description: "Chain is related to this KB entity" - type: "NIL" predicate: null description: "No matching KB entity exists (NIL clustering)" # ============================================================================= # CROSS-DOCUMENT COREFERENCE # ============================================================================= cross_document: description: | Handling coreference across multiple documents. Essential for corpus-level entity consolidation and knowledge base construction. approaches: - approach: "Global Entity Clustering" description: | All mentions across all documents are clustered into global entities. Each global entity corresponds to one real-world referent. workflow: 1: "Extract mentions from all documents" 2: "Perform within-document coreference" 3: "Extract features from document-level chains" 4: "Cluster chains across documents by similarity" 5: "Link clusters to KB entities" advantages: - "Unified entity inventory" - "Enables cross-document analysis" challenges: - "Scalability with large corpora" - "Error propagation from document-level" - approach: "KB-Anchored Linking" description: | Document-level chains are independently linked to KB entities. Cross-document coreference is implicit via shared KB links. workflow: 1: "Perform within-document coreference" 2: "Link each chain to KB independently" 3: "Chains with same KB link are cross-doc coreferent" advantages: - "Scalable (document-independent)" - "Leverages KB structure" challenges: - "Depends on KB coverage" - "NIL entities not handled" - approach: "Incremental Clustering" description: | Documents processed incrementally, updating global entity inventory. workflow: 1: "Process documents in order" 2: "For each document chain, find matching global entity" 3: "If match found, merge; else create new global entity" 4: "Update global entity features" advantages: - "Streaming-compatible" - "Memory-efficient" challenges: - "Order-dependent results" - "May require periodic re-clustering" global_entity_schema: required_fields: - field: "global_entity_id" type: "string" format: "UUID or URI" - field: "entity_type" type: "string" format: "GLAM-NER type code" - field: "canonical_name" type: "string" description: "Best/canonical name for entity" - field: "document_chains" type: "array" description: "List of (document_id, chain_id) pairs" optional_fields: - field: "alternative_names" type: "array" item_type: "string" description: "All name variants observed" - field: "external_links" type: "array" item_type: "EntityLink" - field: "mention_count" type: "integer" description: "Total mentions across all documents" - field: "document_count" type: "integer" description: "Number of documents mentioning entity" # ============================================================================= # SPECIAL CASES # ============================================================================= special_cases: singletons: description: | Singleton mentions are entity mentions that don't corefer with any other mention in the document. They form single-member chains. handling: policy: "INCLUDE" rationale: | Include singletons for completeness and entity linking. Many entity mentions in documents are singletons. annotation: | Create a chain with exactly one mention. Still attempt entity linking for singletons. example: text: "Van Gogh visited Arles in 1888." chains: - chain_id: "chain-001" mentions: - mention_id: "m1" span_text: "Van Gogh" mention_type: "NAM" # Singleton - no other mentions in document split_antecedents: description: | Split antecedents occur when a plural pronoun refers to multiple previously mentioned entities. handling: annotation_strategy: "MULTI_CHAIN_LINK" description: | Create a special "group mention" that links to multiple chains. The plural pronoun forms its own chain that references the component chains. example: text: "Rembrandt met Vermeer. They discussed technique." chains: - chain_id: "chain-rembrandt" mentions: [{span_text: "Rembrandt"}] - chain_id: "chain-vermeer" mentions: [{span_text: "Vermeer"}] - chain_id: "chain-they" mentions: [{span_text: "They", mention_type: "PRO"}] split_antecedent_refs: ["chain-rembrandt", "chain-vermeer"] generic_mentions: description: | Generic mentions refer to a class or type rather than a specific entity. handling: policy: "EXCLUDE_FROM_CHAINS" rationale: | Generic mentions don't refer to specific entities and should not be in coreference chains. They may still be annotated as entities. examples: - "A painter must understand light." # Generic 'painter' - "Museums should preserve culture." # Generic 'museums' annotation: entity_annotation: true coreference: false marker: "GENERIC" cataphora: description: | Cataphora is forward-pointing reference, where a pronoun precedes its referent. handling: same_as_anaphora: true note: "Chain ordering doesn't imply antecedent relationship" example: text: "When he arrived, Rembrandt found the studio empty." chain: mentions: - {span_text: "he", mention_type: "PRO"} # Cataphoric - {span_text: "Rembrandt", mention_type: "NAM"} # Referent metonymy: description: | Metonymic mentions use one entity to refer to a related entity. handling: policy: "ANNOTATE_BOTH" description: | Annotate the literal entity and create a separate chain for the metonymic referent with a metonymy marker. examples: - text: "The White House announced..." literal: "TOP.BLT (building)" metonymic: "GRP.GOV (US executive branch)" - text: "Amsterdam rejected the proposal." literal: "TOP.CTY (city)" metonymic: "GRP.GOV (city government)" # ============================================================================= # OUTPUT FORMATS # ============================================================================= output_formats: conll_2012: description: "CoNLL-2012 shared task format (column-based)" specification: "https://conll.cemantix.org/2012/data.html" columns: - "Document ID" - "Part number" - "Word number" - "Word" - "POS tag" - "Parse bit" - "Predicate lemma" - "Predicate frameset" - "Word sense" - "Speaker" - "Named entities" - "Coreference" coreference_notation: | Parentheses notation: - (X = start of chain X - X) = end of chain X - (X) = single-word chain X example: | doc1 0 0 Rembrandt NNP ... (1) doc1 0 1 painted VBD ... - doc1 0 2 the DT ... (2 doc1 0 3 Night NNP ... - doc1 0 4 Watch NNP ... 2) doc1 0 5 . . ... - doc1 0 6 He PRP ... (1) doc1 0 7 completed VBD ... - doc1 0 8 it PRP ... (2) json_chains: description: "JSON format with explicit chain objects" schema: | { "document_id": "string", "chains": [ { "chain_id": "string", "entity_type": "string", "mentions": [ { "mention_id": "string", "span_start": int, "span_end": int, "span_text": "string", "mention_type": "string" } ], "external_links": [ { "target_uri": "string", "target_source": "string", "confidence": float } ] } ] } nif_rdf: description: "NIF-based RDF format" example: | @prefix nif: . @prefix itsrdf: . a nif:Phrase ; nif:anchorOf "Rembrandt" ; nif:beginIndex 0 ; nif:endIndex 9 ; itsrdf:taIdentRef ; nif:referenceContext . a nif:Phrase ; nif:anchorOf "He" ; nif:beginIndex 40 ; nif:endIndex 42 ; owl:sameAs . # ============================================================================= # EXAMPLES # ============================================================================= examples: - name: "Simple pronominal coreference" text: "Rembrandt van Rijn was born in Leiden. He became Amsterdam's most famous painter." annotation: chains: - chain_id: "chain-rembrandt" entity_type: "AGT.PER" representative_mention: "m1" mentions: - mention_id: "m1" span_start: 0 span_end: 18 span_text: "Rembrandt van Rijn" mention_type: "NAM" - mention_id: "m2" span_start: 39 span_end: 41 span_text: "He" mention_type: "PRO.PER" external_links: - target_uri: "http://viaf.org/viaf/64013650" target_source: "VIAF" link_type: "EXACT" confidence: 0.98 - name: "Definite description coreference" text: "The Rijksmuseum opened in 1885. The museum houses The Night Watch." annotation: chains: - chain_id: "chain-rijksmuseum" entity_type: "GRP.ORG" representative_mention: "m1" mentions: - mention_id: "m1" span_start: 0 span_end: 14 span_text: "The Rijksmuseum" mention_type: "NAM" - mention_id: "m2" span_start: 32 span_end: 42 span_text: "The museum" mention_type: "NOM" external_links: - target_uri: "http://www.wikidata.org/entity/Q190804" target_source: "Wikidata" link_type: "EXACT" confidence: 0.99 - name: "Appositive coreference" text: "Vermeer, the Delft master, painted Girl with a Pearl Earring." annotation: chains: - chain_id: "chain-vermeer" entity_type: "AGT.PER" mentions: - mention_id: "m1" span_start: 0 span_end: 7 span_text: "Vermeer" mention_type: "NAM" - mention_id: "m2" span_start: 9 span_end: 25 span_text: "the Delft master" mention_type: "APP" - name: "NIL entity (no KB match)" text: "Local artist Jan de Vries exhibited at the town hall." annotation: chains: - chain_id: "chain-jdv" entity_type: "AGT.PER" mentions: - mention_id: "m1" span_start: 13 span_end: 25 span_text: "Jan de Vries" mention_type: "NAM" external_links: - target_uri: null target_source: null link_type: "NIL" confidence: 0.85 nil_cluster_id: "nil-cluster-001"