glam/data/entity_annotation/modules/integrations/nif_nerd.yaml
2025-12-05 15:30:23 +01:00

725 lines
25 KiB
YAML

# =============================================================================
# GLAM-NER Entity Annotation Convention v1.7.0
# Module: integrations/nif_nerd.yaml
# =============================================================================
# NLP Interchange Format (NIF), Named Entity Recognition and Disambiguation
# (NERD), and W3C Web Annotation (OA) patterns for cross-tool interoperability.
#
# Standards covered:
# - NIF 2.0: String/offset addressing for NLP tool interchange
# - NERD: Cross-system entity type mappings (10 core classes)
# - W3C OA: Web Annotation Data Model for annotation provenance
# - ITS 2.0 (itsrdf): Entity linking predicates
#
# References:
# - NIF: https://persistence.uni-leipzig.org/nlp2rdf/
# - NERD: http://nerd.eurecom.fr/
# - W3C OA: https://www.w3.org/TR/annotation-model/
# - ITS 2.0: https://www.w3.org/TR/its20/
# =============================================================================
nif_nerd_integration:
description: |
This module defines how GLAM-NER annotations integrate with:
- NIF 2.0: Standard format for NLP tool interchange (string/offset addressing)
- NERD: Cross-system entity type mappings (10 core classes)
- W3C OA: Web Annotation Data Model for annotation provenance
- itsrdf: ITS 2.0 entity linking predicates
These standards enable GLAM-NER annotations to be consumed by external
NLP pipelines, linked data systems, and annotation aggregators.
IMPORTANT: NERD mappings are for INTEROPERABILITY only. GLAM-NER types
provide richer semantics than NERD's 10 classes. Always preserve GLAM-NER
types alongside NERD mappings.
# ---------------------------------------------------------------------------
# NIF Core Patterns
# ---------------------------------------------------------------------------
nif_core_patterns:
description: |
NIF (NLP Interchange Format) provides URI-based addressing for text
spans. Every annotation creates a nif:Phrase linked to its context.
context_class: "nif:Context"
context_uri: "http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#Context"
context_note: |
nif:Context represents the full text document. All annotations reference
this context via nif:referenceContext.
phrase_class: "nif:Phrase"
phrase_uri: "http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#Phrase"
phrase_note: |
nif:Phrase represents extracted entity mentions. Each GLAM-NER entity
becomes a nif:Phrase with offset-based URI addressing.
# -------------------------------------------------------------------------
# URI Schemes
# -------------------------------------------------------------------------
uri_schemes:
offset_based:
pattern: "{source_url}#offset_{begin}_{end}"
example: "https://example.org/page#offset_42_58"
note: "Default scheme. Begin/end are character offsets (0-based)."
preferred: true
rfc5147:
pattern: "{source_url}#char={begin},{end}"
example: "https://example.org/page#char=42,58"
note: "RFC 5147 fragment identifiers for text/plain."
preferred: false
context_hash:
pattern: "{source_url}#hash_{context_length}_{hash}_{begin}_{end}"
example: "https://example.org/page#hash_1024_a1b2c3_42_58"
note: "Hash-based URIs for content-addressing. More stable across edits."
preferred: false
use_case: "Long-term preservation where source may change"
# -------------------------------------------------------------------------
# Core Properties
# -------------------------------------------------------------------------
core_properties:
- property: "nif:beginIndex"
uri: "http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#beginIndex"
range: "xsd:nonNegativeInteger"
description: "Character offset where entity span begins (0-based)"
required: true
- property: "nif:endIndex"
uri: "http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#endIndex"
range: "xsd:nonNegativeInteger"
description: "Character offset where entity span ends (exclusive)"
required: true
- property: "nif:anchorOf"
uri: "http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#anchorOf"
range: "xsd:string"
description: "The exact text string of the entity mention"
required: true
- property: "nif:referenceContext"
uri: "http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#referenceContext"
range: "nif:Context"
description: "Link to the document context containing this phrase"
required: true
- property: "nif:sourceUrl"
uri: "http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#sourceUrl"
range: "xsd:anyURI"
description: "Original URL of the source document"
required: false
# ---------------------------------------------------------------------------
# NERD Class Mappings
# ---------------------------------------------------------------------------
nerd_class_mappings:
description: |
NERD (Named Entity Recognition and Disambiguation) defines 10 core
entity classes that map across multiple NER systems (DBpedia Spotlight,
AlchemyAPI, OpenCalais, Zemanta, etc.). GLAM-NER types map to NERD for
cross-system interoperability.
NOTE: NERD classes are intentionally broad. GLAM-NER types provide
finer granularity. Always output BOTH for maximum utility.
core_classes:
- nerd_class: "nerd:Thing"
nerd_uri: "http://nerd.eurecom.fr/ontology#Thing"
description: "Generic entity (base class)"
glam_ner_types:
- "THG"
note: "Fallback for entities not matching other NERD classes"
- nerd_class: "nerd:Person"
nerd_uri: "http://nerd.eurecom.fr/ontology#Person"
description: "Human beings"
glam_ner_types:
- "AGT.PER"
- "AGT.STF"
subclasses:
- "nerd:Astronaut"
- "nerd:Politician"
- "nerd:Artist"
- "nerd:Athlete"
- "nerd:Actor"
- nerd_class: "nerd:Organization"
nerd_uri: "http://nerd.eurecom.fr/ontology#Organization"
description: "Organizations, companies, institutions"
glam_ner_types:
- "GRP"
- "GRP.HER"
- "GRP.COM"
- "GRP.GOV"
- "GRP.EDU"
- "GRP.REL"
- "GRP.UNT"
subclasses:
- "nerd:Company"
- "nerd:SportsTeam"
- "nerd:Band"
- "nerd:University"
- "nerd:Museum"
- nerd_class: "nerd:Location"
nerd_uri: "http://nerd.eurecom.fr/ontology#Location"
description: "Geographic places and features"
glam_ner_types:
- "TOP"
- "TOP.ADM"
- "TOP.PPL"
- "TOP.BLD"
- "TOP.FAC"
- "TOP.NAT"
- "GEO"
subclasses:
- "nerd:City"
- "nerd:Country"
- "nerd:Continent"
- "nerd:Region"
- "nerd:Facility"
- nerd_class: "nerd:Event"
nerd_uri: "http://nerd.eurecom.fr/ontology#Event"
description: "Named events"
glam_ner_types:
- "THG.EVT"
subclasses:
- "nerd:SportEvent"
- "nerd:MusicFestival"
- "nerd:Election"
- nerd_class: "nerd:Time"
nerd_uri: "http://nerd.eurecom.fr/ontology#Time"
description: "Temporal expressions"
glam_ner_types:
- "TMP"
- "TMP.DAT"
- "TMP.TIM"
- "TMP.DUR"
- "TMP.SET"
- nerd_class: "nerd:Amount"
nerd_uri: "http://nerd.eurecom.fr/ontology#Amount"
description: "Quantities and measurements"
glam_ner_types:
- "QTY"
- "QTY.CNT"
- "QTY.MSR"
- "QTY.MON"
- "QTY.PCT"
- "QTY.ORD"
- nerd_class: "nerd:Product"
nerd_uri: "http://nerd.eurecom.fr/ontology#Product"
description: "Products and creative works"
glam_ner_types:
- "THG.OBJ"
- "WRK"
- "WRK.WRK"
- "WRK.EXP"
- "WRK.MAN"
- "WRK.ITM"
subclasses:
- "nerd:Album"
- "nerd:Book"
- "nerd:Movie"
- "nerd:Software"
- nerd_class: "nerd:Animal"
nerd_uri: "http://nerd.eurecom.fr/ontology#Animal"
description: "Animals"
glam_ner_types:
- "AGT.ANI"
- nerd_class: "nerd:Function"
nerd_uri: "http://nerd.eurecom.fr/ontology#Function"
description: "Roles, titles, occupations"
glam_ner_types:
- "ROL"
- "ROL.OCC"
- "ROL.TIT"
- "ROL.HON"
note: "GLAM-NER treats roles as separate entities; NERD uses as attribute"
# ---------------------------------------------------------------------------
# W3C Web Annotation Patterns
# ---------------------------------------------------------------------------
web_annotation_patterns:
description: |
W3C Web Annotation Data Model provides standard annotation structure
with target selectors for precise text span identification.
annotation_class: "oa:Annotation"
annotation_uri: "http://www.w3.org/ns/oa#Annotation"
structure:
- property: "oa:hasBody"
uri: "http://www.w3.org/ns/oa#hasBody"
description: "The annotation content (entity type, confidence)"
example: "The GLAM-NER entity classification"
- property: "oa:hasTarget"
uri: "http://www.w3.org/ns/oa#hasTarget"
description: "What is being annotated (text span)"
example: "TextPositionSelector pointing to entity mention"
- property: "oa:motivatedBy"
uri: "http://www.w3.org/ns/oa#motivatedBy"
description: "Why the annotation was created"
value: "oa:classifying"
note: "NER annotations are classification activities"
- property: "oa:annotatedBy"
uri: "http://www.w3.org/ns/oa#annotatedBy"
description: "Agent that created the annotation"
example: "Human curator, ML model, or pipeline"
- property: "oa:annotatedAt"
uri: "http://www.w3.org/ns/oa#annotatedAt"
description: "When the annotation was created"
range: "xsd:dateTime"
target_selectors:
text_position:
class: "oa:TextPositionSelector"
uri: "http://www.w3.org/ns/oa#TextPositionSelector"
properties:
- property: "oa:start"
description: "Start offset (0-based)"
- property: "oa:end"
description: "End offset (exclusive)"
note: "Equivalent to NIF offset-based addressing"
text_quote:
class: "oa:TextQuoteSelector"
uri: "http://www.w3.org/ns/oa#TextQuoteSelector"
properties:
- property: "oa:exact"
description: "The exact matched text"
- property: "oa:prefix"
description: "Context before (for disambiguation)"
- property: "oa:suffix"
description: "Context after (for disambiguation)"
note: "Provides context for robust text matching"
xpath:
class: "oa:XPathSelector"
uri: "http://www.w3.org/ns/oa#XPathSelector"
properties:
- property: "rdf:value"
description: "XPath expression to DOM node"
note: "For HTML/XML sources with DOM structure"
css:
class: "oa:CssSelector"
uri: "http://www.w3.org/ns/oa#CssSelector"
properties:
- property: "rdf:value"
description: "CSS selector to DOM node"
note: "Alternative to XPath for HTML sources"
# ---------------------------------------------------------------------------
# ITS 2.0 Entity Linking
# ---------------------------------------------------------------------------
itsrdf_entity_linking:
description: |
ITS 2.0 (Internationalization Tag Set) provides entity linking predicates
for connecting mentions to knowledge bases.
properties:
- property: "itsrdf:taIdentRef"
uri: "http://www.w3.org/2005/11/its/rdf#taIdentRef"
description: "URI reference to entity in knowledge base"
example: "http://www.wikidata.org/entity/Q190804"
note: "Primary entity linking predicate"
required_for_linking: true
- property: "itsrdf:taSource"
uri: "http://www.w3.org/2005/11/its/rdf#taSource"
description: "Knowledge base source identifier"
examples:
- "Wikidata"
- "DBpedia"
- "GeoNames"
- "VIAF"
- "Getty AAT"
- "Getty ULAN"
- "Getty TGN"
note: "Human-readable source name"
- property: "itsrdf:taConfidence"
uri: "http://www.w3.org/2005/11/its/rdf#taConfidence"
description: "Linking confidence score (0.0-1.0)"
range: "xsd:double"
note: "Different from entity detection confidence"
- property: "itsrdf:taClassRef"
uri: "http://www.w3.org/2005/11/its/rdf#taClassRef"
description: "URI of entity type in target ontology"
example: "http://dbpedia.org/ontology/Museum"
note: "Type in linked KB, may differ from GLAM-NER type"
# ---------------------------------------------------------------------------
# Complete GLAM-NER to NERD Mapping Table
# ---------------------------------------------------------------------------
glam_ner_to_nerd_mapping:
description: "Complete mapping table from GLAM-NER v1.7.0 types to NERD classes"
mappings:
# AGENT hypernym mappings
- glam_type: "AGT"
glam_code: "AGT"
nerd_class: "nerd:Person"
note: "Generic agent defaults to Person"
- glam_type: "AGT.PER"
glam_code: "AGT.PER"
nerd_class: "nerd:Person"
nerd_subclasses: ["nerd:Artist", "nerd:Politician", "nerd:Astronaut", "nerd:Athlete"]
- glam_type: "AGT.STF"
glam_code: "AGT.STF"
nerd_class: "nerd:Person"
note: "Staff roles map to nerd:Function as secondary annotation"
- glam_type: "AGT.ANI"
glam_code: "AGT.ANI"
nerd_class: "nerd:Animal"
- glam_type: "AGT.MYT"
glam_code: "AGT.MYT"
nerd_class: "nerd:Person"
note: "Mythological/fictional figures treated as Person in NERD"
- glam_type: "AGT.AI"
glam_code: "AGT.AI"
nerd_class: "nerd:Thing"
note: "AI agents have no NERD equivalent - map to Thing"
# GROUP hypernym mappings
- glam_type: "GRP"
glam_code: "GRP"
nerd_class: "nerd:Organization"
- glam_type: "GRP.HER"
glam_code: "GRP.HER"
nerd_class: "nerd:Organization"
nerd_subclasses: ["nerd:Museum"]
- glam_type: "GRP.COM"
glam_code: "GRP.COM"
nerd_class: "nerd:Organization"
nerd_subclasses: ["nerd:Company"]
- glam_type: "GRP.GOV"
glam_code: "GRP.GOV"
nerd_class: "nerd:Organization"
- glam_type: "GRP.EDU"
glam_code: "GRP.EDU"
nerd_class: "nerd:Organization"
nerd_subclasses: ["nerd:University"]
- glam_type: "GRP.REL"
glam_code: "GRP.REL"
nerd_class: "nerd:Organization"
- glam_type: "GRP.UNT"
glam_code: "GRP.UNT"
nerd_class: "nerd:Organization"
note: "Organizational units are Organizations in NERD"
- glam_type: "GRP.INF"
glam_code: "GRP.INF"
nerd_class: "nerd:Organization"
note: "Informal groups still map to Organization"
# TOPONYM hypernym mappings
- glam_type: "TOP"
glam_code: "TOP"
nerd_class: "nerd:Location"
- glam_type: "TOP.ADM"
glam_code: "TOP.ADM"
nerd_class: "nerd:Location"
nerd_subclasses: ["nerd:Country", "nerd:Region"]
- glam_type: "TOP.PPL"
glam_code: "TOP.PPL"
nerd_class: "nerd:Location"
nerd_subclasses: ["nerd:City"]
- glam_type: "TOP.BLD"
glam_code: "TOP.BLD"
nerd_class: "nerd:Location"
nerd_subclasses: ["nerd:Facility"]
- glam_type: "TOP.FAC"
glam_code: "TOP.FAC"
nerd_class: "nerd:Location"
nerd_subclasses: ["nerd:Facility"]
- glam_type: "TOP.NAT"
glam_code: "TOP.NAT"
nerd_class: "nerd:Location"
- glam_type: "TOP.HIS"
glam_code: "TOP.HIS"
nerd_class: "nerd:Location"
note: "Historical place names"
# GEOMETRY hypernym mappings
- glam_type: "GEO"
glam_code: "GEO"
nerd_class: "nerd:Location"
note: "Coordinates map to Location"
- glam_type: "GEO.PNT"
glam_code: "GEO.PNT"
nerd_class: "nerd:Location"
- glam_type: "GEO.BOX"
glam_code: "GEO.BOX"
nerd_class: "nerd:Location"
# TEMPORAL hypernym mappings
- glam_type: "TMP"
glam_code: "TMP"
nerd_class: "nerd:Time"
- glam_type: "TMP.DAT"
glam_code: "TMP.DAT"
nerd_class: "nerd:Time"
- glam_type: "TMP.TIM"
glam_code: "TMP.TIM"
nerd_class: "nerd:Time"
- glam_type: "TMP.DUR"
glam_code: "TMP.DUR"
nerd_class: "nerd:Time"
- glam_type: "TMP.SET"
glam_code: "TMP.SET"
nerd_class: "nerd:Time"
# QUANTITY hypernym mappings
- glam_type: "QTY"
glam_code: "QTY"
nerd_class: "nerd:Amount"
- glam_type: "QTY.CNT"
glam_code: "QTY.CNT"
nerd_class: "nerd:Amount"
- glam_type: "QTY.MSR"
glam_code: "QTY.MSR"
nerd_class: "nerd:Amount"
- glam_type: "QTY.MON"
glam_code: "QTY.MON"
nerd_class: "nerd:Amount"
- glam_type: "QTY.PCT"
glam_code: "QTY.PCT"
nerd_class: "nerd:Amount"
- glam_type: "QTY.ORD"
glam_code: "QTY.ORD"
nerd_class: "nerd:Amount"
# WORK hypernym mappings (FRBR)
- glam_type: "WRK"
glam_code: "WRK"
nerd_class: "nerd:Product"
- glam_type: "WRK.WRK"
glam_code: "WRK.WRK"
nerd_class: "nerd:Product"
note: "FRBR Work level"
- glam_type: "WRK.EXP"
glam_code: "WRK.EXP"
nerd_class: "nerd:Product"
note: "FRBR Expression level"
- glam_type: "WRK.MAN"
glam_code: "WRK.MAN"
nerd_class: "nerd:Product"
nerd_subclasses: ["nerd:Book", "nerd:Album", "nerd:Movie"]
note: "FRBR Manifestation level"
- glam_type: "WRK.ITM"
glam_code: "WRK.ITM"
nerd_class: "nerd:Product"
note: "FRBR Item level"
# THING hypernym mappings
- glam_type: "THG"
glam_code: "THG"
nerd_class: "nerd:Thing"
- glam_type: "THG.OBJ"
glam_code: "THG.OBJ"
nerd_class: "nerd:Product"
- glam_type: "THG.COL"
glam_code: "THG.COL"
nerd_class: "nerd:Thing"
note: "Collections map to generic Thing (no NERD equivalent)"
- glam_type: "THG.EVT"
glam_code: "THG.EVT"
nerd_class: "nerd:Event"
nerd_subclasses: ["nerd:SportEvent", "nerd:MusicFestival"]
- glam_type: "THG.CON"
glam_code: "THG.CON"
nerd_class: "nerd:Thing"
note: "Abstract concepts"
# APPELLATION hypernym mappings
- glam_type: "APP"
glam_code: "APP"
nerd_class: "nerd:Thing"
note: "Appellations have no direct NERD mapping"
- glam_type: "APP.NAM"
glam_code: "APP.NAM"
nerd_class: "nerd:Thing"
- glam_type: "APP.TIT"
glam_code: "APP.TIT"
nerd_class: "nerd:Product"
note: "Work titles map to Product"
# ROLE hypernym mappings
- glam_type: "ROL"
glam_code: "ROL"
nerd_class: "nerd:Function"
- glam_type: "ROL.OCC"
glam_code: "ROL.OCC"
nerd_class: "nerd:Function"
- glam_type: "ROL.TIT"
glam_code: "ROL.TIT"
nerd_class: "nerd:Function"
- glam_type: "ROL.HON"
glam_code: "ROL.HON"
nerd_class: "nerd:Function"
# ---------------------------------------------------------------------------
# Example NIF Annotation
# ---------------------------------------------------------------------------
example_nif_annotation:
description: "Complete example of GLAM-NER annotation in NIF/OA format"
source_text: "The Rijksmuseum in Amsterdam holds over one million objects."
turtle_example: |
@prefix nif: <http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#> .
@prefix nerd: <http://nerd.eurecom.fr/ontology#> .
@prefix oa: <http://www.w3.org/ns/oa#> .
@prefix itsrdf: <http://www.w3.org/2005/11/its/rdf#> .
@prefix glam: <https://w3id.org/glam-ner/> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
# Document context
<https://example.org/doc1#offset_0_60> a nif:Context, nif:OffsetBasedString ;
nif:isString "The Rijksmuseum in Amsterdam holds over one million objects." ;
nif:sourceUrl <https://example.org/doc1> ;
nif:beginIndex "0"^^xsd:nonNegativeInteger ;
nif:endIndex "60"^^xsd:nonNegativeInteger .
# Entity 1: Rijksmuseum (GRP.HER)
<https://example.org/doc1#offset_4_15> a nif:Phrase, nif:OffsetBasedString ;
nif:anchorOf "Rijksmuseum" ;
nif:beginIndex "4"^^xsd:nonNegativeInteger ;
nif:endIndex "15"^^xsd:nonNegativeInteger ;
nif:referenceContext <https://example.org/doc1#offset_0_60> ;
a nerd:Organization, nerd:Museum ;
glam:entityType "GRP.HER" ;
glam:entityLabel "GROUP.HERITAGE" ;
glam:confidence "0.95"^^xsd:double ;
itsrdf:taIdentRef <http://www.wikidata.org/entity/Q190804> ;
itsrdf:taSource "Wikidata" ;
itsrdf:taConfidence "0.92"^^xsd:double .
# Entity 2: Amsterdam (TOP.PPL)
<https://example.org/doc1#offset_19_28> a nif:Phrase, nif:OffsetBasedString ;
nif:anchorOf "Amsterdam" ;
nif:beginIndex "19"^^xsd:nonNegativeInteger ;
nif:endIndex "28"^^xsd:nonNegativeInteger ;
nif:referenceContext <https://example.org/doc1#offset_0_60> ;
a nerd:Location, nerd:City ;
glam:entityType "TOP.PPL" ;
glam:entityLabel "TOPONYM.POPULATED_PLACE" ;
glam:confidence "0.98"^^xsd:double ;
itsrdf:taIdentRef <http://www.wikidata.org/entity/Q727> ;
itsrdf:taSource "Wikidata" .
# Entity 3: one million (QTY.CNT)
<https://example.org/doc1#offset_41_52> a nif:Phrase, nif:OffsetBasedString ;
nif:anchorOf "one million" ;
nif:beginIndex "41"^^xsd:nonNegativeInteger ;
nif:endIndex "52"^^xsd:nonNegativeInteger ;
nif:referenceContext <https://example.org/doc1#offset_0_60> ;
a nerd:Amount ;
glam:entityType "QTY.CNT" ;
glam:entityLabel "QUANTITY.COUNT" ;
glam:normalizedValue "1000000"^^xsd:integer ;
glam:confidence "0.90"^^xsd:double .
# ---------------------------------------------------------------------------
# Output Format Guidelines
# ---------------------------------------------------------------------------
output_guidelines:
description: "How to serialize GLAM-NER annotations for interoperability"
always_include:
- "GLAM-NER type code (glam:entityType)"
- "GLAM-NER type label (glam:entityLabel)"
- "NERD class mapping (rdf:type nerd:*)"
- "NIF offsets (nif:beginIndex, nif:endIndex)"
- "Anchor text (nif:anchorOf)"
- "Confidence score (glam:confidence)"
when_linked:
- "itsrdf:taIdentRef - Entity URI"
- "itsrdf:taSource - Knowledge base name"
- "itsrdf:taConfidence - Linking confidence"
serialization_formats:
- format: "Turtle"
extension: ".ttl"
mime_type: "text/turtle"
preferred: true
- format: "JSON-LD"
extension: ".jsonld"
mime_type: "application/ld+json"
preferred: true
note: "Use @context for namespace prefixes"
- format: "N-Triples"
extension: ".nt"
mime_type: "application/n-triples"
preferred: false
note: "For streaming/bulk processing"
# =============================================================================
# END OF MODULE
# =============================================================================