glam/data/entity_annotation/modules/integrations/pico.yaml
2025-12-05 15:30:23 +01:00

373 lines
12 KiB
YAML

# =============================================================================
# GLAM-NER Entity Annotation Convention v1.7.0
# Module: integrations/pico.yaml
# =============================================================================
# PiCO (Person in Context Ontology) integration for person observation modeling.
# Enables tracking provenance of person mentions and linking to formal records.
#
# Key concepts:
# - PersonObservation: A textual mention of a person (source-bound)
# - PersonName (PNV): Structured name components
# - Person (CIDOC-CRM E21): Reconstructed person entity
#
# References:
# - PiCo Ontology: https://w3id.org/pico
# - Person Name Vocabulary (PNV): https://w3id.org/pnv
# - CIDOC-CRM: https://www.cidoc-crm.org/
# =============================================================================
pico_integration:
description: |
PiCO (Person in Context Ontology) models textual observations of persons
as distinct from reconstructed person entities. This enables:
- Tracking provenance of person mentions
- Handling name variations across sources
- Linking observations to formal person records
The observation/reconstruction pattern separates:
1. What was OBSERVED in text (PersonObservation) - source-bound, exact
2. What was RECONSTRUCTED as entity (E21_Person) - inferred, normalized
This is critical for heritage data where the same person may appear with
different name forms, titles, or spellings across sources.
# ---------------------------------------------------------------------------
# Core Observation Pattern
# ---------------------------------------------------------------------------
observation_pattern:
description: "Every person mention creates a PersonObservation"
class: "picom:PersonObservation"
class_uri: "https://w3id.org/pico/PersonObservation"
properties:
- property: "picom:hasObservedName"
description: "The name string as it appears in text"
range: "pnv:PersonName"
cardinality: "1"
note: "Exact transcription of name from source"
- property: "picom:isObservationOf"
description: "Links to reconstructed Person entity"
range: "crm:E21_Person"
cardinality: "0..1"
note: "May be null if person not yet identified"
- property: "prov:hadPrimarySource"
description: "The source document/webpage"
range: "prov:Entity"
cardinality: "1"
note: "Required for provenance tracking"
- property: "picom:observedAt"
description: "When the observation was made"
range: "xsd:dateTime"
cardinality: "1"
note: "Extraction timestamp, not document date"
- property: "picom:observedInContext"
description: "Surrounding text context"
range: "xsd:string"
cardinality: "0..1"
note: "For disambiguation when reviewing"
- property: "picom:hasRole"
description: "Role/position observed with the person"
range: "xsd:string"
cardinality: "0..*"
note: "Links to ROLE hypernym when extracted"
# ---------------------------------------------------------------------------
# Person Name Vocabulary (PNV)
# ---------------------------------------------------------------------------
pnv_name_structure:
description: |
Person Name Vocabulary (PNV) provides structured name components.
This enables proper parsing of complex name structures across cultures.
class: "pnv:PersonName"
class_uri: "https://w3id.org/pnv/PersonName"
components:
- property: "pnv:literalName"
description: "Full name as single string"
examples:
- "Dr. Maria van den Berg"
- "Rembrandt Harmenszoon van Rijn"
- "Queen Elizabeth II"
note: "Original string before parsing"
- property: "pnv:givenName"
description: "First/given name"
examples:
- "Rembrandt"
- "Maria"
- "Jan"
- "Elizabeth"
note: "Personal name, not surname"
- property: "pnv:patronym"
description: "Patronymic name component"
examples:
- "Harmenszoon"
- "Janszoon"
- "Pietersdochter"
note: "Common in Dutch, Scandinavian, Slavic names"
- property: "pnv:surnamePrefix"
description: "Prefix to surname (tussenvoegsel)"
examples:
- "van"
- "de"
- "van den"
- "van der"
- "op de"
- "'t"
- "von"
- "di"
note: "Language-specific, affects sorting"
- property: "pnv:baseSurname"
description: "Core surname without prefix"
examples:
- "Rijn"
- "Berg"
- "Velde"
- "Gogh"
note: "Primary sorting component in Dutch"
- property: "pnv:honorificPrefix"
description: "Title or honorific before name"
examples:
- "Dr."
- "Prof."
- "Prof. dr."
- "Sir"
- "Queen"
- "Mr."
- "Drs."
- "Ir."
note: "May indicate role - link to ROL"
- property: "pnv:honorificSuffix"
description: "Title or honorific after name"
examples:
- "PhD"
- "Jr."
- "III"
- "MD"
- "RA"
- "MSc"
note: "Credentials and generational markers"
- property: "pnv:infixTitle"
description: "Title within name structure"
examples:
- "graaf van"
- "baron de"
- "duke of"
note: "Nobility titles embedded in name"
# ---------------------------------------------------------------------------
# Dutch Name Conventions (Project-Specific)
# ---------------------------------------------------------------------------
dutch_name_patterns:
description: |
Special handling for Dutch names with tussenvoegsels (surname prefixes).
Dutch sorting rules differ from other languages.
tussenvoegsel_list:
- "van"
- "van de"
- "van den"
- "van der"
- "de"
- "den"
- "het"
- "'t"
- "ter"
- "ten"
- "op de"
- "op den"
- "in 't"
- "in de"
sorting_rule: |
In Dutch, surnames sort by baseSurname, ignoring tussenvoegsel.
"Vincent van Gogh" sorts under "G" not "V".
"Maria van den Berg" sorts under "B" not "V".
capitalization_rule: |
Tussenvoegsel lowercase when preceded by given name:
- "Vincent van Gogh" (not "Vincent Van Gogh")
- "Van Gogh" (surname alone, capitalized)
- "de heer Van Gogh" (formal, capitalized)
# ---------------------------------------------------------------------------
# Integration with GLAM-NER Hypernyms
# ---------------------------------------------------------------------------
hypernym_mapping:
description: "How PiCo concepts map to GLAM-NER v1.7.0 hypernyms"
mappings:
- pico_class: "picom:PersonObservation"
glam_hypernym: "AGT.PER"
glam_code: "AGT.PER"
note: "Person observations create AGT.PER entities"
- pico_class: "picom:PersonObservation"
glam_hypernym: "AGT.STF"
glam_code: "AGT.STF"
condition: "When observed with organizational role"
note: "Staff members with role context"
- pico_class: "pnv:PersonName"
glam_hypernym: "APP.NAM"
glam_code: "APP.NAM"
note: "Name strings as appellations"
- pico_class: "picom:hasRole"
glam_hypernym: "ROL"
glam_code: "ROL"
note: "Extracted roles link to ROL hypernym"
# ---------------------------------------------------------------------------
# Example Annotations
# ---------------------------------------------------------------------------
examples:
- description: "Staff member with title and role"
text: "Dr. Maria van den Berg, Director"
observation:
type: "picom:PersonObservation"
id: "_:obs1"
hasObservedName:
type: "pnv:PersonName"
literalName: "Dr. Maria van den Berg"
honorificPrefix: "Dr."
givenName: "Maria"
surnamePrefix: "van den"
baseSurname: "Berg"
hasRole: "Director"
hadPrimarySource: "https://example.org/staff-page"
observedAt: "2025-12-02T10:30:00Z"
glam_ner_annotations:
- span: "Dr. Maria van den Berg"
type: "AGT.STF"
code: "AGT.STF"
confidence: 0.95
- span: "Director"
type: "ROL.TIT"
code: "ROL.TIT"
confidence: 0.98
- description: "Historical artist"
text: "Rembrandt van Rijn painted this in 1642"
observation:
type: "picom:PersonObservation"
id: "_:obs2"
hasObservedName:
type: "pnv:PersonName"
literalName: "Rembrandt van Rijn"
givenName: "Rembrandt"
surnamePrefix: "van"
baseSurname: "Rijn"
isObservationOf: "wd:Q5598" # Wikidata Rembrandt
hadPrimarySource: "https://example.org/artwork-page"
observedAt: "2025-12-02T10:35:00Z"
glam_ner_annotations:
- span: "Rembrandt van Rijn"
type: "AGT.PER"
code: "AGT.PER"
confidence: 0.99
linking:
wikidata: "Q5598"
viaf: "64013650"
- description: "Nobility title"
text: "Count Willem van Loon"
observation:
type: "picom:PersonObservation"
id: "_:obs3"
hasObservedName:
type: "pnv:PersonName"
literalName: "Count Willem van Loon"
honorificPrefix: "Count"
givenName: "Willem"
surnamePrefix: "van"
baseSurname: "Loon"
hadPrimarySource: "https://example.org/archive-doc"
observedAt: "2025-12-02T10:40:00Z"
glam_ner_annotations:
- span: "Count Willem van Loon"
type: "AGT.PER"
code: "AGT.PER"
confidence: 0.95
- span: "Count"
type: "ROL.HON"
code: "ROL.HON"
note: "Nobility title - honorific role"
# ---------------------------------------------------------------------------
# Provenance Chain
# ---------------------------------------------------------------------------
provenance_model:
description: |
PiCo observations maintain full provenance chain:
Observation → Source Document → Extraction Activity → Agent
This enables:
- Tracking where each name form was found
- Attributing extractions to human/ML agents
- Maintaining audit trail for corrections
chain_structure:
observation:
class: "picom:PersonObservation"
properties:
- "prov:hadPrimarySource" # → Source document
- "prov:wasGeneratedBy" # → Extraction activity
source:
class: "prov:Entity"
properties:
- "prov:wasAttributedTo" # → Publisher/author
- "dct:created" # → Document date
activity:
class: "prov:Activity"
properties:
- "prov:wasAssociatedWith" # → Extraction agent
- "prov:used" # → ML model or rules
- "prov:startedAtTime" # → Extraction timestamp
agent:
class: "prov:Agent"
examples:
- "Human curator"
- "spaCy NER model"
- "GLAM-NER extraction pipeline"
# =============================================================================
# END OF MODULE
# =============================================================================