glam/data/entity_annotation/modules/integrations/pico/schema/observation.yaml
kempersc 505c12601a Add test script for PiCo extraction from Arabic waqf documents
- Implemented a new script `test_pico_arabic_waqf.py` to test the GLM annotator's ability to extract person observations from Arabic historical documents.
- The script includes environment variable handling for API token, structured prompts for the GLM API, and validation of extraction results.
- Added comprehensive logging for API responses, extraction results, and validation errors.
- Included a sample Arabic waqf text for testing purposes, following the PiCo ontology pattern.
2025-12-12 17:50:17 +01:00

439 lines
14 KiB
YAML

# =============================================================================
# PiCo Integration Module: Observation Pattern
# =============================================================================
# Part of: data/entity_annotation/modules/integrations/pico/
# Parent: _index.yaml
#
# Description: Core PiCo observation pattern and PersonObservation class.
# Defines the source-bound observation layer that captures
# person mentions exactly as they appear in sources.
#
# Last Updated: 2025-01-13
# =============================================================================
# -----------------------------------------------------------------------------
# Core Observation Pattern
# -----------------------------------------------------------------------------
observation_pattern:
description: "Every person mention creates a PersonObservation"
class: "picom:PersonObservation"
class_uri: "https://w3id.org/pico/PersonObservation"
properties:
- property: "picom:hasObservedName"
description: "The name string as it appears in text"
range: "pnv:PersonName"
cardinality: "1"
note: "Exact transcription of name from source"
- property: "picom:isObservationOf"
description: "Links to reconstructed Person entity"
range: "crm:E21_Person"
cardinality: "0..1"
note: "May be null if person not yet identified"
- property: "prov:hadPrimarySource"
description: "The source document/webpage"
range: "prov:Entity"
cardinality: "1"
note: "Required for provenance tracking"
- property: "picom:observedAt"
description: "When the observation was made"
range: "xsd:dateTime"
cardinality: "1"
note: "Extraction timestamp, not document date"
- property: "picom:observedInContext"
description: "Surrounding text context"
range: "xsd:string"
cardinality: "0..1"
note: "For disambiguation when reviewing"
- property: "picom:hasRole"
description: "Role/position observed with the person"
range: "xsd:string"
cardinality: "0..*"
note: "Links to ROLE hypernym when extracted"
# -----------------------------------------------------------------------------
# Person Reconstruction Pattern
# -----------------------------------------------------------------------------
person_reconstruction_pattern:
description: |
A PersonReconstruction is created by linking one or more PersonObservations
to form a unified person entity. This is the scholarly interpretation layer
that connects source-bound observations to a conceptual person.
Key distinction:
- PersonObservation: What is OBSERVED in a specific source (exact transcription)
- PersonReconstruction: What is INFERRED about the person (normalized, linked)
A single PersonReconstruction may derive from observations across:
- Multiple sources (birth record + marriage record + death record)
- Different time periods (mentions across decades)
- Various name forms ("Jan Jansz" + "Johannes Jansen" + "J. Jansen")
class: "pico:PersonReconstruction"
class_uri: "https://personsincontext.org/model#PersonReconstruction"
superclass: "pico:Person"
required_properties:
- property: "prov:wasDerivedFrom"
description: "Links to source PersonObservation(s)"
range: "pico:PersonObservation"
cardinality: "1..*"
note: "Every reconstruction MUST link to at least one observation"
- property: "prov:wasGeneratedBy"
description: "Links to the reconstruction Activity"
range: "prov:Activity"
cardinality: "1"
note: "Documents how/when/by whom reconstruction was created"
optional_properties:
- property: "prov:wasRevisionOf"
description: "Links to previous version of this reconstruction"
range: "pico:PersonReconstruction"
cardinality: "0..1"
note: "For tracking updates to reconstructions over time"
- property: "sdo:name"
description: "Normalized/preferred name form"
range: "xsd:string"
note: "The canonical name for this person"
- property: "sdo:additionalName"
description: "Structured name following PNV"
range: "pnv:PersonName"
note: "Full name breakdown using Person Name Vocabulary"
- property: "sdo:givenName"
description: "Given/first name"
range: "xsd:string"
- property: "sdo:familyName"
description: "Family/surname"
range: "xsd:string"
- property: "sdo:gender"
description: "Gender of the person"
range: "sdo:GenderType"
values: ["sdo:Male", "sdo:Female"]
- property: "sdo:birthDate"
description: "Birth date (ISO 8601)"
range: "xsd:date"
note: "May be incomplete: YYYY, YYYY-MM, or YYYY-MM-DD"
- property: "sdo:birthPlace"
description: "Place of birth"
range: "xsd:string or xsd:anyURI"
note: "Prefer linking to GeoNames or Wikidata"
- property: "sdo:deathDate"
description: "Death date (ISO 8601)"
range: "xsd:date"
- property: "sdo:deathPlace"
description: "Place of death"
range: "xsd:string or xsd:anyURI"
example:
description: "PersonReconstruction derived from multiple observations"
turtle: |
cbg:person_reconstruction_anna_koppen
a pico:PersonReconstruction ;
sdo:name "Anna Maria Koppen" ;
sdo:familyName "Koppen" ;
sdo:givenName "Anna Maria" ;
sdo:gender sdo:Female ;
sdo:birthPlace "Haarlem" ;
sdo:birthDate "1860-03-31"^^xsd:date ;
sdo:deathPlace "Detroit, USA" ;
sdo:deathDate "1926"^^xsd:gYear ;
prov:wasDerivedFrom nha:marriage_1885_po_1 ,
cbg:emigration_1887_po_1 ,
us:death_1926_po_1 ;
prov:wasGeneratedBy cbg:reconstruction_activity_01 .
# -----------------------------------------------------------------------------
# Source and Scan Classes
# -----------------------------------------------------------------------------
source_classes:
archive_component:
description: |
A Source document from which PersonObservations are extracted.
PiCo does not aim to fully describe archival sources (use RiC-O or DC for that),
but requires minimal identification for provenance tracking.
class: "sdo:ArchiveComponent"
class_uri: "https://schema.org/ArchiveComponent"
superclass: "sdo:CreativeWork"
properties:
- property: "sdo:name"
description: "Identifying name for the source"
range: "xsd:string"
cardinality: "1"
note: "Combine title, date, archive location for identification"
example: "BS Marriage Haarlem, November 11, 1885, certificate number 321"
- property: "sdo:additionalType"
description: "Type of source document"
range: "picot_sourcetypes:Concept"
note: "Use PiCo SourceType thesaurus"
- property: "sdo:dateCreated"
description: "Date the source was created"
range: "xsd:date"
- property: "sdo:holdingArchive"
description: "Institution holding the source"
range: "xsd:anyURI"
note: "Link to heritage custodian (GHCID or Wikidata)"
- property: "sdo:url"
description: "Permalink to the source"
range: "sdo:URL"
note: "Preferably a persistent identifier"
- property: "sdo:contentLocation"
description: "Geographic coverage of the source"
range: "xsd:string or xsd:anyURI"
- property: "sdo:associatedMedia"
description: "Link to scan(s) of the source"
range: "sdo:ImageObject"
cardinality: "0..*"
image_object:
description: |
A Scan of a source document. Links to the digital image at the holding archive.
class: "sdo:ImageObject"
class_uri: "https://schema.org/ImageObject"
superclass: "sdo:CreativeWork"
properties:
- property: "sdo:url"
description: "URL to the full scan"
range: "sdo:URL"
note: "Preferably IIIF manifest"
- property: "sdo:thumbnail"
description: "URL to thumbnail image"
range: "sdo:ImageObject"
- property: "sdo:embedUrl"
description: "URL to image viewer"
range: "sdo:URL"
- property: "sdo:position"
description: "Position in sequence of scans"
range: "xsd:int"
note: "For multi-page sources"
# -----------------------------------------------------------------------------
# Biographical Properties
# -----------------------------------------------------------------------------
biographical_properties:
description: |
Biographical properties capture personal details as they appear in sources.
These are used for both PersonObservation (source-bound) and
PersonReconstruction (normalized).
age:
property: "pico:hasAge"
property_uri: "https://personsincontext.org/model#hasAge"
description: "Age of person as stated in source"
range: "xsd:string"
domain: "pico:PersonObservation"
note: |
Used when birth date unknown but age is recorded.
Age assumed in years unless specified ("4" = 4 years, "4 months" = 4 months).
Numerical preferred over text ("4" not "four").
examples:
- "30"
- "4 months"
- "about 25"
religion:
property: "pico:hasReligion"
property_uri: "https://personsincontext.org/model#hasReligion"
description: "Religious affiliation as stated in source"
range: "xsd:string or xsd:anyURI"
domain: "pico:Person"
note: "Can link to SKOS thesaurus for religions"
examples:
- "Catholic"
- "Reformed"
- "Jewish"
deceased:
property: "pico:deceased"
property_uri: "https://personsincontext.org/model#deceased"
description: "Indication that person is deceased (when death date unknown)"
range: "xsd:boolean"
domain: "pico:PersonObservation"
note: |
Only used when deathDate is unknown but death is indicated.
A person without deathDate and without deceased:true is assumed alive.
Important for privacy considerations in publishing person records.
gender:
property: "sdo:gender"
property_uri: "https://schema.org/gender"
description: "Gender of the person"
range: "sdo:GenderType"
domain: "pico:Person"
values:
- uri: "sdo:Male"
label: "Male"
- uri: "sdo:Female"
label: "Female"
address:
property: "sdo:address"
property_uri: "https://schema.org/address"
description: "Physical address as mentioned in source"
range: "xsd:string"
domain: "pico:PersonObservation"
note: "Address exactly as recorded in source"
initials:
property: "pnv:initials"
property_uri: "https://w3id.org/pnv#initials"
description: "Initials of given name(s)"
range: "xsd:string"
domain: "pnv:PersonName"
note: "Each initial followed by period (e.g., 'P.R.', 'H.A.F.M.O.')"
examples:
- "P.R."
- "C.Joh."
- "H.A.F.M.O."
# -----------------------------------------------------------------------------
# Hypernym Mapping (GLAM-NER v1.7.0)
# -----------------------------------------------------------------------------
hypernym_mapping:
description: "How PiCo concepts map to GLAM-NER v1.7.0 hypernyms"
mappings:
- pico_class: "picom:PersonObservation"
glam_hypernym: "AGT.PER"
glam_code: "AGT.PER"
note: "Person observations create AGT.PER entities"
- pico_class: "picom:PersonObservation"
glam_hypernym: "AGT.STF"
glam_code: "AGT.STF"
condition: "When observed with organizational role"
note: "Staff members with role context"
- pico_class: "pnv:PersonName"
glam_hypernym: "APP.NAM"
glam_code: "APP.NAM"
note: "Name strings as appellations"
- pico_class: "picom:hasRole"
glam_hypernym: "ROL"
glam_code: "ROL"
note: "Extracted roles link to ROL hypernym"
# -----------------------------------------------------------------------------
# Simple Examples
# -----------------------------------------------------------------------------
examples:
- description: "Staff member with title and role"
text: "Dr. Maria van den Berg, Director"
observation:
type: "picom:PersonObservation"
id: "_:obs1"
hasObservedName:
type: "pnv:PersonName"
literalName: "Dr. Maria van den Berg"
honorificPrefix: "Dr."
givenName: "Maria"
surnamePrefix: "van den"
baseSurname: "Berg"
hasRole: "Director"
hadPrimarySource: "https://example.org/staff-page"
observedAt: "2025-12-02T10:30:00Z"
glam_ner_annotations:
- span: "Dr. Maria van den Berg"
type: "AGT.STF"
code: "AGT.STF"
confidence: 0.95
- span: "Director"
type: "ROL.TIT"
code: "ROL.TIT"
confidence: 0.98
- description: "Historical artist"
text: "Rembrandt van Rijn painted this in 1642"
observation:
type: "picom:PersonObservation"
id: "_:obs2"
hasObservedName:
type: "pnv:PersonName"
literalName: "Rembrandt van Rijn"
givenName: "Rembrandt"
surnamePrefix: "van"
baseSurname: "Rijn"
isObservationOf: "wd:Q5598" # Wikidata Rembrandt
hadPrimarySource: "https://example.org/artwork-page"
observedAt: "2025-12-02T10:35:00Z"
glam_ner_annotations:
- span: "Rembrandt van Rijn"
type: "AGT.PER"
code: "AGT.PER"
confidence: 0.99
linking:
wikidata: "Q5598"
viaf: "64013650"
- description: "Nobility title"
text: "Count Willem van Loon"
observation:
type: "picom:PersonObservation"
id: "_:obs3"
hasObservedName:
type: "pnv:PersonName"
literalName: "Count Willem van Loon"
honorificPrefix: "Count"
givenName: "Willem"
surnamePrefix: "van"
baseSurname: "Loon"
hadPrimarySource: "https://example.org/archive-doc"
observedAt: "2025-12-02T10:40:00Z"
glam_ner_annotations:
- span: "Count Willem van Loon"
type: "AGT.PER"
code: "AGT.PER"
confidence: 0.95
- span: "Count"
type: "ROL.HON"
code: "ROL.HON"
note: "Nobility title - honorific role"