- Implemented a new script `test_pico_arabic_waqf.py` to test the GLM annotator's ability to extract person observations from Arabic historical documents. - The script includes environment variable handling for API token, structured prompts for the GLM API, and validation of extraction results. - Added comprehensive logging for API responses, extraction results, and validation errors. - Included a sample Arabic waqf text for testing purposes, following the PiCo ontology pattern.
439 lines
14 KiB
YAML
439 lines
14 KiB
YAML
# =============================================================================
|
|
# PiCo Integration Module: Observation Pattern
|
|
# =============================================================================
|
|
# Part of: data/entity_annotation/modules/integrations/pico/
|
|
# Parent: _index.yaml
|
|
#
|
|
# Description: Core PiCo observation pattern and PersonObservation class.
|
|
# Defines the source-bound observation layer that captures
|
|
# person mentions exactly as they appear in sources.
|
|
#
|
|
# Last Updated: 2025-01-13
|
|
# =============================================================================
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# Core Observation Pattern
|
|
# -----------------------------------------------------------------------------
|
|
|
|
observation_pattern:
|
|
description: "Every person mention creates a PersonObservation"
|
|
class: "picom:PersonObservation"
|
|
class_uri: "https://w3id.org/pico/PersonObservation"
|
|
|
|
properties:
|
|
- property: "picom:hasObservedName"
|
|
description: "The name string as it appears in text"
|
|
range: "pnv:PersonName"
|
|
cardinality: "1"
|
|
note: "Exact transcription of name from source"
|
|
|
|
- property: "picom:isObservationOf"
|
|
description: "Links to reconstructed Person entity"
|
|
range: "crm:E21_Person"
|
|
cardinality: "0..1"
|
|
note: "May be null if person not yet identified"
|
|
|
|
- property: "prov:hadPrimarySource"
|
|
description: "The source document/webpage"
|
|
range: "prov:Entity"
|
|
cardinality: "1"
|
|
note: "Required for provenance tracking"
|
|
|
|
- property: "picom:observedAt"
|
|
description: "When the observation was made"
|
|
range: "xsd:dateTime"
|
|
cardinality: "1"
|
|
note: "Extraction timestamp, not document date"
|
|
|
|
- property: "picom:observedInContext"
|
|
description: "Surrounding text context"
|
|
range: "xsd:string"
|
|
cardinality: "0..1"
|
|
note: "For disambiguation when reviewing"
|
|
|
|
- property: "picom:hasRole"
|
|
description: "Role/position observed with the person"
|
|
range: "xsd:string"
|
|
cardinality: "0..*"
|
|
note: "Links to ROLE hypernym when extracted"
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# Person Reconstruction Pattern
|
|
# -----------------------------------------------------------------------------
|
|
|
|
person_reconstruction_pattern:
|
|
description: |
|
|
A PersonReconstruction is created by linking one or more PersonObservations
|
|
to form a unified person entity. This is the scholarly interpretation layer
|
|
that connects source-bound observations to a conceptual person.
|
|
|
|
Key distinction:
|
|
- PersonObservation: What is OBSERVED in a specific source (exact transcription)
|
|
- PersonReconstruction: What is INFERRED about the person (normalized, linked)
|
|
|
|
A single PersonReconstruction may derive from observations across:
|
|
- Multiple sources (birth record + marriage record + death record)
|
|
- Different time periods (mentions across decades)
|
|
- Various name forms ("Jan Jansz" + "Johannes Jansen" + "J. Jansen")
|
|
|
|
class: "pico:PersonReconstruction"
|
|
class_uri: "https://personsincontext.org/model#PersonReconstruction"
|
|
superclass: "pico:Person"
|
|
|
|
required_properties:
|
|
- property: "prov:wasDerivedFrom"
|
|
description: "Links to source PersonObservation(s)"
|
|
range: "pico:PersonObservation"
|
|
cardinality: "1..*"
|
|
note: "Every reconstruction MUST link to at least one observation"
|
|
|
|
- property: "prov:wasGeneratedBy"
|
|
description: "Links to the reconstruction Activity"
|
|
range: "prov:Activity"
|
|
cardinality: "1"
|
|
note: "Documents how/when/by whom reconstruction was created"
|
|
|
|
optional_properties:
|
|
- property: "prov:wasRevisionOf"
|
|
description: "Links to previous version of this reconstruction"
|
|
range: "pico:PersonReconstruction"
|
|
cardinality: "0..1"
|
|
note: "For tracking updates to reconstructions over time"
|
|
|
|
- property: "sdo:name"
|
|
description: "Normalized/preferred name form"
|
|
range: "xsd:string"
|
|
note: "The canonical name for this person"
|
|
|
|
- property: "sdo:additionalName"
|
|
description: "Structured name following PNV"
|
|
range: "pnv:PersonName"
|
|
note: "Full name breakdown using Person Name Vocabulary"
|
|
|
|
- property: "sdo:givenName"
|
|
description: "Given/first name"
|
|
range: "xsd:string"
|
|
|
|
- property: "sdo:familyName"
|
|
description: "Family/surname"
|
|
range: "xsd:string"
|
|
|
|
- property: "sdo:gender"
|
|
description: "Gender of the person"
|
|
range: "sdo:GenderType"
|
|
values: ["sdo:Male", "sdo:Female"]
|
|
|
|
- property: "sdo:birthDate"
|
|
description: "Birth date (ISO 8601)"
|
|
range: "xsd:date"
|
|
note: "May be incomplete: YYYY, YYYY-MM, or YYYY-MM-DD"
|
|
|
|
- property: "sdo:birthPlace"
|
|
description: "Place of birth"
|
|
range: "xsd:string or xsd:anyURI"
|
|
note: "Prefer linking to GeoNames or Wikidata"
|
|
|
|
- property: "sdo:deathDate"
|
|
description: "Death date (ISO 8601)"
|
|
range: "xsd:date"
|
|
|
|
- property: "sdo:deathPlace"
|
|
description: "Place of death"
|
|
range: "xsd:string or xsd:anyURI"
|
|
|
|
example:
|
|
description: "PersonReconstruction derived from multiple observations"
|
|
turtle: |
|
|
cbg:person_reconstruction_anna_koppen
|
|
a pico:PersonReconstruction ;
|
|
sdo:name "Anna Maria Koppen" ;
|
|
sdo:familyName "Koppen" ;
|
|
sdo:givenName "Anna Maria" ;
|
|
sdo:gender sdo:Female ;
|
|
sdo:birthPlace "Haarlem" ;
|
|
sdo:birthDate "1860-03-31"^^xsd:date ;
|
|
sdo:deathPlace "Detroit, USA" ;
|
|
sdo:deathDate "1926"^^xsd:gYear ;
|
|
prov:wasDerivedFrom nha:marriage_1885_po_1 ,
|
|
cbg:emigration_1887_po_1 ,
|
|
us:death_1926_po_1 ;
|
|
prov:wasGeneratedBy cbg:reconstruction_activity_01 .
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# Source and Scan Classes
|
|
# -----------------------------------------------------------------------------
|
|
|
|
source_classes:
|
|
|
|
archive_component:
|
|
description: |
|
|
A Source document from which PersonObservations are extracted.
|
|
PiCo does not aim to fully describe archival sources (use RiC-O or DC for that),
|
|
but requires minimal identification for provenance tracking.
|
|
|
|
class: "sdo:ArchiveComponent"
|
|
class_uri: "https://schema.org/ArchiveComponent"
|
|
superclass: "sdo:CreativeWork"
|
|
|
|
properties:
|
|
- property: "sdo:name"
|
|
description: "Identifying name for the source"
|
|
range: "xsd:string"
|
|
cardinality: "1"
|
|
note: "Combine title, date, archive location for identification"
|
|
example: "BS Marriage Haarlem, November 11, 1885, certificate number 321"
|
|
|
|
- property: "sdo:additionalType"
|
|
description: "Type of source document"
|
|
range: "picot_sourcetypes:Concept"
|
|
note: "Use PiCo SourceType thesaurus"
|
|
|
|
- property: "sdo:dateCreated"
|
|
description: "Date the source was created"
|
|
range: "xsd:date"
|
|
|
|
- property: "sdo:holdingArchive"
|
|
description: "Institution holding the source"
|
|
range: "xsd:anyURI"
|
|
note: "Link to heritage custodian (GHCID or Wikidata)"
|
|
|
|
- property: "sdo:url"
|
|
description: "Permalink to the source"
|
|
range: "sdo:URL"
|
|
note: "Preferably a persistent identifier"
|
|
|
|
- property: "sdo:contentLocation"
|
|
description: "Geographic coverage of the source"
|
|
range: "xsd:string or xsd:anyURI"
|
|
|
|
- property: "sdo:associatedMedia"
|
|
description: "Link to scan(s) of the source"
|
|
range: "sdo:ImageObject"
|
|
cardinality: "0..*"
|
|
|
|
image_object:
|
|
description: |
|
|
A Scan of a source document. Links to the digital image at the holding archive.
|
|
|
|
class: "sdo:ImageObject"
|
|
class_uri: "https://schema.org/ImageObject"
|
|
superclass: "sdo:CreativeWork"
|
|
|
|
properties:
|
|
- property: "sdo:url"
|
|
description: "URL to the full scan"
|
|
range: "sdo:URL"
|
|
note: "Preferably IIIF manifest"
|
|
|
|
- property: "sdo:thumbnail"
|
|
description: "URL to thumbnail image"
|
|
range: "sdo:ImageObject"
|
|
|
|
- property: "sdo:embedUrl"
|
|
description: "URL to image viewer"
|
|
range: "sdo:URL"
|
|
|
|
- property: "sdo:position"
|
|
description: "Position in sequence of scans"
|
|
range: "xsd:int"
|
|
note: "For multi-page sources"
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# Biographical Properties
|
|
# -----------------------------------------------------------------------------
|
|
|
|
biographical_properties:
|
|
description: |
|
|
Biographical properties capture personal details as they appear in sources.
|
|
These are used for both PersonObservation (source-bound) and
|
|
PersonReconstruction (normalized).
|
|
|
|
age:
|
|
property: "pico:hasAge"
|
|
property_uri: "https://personsincontext.org/model#hasAge"
|
|
description: "Age of person as stated in source"
|
|
range: "xsd:string"
|
|
domain: "pico:PersonObservation"
|
|
note: |
|
|
Used when birth date unknown but age is recorded.
|
|
Age assumed in years unless specified ("4" = 4 years, "4 months" = 4 months).
|
|
Numerical preferred over text ("4" not "four").
|
|
examples:
|
|
- "30"
|
|
- "4 months"
|
|
- "about 25"
|
|
|
|
religion:
|
|
property: "pico:hasReligion"
|
|
property_uri: "https://personsincontext.org/model#hasReligion"
|
|
description: "Religious affiliation as stated in source"
|
|
range: "xsd:string or xsd:anyURI"
|
|
domain: "pico:Person"
|
|
note: "Can link to SKOS thesaurus for religions"
|
|
examples:
|
|
- "Catholic"
|
|
- "Reformed"
|
|
- "Jewish"
|
|
|
|
deceased:
|
|
property: "pico:deceased"
|
|
property_uri: "https://personsincontext.org/model#deceased"
|
|
description: "Indication that person is deceased (when death date unknown)"
|
|
range: "xsd:boolean"
|
|
domain: "pico:PersonObservation"
|
|
note: |
|
|
Only used when deathDate is unknown but death is indicated.
|
|
A person without deathDate and without deceased:true is assumed alive.
|
|
Important for privacy considerations in publishing person records.
|
|
|
|
gender:
|
|
property: "sdo:gender"
|
|
property_uri: "https://schema.org/gender"
|
|
description: "Gender of the person"
|
|
range: "sdo:GenderType"
|
|
domain: "pico:Person"
|
|
values:
|
|
- uri: "sdo:Male"
|
|
label: "Male"
|
|
- uri: "sdo:Female"
|
|
label: "Female"
|
|
|
|
address:
|
|
property: "sdo:address"
|
|
property_uri: "https://schema.org/address"
|
|
description: "Physical address as mentioned in source"
|
|
range: "xsd:string"
|
|
domain: "pico:PersonObservation"
|
|
note: "Address exactly as recorded in source"
|
|
|
|
initials:
|
|
property: "pnv:initials"
|
|
property_uri: "https://w3id.org/pnv#initials"
|
|
description: "Initials of given name(s)"
|
|
range: "xsd:string"
|
|
domain: "pnv:PersonName"
|
|
note: "Each initial followed by period (e.g., 'P.R.', 'H.A.F.M.O.')"
|
|
examples:
|
|
- "P.R."
|
|
- "C.Joh."
|
|
- "H.A.F.M.O."
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# Hypernym Mapping (GLAM-NER v1.7.0)
|
|
# -----------------------------------------------------------------------------
|
|
|
|
hypernym_mapping:
|
|
description: "How PiCo concepts map to GLAM-NER v1.7.0 hypernyms"
|
|
|
|
mappings:
|
|
- pico_class: "picom:PersonObservation"
|
|
glam_hypernym: "AGT.PER"
|
|
glam_code: "AGT.PER"
|
|
note: "Person observations create AGT.PER entities"
|
|
|
|
- pico_class: "picom:PersonObservation"
|
|
glam_hypernym: "AGT.STF"
|
|
glam_code: "AGT.STF"
|
|
condition: "When observed with organizational role"
|
|
note: "Staff members with role context"
|
|
|
|
- pico_class: "pnv:PersonName"
|
|
glam_hypernym: "APP.NAM"
|
|
glam_code: "APP.NAM"
|
|
note: "Name strings as appellations"
|
|
|
|
- pico_class: "picom:hasRole"
|
|
glam_hypernym: "ROL"
|
|
glam_code: "ROL"
|
|
note: "Extracted roles link to ROL hypernym"
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# Simple Examples
|
|
# -----------------------------------------------------------------------------
|
|
|
|
examples:
|
|
- description: "Staff member with title and role"
|
|
text: "Dr. Maria van den Berg, Director"
|
|
|
|
observation:
|
|
type: "picom:PersonObservation"
|
|
id: "_:obs1"
|
|
|
|
hasObservedName:
|
|
type: "pnv:PersonName"
|
|
literalName: "Dr. Maria van den Berg"
|
|
honorificPrefix: "Dr."
|
|
givenName: "Maria"
|
|
surnamePrefix: "van den"
|
|
baseSurname: "Berg"
|
|
|
|
hasRole: "Director"
|
|
hadPrimarySource: "https://example.org/staff-page"
|
|
observedAt: "2025-12-02T10:30:00Z"
|
|
|
|
glam_ner_annotations:
|
|
- span: "Dr. Maria van den Berg"
|
|
type: "AGT.STF"
|
|
code: "AGT.STF"
|
|
confidence: 0.95
|
|
|
|
- span: "Director"
|
|
type: "ROL.TIT"
|
|
code: "ROL.TIT"
|
|
confidence: 0.98
|
|
|
|
- description: "Historical artist"
|
|
text: "Rembrandt van Rijn painted this in 1642"
|
|
|
|
observation:
|
|
type: "picom:PersonObservation"
|
|
id: "_:obs2"
|
|
|
|
hasObservedName:
|
|
type: "pnv:PersonName"
|
|
literalName: "Rembrandt van Rijn"
|
|
givenName: "Rembrandt"
|
|
surnamePrefix: "van"
|
|
baseSurname: "Rijn"
|
|
|
|
isObservationOf: "wd:Q5598" # Wikidata Rembrandt
|
|
hadPrimarySource: "https://example.org/artwork-page"
|
|
observedAt: "2025-12-02T10:35:00Z"
|
|
|
|
glam_ner_annotations:
|
|
- span: "Rembrandt van Rijn"
|
|
type: "AGT.PER"
|
|
code: "AGT.PER"
|
|
confidence: 0.99
|
|
linking:
|
|
wikidata: "Q5598"
|
|
viaf: "64013650"
|
|
|
|
- description: "Nobility title"
|
|
text: "Count Willem van Loon"
|
|
|
|
observation:
|
|
type: "picom:PersonObservation"
|
|
id: "_:obs3"
|
|
|
|
hasObservedName:
|
|
type: "pnv:PersonName"
|
|
literalName: "Count Willem van Loon"
|
|
honorificPrefix: "Count"
|
|
givenName: "Willem"
|
|
surnamePrefix: "van"
|
|
baseSurname: "Loon"
|
|
|
|
hadPrimarySource: "https://example.org/archive-doc"
|
|
observedAt: "2025-12-02T10:40:00Z"
|
|
|
|
glam_ner_annotations:
|
|
- span: "Count Willem van Loon"
|
|
type: "AGT.PER"
|
|
code: "AGT.PER"
|
|
confidence: 0.95
|
|
|
|
- span: "Count"
|
|
type: "ROL.HON"
|
|
code: "ROL.HON"
|
|
note: "Nobility title - honorific role"
|