glam/schemas/initial/provenance.yaml
2025-12-07 00:26:01 +01:00

438 lines
14 KiB
YAML

id: https://w3id.org/heritage/custodian/provenance
name: heritage-custodian-provenance
title: Heritage Custodian Provenance Module
description: >-
Provenance tracking and organizational change event classes for heritage custodians.
Implements W3C PROV-O patterns for data quality tracking and TOOI Wijzigingsgebeurtenis
patterns for institutional lifecycle events. Tracks GHCID history and data lineage.
license: https://creativecommons.org/publicdomain/zero/1.0/
version: 0.2.2
prefixes:
linkml: https://w3id.org/linkml/
heritage: https://w3id.org/heritage/custodian/
prov: http://www.w3.org/ns/prov#
tooi: https://identifier.overheid.nl/tooi/def/ont/
dcterms: http://purl.org/dc/terms/
rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns#
rdfs: http://www.w3.org/2000/01/rdf-schema#
foaf: http://xmlns.com/foaf/0.1/
adms: http://www.w3.org/ns/adms#
default_prefix: heritage
default_range: string
imports:
- linkml:types
- enums
- core
# =============================================================================
# PROVENANCE CLASSES
# =============================================================================
classes:
# PROV-O Proxy Classes
# These define local equivalents of W3C PROV-O classes to avoid external dependencies
# while maintaining RDF alignment via class_uri mappings
ProvenanceEntity:
description: >-
Local proxy for prov:Entity. Represents a physical, digital, conceptual,
or other kind of thing with fixed aspects. Used as mixin for classes that
need PROV-O alignment (e.g., HeritageCustodian).
class_uri: prov:Entity
abstract: true
notes: >-
This is a mixin class providing PROV-O alignment. Do not instantiate directly.
In RDF serialization, classes using this mixin will be typed as prov:Entity.
ProvenanceActivity:
description: >-
Local proxy for prov:Activity. Represents something that occurs over a period
of time and acts upon or with entities. Used as base for ChangeEvent.
class_uri: prov:Activity
abstract: true
notes: >-
This is an abstract base class providing PROV-O alignment.
In RDF serialization, subclasses will be typed as prov:Activity.
# TOOI Ontology Proxy Classes
# Local proxy for Dutch government organizational ontology (TOOI)
TOOIWijzigingsgebeurtenis:
description: >-
Local proxy for tooi:Wijzigingsgebeurtenis (Change Event in TOOI ontology).
Represents organizational change events in Dutch government organizations.
Used as mixin for ChangeEvent to align with TOOI patterns.
class_uri: tooi:Wijzigingsgebeurtenis
abstract: true
notes: >-
This is a mixin class providing TOOI ontology alignment for Dutch institutions.
In RDF serialization, ChangeEvent will also be typed as tooi:Wijzigingsgebeurtenis.
Provenance:
description: Provenance information for data quality tracking
slots:
- data_source
- data_tier
- extraction_date
- extraction_method
- confidence_score
- conversation_id
- source_url
- verified_date
- verified_by
- enrichment_history
- provenance_notes
EnrichmentHistoryEntry:
description: >-
Record of a data enrichment activity (e.g., Wikidata lookup, geocoding,
identifier resolution). Tracks what was enriched, when, how, and with what
quality metrics.
slots:
- enrichment_date
- enrichment_method
- enrichment_type
- match_score
- verified
- enrichment_source
- enrichment_notes
slot_usage:
enrichment_date:
required: true
description: When the enrichment was performed (ISO 8601 timestamp)
enrichment_method:
required: true
description: >-
Method used for enrichment (e.g., "Wikidata SPARQL fuzzy matching",
"Nominatim geocoding API", "Manual identifier lookup")
enrichment_type:
required: true
description: >-
Type of enrichment performed (e.g., "wikidata_identifier",
"geocoding", "viaf_identifier", "isil_code")
match_score:
required: false
description: >-
Fuzzy matching confidence score (0.0-1.0) for automated enrichments.
Null for manual enrichments or non-matching operations.
verified:
required: true
description: >-
Whether the enrichment has been manually verified (true) or is
automated/unverified (false)
enrichment_source:
required: false
description: >-
Source of the enrichment data (e.g., "https://www.wikidata.org",
"https://nominatim.openstreetmap.org", "VIAF API")
enrichment_notes:
required: false
description: >-
Additional notes about the enrichment (e.g., "City verification passed",
"False positive removed", "Matched alternative name")
GHCIDHistoryEntry:
description: >-
Historical record tracking GHCID changes over time. Each entry represents
a period when a specific GHCID was valid for this institution.
slots:
- ghcid
- ghcid_numeric
- valid_from
- valid_to
- reason
- institution_name
- location_city
- location_country
slot_usage:
ghcid:
required: true
description: The GHCID string that was valid during this period
ghcid_numeric:
required: true
description: Numeric hash of the GHCID (persistent identifier)
valid_from:
required: true
description: When this GHCID became valid (ISO 8601 timestamp)
valid_to:
required: false
description: When this GHCID was superseded (null = still current)
reason:
required: true
description: Reason for this identifier or change (e.g., "Initial identifier", "Relocated to Amsterdam", "Name change")
institution_name:
required: true
description: Institution name during this period
location_city:
required: true
description: City location during this period
location_country:
required: true
description: Country location during this period
ChangeEvent:
description: >-
A significant organizational change event in an institution's lifecycle.
Based on TOOI Wijzigingsgebeurtenis and W3C PROV-O Activity patterns.
Tracks mergers, name changes, relocations, and other structural changes.
is_a: ProvenanceActivity
mixins:
- TOOIWijzigingsgebeurtenis
slots:
- event_id
- change_type
- event_date
- event_description
- affected_organization
- resulting_organization
- related_organizations
- source_documentation
slot_usage:
event_id:
required: true
identifier: true
change_type:
required: true
event_date:
required: true
# =============================================================================
# SLOTS
# =============================================================================
slots:
# Provenance fields
data_source:
description: Source of this data record
range: DataSourceEnum
required: true
data_tier:
description: Data quality tier (authority level)
range: DataTierEnum
required: true
extraction_date:
description: Date the data was extracted or created
range: datetime
required: true
extraction_method:
description: Method used to extract data (NLP model, manual, API, etc.)
range: string
confidence_score:
description: Confidence score for NLP-extracted data (0.0-1.0)
range: float
minimum_value: 0.0
maximum_value: 1.0
conversation_id:
description: UUID of conversation if extracted from Claude conversation
range: string
source_url:
description: URL of the source
range: uri
verified_date:
description: Date the data was verified
range: datetime
verified_by:
description: Person or system that verified the data
range: string
provenance_notes:
description: >-
Additional notes about data quality, extraction issues, conflicts resolved,
or other provenance metadata. Use for documenting uncertainties, data conflicts,
manual corrections, or contextual information about the data source.
range: string
required: false
slot_uri: rdfs:comment
aliases:
- notes
comments:
- "Uses RDF Schema comment property for annotation alignment"
- "For observations about data reliability, use confidence_score instead"
- "For enrichment details, use enrichment_history instead"
- "Accepts 'notes' as alias for backward compatibility with extracted data"
# GHCID history fields
ghcid:
description: |
Human-readable GHCID string.
Format: {Country}-{Region}-{City}-{Type}-{Abbreviation}[-{name_suffix}]
The optional name_suffix is used for collision resolution and consists of
the institution's full official name in native language, converted to
snake_case format (lowercase, underscores for spaces, no diacritics/punctuation).
Examples:
- NL-NH-AMS-M-RM (Rijksmuseum, no collision)
- NL-NH-AMS-M-SM-stedelijk_museum_amsterdam (Stedelijk Museum, with collision suffix)
- FR-IDF-PAR-M-MO-musee_dorsay (Musée d'Orsay, with collision suffix)
range: string
pattern: '^[A-Z]{2}-[A-Z0-9]{1,3}-[A-Z]{3}-[A-Z]-[A-Z0-9]{1,10}(-[a-z0-9_]+)?$'
valid_from:
description: Timestamp when this record/identifier became valid
range: datetime
valid_to:
description: Timestamp when this record/identifier was superseded (null = still current)
range: datetime
required: false
reason:
description: Reason for identifier assignment or change
range: string
institution_name:
description: Institution name at a specific point in time
range: string
location_city:
description: City location at a specific point in time
range: string
location_country:
description: Country location at a specific point in time
range: string
# ChangeEvent slots
event_id:
description: Unique identifier for this change event
range: uriorcurie
identifier: true
slot_uri: dcterms:identifier
change_type:
description: Type of organizational change
range: ChangeTypeEnum
required: true
slot_uri: rdf:type
event_date:
description: Date when the change event occurred
range: date
required: true
slot_uri: prov:atTime
event_description:
description: Textual description of the change event
range: string
slot_uri: dcterms:description
affected_organization:
description: >-
The organization that was affected by this change event.
For mergers/acquisitions, this is the organization being absorbed.
range: HeritageCustodian
slot_uri: prov:entity
resulting_organization:
description: >-
The organization resulting from this change event.
For mergers, this is the surviving/new organization.
range: HeritageCustodian
slot_uri: prov:generated
related_organizations:
description: >-
Other organizations involved in this change event.
For mergers, these are the other merging parties.
range: HeritageCustodian
multivalued: true
slot_uri: prov:wasAssociatedWith
source_documentation:
description: URL or reference to documentation of this change event
range: uri
slot_uri: dcterms:source
# EnrichmentHistoryEntry slots
enrichment_history:
description: >-
Chronological log of data enrichment activities performed on this record
(e.g., Wikidata lookups, geocoding, identifier resolution)
range: EnrichmentHistoryEntry
multivalued: true
inlined_as_list: true
enrichment_date:
description: Timestamp when the enrichment was performed
range: datetime
required: true
slot_uri: prov:atTime
comments:
- Maps to PROV-O timestamp for enrichment activity
enrichment_method:
description: >-
Method used for enrichment (e.g., "Wikidata SPARQL fuzzy matching",
"Nominatim geocoding API", "Manual identifier lookup")
range: string
required: true
slot_uri: prov:hadPlan
comments:
- Maps to PROV-O plan/method used for enrichment activity
enrichment_type:
description: >-
Type of enrichment performed. Common values: "wikidata_identifier",
"geocoding", "viaf_identifier", "isil_code", "ghcid_generation",
"false_positive_removal"
range: EnrichmentTypeEnum
required: true
slot_uri: rdf:type
comments:
- Controlled vocabulary for enrichment activity types
match_score:
description: >-
Fuzzy matching confidence score (0.0-1.0) for automated enrichments.
Null for manual enrichments or non-matching operations.
range: float
minimum_value: 0.0
maximum_value: 1.0
required: false
slot_uri: adms:confidence
comments:
- Maps to ADMS (Asset Description Metadata Schema) confidence score
verified:
description: >-
Whether the enrichment has been manually verified (true) or is
automated/unverified (false)
range: boolean
required: true
slot_uri: adms:status
comments:
- Maps to ADMS verification status
enrichment_source:
description: >-
Source of the enrichment data (e.g., "https://www.wikidata.org",
"https://nominatim.openstreetmap.org", "VIAF API")
range: uri
required: false
slot_uri: dcterms:source
comments:
- Uses Dublin Core Terms source property
enrichment_notes:
description: >-
Additional notes about the enrichment (e.g., "City verification passed",
"False positive removed", "Matched alternative name")
range: string
required: false
slot_uri: dcterms:description
comments:
- Human-readable description of enrichment details