glam/schemas/web_enrichment_provenance.yaml

377 lines
9.8 KiB
YAML

# Web Enrichment Provenance Schema
# Extends heritage-custodian-provenance with claim-level traceability for web-extracted data
id: https://w3id.org/heritage/custodian/web-enrichment-provenance
name: web-enrichment-provenance
title: Web Enrichment Provenance Schema
description: >-
Schema for tracking provenance of web-enriched data with claim-level traceability.
Each factual claim extracted from web sources includes precise references to:
- Source URL and fetch timestamp
- Text excerpt with character offsets
- Markdown heading path for structural context
- Confidence and verification status
version: 1.0.0
license: https://creativecommons.org/publicdomain/zero/1.0/
prefixes:
linkml: https://w3id.org/linkml/
heritage: https://w3id.org/heritage/custodian/
prov: http://www.w3.org/ns/prov#
dcterms: http://purl.org/dc/terms/
oa: http://www.w3.org/ns/oa# # Web Annotation Ontology
default_prefix: heritage
default_range: string
# =============================================================================
# CLASSES
# =============================================================================
classes:
WebEnrichment:
description: >-
Container for all web-enriched data with full provenance tracking.
Replaces legacy 'exa_enrichment' and 'website_enrichment' sections.
slots:
- enrichment_id
- search_query
- search_timestamp
- search_engine
- claims
- raw_sources
- enrichment_status
- enrichment_notes
WebSource:
description: >-
A web page that was fetched and used as source for claims.
Stores the raw content for reproducibility and audit.
slots:
- source_id
- url
- fetch_timestamp
- http_status
- content_type
- title
- author
- published_date
- raw_markdown
- raw_markdown_hash
- exa_highlights
- exa_highlight_scores
Claim:
description: >-
A single factual assertion extracted from web sources.
Each claim has precise provenance linking to source text.
class_uri: oa:Annotation
slots:
- claim_id
- claim_type
- field_path
- value
- value_type
- source_references
- confidence_score
- verified
- verified_by
- verified_date
- claim_notes
SourceReference:
description: >-
Precise reference to source text supporting a claim.
Uses Web Annotation Ontology (oa:) patterns for text selection.
class_uri: oa:TextPositionSelector
slots:
- source_id
- text_excerpt
- char_start
- char_end
- markdown_heading_path
- sentence_index
- exa_highlight_index
- relevance_score
# =============================================================================
# SLOTS
# =============================================================================
slots:
# WebEnrichment slots
enrichment_id:
description: Unique identifier for this enrichment session
range: string
identifier: true
pattern: '^enrich-[0-9]{8}T[0-9]{6}-[a-f0-9]{8}$'
examples:
- value: "enrich-20251129T143200-a1b2c3d4"
search_query:
description: The search query used to find sources
range: string
required: true
search_timestamp:
description: When the search was performed (ISO 8601)
range: datetime
required: true
slot_uri: prov:atTime
search_engine:
description: Search engine used (e.g., "exa", "google", "bing")
range: string
required: true
claims:
description: List of factual claims extracted from sources
range: Claim
multivalued: true
inlined_as_list: true
raw_sources:
description: Raw source documents fetched during enrichment
range: WebSource
multivalued: true
inlined_as_list: true
enrichment_status:
description: Status of the enrichment process
range: EnrichmentStatusEnum
required: true
enrichment_notes:
description: Notes about the enrichment process
range: string
# WebSource slots
source_id:
description: Unique identifier for this source (typically URL hash)
range: string
identifier: true
url:
description: Source URL
range: uri
required: true
slot_uri: dcterms:source
fetch_timestamp:
description: When the page was fetched
range: datetime
required: true
http_status:
description: HTTP status code of fetch (200, 404, etc.)
range: integer
content_type:
description: Content-Type header value
range: string
title:
description: Page title
range: string
slot_uri: dcterms:title
author:
description: Page author if available
range: string
slot_uri: dcterms:creator
published_date:
description: Page publication date if available
range: datetime
slot_uri: dcterms:created
raw_markdown:
description: >-
Full page content as markdown (for reproducibility).
Used to validate char_start/char_end offsets.
range: string
raw_markdown_hash:
description: SHA-256 hash of raw_markdown for integrity verification
range: string
pattern: '^[a-f0-9]{64}$'
exa_highlights:
description: Exa-extracted highlight snippets
range: string
multivalued: true
exa_highlight_scores:
description: Relevance scores for each Exa highlight
range: float
multivalued: true
# Claim slots
claim_id:
description: Unique identifier for this claim
range: string
identifier: true
pattern: '^claim-[a-z0-9_]+-[0-9]+$'
examples:
- value: "claim-tree_heights-1"
claim_type:
description: Category of claim (descriptive, quantitative, temporal, etc.)
range: ClaimTypeEnum
required: true
field_path:
description: >-
JSON path to the field this claim populates.
Uses dot notation (e.g., "notable_features.green_library.tree_heights")
range: string
required: true
examples:
- value: "description"
- value: "notable_features.green_library.tree_heights"
- value: "services[0]"
value:
description: The actual value of the claim (as string, will be typed by value_type)
range: string
required: true
value_type:
description: Data type of the value (string, integer, float, boolean, list, object)
range: ValueTypeEnum
required: true
source_references:
description: References to source text supporting this claim
range: SourceReference
multivalued: true
inlined_as_list: true
required: true
verified:
description: Whether this claim has been manually verified
range: boolean
required: true
verified_by:
description: Who verified this claim (person or system)
range: string
verified_date:
description: When verification occurred
range: datetime
claim_notes:
description: Notes about this specific claim
range: string
# SourceReference slots
text_excerpt:
description: Exact text from source supporting this claim
range: string
required: true
slot_uri: oa:exact
char_start:
description: Character offset where excerpt starts in raw_markdown
range: integer
required: true
minimum_value: 0
slot_uri: oa:start
char_end:
description: Character offset where excerpt ends in raw_markdown
range: integer
required: true
minimum_value: 0
slot_uri: oa:end
markdown_heading_path:
description: >-
Path of markdown headings to this content.
Format: "# H1 > ## H2 > ### H3"
range: string
examples:
- value: "# Library > ## The Netherlands 'Greenest' Library"
sentence_index:
description: Index of sentence in document (0-based)
range: integer
minimum_value: 0
exa_highlight_index:
description: Index in exa_highlights array if from Exa highlight
range: integer
minimum_value: 0
relevance_score:
description: Relevance score from search engine (0.0-1.0)
range: float
minimum_value: 0.0
maximum_value: 1.0
# =============================================================================
# ENUMS
# =============================================================================
enums:
EnrichmentStatusEnum:
permissible_values:
SUCCESS:
description: Enrichment completed successfully with claims extracted
PARTIAL:
description: Some claims extracted but source had issues
NO_RESULTS:
description: Search returned no usable results
FETCH_ERROR:
description: Failed to fetch source pages
PARSE_ERROR:
description: Failed to parse source content
ClaimTypeEnum:
permissible_values:
DESCRIPTIVE:
description: General descriptive text about the institution
QUANTITATIVE:
description: Numeric values (counts, measurements, ratings)
TEMPORAL:
description: Dates, time periods, durations
GEOGRAPHIC:
description: Locations, addresses, coordinates
IDENTIFIER:
description: Codes, IDs, registration numbers
ORGANIZATIONAL:
description: Relationships, parent/child, affiliations
SERVICE:
description: Services offered by the institution
COLLECTION:
description: Collection information
CONTACT:
description: Contact information (phone, email, social media)
ARCHITECTURAL:
description: Building features, design, accessibility
ValueTypeEnum:
permissible_values:
STRING:
description: Text value
INTEGER:
description: Whole number
FLOAT:
description: Decimal number
BOOLEAN:
description: True/false
DATE:
description: Date value (ISO 8601)
DATETIME:
description: Date and time value (ISO 8601)
LIST_STRING:
description: List of strings
LIST_OBJECT:
description: List of objects
OBJECT:
description: Nested object