377 lines
9.8 KiB
YAML
377 lines
9.8 KiB
YAML
# Web Enrichment Provenance Schema
|
|
# Extends heritage-custodian-provenance with claim-level traceability for web-extracted data
|
|
|
|
id: https://w3id.org/heritage/custodian/web-enrichment-provenance
|
|
name: web-enrichment-provenance
|
|
title: Web Enrichment Provenance Schema
|
|
description: >-
|
|
Schema for tracking provenance of web-enriched data with claim-level traceability.
|
|
Each factual claim extracted from web sources includes precise references to:
|
|
- Source URL and fetch timestamp
|
|
- Text excerpt with character offsets
|
|
- Markdown heading path for structural context
|
|
- Confidence and verification status
|
|
|
|
version: 1.0.0
|
|
license: https://creativecommons.org/publicdomain/zero/1.0/
|
|
|
|
prefixes:
|
|
linkml: https://w3id.org/linkml/
|
|
heritage: https://w3id.org/heritage/custodian/
|
|
prov: http://www.w3.org/ns/prov#
|
|
dcterms: http://purl.org/dc/terms/
|
|
oa: http://www.w3.org/ns/oa# # Web Annotation Ontology
|
|
|
|
default_prefix: heritage
|
|
default_range: string
|
|
|
|
# =============================================================================
|
|
# CLASSES
|
|
# =============================================================================
|
|
|
|
classes:
|
|
|
|
WebEnrichment:
|
|
description: >-
|
|
Container for all web-enriched data with full provenance tracking.
|
|
Replaces legacy 'exa_enrichment' and 'website_enrichment' sections.
|
|
slots:
|
|
- enrichment_id
|
|
- search_query
|
|
- search_timestamp
|
|
- search_engine
|
|
- claims
|
|
- raw_sources
|
|
- enrichment_status
|
|
- enrichment_notes
|
|
|
|
WebSource:
|
|
description: >-
|
|
A web page that was fetched and used as source for claims.
|
|
Stores the raw content for reproducibility and audit.
|
|
slots:
|
|
- source_id
|
|
- url
|
|
- fetch_timestamp
|
|
- http_status
|
|
- content_type
|
|
- title
|
|
- author
|
|
- published_date
|
|
- raw_markdown
|
|
- raw_markdown_hash
|
|
- exa_highlights
|
|
- exa_highlight_scores
|
|
|
|
Claim:
|
|
description: >-
|
|
A single factual assertion extracted from web sources.
|
|
Each claim has precise provenance linking to source text.
|
|
class_uri: oa:Annotation
|
|
slots:
|
|
- claim_id
|
|
- claim_type
|
|
- field_path
|
|
- value
|
|
- value_type
|
|
- source_references
|
|
- confidence_score
|
|
- verified
|
|
- verified_by
|
|
- verified_date
|
|
- claim_notes
|
|
|
|
SourceReference:
|
|
description: >-
|
|
Precise reference to source text supporting a claim.
|
|
Uses Web Annotation Ontology (oa:) patterns for text selection.
|
|
class_uri: oa:TextPositionSelector
|
|
slots:
|
|
- source_id
|
|
- text_excerpt
|
|
- char_start
|
|
- char_end
|
|
- markdown_heading_path
|
|
- sentence_index
|
|
- exa_highlight_index
|
|
- relevance_score
|
|
|
|
# =============================================================================
|
|
# SLOTS
|
|
# =============================================================================
|
|
|
|
slots:
|
|
|
|
# WebEnrichment slots
|
|
enrichment_id:
|
|
description: Unique identifier for this enrichment session
|
|
range: string
|
|
identifier: true
|
|
pattern: '^enrich-[0-9]{8}T[0-9]{6}-[a-f0-9]{8}$'
|
|
examples:
|
|
- value: "enrich-20251129T143200-a1b2c3d4"
|
|
|
|
search_query:
|
|
description: The search query used to find sources
|
|
range: string
|
|
required: true
|
|
|
|
search_timestamp:
|
|
description: When the search was performed (ISO 8601)
|
|
range: datetime
|
|
required: true
|
|
slot_uri: prov:atTime
|
|
|
|
search_engine:
|
|
description: Search engine used (e.g., "exa", "google", "bing")
|
|
range: string
|
|
required: true
|
|
|
|
claims:
|
|
description: List of factual claims extracted from sources
|
|
range: Claim
|
|
multivalued: true
|
|
inlined_as_list: true
|
|
|
|
raw_sources:
|
|
description: Raw source documents fetched during enrichment
|
|
range: WebSource
|
|
multivalued: true
|
|
inlined_as_list: true
|
|
|
|
enrichment_status:
|
|
description: Status of the enrichment process
|
|
range: EnrichmentStatusEnum
|
|
required: true
|
|
|
|
enrichment_notes:
|
|
description: Notes about the enrichment process
|
|
range: string
|
|
|
|
# WebSource slots
|
|
source_id:
|
|
description: Unique identifier for this source (typically URL hash)
|
|
range: string
|
|
identifier: true
|
|
|
|
url:
|
|
description: Source URL
|
|
range: uri
|
|
required: true
|
|
slot_uri: dcterms:source
|
|
|
|
fetch_timestamp:
|
|
description: When the page was fetched
|
|
range: datetime
|
|
required: true
|
|
|
|
http_status:
|
|
description: HTTP status code of fetch (200, 404, etc.)
|
|
range: integer
|
|
|
|
content_type:
|
|
description: Content-Type header value
|
|
range: string
|
|
|
|
title:
|
|
description: Page title
|
|
range: string
|
|
slot_uri: dcterms:title
|
|
|
|
author:
|
|
description: Page author if available
|
|
range: string
|
|
slot_uri: dcterms:creator
|
|
|
|
published_date:
|
|
description: Page publication date if available
|
|
range: datetime
|
|
slot_uri: dcterms:created
|
|
|
|
raw_markdown:
|
|
description: >-
|
|
Full page content as markdown (for reproducibility).
|
|
Used to validate char_start/char_end offsets.
|
|
range: string
|
|
|
|
raw_markdown_hash:
|
|
description: SHA-256 hash of raw_markdown for integrity verification
|
|
range: string
|
|
pattern: '^[a-f0-9]{64}$'
|
|
|
|
exa_highlights:
|
|
description: Exa-extracted highlight snippets
|
|
range: string
|
|
multivalued: true
|
|
|
|
exa_highlight_scores:
|
|
description: Relevance scores for each Exa highlight
|
|
range: float
|
|
multivalued: true
|
|
|
|
# Claim slots
|
|
claim_id:
|
|
description: Unique identifier for this claim
|
|
range: string
|
|
identifier: true
|
|
pattern: '^claim-[a-z0-9_]+-[0-9]+$'
|
|
examples:
|
|
- value: "claim-tree_heights-1"
|
|
|
|
claim_type:
|
|
description: Category of claim (descriptive, quantitative, temporal, etc.)
|
|
range: ClaimTypeEnum
|
|
required: true
|
|
|
|
field_path:
|
|
description: >-
|
|
JSON path to the field this claim populates.
|
|
Uses dot notation (e.g., "notable_features.green_library.tree_heights")
|
|
range: string
|
|
required: true
|
|
examples:
|
|
- value: "description"
|
|
- value: "notable_features.green_library.tree_heights"
|
|
- value: "services[0]"
|
|
|
|
value:
|
|
description: The actual value of the claim (as string, will be typed by value_type)
|
|
range: string
|
|
required: true
|
|
|
|
value_type:
|
|
description: Data type of the value (string, integer, float, boolean, list, object)
|
|
range: ValueTypeEnum
|
|
required: true
|
|
|
|
source_references:
|
|
description: References to source text supporting this claim
|
|
range: SourceReference
|
|
multivalued: true
|
|
inlined_as_list: true
|
|
required: true
|
|
|
|
verified:
|
|
description: Whether this claim has been manually verified
|
|
range: boolean
|
|
required: true
|
|
|
|
verified_by:
|
|
description: Who verified this claim (person or system)
|
|
range: string
|
|
|
|
verified_date:
|
|
description: When verification occurred
|
|
range: datetime
|
|
|
|
claim_notes:
|
|
description: Notes about this specific claim
|
|
range: string
|
|
|
|
# SourceReference slots
|
|
text_excerpt:
|
|
description: Exact text from source supporting this claim
|
|
range: string
|
|
required: true
|
|
slot_uri: oa:exact
|
|
|
|
char_start:
|
|
description: Character offset where excerpt starts in raw_markdown
|
|
range: integer
|
|
required: true
|
|
minimum_value: 0
|
|
slot_uri: oa:start
|
|
|
|
char_end:
|
|
description: Character offset where excerpt ends in raw_markdown
|
|
range: integer
|
|
required: true
|
|
minimum_value: 0
|
|
slot_uri: oa:end
|
|
|
|
markdown_heading_path:
|
|
description: >-
|
|
Path of markdown headings to this content.
|
|
Format: "# H1 > ## H2 > ### H3"
|
|
range: string
|
|
examples:
|
|
- value: "# Library > ## The Netherlands 'Greenest' Library"
|
|
|
|
sentence_index:
|
|
description: Index of sentence in document (0-based)
|
|
range: integer
|
|
minimum_value: 0
|
|
|
|
exa_highlight_index:
|
|
description: Index in exa_highlights array if from Exa highlight
|
|
range: integer
|
|
minimum_value: 0
|
|
|
|
relevance_score:
|
|
description: Relevance score from search engine (0.0-1.0)
|
|
range: float
|
|
minimum_value: 0.0
|
|
maximum_value: 1.0
|
|
|
|
# =============================================================================
|
|
# ENUMS
|
|
# =============================================================================
|
|
|
|
enums:
|
|
|
|
EnrichmentStatusEnum:
|
|
permissible_values:
|
|
SUCCESS:
|
|
description: Enrichment completed successfully with claims extracted
|
|
PARTIAL:
|
|
description: Some claims extracted but source had issues
|
|
NO_RESULTS:
|
|
description: Search returned no usable results
|
|
FETCH_ERROR:
|
|
description: Failed to fetch source pages
|
|
PARSE_ERROR:
|
|
description: Failed to parse source content
|
|
|
|
ClaimTypeEnum:
|
|
permissible_values:
|
|
DESCRIPTIVE:
|
|
description: General descriptive text about the institution
|
|
QUANTITATIVE:
|
|
description: Numeric values (counts, measurements, ratings)
|
|
TEMPORAL:
|
|
description: Dates, time periods, durations
|
|
GEOGRAPHIC:
|
|
description: Locations, addresses, coordinates
|
|
IDENTIFIER:
|
|
description: Codes, IDs, registration numbers
|
|
ORGANIZATIONAL:
|
|
description: Relationships, parent/child, affiliations
|
|
SERVICE:
|
|
description: Services offered by the institution
|
|
COLLECTION:
|
|
description: Collection information
|
|
CONTACT:
|
|
description: Contact information (phone, email, social media)
|
|
ARCHITECTURAL:
|
|
description: Building features, design, accessibility
|
|
|
|
ValueTypeEnum:
|
|
permissible_values:
|
|
STRING:
|
|
description: Text value
|
|
INTEGER:
|
|
description: Whole number
|
|
FLOAT:
|
|
description: Decimal number
|
|
BOOLEAN:
|
|
description: True/false
|
|
DATE:
|
|
description: Date value (ISO 8601)
|
|
DATETIME:
|
|
description: Date and time value (ISO 8601)
|
|
LIST_STRING:
|
|
description: List of strings
|
|
LIST_OBJECT:
|
|
description: List of objects
|
|
OBJECT:
|
|
description: Nested object
|