glam/schemas/web_enrichment_provenance.yaml

# Web Enrichment Provenance Schema
# Extends heritage-custodian-provenance with claim-level traceability for web-extracted data

id: https://w3id.org/heritage/custodian/web-enrichment-provenance
name: web-enrichment-provenance
title: Web Enrichment Provenance Schema
description: >-
  Schema for tracking provenance of web-enriched data with claim-level traceability.
  Each factual claim extracted from web sources includes precise references to:
  - Source URL and fetch timestamp
  - Text excerpt with character offsets
  - Markdown heading path for structural context
  - Confidence and verification status

version: 1.0.0
license: https://creativecommons.org/publicdomain/zero/1.0/

prefixes:
  linkml: https://w3id.org/linkml/
  heritage: https://w3id.org/heritage/custodian/
  prov: http://www.w3.org/ns/prov#
  dcterms: http://purl.org/dc/terms/
  oa: http://www.w3.org/ns/oa#  # Web Annotation Ontology

default_prefix: heritage
default_range: string

# =============================================================================
# CLASSES
# =============================================================================

classes:

  WebEnrichment:
    description: >-
      Container for all web-enriched data with full provenance tracking.
      Replaces legacy 'exa_enrichment' and 'website_enrichment' sections.
    slots:
      - enrichment_id
      - search_query
      - search_timestamp
      - search_engine
      - claims
      - raw_sources
      - enrichment_status
      - enrichment_notes

  WebSource:
    description: >-
      A web page that was fetched and used as source for claims.
      Stores the raw content for reproducibility and audit.
    slots:
      - source_id
      - url
      - fetch_timestamp
      - http_status
      - content_type
      - title
      - author
      - published_date
      - raw_markdown
      - raw_markdown_hash
      - exa_highlights
      - exa_highlight_scores

  Claim:
    description: >-
      A single factual assertion extracted from web sources.
      Each claim has precise provenance linking to source text.
    class_uri: oa:Annotation
    slots:
      - claim_id
      - claim_type
      - field_path
      - value
      - value_type
      - source_references
      - confidence_score
      - verified
      - verified_by
      - verified_date
      - claim_notes

  SourceReference:
    description: >-
      Precise reference to source text supporting a claim.
      Uses Web Annotation Ontology (oa:) patterns for text selection.
    class_uri: oa:TextPositionSelector
    slots:
      - source_id
      - text_excerpt
      - char_start
      - char_end
      - markdown_heading_path
      - sentence_index
      - exa_highlight_index
      - relevance_score

# =============================================================================
# SLOTS
# =============================================================================

slots:

  # WebEnrichment slots
  enrichment_id:
    description: Unique identifier for this enrichment session
    range: string
    identifier: true
    pattern: '^enrich-[0-9]{8}T[0-9]{6}-[a-f0-9]{8}$'
    examples:
      - value: "enrich-20251129T143200-a1b2c3d4"

  search_query:
    description: The search query used to find sources
    range: string
    required: true

  search_timestamp:
    description: When the search was performed (ISO 8601)
    range: datetime
    required: true
    slot_uri: prov:atTime

  search_engine:
    description: Search engine used (e.g., "exa", "google", "bing")
    range: string
    required: true

  claims:
    description: List of factual claims extracted from sources
    range: Claim
    multivalued: true
    inlined_as_list: true

  raw_sources:
    description: Raw source documents fetched during enrichment
    range: WebSource
    multivalued: true
    inlined_as_list: true

  enrichment_status:
    description: Status of the enrichment process
    range: EnrichmentStatusEnum
    required: true

  enrichment_notes:
    description: Notes about the enrichment process
    range: string

  # WebSource slots
  source_id:
    description: Unique identifier for this source (typically URL hash)
    range: string
    identifier: true

  url:
    description: Source URL
    range: uri
    required: true
    slot_uri: dcterms:source

  fetch_timestamp:
    description: When the page was fetched
    range: datetime
    required: true

  http_status:
    description: HTTP status code of fetch (200, 404, etc.)
    range: integer

  content_type:
    description: Content-Type header value
    range: string

  title:
    description: Page title
    range: string
    slot_uri: dcterms:title

  author:
    description: Page author if available
    range: string
    slot_uri: dcterms:creator

  published_date:
    description: Page publication date if available
    range: datetime
    slot_uri: dcterms:created

  raw_markdown:
    description: >-
      Full page content as markdown (for reproducibility).
      Used to validate char_start/char_end offsets.
    range: string

  raw_markdown_hash:
    description: SHA-256 hash of raw_markdown for integrity verification
    range: string
    pattern: '^[a-f0-9]{64}$'

  exa_highlights:
    description: Exa-extracted highlight snippets
    range: string
    multivalued: true

  exa_highlight_scores:
    description: Relevance scores for each Exa highlight
    range: float
    multivalued: true

  # Claim slots
  claim_id:
    description: Unique identifier for this claim
    range: string
    identifier: true
    pattern: '^claim-[a-z0-9_]+-[0-9]+$'
    examples:
      - value: "claim-tree_heights-1"

  claim_type:
    description: Category of claim (descriptive, quantitative, temporal, etc.)
    range: ClaimTypeEnum
    required: true

  field_path:
    description: >-
      JSON path to the field this claim populates.
      Uses dot notation (e.g., "notable_features.green_library.tree_heights")
    range: string
    required: true
    examples:
      - value: "description"
      - value: "notable_features.green_library.tree_heights"
      - value: "services[0]"

  value:
    description: The actual value of the claim (as string, will be typed by value_type)
    range: string
    required: true

  value_type:
    description: Data type of the value (string, integer, float, boolean, list, object)
    range: ValueTypeEnum
    required: true

  source_references:
    description: References to source text supporting this claim
    range: SourceReference
    multivalued: true
    inlined_as_list: true
    required: true

  verified:
    description: Whether this claim has been manually verified
    range: boolean
    required: true

  verified_by:
    description: Who verified this claim (person or system)
    range: string

  verified_date:
    description: When verification occurred
    range: datetime

  claim_notes:
    description: Notes about this specific claim
    range: string

  # SourceReference slots
  text_excerpt:
    description: Exact text from source supporting this claim
    range: string
    required: true
    slot_uri: oa:exact

  char_start:
    description: Character offset where excerpt starts in raw_markdown
    range: integer
    required: true
    minimum_value: 0
    slot_uri: oa:start

  char_end:
    description: Character offset where excerpt ends in raw_markdown
    range: integer
    required: true
    minimum_value: 0
    slot_uri: oa:end

  markdown_heading_path:
    description: >-
      Path of markdown headings to this content.
      Format: "# H1 > ## H2 > ### H3"
    range: string
    examples:
      - value: "# Library > ## The Netherlands 'Greenest' Library"

  sentence_index:
    description: Index of sentence in document (0-based)
    range: integer
    minimum_value: 0

  exa_highlight_index:
    description: Index in exa_highlights array if from Exa highlight
    range: integer
    minimum_value: 0

  relevance_score:
    description: Relevance score from search engine (0.0-1.0)
    range: float
    minimum_value: 0.0
    maximum_value: 1.0

# =============================================================================
# ENUMS
# =============================================================================

enums:

  EnrichmentStatusEnum:
    permissible_values:
      SUCCESS:
        description: Enrichment completed successfully with claims extracted
      PARTIAL:
        description: Some claims extracted but source had issues
      NO_RESULTS:
        description: Search returned no usable results
      FETCH_ERROR:
        description: Failed to fetch source pages
      PARSE_ERROR:
        description: Failed to parse source content

  ClaimTypeEnum:
    permissible_values:
      DESCRIPTIVE:
        description: General descriptive text about the institution
      QUANTITATIVE:
        description: Numeric values (counts, measurements, ratings)
      TEMPORAL:
        description: Dates, time periods, durations
      GEOGRAPHIC:
        description: Locations, addresses, coordinates
      IDENTIFIER:
        description: Codes, IDs, registration numbers
      ORGANIZATIONAL:
        description: Relationships, parent/child, affiliations
      SERVICE:
        description: Services offered by the institution
      COLLECTION:
        description: Collection information
      CONTACT:
        description: Contact information (phone, email, social media)
      ARCHITECTURAL:
        description: Building features, design, accessibility

  ValueTypeEnum:
    permissible_values:
      STRING:
        description: Text value
      INTEGER:
        description: Whole number
      FLOAT:
        description: Decimal number
      BOOLEAN:
        description: True/false
      DATE:
        description: Date value (ISO 8601)
      DATETIME:
        description: Date and time value (ISO 8601)
      LIST_STRING:
        description: List of strings
      LIST_OBJECT:
        description: List of objects
      OBJECT:
        description: Nested object