glam/schemas/20251121/linkml/modules/classes/WebObservation.yaml

# WebObservation Class
# Models provenance information for web-extracted data following PROV-O and PAV patterns
#
# Created November 2025 to provide:
# - Transparent provenance for web-scraped heritage data
# - Integration with PROV-O (W3C Provenance Ontology)
# - Integration with PAV (Provenance, Authoring and Versioning)
# - Support for CallForApplication and other web-sourced entities
#
# Key relationships:
#   WebObservation --used--> SourceDocument (the web page)
#   WebObservation --generated--> Entity (extracted data)
#   CallForApplication --web_observations--> WebObservation[] (provenance chain)
#
# Examples:
#   - Observation of Horizon Europe call page on 2025-11-29
#   - Observation of heritage organisation website
#   - Observation of Wikidata SPARQL query results

id: https://nde.nl/ontology/hc/class/WebObservation
name: WebObservation
title: WebObservation Class

prefixes:
  linkml: https://w3id.org/linkml/
  hc: https://nde.nl/ontology/hc/
  schema: http://schema.org/
  dcterms: http://purl.org/dc/terms/
  prov: http://www.w3.org/ns/prov#
  pav: http://purl.org/pav/
  foaf: http://xmlns.com/foaf/0.1/
  xsd: http://www.w3.org/2001/XMLSchema#

imports:
  - linkml:types
  - ./WebClaim
  - ../slots/archived_at
  - ../slots/extraction_confidence
  - ../slots/extraction_notes
  - ../slots/source_url
  - ../slots/retrieved_on

default_prefix: hc

slots:
  observation_id:
    identifier: true
    range: uriorcurie
    description: Unique identifier for this web observation

  # NOTE: source_url imported from global slot ../slots/source_url.yaml
  # NOTE: retrieved_on imported from global slot ../slots/retrieved_on.yaml

  retrieved_by:
    range: string
    description: Agent (person, script, or system) that performed the retrieval

  retrieval_method:
    range: string
    description: Method used for retrieval (browser, API, scraper, etc.)

  content_hash:
    range: string
    description: Hash of retrieved content for integrity verification (SHA-256)

  http_status_code:
    range: integer
    description: HTTP status code received (200, 404, etc.)

  content_type:
    range: string
    description: MIME type of retrieved content (text/html, application/json, etc.)

  page_title:
    range: string
    description: Title of the web page as retrieved

  last_modified:
    range: datetime
    description: Last-Modified header value from HTTP response

  etag:
    range: string
    description: ETag header value for cache validation

  # NOTE: extraction_confidence imported from global slot ../slots/extraction_confidence.yaml
  # NOTE: extraction_notes imported from global slot ../slots/extraction_notes.yaml

  observed_entities:
    range: uriorcurie
    multivalued: true
    description: Entities extracted from this observation

  previous_observation:
    range: uriorcurie
    description: Previous observation of the same URL for change tracking

  content_changed:
    range: boolean
    description: Whether content changed since previous observation

  # NOTE: archived_at imported from global slot ../slots/archived_at.yaml

  claims:
    range: WebClaim
    multivalued: true
    inlined_as_list: true
    description: |
      Individual claims extracted from this web observation.
      Each claim MUST have XPath provenance pointing to the exact
      location in archived HTML where the value appears.

      Claims without XPath are considered FABRICATED and must be removed.

      See WebClaim class for required fields:
      - claim_type, claim_value (what)
      - source_url, retrieved_on (when/where)
      - xpath, html_file, xpath_match_score (verifiable provenance)

classes:
  WebObservation:
    class_uri: prov:Activity
    description: |
      A provenance record documenting the retrieval and observation of web content.
      Tracks when, where, and how web-based information was obtained.

      **PURPOSE**:

      WebObservation provides transparent provenance for web-extracted data in the
      heritage custodian ontology. When information about funding calls, institutions,
      or other entities is extracted from web sources, a WebObservation record
      documents:

      - **What**: The source URL and content
      - **When**: Timestamp of retrieval
      - **Who/What**: Agent performing retrieval
      - **How**: Method of extraction
      - **Quality**: Confidence scores and notes

      **PROVENANCE CHAIN**:

      ```
      WebObservation (Activity)
            │
            ├── prov:used ──→ SourceDocument (web page as Entity)
            │                     │
            │                     └── source_uri: https://example.org/call
            │
            ├── prov:generated ──→ CallForApplication (extracted Entity)
            │
            ├── pav:retrievedFrom ──→ URI of source
            ├── pav:retrievedOn ──→ datetime
            └── pav:retrievedBy ──→ agent identifier
      ```

      **PROV-O ALIGNMENT**:

      WebObservation is modelled as a `prov:Activity`:
      - Activities are "something that occurs over a period of time and acts upon
        or with entities"
      - The retrieval of a web page is an activity that uses a SourceDocument
        (the live web page) and generates extracted data

      Key PROV-O properties:
      - `prov:used` - The web page accessed
      - `prov:generated` - The extracted data entity
      - `prov:wasAssociatedWith` - The retrieval agent
      - `prov:atTime` - When the activity occurred

      **PAV ALIGNMENT**:

      PAV (Provenance, Authoring and Versioning) provides more specific properties:
      - `pav:retrievedFrom` - Source URL
      - `pav:retrievedOn` - Retrieval timestamp
      - `pav:retrievedBy` - Retrieval agent
      - `pav:sourceAccessedAt` - When source was consulted

      **CHANGE DETECTION**:

      WebObservation supports tracking changes over time:
      - Link to `previous_observation` for same URL
      - `content_changed` flag for quick change detection
      - `content_hash` for integrity verification
      - Compare `last_modified` and `etag` across observations

      **ARCHIVAL INTEGRATION**:

      For long-term preservation, link to archived copies:
      - `archived_at` can point to Wayback Machine, Archive.today, etc.
      - Ensures cited web content remains accessible

      **EXAMPLES**:

      1. **EU Funding Portal Observation**
         - source_url: https://ec.europa.eu/.../topic-details/horizon-cl2-2025-heritage-01
         - retrieved_on: 2025-11-29T10:30:00Z
         - retrieved_by: "glam-harvester/1.0"
         - extraction_confidence: 0.95

      2. **Heritage Organisation Website**
         - source_url: https://www.heritagefund.org.uk/funding/medium-grants
         - retrieved_on: 2025-11-28T14:00:00Z
         - content_type: text/html
         - page_title: "Medium grants - Heritage Fund"

      3. **Wikidata SPARQL Query**
         - source_url: https://query.wikidata.org/sparql?query=...
         - retrieval_method: SPARQL API
         - content_type: application/sparql-results+json
         - observed_entities: [Q131381572, Q1375245, ...]

    exact_mappings:
      - prov:Activity

    close_mappings:
      - pav:Retrieval  # PAV doesn't define this but concept aligns
      - schema:Action

    related_mappings:
      - prov:Entity
      - pav:sourceAccessedAt
      - dcterms:source

    slots:
      - observation_id
      - source_url
      - retrieved_on
      - retrieved_by
      - retrieval_method
      - content_hash
      - http_status_code
      - content_type
      - page_title
      - last_modified
      - etag
      - extraction_confidence
      - extraction_notes
      - observed_entities
      - previous_observation
      - content_changed
      - archived_at
      - claims

    comments:
      - "WebObservation is a prov:Activity documenting web content retrieval"
      - "Integrates PROV-O for provenance and PAV for retrieval-specific properties"
      - "Supports change detection via content_hash, previous_observation, content_changed"
      - "Links to archived copies via archived_at for long-term citation"
      - "observed_entities links observation to extracted data (prov:generated)"

    see_also:
      - "https://www.w3.org/TR/prov-o/"
      - "http://purl.org/pav/"
      - "https://www.w3.org/TR/prov-dm/"
      - "https://web.archive.org/"

    examples:
      - value:
          observation_id: "https://nde.nl/ontology/hc/observation/web/2025-11-29/eu-horizon-cl2-heritage"
          source_url: "https://ec.europa.eu/info/funding-tenders/opportunities/portal/screen/opportunities/topic-details/horizon-cl2-2025-heritage-01"
          retrieved_on: "2025-11-29T10:30:00Z"
          retrieved_by: "claude-assistant"
          retrieval_method: "exa-search"
          http_status_code: 200
          content_type: "text/html"
          page_title: "Horizon Europe - Cultural heritage, cultural and creative industries"
          extraction_confidence: 0.92
          extraction_notes: >-
            Extracted via Exa AI search. Call details structured and well-formatted.
            Budget and deadline clearly stated. Eligibility criteria parsed from
            HTML sections.
          observed_entities:
            - "https://nde.nl/ontology/hc/call/ec/cl2-2025-heritage-01"
          archived_at: "https://web.archive.org/web/20251129103000/https://ec.europa.eu/info/funding-tenders/opportunities/portal/screen/opportunities/topic-details/horizon-cl2-2025-heritage-01"
        description: "Web observation of Horizon Europe CL2 2025 heritage call"

      - value:
          observation_id: "https://nde.nl/ontology/hc/observation/web/2025-11-28/nlhf-medium-grants"
          source_url: "https://www.heritagefund.org.uk/funding/medium-grants"
          retrieved_on: "2025-11-28T14:00:00Z"
          retrieved_by: "glam-harvester/1.0"
          retrieval_method: "playwright-scraper"
          http_status_code: 200
          content_type: "text/html"
          page_title: "Medium grants | The National Lottery Heritage Fund"
          content_hash: "sha256:a1b2c3d4e5f6789012345678901234567890abcdef1234567890abcdef123456"
          last_modified: "2025-11-15T09:00:00Z"
          extraction_confidence: 0.88
          extraction_notes: >-
            Extracted via Playwright scraper. Dynamic content fully rendered.
            Grant range and eligibility parsed from page sections.
          observed_entities:
            - "https://nde.nl/ontology/hc/call/nlhf/medium-grants-2025-q4"
          previous_observation: "https://nde.nl/ontology/hc/observation/web/2025-10-15/nlhf-medium-grants"
          content_changed: true
        description: "Web observation of National Lottery Heritage Fund grants page"

      - value:
          observation_id: "https://nde.nl/ontology/hc/observation/web/2025-11-29/wikidata-echoes"
          source_url: "https://query.wikidata.org/sparql"
          retrieved_on: "2025-11-29T09:00:00Z"
          retrieved_by: "wikidata-mcp-server"
          retrieval_method: "sparql-api"
          http_status_code: 200
          content_type: "application/sparql-results+json"
          extraction_confidence: 1.0
          extraction_notes: >-
            SPARQL query for ECHOES/ECCCH Q-number (Q131381572).
            Structured API response with high confidence.
          observed_entities:
            - "http://www.wikidata.org/entity/Q131381572"
        description: "SPARQL query observation for Wikidata entity"