glam/schemas/20251121/linkml/modules/classes/WebObservation.yaml

id: https://nde.nl/ontology/hc/class/WebObservation
name: WebObservation
title: WebObservation Class
prefixes:
  linkml: https://w3id.org/linkml/
  hc: https://nde.nl/ontology/hc/
  schema: http://schema.org/
  dcterms: http://purl.org/dc/terms/
  prov: http://www.w3.org/ns/prov#
  pav: http://purl.org/pav/
  foaf: http://xmlns.com/foaf/0.1/
  xsd: http://www.w3.org/2001/XMLSchema#
imports:
- linkml:types
- ./WebClaim
- ../slots/archived_at
- ../slots/extraction_confidence
- ../slots/extraction_notes
- ../slots/source_url
- ../slots/retrieved_on
- ../slots/content_hash
- ../slots/class_metadata_slots
default_prefix: hc
slots:
  observation_id:
    identifier: true
    range: uriorcurie
    description: Unique identifier for this web observation
  retrieved_by:
    range: string
    description: Agent (person, script, or system) that performed the retrieval
  retrieval_method:
    range: string
    description: Method used for retrieval (browser, API, scraper, etc.)
  http_status_code:
    range: integer
    description: HTTP status code received (200, 404, etc.)
  content_type:
    range: string
    description: MIME type of retrieved content (text/html, application/json, etc.)
  page_title:
    range: string
    description: Title of the web page as retrieved
  last_modified:
    range: datetime
    description: Last-Modified header value from HTTP response
  etag:
    range: string
    description: ETag header value for cache validation
  observed_entities:
    range: uriorcurie
    multivalued: true
    description: Entities extracted from this observation
  previous_observation:
    range: uriorcurie
    description: Previous observation of the same URL for change tracking
  content_changed:
    range: boolean
    description: Whether content changed since previous observation
  claims:
    range: WebClaim
    multivalued: true
    inlined_as_list: true
    description: |
      Individual claims extracted from this web observation.
      Each claim MUST have XPath provenance pointing to the exact
      location in archived HTML where the value appears.

      Claims without XPath are considered FABRICATED and must be removed.

      See WebClaim class for required fields:
      - claim_type, claim_value (what)
      - source_url, retrieved_on (when/where)
      - xpath, html_file, xpath_match_score (verifiable provenance)
classes:
  WebObservation:
    class_uri: prov:Activity
    description: "A provenance record documenting the retrieval and observation of\
      \ web content.\nTracks when, where, and how web-based information was obtained.\n\
      \n**PURPOSE**:\n\nWebObservation provides transparent provenance for web-extracted\
      \ data in the\nheritage custodian ontology. When information about funding calls,\
      \ institutions,\nor other entities is extracted from web sources, a WebObservation\
      \ record\ndocuments:\n\n- **What**: The source URL and content\n- **When**:\
      \ Timestamp of retrieval\n- **Who/What**: Agent performing retrieval\n- **How**:\
      \ Method of extraction\n- **Quality**: Confidence scores and notes\n\n**PROVENANCE\
      \ CHAIN**:\n\n```\nWebObservation (Activity)\n      │\n      ├── prov:used ──→\
      \ SourceDocument (web page as Entity)\n      │                     │\n     \
      \ │                     └── source_uri: https://example.org/call\n      │\n\
      \      ├── prov:generated ──→ CallForApplication (extracted Entity)\n      │\n\
      \      ├── pav:retrievedFrom ──→ URI of source\n      ├── pav:retrievedOn ──→\
      \ datetime\n      └── pav:retrievedBy ──→ agent identifier\n```\n\n**PROV-O\
      \ ALIGNMENT**:\n\nWebObservation is modelled as a `prov:Activity`:\n- Activities\
      \ are \"something that occurs over a period of time and acts upon\n  or with\
      \ entities\"\n- The retrieval of a web page is an activity that uses a SourceDocument\n\
      \  (the live web page) and generates extracted data\n\nKey PROV-O properties:\n\
      - `prov:used` - The web page accessed\n- `prov:generated` - The extracted data\
      \ entity\n- `prov:wasAssociatedWith` - The retrieval agent\n- `prov:atTime`\
      \ - When the activity occurred\n\n**PAV ALIGNMENT**:\n\nPAV (Provenance, Authoring\
      \ and Versioning) provides more specific properties:\n- `pav:retrievedFrom`\
      \ - Source URL\n- `pav:retrievedOn` - Retrieval timestamp\n- `pav:retrievedBy`\
      \ - Retrieval agent\n- `pav:sourceAccessedAt` - When source was consulted\n\n\
      **CHANGE DETECTION**:\n\nWebObservation supports tracking changes over time:\n\
      - Link to `previous_observation` for same URL\n- `content_changed` flag for\
      \ quick change detection\n- `content_hash` for integrity verification\n- Compare\
      \ `last_modified` and `etag` across observations\n\n**ARCHIVAL INTEGRATION**:\n\
      \nFor long-term preservation, link to archived copies:\n- `archived_at` can\
      \ point to Wayback Machine, Archive.today, etc.\n- Ensures cited web content\
      \ remains accessible\n\n**EXAMPLES**:\n\n1. **EU Funding Portal Observation**\n\
      \   - source_url: https://ec.europa.eu/.../topic-details/horizon-cl2-2025-heritage-01\n\
      \   - retrieved_on: 2025-11-29T10:30:00Z\n   - retrieved_by: \"glam-harvester/1.0\"\
      \n   - extraction_confidence: 0.95\n   \n2. **Heritage Organisation Website**\n\
      \   - source_url: https://www.heritagefund.org.uk/funding/medium-grants\n  \
      \ - retrieved_on: 2025-11-28T14:00:00Z\n   - content_type: text/html\n   - page_title:\
      \ \"Medium grants - Heritage Fund\"\n   \n3. **Wikidata SPARQL Query**\n   -\
      \ source_url: https://query.wikidata.org/sparql?query=...\n   - retrieval_method:\
      \ SPARQL API\n   - content_type: application/sparql-results+json\n   - observed_entities:\
      \ [Q131381572, Q1375245, ...]\n"
    exact_mappings:
    - prov:Activity
    close_mappings:
    - pav:Retrieval
    - schema:Action
    related_mappings:
    - prov:Entity
    - pav:sourceAccessedAt
    - dcterms:source
    slots:
    - archived_at
    - claims
    - content_changed
    - content_hash
    - content_type
    - etag
    - extraction_confidence
    - extraction_notes
    - http_status_code
    - last_modified
    - observation_id
    - observed_entities
    - page_title
    - previous_observation
    - retrieval_method
    - retrieved_by
    - retrieved_on
    - source_url
    - specificity_annotation
    - template_specificity
    comments:
    - WebObservation is a prov:Activity documenting web content retrieval
    - Integrates PROV-O for provenance and PAV for retrieval-specific properties
    - Supports change detection via content_hash, previous_observation, content_changed
    - Links to archived copies via archived_at for long-term citation
    - observed_entities links observation to extracted data (prov:generated)
    see_also:
    - https://www.w3.org/TR/prov-o/
    - http://purl.org/pav/
    - https://www.w3.org/TR/prov-dm/
    - https://web.archive.org/
    examples:
    - value:
        observation_id: https://nde.nl/ontology/hc/observation/web/2025-11-29/eu-horizon-cl2-heritage
        source_url: https://ec.europa.eu/info/funding-tenders/opportunities/portal/screen/opportunities/topic-details/horizon-cl2-2025-heritage-01
        retrieved_on: '2025-11-29T10:30:00Z'
        retrieved_by: claude-assistant
        retrieval_method: exa-search
        http_status_code: 200
        content_type: text/html
        page_title: Horizon Europe - Cultural heritage, cultural and creative industries
        extraction_confidence: 0.92
        extraction_notes: Extracted via Exa AI search. Call details structured and
          well-formatted. Budget and deadline clearly stated. Eligibility criteria
          parsed from HTML sections.
        observed_entities:
        - https://nde.nl/ontology/hc/call/ec/cl2-2025-heritage-01
        archived_at: https://web.archive.org/web/20251129103000/https://ec.europa.eu/info/funding-tenders/opportunities/portal/screen/opportunities/topic-details/horizon-cl2-2025-heritage-01
      description: Web observation of Horizon Europe CL2 2025 heritage call
    - value:
        observation_id: https://nde.nl/ontology/hc/observation/web/2025-11-28/nlhf-medium-grants
        source_url: https://www.heritagefund.org.uk/funding/medium-grants
        retrieved_on: '2025-11-28T14:00:00Z'
        retrieved_by: glam-harvester/1.0
        retrieval_method: playwright-scraper
        http_status_code: 200
        content_type: text/html
        page_title: Medium grants | The National Lottery Heritage Fund
        content_hash: sha256:a1b2c3d4e5f6789012345678901234567890abcdef1234567890abcdef123456
        last_modified: '2025-11-15T09:00:00Z'
        extraction_confidence: 0.88
        extraction_notes: Extracted via Playwright scraper. Dynamic content fully
          rendered. Grant range and eligibility parsed from page sections.
        observed_entities:
        - https://nde.nl/ontology/hc/call/nlhf/medium-grants-2025-q4
        previous_observation: https://nde.nl/ontology/hc/observation/web/2025-10-15/nlhf-medium-grants
        content_changed: true
      description: Web observation of National Lottery Heritage Fund grants page
    - value:
        observation_id: https://nde.nl/ontology/hc/observation/web/2025-11-29/wikidata-echoes
        source_url: https://query.wikidata.org/sparql
        retrieved_on: '2025-11-29T09:00:00Z'
        retrieved_by: wikidata-mcp-server
        retrieval_method: sparql-api
        http_status_code: 200
        content_type: application/sparql-results+json
        extraction_confidence: 1.0
        extraction_notes: SPARQL query for ECHOES/ECCCH Q-number (Q131381572). Structured
          API response with high confidence.
        observed_entities:
        - http://www.wikidata.org/entity/Q131381572
      description: SPARQL query observation for Wikidata entity
    slot_usage:
      specificity_annotation:
        range: SpecificityAnnotation
        inlined: true
      template_specificity:
        range: TemplateSpecificityScores
        inlined: true