glam/schemas/20251121/linkml/modules/classes/WebClaim.yaml

# WebClaim Class
# Models individual claims extracted from web pages with XPath provenance
#
# Created November 2025 to provide:
# - Verifiable provenance for each extracted claim value
# - XPath pointers to exact locations in archived HTML
# - Rejection of claims without verifiable sources
#
# Key principle:
#   Every claim from a webpage MUST have an XPath pointer to the exact
#   location in archived HTML where that value appears.
#   Claims without XPath provenance are considered FABRICATED and must be removed.
#
# This is NOT about "confidence" - it's about VERIFIABILITY.
# Either the claim value exists in the HTML at a specific XPath, or it was
# hallucinated/fabricated by an LLM.
#
# Examples:
#   - full_name claim verified at /html/body/div[1]/h1
#   - email claim verified at /html/body/footer/a[@href="mailto:..."]
#   - description claim verified at /html/head/meta[@name="description"]

id: https://nde.nl/ontology/hc/class/WebClaim
name: WebClaim
title: WebClaim Class - Verifiable Web-Extracted Claims

prefixes:
  linkml: https://w3id.org/linkml/
  hc: https://nde.nl/ontology/hc/
  schema: http://schema.org/
  dcterms: http://purl.org/dc/terms/
  prov: http://www.w3.org/ns/prov#
  pav: http://purl.org/pav/
  xsd: http://www.w3.org/2001/XMLSchema#

imports:
  - linkml:types

default_prefix: hc

enums:
  ClaimTypeEnum:
    description: >-
      Types of claims that can be extracted from web pages.
      Each claim type has expected source locations in HTML.
    permissible_values:
      full_name:
        description: Official full name of the organization
        comments:
          - "Expected in: <title>, <h1>, logo text, meta og:title"
      short_name:
        description: Abbreviated or short name
        comments:
          - "Expected in: <h1>, logo, header"
      description:
        description: Description or about text
        comments:
          - "Expected in: <meta name='description'>, about section"
      email:
        description: Contact email address
        comments:
          - "Expected in: <a href='mailto:...'>, contact page, footer"
      phone:
        description: Contact phone number
        comments:
          - "Expected in: <a href='tel:...'>, contact page, footer"
      address:
        description: Physical address
        comments:
          - "Expected in: contact page, footer, structured data"
      website:
        description: Website URL (may differ from source_url)
        comments:
          - "Expected in: contact page, links"
      social_media:
        description: Social media profile links
        comments:
          - "Expected in: footer, header, contact page"
      facebook:
        description: Facebook page URL
      twitter:
        description: Twitter/X profile URL
      instagram:
        description: Instagram profile URL
      linkedin:
        description: LinkedIn page URL
      youtube:
        description: YouTube channel URL
      opening_hours:
        description: Opening hours / visiting times
        comments:
          - "Expected in: visit page, contact page, structured data"
      admission_info:
        description: Admission prices or policies
      accessibility_info:
        description: Accessibility information
      collection_count:
        description: Number of items in collection
      beeldbank_total_photos:
        description: Total photos in image bank (beeldbank)
      beeldbank_described_photos:
        description: Number of described photos in image bank
      founding_date:
        description: When organization was founded
      kvk_number:
        description: Dutch Chamber of Commerce number
      isil_code:
        description: ISIL identifier
      wikidata_id:
        description: Wikidata Q-number
      parent_organization:
        description: Parent organization reference

slots:
  claim_id:
    identifier: true
    range: uriorcurie
    description: Unique identifier for this claim

  claim_type:
    range: ClaimTypeEnum
    required: true
    description: Type of claim being made

  claim_value:
    range: string
    required: true
    description: The extracted value

  source_url:
    range: uri
    required: true
    slot_uri: pav:retrievedFrom
    description: URL of the web page this claim was extracted from

  retrieved_on:
    range: datetime
    required: true
    slot_uri: pav:retrievedOn
    description: >-
      Timestamp when the web page was archived.
      ISO 8601 format with timezone (UTC preferred).

  xpath:
    range: string
    required: true
    description: >-
      XPath to the element containing this claim value.
      This is the CRITICAL provenance field - without it, a claim is unverifiable.

      Format: Standard XPath 1.0 expression
      Example: /html[1]/body[1]/div[6]/div[1]/h1[1]
    pattern: "^/.*"

  html_file:
    range: string
    required: true
    description: >-
      Relative path to the archived HTML file containing this claim.
      Path is relative to the entry file.

      Example: web/0021/historischeverenigingnijeveen.nl/rendered.html

  xpath_match_score:
    range: float
    required: true
    minimum_value: 0.0
    maximum_value: 1.0
    description: >-
      Match quality between claim_value and the text at the XPath location.

      - 1.0 = Exact match (claim_value appears verbatim)
      - 0.8-0.99 = Near match after normalization (whitespace, case)
      - 0.5-0.79 = Substring match (claim_value is part of element text)
      - <0.5 = Weak match (claim may need verification)

      Claims with score < 0.3 should be flagged for manual review.

  xpath_matched_text:
    range: string
    description: >-
      The actual text found at the XPath location.
      Useful when xpath_match_score < 1.0 to show what was matched.

  extraction_timestamp:
    range: datetime
    description: >-
      When the claim was extracted from the archived HTML.
      May differ from retrieved_on if extraction happens later.

  extraction_method:
    range: string
    description: >-
      Method used to extract this claim.
      Examples: "xpath_exact_match", "text_search", "css_selector"

  claim_notes:
    range: string
    description: Notes about this specific claim extraction

classes:
  WebClaim:
    class_uri: prov:Entity
    description: >-
      A single verifiable claim extracted from a web page.

      **CORE PRINCIPLE: XPATH OR REMOVE**

      Every claim extracted from a webpage MUST have:
      1. `xpath` - pointing to exact element in archived HTML
      2. `html_file` - path to the archived HTML
      3. `xpath_match_score` - quality of the match

      Claims without these fields are FABRICATED and must be REMOVED.

      **WHY NOT CONFIDENCE SCORES?**

      Confidence scores like `0.95` are MEANINGLESS because:
      - There is NO methodology defining what these numbers mean
      - They cannot be verified or reproduced
      - They give false impression of rigor
      - They mask the fact that claims may be fabricated

      Instead, we use VERIFIABLE provenance:
      - XPath points to exact location
      - Archived HTML can be inspected
      - Match score is computed, not estimated

      **WORKFLOW**:

      1. Archive website using Playwright:
         `python scripts/fetch_website_playwright.py 0021 https://example.org/`

      2. Add XPath provenance to claims:
         `python scripts/add_xpath_provenance.py`

      3. Script REMOVES claims that cannot be verified
         (stores in `removed_unverified_claims` for audit)

      **EXAMPLES**:

      CORRECT (Verifiable):
      ```yaml
      - claim_type: full_name
        claim_value: Historische Vereniging Nijeveen
        source_url: https://historischeverenigingnijeveen.nl/
        retrieved_on: "2025-11-29T12:28:00Z"
        xpath: /html[1]/body[1]/div[6]/div[1]/h1[1]
        html_file: web/0021/historischeverenigingnijeveen.nl/rendered.html
        xpath_match_score: 1.0
      ```

      WRONG (Fabricated - Must Be Removed):
      ```yaml
      - claim_type: full_name
        claim_value: Historische Vereniging Nijeveen
        confidence: 0.95  # ← NO! This is meaningless without XPath
      ```

    exact_mappings:
      - prov:Entity

    close_mappings:
      - schema:PropertyValue

    slots:
      - claim_id
      - claim_type
      - claim_value
      - source_url
      - retrieved_on
      - xpath
      - html_file
      - xpath_match_score
      - xpath_matched_text
      - extraction_timestamp
      - extraction_method
      - claim_notes

    slot_usage:
      claim_type:
        required: true
        description: >-
          Type of claim. See ClaimTypeEnum for allowed values.

          Each claim type has expected source locations:
          - full_name: <title>, <h1>, logo, og:title
          - description: <meta name="description">, about section
          - email: <a href="mailto:...">, contact page
          - phone: <a href="tel:...">, contact page
          - address: footer, contact page, JSON-LD

      claim_value:
        required: true
        description: >-
          The extracted value. Must appear at the XPath location.

      xpath:
        required: true
        description: >-
          XPath to element containing claim_value.

          **THIS IS THE CRITICAL PROVENANCE FIELD.**

          Without an XPath, a claim is unverifiable and must be removed.

          Format: Standard XPath 1.0 expression
          Example: /html[1]/body[1]/div[6]/div[1]/table[3]/tbody[1]/tr[1]/td[1]/p[6]

      html_file:
        required: true
        description: >-
          Path to archived HTML file (relative to entry file).

          Standard structure:
          web/{entry_number}/{domain}/rendered.html

          Example: web/0021/historischeverenigingnijeveen.nl/rendered.html

      xpath_match_score:
        required: true
        description: >-
          Match quality between claim_value and text at XPath.

          This is COMPUTED, not estimated:
          - 1.0: claim_value == element_text (exact)
          - <1.0: len(claim_value) / len(element_text) for substrings

          Claims with score < 0.3 should be flagged for review.

    rules:
      - preconditions:
          slot_conditions:
            xpath:
              value_presence: ABSENT
        postconditions:
          description: "Claims without XPath must be removed as unverifiable"

    comments:
      - "WebClaim requires XPath provenance - claims without it are fabricated"
      - "Match score is COMPUTED from actual text comparison, not estimated"
      - "Archived HTML files are stored in web/{entry}/{domain}/ directories"
      - "Use scripts/add_xpath_provenance.py to add XPath to existing claims"

    see_also:
      - ".opencode/WEB_OBSERVATION_PROVENANCE_RULES.md"
      - "scripts/fetch_website_playwright.py"
      - "scripts/add_xpath_provenance.py"

    examples:
      - value:
          claim_type: full_name
          claim_value: Historische Vereniging Nijeveen
          source_url: "https://historischeverenigingnijeveen.nl/"
          retrieved_on: "2025-11-29T12:28:00Z"
          xpath: "/html[1]/body[1]/div[6]/div[1]/h1[1]"
          html_file: "web/0021/historischeverenigingnijeveen.nl/rendered.html"
          xpath_match_score: 1.0
        description: "Exact match claim for organization name"

      - value:
          claim_type: beeldbank_total_photos
          claim_value: "6253"
          source_url: "https://historischeverenigingnijeveen.nl/nl/hvn"
          retrieved_on: "2025-11-29T12:28:00Z"
          xpath: "/html[1]/body[1]/div[6]/div[1]/table[3]/tbody[1]/tr[1]/td[1]/p[1]"
          html_file: "web/0021/historischeverenigingnijeveen.nl/rendered.html"
          xpath_match_score: 1.0
        description: "Collection count claim from image bank statistics"

      - value:
          claim_type: facebook
          claim_value: "https://www.facebook.com/HistorischeVerenigingNijeveen/"
          source_url: "https://historischeverenigingnijeveen.nl/"
          retrieved_on: "2025-11-29T12:28:00Z"
          xpath: "/html[1]/body[1]/footer[1]/div[1]/a[3]"
          html_file: "web/0021/historischeverenigingnijeveen.nl/rendered.html"
          xpath_match_score: 1.0
        description: "Social media link claim"

      - value:
          claim_type: website
          claim_value: "https://www.historischeverenigingnijeveen.nl/"
          source_url: "https://historischeverenigingnijeveen.nl/nl/hvn"
          retrieved_on: "2025-11-28T12:00:00Z"
          xpath: "/html[1]/body[1]/div[6]/div[1]/table[3]/tbody[1]/tr[1]/td[1]/p[6]"
          html_file: "web/0021/historischeverenigingnijeveen.nl/rendered.html"
          xpath_match_score: 0.561
          xpath_matched_text: "De Historische Vereniging Nijeveen is ook te vinden op Facebook"
        description: "Substring match - URL found within longer text"