glam/schemas/20251121/linkml/modules/classes/WebClaim.yaml

id: https://nde.nl/ontology/hc/class/WebClaim
name: WebClaim
title: WebClaim Class - Verifiable Web-Extracted Claims
prefixes:
  linkml: https://w3id.org/linkml/
  hc: https://nde.nl/ontology/hc/
  schema: http://schema.org/
  dcterms: http://purl.org/dc/terms/
  prov: http://www.w3.org/ns/prov#
  pav: http://purl.org/pav/
  xsd: http://www.w3.org/2001/XMLSchema#
  oa: http://www.w3.org/ns/oa#
  nif: http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#
imports:
  - linkml:types
  # Slot imports
  - ../slots/source_url
  - ../slots/retrieved_on
  - ../slots/xpath
  - ../slots/html_file
  - ../slots/claim_id
  - ../slots/claim_type
  - ../slots/claim_value
  - ../slots/extraction_timestamp
  - ../slots/specificity_annotation
  - ../slots/template_specificity
  - ../slots/xpath_match_score
  - ../slots/xpath_matched_text
  - ../slots/claim_extraction_method
  - ../slots/pipeline_stage
  - ../slots/claim_note
  # Enum imports
  - ../enums/ClaimTypeEnum
  - ../enums/ExtractionPipelineStageEnum
  # Class imports (for slot ranges)
  - ./SpecificityAnnotation
  - ./TemplateSpecificityScores
default_prefix: hc

classes:
  WebClaim:
    class_uri: prov:Entity
    description: "A single verifiable claim extracted from a web page.\n\n**CORE PRINCIPLE:\
      \ XPATH OR REMOVE**\n\nEvery claim extracted from a webpage MUST have:\n1. `xpath`\
      \ - pointing to exact element in archived HTML\n2. `html_file` - path to the\
      \ archived HTML (Playwright-rendered, NOT WARC)\n3. `xpath_match_score` - quality\
      \ of the match\n\nClaims without these fields are FABRICATED and must be REMOVED.\n\
      \n**ARCHIVE FORMAT: PLAYWRIGHT-RENDERED HTML**\n\nWe use Playwright (headless\
      \ browser) to:\n1. Navigate to the target URL\n2. Wait for JavaScript to fully\
      \ render\n3. Save the complete DOM as an HTML file\n\nThis differs from WARC\
      \ archives which capture raw HTTP responses.\nPlaywright rendering captures\
      \ the final DOM state including:\n- JavaScript-rendered content\n- Dynamically\
      \ loaded elements\n- Client-side state\n\n**WHY NOT CONFIDENCE SCORES?**\n\n\
      Confidence scores like `0.95` are MEANINGLESS because:\n- There is NO methodology\
      \ defining what these numbers mean\n- They cannot be verified or reproduced\n\
      - They give false impression of rigor\n- They mask the fact that claims may\
      \ be fabricated\n\nInstead, we use VERIFIABLE provenance:\n- XPath points to\
      \ exact location\n- Archived HTML can be inspected\n- Match score is computed,\
      \ not estimated\n\n**EXTRACTION PIPELINE (4 Stages)**\n\nFollowing the GLAM-NER\
      \ Unified Entity Annotation Convention v1.7.0:\n\n1. **Entity Recognition**\
      \ (Stage 1)\n   - Detect named entities in text\n   - Classify by hypernym type\
      \ (AGT, GRP, TOP, TMP, etc.)\n   - Methods: spaCy NER, transformer models, regex\
      \ patterns\n\n2. **Layout Analysis** (Stage 2)\n   - Analyze document structure\
      \ (headers, paragraphs, tables)\n   - Assign DOC hypernym types (DOC.HDR, DOC.PAR,\
      \ DOC.TBL)\n   - Generate XPath provenance for each claim location\n\n3. **Entity\
      \ Resolution** (Stage 3)\n   - Disambiguate entity mentions\n   - Merge coreferences\
      \ and name variants\n   - Produce canonical entity clusters\n\n4. **Entity Linking**\
      \ (Stage 4)\n   - Link resolved entities to knowledge bases\n   - Connect to\
      \ Wikidata, ISIL, GeoNames, etc.\n   - Assign link confidence scores\n\n**WORKFLOW**:\n\
      \n1. Archive website using Playwright:\n   `python scripts/fetch_website_playwright.py\
      \ <entry_number> <url>`\n   \n   This saves: web/{entry_number}/{domain}/rendered.html\n\
      \n2. Add XPath provenance to claims:\n   `python scripts/add_xpath_provenance.py`\n\
      \n3. Script REMOVES claims that cannot be verified\n   (stores in `removed_unverified_claims`\
      \ for audit)\n\n**EXAMPLES**:\n\nCORRECT (Verifiable):\n```yaml\n- claim_type:\
      \ full_name\n  claim_value: Historische Vereniging Nijeveen\n  source_url: https://historischeverenigingnijeveen.nl/\n\
      \  retrieved_on: \"2025-11-29T12:28:00Z\"\n  xpath: /html[1]/body[1]/div[6]/div[1]/h1[1]\n\
      \  html_file: web/0021/historischeverenigingnijeveen.nl/rendered.html\n  xpath_match_score:\
      \ 1.0\n  pipeline_stage: layout_analysis\n```\n\nWRONG (Fabricated - Must Be\
      \ Removed):\n```yaml\n- claim_type: full_name\n  claim_value: Historische Vereniging\
      \ Nijeveen\n  confidence: 0.95  # ← NO! This is meaningless without XPath\n\
      ```\n"
    exact_mappings:
      - prov:Entity
    close_mappings:
      - schema:PropertyValue
      - oa:Annotation
    slots:
      - claim_extraction_method
      - claim_id
      - claim_note
      - claim_type
      - claim_value
      - extraction_timestamp
      - html_file
      - pipeline_stage
      - retrieved_on
      - source_url
      - specificity_annotation
      - template_specificity
      - xpath
      - xpath_match_score
      - xpath_matched_text
    slot_usage:
      claim_type:
        required: true
        description: |
          Type of claim. See ClaimTypeEnum for allowed values.

          Each claim type has expected source locations:
          - full_name: <title>, <h1>, logo, og:title
          - description: <meta name="description">, about section
          - email: <a href="mailto:...">, contact page
          - phone: <a href="tel:...">, contact page
          - address: footer, contact page, JSON-LD
      claim_value:
        required: true
        description: |
          The extracted value. Must appear at the XPath location.
      source_url:
        required: true
        description: |
          URL of the web page this claim was extracted from.
          Required for all claims - enables verification of provenance.
      retrieved_on:
        required: true
        description: |
          Timestamp when the web page was archived using Playwright.
          ISO 8601 format with timezone (UTC preferred).
      xpath:
        required: true
        description: |
          XPath to element containing claim_value.

          **THIS IS THE CRITICAL PROVENANCE FIELD.**

          Without an XPath, a claim is unverifiable and must be removed.

          Format: Standard XPath 1.0 expression
          Example: /html[1]/body[1]/div[6]/div[1]/table[3]/tbody[1]/tr[1]/td[1]/p[6]
      html_file:
        required: true
        description: |
          Path to archived HTML file (relative to entry file).

          Archive format: Playwright-rendered HTML (NOT WARC).
          Playwright captures the fully rendered DOM after JavaScript execution.

          Standard structure:
          web/{entry_number}/{domain}/rendered.html

          Example: web/0021/historischeverenigingnijeveen.nl/rendered.html
      xpath_match_score:
        required: true
        description: |
          Match quality between claim_value and text at XPath.

          This is COMPUTED, not estimated:
          - 1.0: claim_value == element_text (exact)
          - <1.0: len(claim_value) / len(element_text) for substrings

          Claims with score < 0.3 should be flagged for review.
      pipeline_stage:
        description: |
          The extraction pipeline stage that produced this claim.

          Most WebClaims are produced during:
          - layout_analysis (Stage 2): When XPath locations are identified
          - entity_linking (Stage 4): When claims link to knowledge bases
      specificity_annotation:
        range: SpecificityAnnotation
        inlined: true
      template_specificity:
        range: TemplateSpecificityScores
        inlined: true
    rules:
      - preconditions:
          slot_conditions:
            xpath:
              value_presence: ABSENT
        postconditions:
          description: Claims without XPath must be removed as unverifiable
    comments:
      - WebClaim requires XPath provenance - claims without it are fabricated
      - Match score is COMPUTED from actual text comparison, not estimated
      - Archived HTML files are Playwright-rendered (NOT WARC format)
      - Use scripts/fetch_website_playwright.py to archive websites
      - Use scripts/add_xpath_provenance.py to add XPath to existing claims
      - 'Follows 4-stage GLAM-NER pipeline: recognition → layout → resolution → linking'
    see_also:
      - rules/WEB_OBSERVATION_PROVENANCE_RULES.md
      - scripts/fetch_website_playwright.py
      - scripts/add_xpath_provenance.py
      - docs/convention/schema/20251202/entity_annotation_rules_v1.6.0_unified.yaml
    examples:
      - value:
          claim_type: full_name
          claim_value: Historische Vereniging Nijeveen
          source_url: https://historischeverenigingnijeveen.nl/
          retrieved_on: '2025-11-29T12:28:00Z'
          xpath: /html[1]/body[1]/div[6]/div[1]/h1[1]
          html_file: web/0021/historischeverenigingnijeveen.nl/rendered.html
          xpath_match_score: 1.0
          pipeline_stage: layout_analysis
        description: Exact match claim for organization name
      - value:
          claim_type: beeldbank_total_photos
          claim_value: '6253'
          source_url: https://historischeverenigingnijeveen.nl/nl/hvn
          retrieved_on: '2025-11-29T12:28:00Z'
          xpath: /html[1]/body[1]/div[6]/div[1]/table[3]/tbody[1]/tr[1]/td[1]/p[1]
          html_file: web/0021/historischeverenigingnijeveen.nl/rendered.html
          xpath_match_score: 1.0
          pipeline_stage: layout_analysis
        description: Collection count claim from image bank statistics
      - value:
          claim_type: facebook
          claim_value: https://www.facebook.com/HistorischeVerenigingNijeveen/
          source_url: https://historischeverenigingnijeveen.nl/
          retrieved_on: '2025-11-29T12:28:00Z'
          xpath: /html[1]/body[1]/footer[1]/div[1]/a[3]
          html_file: web/0021/historischeverenigingnijeveen.nl/rendered.html
          xpath_match_score: 1.0
          pipeline_stage: entity_linking
        description: Social media link claim - entity linking stage
      - value:
          claim_type: website
          claim_value: https://www.historischeverenigingnijeveen.nl/
          source_url: https://historischeverenigingnijeveen.nl/nl/hvn
          retrieved_on: '2025-11-28T12:00:00Z'
          xpath: /html[1]/body[1]/div[6]/div[1]/table[3]/tbody[1]/tr[1]/td[1]/p[6]
          html_file: web/0021/historischeverenigingnijeveen.nl/rendered.html
          xpath_match_score: 0.561
          xpath_matched_text: De Historische Vereniging Nijeveen is ook te vinden op
            Facebook
          pipeline_stage: layout_analysis
        description: Substring match - URL found within longer text