glam/schemas/20251121/linkml/modules/classes/WebClaim.yaml

id: https://nde.nl/ontology/hc/class/WebClaim
name: WebClaim
title: WebClaim Class - Verifiable Web-Extracted Claims
prefixes:
  linkml: https://w3id.org/linkml/
  hc: https://nde.nl/ontology/hc/
  schema: http://schema.org/
  dcterms: http://purl.org/dc/terms/
  prov: http://www.w3.org/ns/prov#
  pav: http://purl.org/pav/
  xsd: http://www.w3.org/2001/XMLSchema#
  oa: http://www.w3.org/ns/oa#
  nif: http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#
imports:
- linkml:types
- ../slots/source_url
- ../slots/retrieved_on
- ../slots/xpath
- ../slots/html_file
- ../slots/claim_id
- ../slots/claim_type
- ../slots/claim_value
- ../slots/extraction_timestamp
- ../slots/class_metadata_slots
default_prefix: hc
enums:
  ClaimTypeEnum:
    description: |
      Types of claims that can be extracted from web pages.
      Each claim type has expected source locations in HTML.
    permissible_values:
      full_name:
        description: Official full name of the organization
        comments:
        - 'Expected in: <title>, <h1>, logo text, meta og:title'
      short_name:
        description: Abbreviated or short name
        comments:
        - 'Expected in: <h1>, logo, header'
      description:
        description: Description or about text
        comments:
        - 'Expected in: <meta name=''description''>, about section'
      email:
        description: Contact email address
        comments:
        - 'Expected in: <a href=''mailto:...''>, contact page, footer'
      phone:
        description: Contact phone number
        comments:
        - 'Expected in: <a href=''tel:...''>, contact page, footer'
      address:
        description: Physical address
        comments:
        - 'Expected in: contact page, footer, structured data'
      website:
        description: Website URL (may differ from source_url)
        comments:
        - 'Expected in: contact page, links'
      social_media:
        description: Social media profile links
        comments:
        - 'Expected in: footer, header, contact page'
      facebook:
        description: Facebook page URL
      twitter:
        description: Twitter/X profile URL
      instagram:
        description: Instagram profile URL
      linkedin:
        description: LinkedIn page URL
      youtube:
        description: YouTube channel URL
      logo_url:
        description: Organization logo image URL
        comments:
        - 'Expected in: <img> in header/nav, <link rel=''icon''>, og:image meta tag'
        - 'Look for: logo, brand, emblem, icon in class/id/alt attributes'
        - 'Priority: SVG > PNG > JPG for quality'
      favicon_url:
        description: Website favicon URL
        comments:
        - 'Expected in: <link rel=''icon''>, <link rel=''shortcut icon''>'
        - 'Usually: /favicon.ico, /favicon.png'
      og_image_url:
        description: Open Graph image URL (social sharing image)
        comments:
        - 'Expected in: <meta property=''og:image''>'
        - Often the logo or a representative image
      opening_hours:
        description: Opening hours / visiting times
        comments:
        - 'Expected in: visit page, contact page, structured data'
      admission_info:
        description: Admission prices or policies
      accessibility_info:
        description: Accessibility information
      collection_count:
        description: Number of items in collection
      beeldbank_total_photos:
        description: Total photos in image bank (beeldbank)
      beeldbank_described_photos:
        description: Number of described photos in image bank
      founding_date:
        description: When organization was founded
      kvk_number:
        description: Dutch Chamber of Commerce number
      isil_code:
        description: ISIL identifier
      wikidata_id:
        description: Wikidata Q-number
      parent_organization:
        description: Parent organization reference
      annual_report_url:
        description: Link to annual report PDF (jaarverslag)
        comments:
        - 'Expected in: /over-ons/jaarverslagen/, /organisatie/jaarverslagen/'
        - 'Dutch: jaarverslag, publieksjaarverslag'
        - 'English: annual report'
      financial_statement_url:
        description: Link to annual financial statement document (jaarstukken, jaarrekening)
        comments:
        - 'Expected in: /organisatie/jaarverslagen/, /over-ons/documenten/'
        - 'Dutch: jaarstukken, jaarrekening'
        - 'English: financial statements, annual accounts'
      anbi_publication_url:
        description: ANBI publication link (Dutch charity tax status requirement)
        comments:
        - ANBI = Algemeen Nut Beogende Instelling (public benefit institution)
        - Required for Dutch non-profits to maintain tax-exempt status
        - 'Expected in: footer, /anbi/, /over-ons/'
      policy_document_url:
        description: Multi-year policy document link (meerjarenbeleid, beleidsplan)
        comments:
        - 'Dutch: meerjarenbeleid, beleidsplan, strategisch plan'
        - 'English: multi-year policy, strategic plan'
      financial_document_year:
        description: Fiscal year for a financial document (extracted from filename/context)
        comments:
        - Extracted from PDF filename or link text
        - 'Format: YYYY (e.g., 2024)'
  ExtractionPipelineStageEnum:
    description: |
      The four stages of the entity extraction pipeline, following the
      GLAM-NER Unified Entity Annotation Convention v1.7.0.

      Reference: docs/convention/schema/20251202/entity_annotation_rules_v1.6.0_unified.yaml
    permissible_values:
      entity_recognition:
        description: |
          Stage 1: Detect and classify named entities in text.
          Identifies entity spans and assigns hypernym types (AGT, GRP, TOP, etc.)
        meaning: nif:String
        comments:
        - 'Input: Raw text or rendered HTML'
        - 'Output: Entity spans with type classifications'
        - 'Models: spaCy NER, transformer-based NER, pattern matching'
      layout_analysis:
        description: |
          Stage 2: Analyze document structure and region semantics.
          Identifies headers, paragraphs, tables, navigation, sidebars, etc.
          Uses DOC hypernym types (DOC.HDR, DOC.PAR, DOC.TBL, etc.)
        meaning: oa:Annotation
        comments:
        - 'Input: HTML DOM or PAGE-XML'
        - 'Output: Document region annotations with semantic roles'
        - XPath provenance links claims to specific DOM locations
      entity_resolution:
        description: |
          Stage 3: Disambiguate and merge entity mentions.
          Resolves coreferences, handles name variants, deduplicates entities.
        meaning: prov:Activity
        comments:
        - 'Input: Entity spans from stage 1'
        - 'Output: Resolved entity clusters with canonical forms'
        - 'Methods: String similarity, context matching, ML classifiers'
      entity_linking:
        description: |
          Stage 4: Link entities to knowledge bases.
          Connects resolved entities to Wikidata, ISIL registry, GeoNames, etc.
        meaning: oa:Annotation
        comments:
        - 'Input: Resolved entities from stage 3'
        - 'Output: Knowledge base URIs (Q-numbers, ISIL codes, GeoNames IDs)'
        - 'Confidence: Link confidence scores from disambiguation'
slots:
  xpath_match_score:
    range: float
    required: true
    minimum_value: 0.0
    maximum_value: 1.0
    description: |
      Match quality between claim_value and the text at the XPath location.

      - 1.0 = Exact match (claim_value appears verbatim)
      - 0.8-0.99 = Near match after normalization (whitespace, case)
      - 0.5-0.79 = Substring match (claim_value is part of element text)
      - <0.5 = Weak match (claim may need verification)

      Claims with score < 0.3 should be flagged for manual review.
  xpath_matched_text:
    range: string
    description: |
      The actual text found at the XPath location.
      Useful when xpath_match_score < 1.0 to show what was matched.
  claim_extraction_method:
    range: string
    description: |
      Method used to extract this claim.
      Examples: "xpath_exact_match", "text_search", "css_selector", "json_ld_parse"
  pipeline_stage:
    range: ExtractionPipelineStageEnum
    description: |
      Which stage of the extraction pipeline produced this claim.
      Following the 4-stage GLAM-NER pipeline:
      1. entity_recognition - NER detection
      2. layout_analysis - Document structure analysis
      3. entity_resolution - Disambiguation and merging
      4. entity_linking - Knowledge base linking
  claim_notes:
    range: string
    description: Notes about this specific claim extraction
classes:
  WebClaim:
    class_uri: prov:Entity
    description: "A single verifiable claim extracted from a web page.\n\n**CORE PRINCIPLE:\
      \ XPATH OR REMOVE**\n\nEvery claim extracted from a webpage MUST have:\n1. `xpath`\
      \ - pointing to exact element in archived HTML\n2. `html_file` - path to the\
      \ archived HTML (Playwright-rendered, NOT WARC)\n3. `xpath_match_score` - quality\
      \ of the match\n\nClaims without these fields are FABRICATED and must be REMOVED.\n\
      \n**ARCHIVE FORMAT: PLAYWRIGHT-RENDERED HTML**\n\nWe use Playwright (headless\
      \ browser) to:\n1. Navigate to the target URL\n2. Wait for JavaScript to fully\
      \ render\n3. Save the complete DOM as an HTML file\n\nThis differs from WARC\
      \ archives which capture raw HTTP responses.\nPlaywright rendering captures\
      \ the final DOM state including:\n- JavaScript-rendered content\n- Dynamically\
      \ loaded elements\n- Client-side state\n\n**WHY NOT CONFIDENCE SCORES?**\n\n\
      Confidence scores like `0.95` are MEANINGLESS because:\n- There is NO methodology\
      \ defining what these numbers mean\n- They cannot be verified or reproduced\n\
      - They give false impression of rigor\n- They mask the fact that claims may\
      \ be fabricated\n\nInstead, we use VERIFIABLE provenance:\n- XPath points to\
      \ exact location\n- Archived HTML can be inspected\n- Match score is computed,\
      \ not estimated\n\n**EXTRACTION PIPELINE (4 Stages)**\n\nFollowing the GLAM-NER\
      \ Unified Entity Annotation Convention v1.7.0:\n\n1. **Entity Recognition**\
      \ (Stage 1)\n   - Detect named entities in text\n   - Classify by hypernym type\
      \ (AGT, GRP, TOP, TMP, etc.)\n   - Methods: spaCy NER, transformer models, regex\
      \ patterns\n\n2. **Layout Analysis** (Stage 2)\n   - Analyze document structure\
      \ (headers, paragraphs, tables)\n   - Assign DOC hypernym types (DOC.HDR, DOC.PAR,\
      \ DOC.TBL)\n   - Generate XPath provenance for each claim location\n\n3. **Entity\
      \ Resolution** (Stage 3)\n   - Disambiguate entity mentions\n   - Merge coreferences\
      \ and name variants\n   - Produce canonical entity clusters\n\n4. **Entity Linking**\
      \ (Stage 4)\n   - Link resolved entities to knowledge bases\n   - Connect to\
      \ Wikidata, ISIL, GeoNames, etc.\n   - Assign link confidence scores\n\n**WORKFLOW**:\n\
      \n1. Archive website using Playwright:\n   `python scripts/fetch_website_playwright.py\
      \ <entry_number> <url>`\n   \n   This saves: web/{entry_number}/{domain}/rendered.html\n\
      \n2. Add XPath provenance to claims:\n   `python scripts/add_xpath_provenance.py`\n\
      \n3. Script REMOVES claims that cannot be verified\n   (stores in `removed_unverified_claims`\
      \ for audit)\n\n**EXAMPLES**:\n\nCORRECT (Verifiable):\n```yaml\n- claim_type:\
      \ full_name\n  claim_value: Historische Vereniging Nijeveen\n  source_url: https://historischeverenigingnijeveen.nl/\n\
      \  retrieved_on: \"2025-11-29T12:28:00Z\"\n  xpath: /html[1]/body[1]/div[6]/div[1]/h1[1]\n\
      \  html_file: web/0021/historischeverenigingnijeveen.nl/rendered.html\n  xpath_match_score:\
      \ 1.0\n  pipeline_stage: layout_analysis\n```\n\nWRONG (Fabricated - Must Be\
      \ Removed):\n```yaml\n- claim_type: full_name\n  claim_value: Historische Vereniging\
      \ Nijeveen\n  confidence: 0.95  # ← NO! This is meaningless without XPath\n\
      ```\n"
    exact_mappings:
    - prov:Entity
    close_mappings:
    - schema:PropertyValue
    - oa:Annotation
    slots:
    - claim_extraction_method
    - claim_id
    - claim_notes
    - claim_type
    - claim_value
    - extraction_timestamp
    - html_file
    - pipeline_stage
    - retrieved_on
    - source_url
    - specificity_annotation
    - template_specificity
    - xpath
    - xpath_match_score
    - xpath_matched_text
    slot_usage:
      claim_type:
        required: true
        description: |
          Type of claim. See ClaimTypeEnum for allowed values.

          Each claim type has expected source locations:
          - full_name: <title>, <h1>, logo, og:title
          - description: <meta name="description">, about section
          - email: <a href="mailto:...">, contact page
          - phone: <a href="tel:...">, contact page
          - address: footer, contact page, JSON-LD
      claim_value:
        required: true
        description: |
          The extracted value. Must appear at the XPath location.
      source_url:
        required: true
        description: |
          URL of the web page this claim was extracted from.
          Required for all claims - enables verification of provenance.
      retrieved_on:
        required: true
        description: |
          Timestamp when the web page was archived using Playwright.
          ISO 8601 format with timezone (UTC preferred).
      xpath:
        required: true
        description: |
          XPath to element containing claim_value.

          **THIS IS THE CRITICAL PROVENANCE FIELD.**

          Without an XPath, a claim is unverifiable and must be removed.

          Format: Standard XPath 1.0 expression
          Example: /html[1]/body[1]/div[6]/div[1]/table[3]/tbody[1]/tr[1]/td[1]/p[6]
      html_file:
        required: true
        description: |
          Path to archived HTML file (relative to entry file).

          Archive format: Playwright-rendered HTML (NOT WARC).
          Playwright captures the fully rendered DOM after JavaScript execution.

          Standard structure:
          web/{entry_number}/{domain}/rendered.html

          Example: web/0021/historischeverenigingnijeveen.nl/rendered.html
      xpath_match_score:
        required: true
        description: |
          Match quality between claim_value and text at XPath.

          This is COMPUTED, not estimated:
          - 1.0: claim_value == element_text (exact)
          - <1.0: len(claim_value) / len(element_text) for substrings

          Claims with score < 0.3 should be flagged for review.
      pipeline_stage:
        description: |
          The extraction pipeline stage that produced this claim.

          Most WebClaims are produced during:
          - layout_analysis (Stage 2): When XPath locations are identified
          - entity_linking (Stage 4): When claims link to knowledge bases
      specificity_annotation:
        range: SpecificityAnnotation
        inlined: true
      template_specificity:
        range: TemplateSpecificityScores
        inlined: true
    rules:
    - preconditions:
        slot_conditions:
          xpath:
            value_presence: ABSENT
      postconditions:
        description: Claims without XPath must be removed as unverifiable
    comments:
    - WebClaim requires XPath provenance - claims without it are fabricated
    - Match score is COMPUTED from actual text comparison, not estimated
    - Archived HTML files are Playwright-rendered (NOT WARC format)
    - Use scripts/fetch_website_playwright.py to archive websites
    - Use scripts/add_xpath_provenance.py to add XPath to existing claims
    - 'Follows 4-stage GLAM-NER pipeline: recognition → layout → resolution → linking'
    see_also:
    - rules/WEB_OBSERVATION_PROVENANCE_RULES.md
    - scripts/fetch_website_playwright.py
    - scripts/add_xpath_provenance.py
    - docs/convention/schema/20251202/entity_annotation_rules_v1.6.0_unified.yaml
    examples:
    - value:
        claim_type: full_name
        claim_value: Historische Vereniging Nijeveen
        source_url: https://historischeverenigingnijeveen.nl/
        retrieved_on: '2025-11-29T12:28:00Z'
        xpath: /html[1]/body[1]/div[6]/div[1]/h1[1]
        html_file: web/0021/historischeverenigingnijeveen.nl/rendered.html
        xpath_match_score: 1.0
        pipeline_stage: layout_analysis
      description: Exact match claim for organization name
    - value:
        claim_type: beeldbank_total_photos
        claim_value: '6253'
        source_url: https://historischeverenigingnijeveen.nl/nl/hvn
        retrieved_on: '2025-11-29T12:28:00Z'
        xpath: /html[1]/body[1]/div[6]/div[1]/table[3]/tbody[1]/tr[1]/td[1]/p[1]
        html_file: web/0021/historischeverenigingnijeveen.nl/rendered.html
        xpath_match_score: 1.0
        pipeline_stage: layout_analysis
      description: Collection count claim from image bank statistics
    - value:
        claim_type: facebook
        claim_value: https://www.facebook.com/HistorischeVerenigingNijeveen/
        source_url: https://historischeverenigingnijeveen.nl/
        retrieved_on: '2025-11-29T12:28:00Z'
        xpath: /html[1]/body[1]/footer[1]/div[1]/a[3]
        html_file: web/0021/historischeverenigingnijeveen.nl/rendered.html
        xpath_match_score: 1.0
        pipeline_stage: entity_linking
      description: Social media link claim - entity linking stage
    - value:
        claim_type: website
        claim_value: https://www.historischeverenigingnijeveen.nl/
        source_url: https://historischeverenigingnijeveen.nl/nl/hvn
        retrieved_on: '2025-11-28T12:00:00Z'
        xpath: /html[1]/body[1]/div[6]/div[1]/table[3]/tbody[1]/tr[1]/td[1]/p[6]
        html_file: web/0021/historischeverenigingnijeveen.nl/rendered.html
        xpath_match_score: 0.561
        xpath_matched_text: De Historische Vereniging Nijeveen is ook te vinden op
          Facebook
        pipeline_stage: layout_analysis
      description: Substring match - URL found within longer text