glam/schemas/20251121/linkml/modules/classes/WebClaim.yaml
2025-11-29 18:05:16 +01:00

390 lines
13 KiB
YAML

# WebClaim Class
# Models individual claims extracted from web pages with XPath provenance
#
# Created November 2025 to provide:
# - Verifiable provenance for each extracted claim value
# - XPath pointers to exact locations in archived HTML
# - Rejection of claims without verifiable sources
#
# Key principle:
# Every claim from a webpage MUST have an XPath pointer to the exact
# location in archived HTML where that value appears.
# Claims without XPath provenance are considered FABRICATED and must be removed.
#
# This is NOT about "confidence" - it's about VERIFIABILITY.
# Either the claim value exists in the HTML at a specific XPath, or it was
# hallucinated/fabricated by an LLM.
#
# Examples:
# - full_name claim verified at /html/body/div[1]/h1
# - email claim verified at /html/body/footer/a[@href="mailto:..."]
# - description claim verified at /html/head/meta[@name="description"]
id: https://nde.nl/ontology/hc/class/WebClaim
name: WebClaim
title: WebClaim Class - Verifiable Web-Extracted Claims
prefixes:
linkml: https://w3id.org/linkml/
hc: https://nde.nl/ontology/hc/
schema: http://schema.org/
dcterms: http://purl.org/dc/terms/
prov: http://www.w3.org/ns/prov#
pav: http://purl.org/pav/
xsd: http://www.w3.org/2001/XMLSchema#
imports:
- linkml:types
default_prefix: hc
enums:
ClaimTypeEnum:
description: >-
Types of claims that can be extracted from web pages.
Each claim type has expected source locations in HTML.
permissible_values:
full_name:
description: Official full name of the organization
comments:
- "Expected in: <title>, <h1>, logo text, meta og:title"
short_name:
description: Abbreviated or short name
comments:
- "Expected in: <h1>, logo, header"
description:
description: Description or about text
comments:
- "Expected in: <meta name='description'>, about section"
email:
description: Contact email address
comments:
- "Expected in: <a href='mailto:...'>, contact page, footer"
phone:
description: Contact phone number
comments:
- "Expected in: <a href='tel:...'>, contact page, footer"
address:
description: Physical address
comments:
- "Expected in: contact page, footer, structured data"
website:
description: Website URL (may differ from source_url)
comments:
- "Expected in: contact page, links"
social_media:
description: Social media profile links
comments:
- "Expected in: footer, header, contact page"
facebook:
description: Facebook page URL
twitter:
description: Twitter/X profile URL
instagram:
description: Instagram profile URL
linkedin:
description: LinkedIn page URL
youtube:
description: YouTube channel URL
opening_hours:
description: Opening hours / visiting times
comments:
- "Expected in: visit page, contact page, structured data"
admission_info:
description: Admission prices or policies
accessibility_info:
description: Accessibility information
collection_count:
description: Number of items in collection
beeldbank_total_photos:
description: Total photos in image bank (beeldbank)
beeldbank_described_photos:
description: Number of described photos in image bank
founding_date:
description: When organization was founded
kvk_number:
description: Dutch Chamber of Commerce number
isil_code:
description: ISIL identifier
wikidata_id:
description: Wikidata Q-number
parent_organization:
description: Parent organization reference
slots:
claim_id:
identifier: true
range: uriorcurie
description: Unique identifier for this claim
claim_type:
range: ClaimTypeEnum
required: true
description: Type of claim being made
claim_value:
range: string
required: true
description: The extracted value
source_url:
range: uri
required: true
slot_uri: pav:retrievedFrom
description: URL of the web page this claim was extracted from
retrieved_on:
range: datetime
required: true
slot_uri: pav:retrievedOn
description: >-
Timestamp when the web page was archived.
ISO 8601 format with timezone (UTC preferred).
xpath:
range: string
required: true
description: >-
XPath to the element containing this claim value.
This is the CRITICAL provenance field - without it, a claim is unverifiable.
Format: Standard XPath 1.0 expression
Example: /html[1]/body[1]/div[6]/div[1]/h1[1]
pattern: "^/.*"
html_file:
range: string
required: true
description: >-
Relative path to the archived HTML file containing this claim.
Path is relative to the entry file.
Example: web/0021/historischeverenigingnijeveen.nl/rendered.html
xpath_match_score:
range: float
required: true
minimum_value: 0.0
maximum_value: 1.0
description: >-
Match quality between claim_value and the text at the XPath location.
- 1.0 = Exact match (claim_value appears verbatim)
- 0.8-0.99 = Near match after normalization (whitespace, case)
- 0.5-0.79 = Substring match (claim_value is part of element text)
- <0.5 = Weak match (claim may need verification)
Claims with score < 0.3 should be flagged for manual review.
xpath_matched_text:
range: string
description: >-
The actual text found at the XPath location.
Useful when xpath_match_score < 1.0 to show what was matched.
extraction_timestamp:
range: datetime
description: >-
When the claim was extracted from the archived HTML.
May differ from retrieved_on if extraction happens later.
extraction_method:
range: string
description: >-
Method used to extract this claim.
Examples: "xpath_exact_match", "text_search", "css_selector"
claim_notes:
range: string
description: Notes about this specific claim extraction
classes:
WebClaim:
class_uri: prov:Entity
description: >-
A single verifiable claim extracted from a web page.
**CORE PRINCIPLE: XPATH OR REMOVE**
Every claim extracted from a webpage MUST have:
1. `xpath` - pointing to exact element in archived HTML
2. `html_file` - path to the archived HTML
3. `xpath_match_score` - quality of the match
Claims without these fields are FABRICATED and must be REMOVED.
**WHY NOT CONFIDENCE SCORES?**
Confidence scores like `0.95` are MEANINGLESS because:
- There is NO methodology defining what these numbers mean
- They cannot be verified or reproduced
- They give false impression of rigor
- They mask the fact that claims may be fabricated
Instead, we use VERIFIABLE provenance:
- XPath points to exact location
- Archived HTML can be inspected
- Match score is computed, not estimated
**WORKFLOW**:
1. Archive website using Playwright:
`python scripts/fetch_website_playwright.py 0021 https://example.org/`
2. Add XPath provenance to claims:
`python scripts/add_xpath_provenance.py`
3. Script REMOVES claims that cannot be verified
(stores in `removed_unverified_claims` for audit)
**EXAMPLES**:
CORRECT (Verifiable):
```yaml
- claim_type: full_name
claim_value: Historische Vereniging Nijeveen
source_url: https://historischeverenigingnijeveen.nl/
retrieved_on: "2025-11-29T12:28:00Z"
xpath: /html[1]/body[1]/div[6]/div[1]/h1[1]
html_file: web/0021/historischeverenigingnijeveen.nl/rendered.html
xpath_match_score: 1.0
```
WRONG (Fabricated - Must Be Removed):
```yaml
- claim_type: full_name
claim_value: Historische Vereniging Nijeveen
confidence: 0.95 # ← NO! This is meaningless without XPath
```
exact_mappings:
- prov:Entity
close_mappings:
- schema:PropertyValue
slots:
- claim_id
- claim_type
- claim_value
- source_url
- retrieved_on
- xpath
- html_file
- xpath_match_score
- xpath_matched_text
- extraction_timestamp
- extraction_method
- claim_notes
slot_usage:
claim_type:
required: true
description: >-
Type of claim. See ClaimTypeEnum for allowed values.
Each claim type has expected source locations:
- full_name: <title>, <h1>, logo, og:title
- description: <meta name="description">, about section
- email: <a href="mailto:...">, contact page
- phone: <a href="tel:...">, contact page
- address: footer, contact page, JSON-LD
claim_value:
required: true
description: >-
The extracted value. Must appear at the XPath location.
xpath:
required: true
description: >-
XPath to element containing claim_value.
**THIS IS THE CRITICAL PROVENANCE FIELD.**
Without an XPath, a claim is unverifiable and must be removed.
Format: Standard XPath 1.0 expression
Example: /html[1]/body[1]/div[6]/div[1]/table[3]/tbody[1]/tr[1]/td[1]/p[6]
html_file:
required: true
description: >-
Path to archived HTML file (relative to entry file).
Standard structure:
web/{entry_number}/{domain}/rendered.html
Example: web/0021/historischeverenigingnijeveen.nl/rendered.html
xpath_match_score:
required: true
description: >-
Match quality between claim_value and text at XPath.
This is COMPUTED, not estimated:
- 1.0: claim_value == element_text (exact)
- <1.0: len(claim_value) / len(element_text) for substrings
Claims with score < 0.3 should be flagged for review.
rules:
- preconditions:
slot_conditions:
xpath:
value_presence: ABSENT
postconditions:
description: "Claims without XPath must be removed as unverifiable"
comments:
- "WebClaim requires XPath provenance - claims without it are fabricated"
- "Match score is COMPUTED from actual text comparison, not estimated"
- "Archived HTML files are stored in web/{entry}/{domain}/ directories"
- "Use scripts/add_xpath_provenance.py to add XPath to existing claims"
see_also:
- ".opencode/WEB_OBSERVATION_PROVENANCE_RULES.md"
- "scripts/fetch_website_playwright.py"
- "scripts/add_xpath_provenance.py"
examples:
- value:
claim_type: full_name
claim_value: Historische Vereniging Nijeveen
source_url: "https://historischeverenigingnijeveen.nl/"
retrieved_on: "2025-11-29T12:28:00Z"
xpath: "/html[1]/body[1]/div[6]/div[1]/h1[1]"
html_file: "web/0021/historischeverenigingnijeveen.nl/rendered.html"
xpath_match_score: 1.0
description: "Exact match claim for organization name"
- value:
claim_type: beeldbank_total_photos
claim_value: "6253"
source_url: "https://historischeverenigingnijeveen.nl/nl/hvn"
retrieved_on: "2025-11-29T12:28:00Z"
xpath: "/html[1]/body[1]/div[6]/div[1]/table[3]/tbody[1]/tr[1]/td[1]/p[1]"
html_file: "web/0021/historischeverenigingnijeveen.nl/rendered.html"
xpath_match_score: 1.0
description: "Collection count claim from image bank statistics"
- value:
claim_type: facebook
claim_value: "https://www.facebook.com/HistorischeVerenigingNijeveen/"
source_url: "https://historischeverenigingnijeveen.nl/"
retrieved_on: "2025-11-29T12:28:00Z"
xpath: "/html[1]/body[1]/footer[1]/div[1]/a[3]"
html_file: "web/0021/historischeverenigingnijeveen.nl/rendered.html"
xpath_match_score: 1.0
description: "Social media link claim"
- value:
claim_type: website
claim_value: "https://www.historischeverenigingnijeveen.nl/"
source_url: "https://historischeverenigingnijeveen.nl/nl/hvn"
retrieved_on: "2025-11-28T12:00:00Z"
xpath: "/html[1]/body[1]/div[6]/div[1]/table[3]/tbody[1]/tr[1]/td[1]/p[6]"
html_file: "web/0021/historischeverenigingnijeveen.nl/rendered.html"
xpath_match_score: 0.561
xpath_matched_text: "De Historische Vereniging Nijeveen is ook te vinden op Facebook"
description: "Substring match - URL found within longer text"