390 lines
13 KiB
YAML
390 lines
13 KiB
YAML
# WebClaim Class
|
|
# Models individual claims extracted from web pages with XPath provenance
|
|
#
|
|
# Created November 2025 to provide:
|
|
# - Verifiable provenance for each extracted claim value
|
|
# - XPath pointers to exact locations in archived HTML
|
|
# - Rejection of claims without verifiable sources
|
|
#
|
|
# Key principle:
|
|
# Every claim from a webpage MUST have an XPath pointer to the exact
|
|
# location in archived HTML where that value appears.
|
|
# Claims without XPath provenance are considered FABRICATED and must be removed.
|
|
#
|
|
# This is NOT about "confidence" - it's about VERIFIABILITY.
|
|
# Either the claim value exists in the HTML at a specific XPath, or it was
|
|
# hallucinated/fabricated by an LLM.
|
|
#
|
|
# Examples:
|
|
# - full_name claim verified at /html/body/div[1]/h1
|
|
# - email claim verified at /html/body/footer/a[@href="mailto:..."]
|
|
# - description claim verified at /html/head/meta[@name="description"]
|
|
|
|
id: https://nde.nl/ontology/hc/class/WebClaim
|
|
name: WebClaim
|
|
title: WebClaim Class - Verifiable Web-Extracted Claims
|
|
|
|
prefixes:
|
|
linkml: https://w3id.org/linkml/
|
|
hc: https://nde.nl/ontology/hc/
|
|
schema: http://schema.org/
|
|
dcterms: http://purl.org/dc/terms/
|
|
prov: http://www.w3.org/ns/prov#
|
|
pav: http://purl.org/pav/
|
|
xsd: http://www.w3.org/2001/XMLSchema#
|
|
|
|
imports:
|
|
- linkml:types
|
|
|
|
default_prefix: hc
|
|
|
|
enums:
|
|
ClaimTypeEnum:
|
|
description: >-
|
|
Types of claims that can be extracted from web pages.
|
|
Each claim type has expected source locations in HTML.
|
|
permissible_values:
|
|
full_name:
|
|
description: Official full name of the organization
|
|
comments:
|
|
- "Expected in: <title>, <h1>, logo text, meta og:title"
|
|
short_name:
|
|
description: Abbreviated or short name
|
|
comments:
|
|
- "Expected in: <h1>, logo, header"
|
|
description:
|
|
description: Description or about text
|
|
comments:
|
|
- "Expected in: <meta name='description'>, about section"
|
|
email:
|
|
description: Contact email address
|
|
comments:
|
|
- "Expected in: <a href='mailto:...'>, contact page, footer"
|
|
phone:
|
|
description: Contact phone number
|
|
comments:
|
|
- "Expected in: <a href='tel:...'>, contact page, footer"
|
|
address:
|
|
description: Physical address
|
|
comments:
|
|
- "Expected in: contact page, footer, structured data"
|
|
website:
|
|
description: Website URL (may differ from source_url)
|
|
comments:
|
|
- "Expected in: contact page, links"
|
|
social_media:
|
|
description: Social media profile links
|
|
comments:
|
|
- "Expected in: footer, header, contact page"
|
|
facebook:
|
|
description: Facebook page URL
|
|
twitter:
|
|
description: Twitter/X profile URL
|
|
instagram:
|
|
description: Instagram profile URL
|
|
linkedin:
|
|
description: LinkedIn page URL
|
|
youtube:
|
|
description: YouTube channel URL
|
|
opening_hours:
|
|
description: Opening hours / visiting times
|
|
comments:
|
|
- "Expected in: visit page, contact page, structured data"
|
|
admission_info:
|
|
description: Admission prices or policies
|
|
accessibility_info:
|
|
description: Accessibility information
|
|
collection_count:
|
|
description: Number of items in collection
|
|
beeldbank_total_photos:
|
|
description: Total photos in image bank (beeldbank)
|
|
beeldbank_described_photos:
|
|
description: Number of described photos in image bank
|
|
founding_date:
|
|
description: When organization was founded
|
|
kvk_number:
|
|
description: Dutch Chamber of Commerce number
|
|
isil_code:
|
|
description: ISIL identifier
|
|
wikidata_id:
|
|
description: Wikidata Q-number
|
|
parent_organization:
|
|
description: Parent organization reference
|
|
|
|
slots:
|
|
claim_id:
|
|
identifier: true
|
|
range: uriorcurie
|
|
description: Unique identifier for this claim
|
|
|
|
claim_type:
|
|
range: ClaimTypeEnum
|
|
required: true
|
|
description: Type of claim being made
|
|
|
|
claim_value:
|
|
range: string
|
|
required: true
|
|
description: The extracted value
|
|
|
|
source_url:
|
|
range: uri
|
|
required: true
|
|
slot_uri: pav:retrievedFrom
|
|
description: URL of the web page this claim was extracted from
|
|
|
|
retrieved_on:
|
|
range: datetime
|
|
required: true
|
|
slot_uri: pav:retrievedOn
|
|
description: >-
|
|
Timestamp when the web page was archived.
|
|
ISO 8601 format with timezone (UTC preferred).
|
|
|
|
xpath:
|
|
range: string
|
|
required: true
|
|
description: >-
|
|
XPath to the element containing this claim value.
|
|
This is the CRITICAL provenance field - without it, a claim is unverifiable.
|
|
|
|
Format: Standard XPath 1.0 expression
|
|
Example: /html[1]/body[1]/div[6]/div[1]/h1[1]
|
|
pattern: "^/.*"
|
|
|
|
html_file:
|
|
range: string
|
|
required: true
|
|
description: >-
|
|
Relative path to the archived HTML file containing this claim.
|
|
Path is relative to the entry file.
|
|
|
|
Example: web/0021/historischeverenigingnijeveen.nl/rendered.html
|
|
|
|
xpath_match_score:
|
|
range: float
|
|
required: true
|
|
minimum_value: 0.0
|
|
maximum_value: 1.0
|
|
description: >-
|
|
Match quality between claim_value and the text at the XPath location.
|
|
|
|
- 1.0 = Exact match (claim_value appears verbatim)
|
|
- 0.8-0.99 = Near match after normalization (whitespace, case)
|
|
- 0.5-0.79 = Substring match (claim_value is part of element text)
|
|
- <0.5 = Weak match (claim may need verification)
|
|
|
|
Claims with score < 0.3 should be flagged for manual review.
|
|
|
|
xpath_matched_text:
|
|
range: string
|
|
description: >-
|
|
The actual text found at the XPath location.
|
|
Useful when xpath_match_score < 1.0 to show what was matched.
|
|
|
|
extraction_timestamp:
|
|
range: datetime
|
|
description: >-
|
|
When the claim was extracted from the archived HTML.
|
|
May differ from retrieved_on if extraction happens later.
|
|
|
|
extraction_method:
|
|
range: string
|
|
description: >-
|
|
Method used to extract this claim.
|
|
Examples: "xpath_exact_match", "text_search", "css_selector"
|
|
|
|
claim_notes:
|
|
range: string
|
|
description: Notes about this specific claim extraction
|
|
|
|
classes:
|
|
WebClaim:
|
|
class_uri: prov:Entity
|
|
description: >-
|
|
A single verifiable claim extracted from a web page.
|
|
|
|
**CORE PRINCIPLE: XPATH OR REMOVE**
|
|
|
|
Every claim extracted from a webpage MUST have:
|
|
1. `xpath` - pointing to exact element in archived HTML
|
|
2. `html_file` - path to the archived HTML
|
|
3. `xpath_match_score` - quality of the match
|
|
|
|
Claims without these fields are FABRICATED and must be REMOVED.
|
|
|
|
**WHY NOT CONFIDENCE SCORES?**
|
|
|
|
Confidence scores like `0.95` are MEANINGLESS because:
|
|
- There is NO methodology defining what these numbers mean
|
|
- They cannot be verified or reproduced
|
|
- They give false impression of rigor
|
|
- They mask the fact that claims may be fabricated
|
|
|
|
Instead, we use VERIFIABLE provenance:
|
|
- XPath points to exact location
|
|
- Archived HTML can be inspected
|
|
- Match score is computed, not estimated
|
|
|
|
**WORKFLOW**:
|
|
|
|
1. Archive website using Playwright:
|
|
`python scripts/fetch_website_playwright.py 0021 https://example.org/`
|
|
|
|
2. Add XPath provenance to claims:
|
|
`python scripts/add_xpath_provenance.py`
|
|
|
|
3. Script REMOVES claims that cannot be verified
|
|
(stores in `removed_unverified_claims` for audit)
|
|
|
|
**EXAMPLES**:
|
|
|
|
CORRECT (Verifiable):
|
|
```yaml
|
|
- claim_type: full_name
|
|
claim_value: Historische Vereniging Nijeveen
|
|
source_url: https://historischeverenigingnijeveen.nl/
|
|
retrieved_on: "2025-11-29T12:28:00Z"
|
|
xpath: /html[1]/body[1]/div[6]/div[1]/h1[1]
|
|
html_file: web/0021/historischeverenigingnijeveen.nl/rendered.html
|
|
xpath_match_score: 1.0
|
|
```
|
|
|
|
WRONG (Fabricated - Must Be Removed):
|
|
```yaml
|
|
- claim_type: full_name
|
|
claim_value: Historische Vereniging Nijeveen
|
|
confidence: 0.95 # ← NO! This is meaningless without XPath
|
|
```
|
|
|
|
exact_mappings:
|
|
- prov:Entity
|
|
|
|
close_mappings:
|
|
- schema:PropertyValue
|
|
|
|
slots:
|
|
- claim_id
|
|
- claim_type
|
|
- claim_value
|
|
- source_url
|
|
- retrieved_on
|
|
- xpath
|
|
- html_file
|
|
- xpath_match_score
|
|
- xpath_matched_text
|
|
- extraction_timestamp
|
|
- extraction_method
|
|
- claim_notes
|
|
|
|
slot_usage:
|
|
claim_type:
|
|
required: true
|
|
description: >-
|
|
Type of claim. See ClaimTypeEnum for allowed values.
|
|
|
|
Each claim type has expected source locations:
|
|
- full_name: <title>, <h1>, logo, og:title
|
|
- description: <meta name="description">, about section
|
|
- email: <a href="mailto:...">, contact page
|
|
- phone: <a href="tel:...">, contact page
|
|
- address: footer, contact page, JSON-LD
|
|
|
|
claim_value:
|
|
required: true
|
|
description: >-
|
|
The extracted value. Must appear at the XPath location.
|
|
|
|
xpath:
|
|
required: true
|
|
description: >-
|
|
XPath to element containing claim_value.
|
|
|
|
**THIS IS THE CRITICAL PROVENANCE FIELD.**
|
|
|
|
Without an XPath, a claim is unverifiable and must be removed.
|
|
|
|
Format: Standard XPath 1.0 expression
|
|
Example: /html[1]/body[1]/div[6]/div[1]/table[3]/tbody[1]/tr[1]/td[1]/p[6]
|
|
|
|
html_file:
|
|
required: true
|
|
description: >-
|
|
Path to archived HTML file (relative to entry file).
|
|
|
|
Standard structure:
|
|
web/{entry_number}/{domain}/rendered.html
|
|
|
|
Example: web/0021/historischeverenigingnijeveen.nl/rendered.html
|
|
|
|
xpath_match_score:
|
|
required: true
|
|
description: >-
|
|
Match quality between claim_value and text at XPath.
|
|
|
|
This is COMPUTED, not estimated:
|
|
- 1.0: claim_value == element_text (exact)
|
|
- <1.0: len(claim_value) / len(element_text) for substrings
|
|
|
|
Claims with score < 0.3 should be flagged for review.
|
|
|
|
rules:
|
|
- preconditions:
|
|
slot_conditions:
|
|
xpath:
|
|
value_presence: ABSENT
|
|
postconditions:
|
|
description: "Claims without XPath must be removed as unverifiable"
|
|
|
|
comments:
|
|
- "WebClaim requires XPath provenance - claims without it are fabricated"
|
|
- "Match score is COMPUTED from actual text comparison, not estimated"
|
|
- "Archived HTML files are stored in web/{entry}/{domain}/ directories"
|
|
- "Use scripts/add_xpath_provenance.py to add XPath to existing claims"
|
|
|
|
see_also:
|
|
- ".opencode/WEB_OBSERVATION_PROVENANCE_RULES.md"
|
|
- "scripts/fetch_website_playwright.py"
|
|
- "scripts/add_xpath_provenance.py"
|
|
|
|
examples:
|
|
- value:
|
|
claim_type: full_name
|
|
claim_value: Historische Vereniging Nijeveen
|
|
source_url: "https://historischeverenigingnijeveen.nl/"
|
|
retrieved_on: "2025-11-29T12:28:00Z"
|
|
xpath: "/html[1]/body[1]/div[6]/div[1]/h1[1]"
|
|
html_file: "web/0021/historischeverenigingnijeveen.nl/rendered.html"
|
|
xpath_match_score: 1.0
|
|
description: "Exact match claim for organization name"
|
|
|
|
- value:
|
|
claim_type: beeldbank_total_photos
|
|
claim_value: "6253"
|
|
source_url: "https://historischeverenigingnijeveen.nl/nl/hvn"
|
|
retrieved_on: "2025-11-29T12:28:00Z"
|
|
xpath: "/html[1]/body[1]/div[6]/div[1]/table[3]/tbody[1]/tr[1]/td[1]/p[1]"
|
|
html_file: "web/0021/historischeverenigingnijeveen.nl/rendered.html"
|
|
xpath_match_score: 1.0
|
|
description: "Collection count claim from image bank statistics"
|
|
|
|
- value:
|
|
claim_type: facebook
|
|
claim_value: "https://www.facebook.com/HistorischeVerenigingNijeveen/"
|
|
source_url: "https://historischeverenigingnijeveen.nl/"
|
|
retrieved_on: "2025-11-29T12:28:00Z"
|
|
xpath: "/html[1]/body[1]/footer[1]/div[1]/a[3]"
|
|
html_file: "web/0021/historischeverenigingnijeveen.nl/rendered.html"
|
|
xpath_match_score: 1.0
|
|
description: "Social media link claim"
|
|
|
|
- value:
|
|
claim_type: website
|
|
claim_value: "https://www.historischeverenigingnijeveen.nl/"
|
|
source_url: "https://historischeverenigingnijeveen.nl/nl/hvn"
|
|
retrieved_on: "2025-11-28T12:00:00Z"
|
|
xpath: "/html[1]/body[1]/div[6]/div[1]/table[3]/tbody[1]/tr[1]/td[1]/p[6]"
|
|
html_file: "web/0021/historischeverenigingnijeveen.nl/rendered.html"
|
|
xpath_match_score: 0.561
|
|
xpath_matched_text: "De Historische Vereniging Nijeveen is ook te vinden op Facebook"
|
|
description: "Substring match - URL found within longer text"
|