418 lines
14 KiB
YAML
418 lines
14 KiB
YAML
# PersonWebClaim Class
|
|
# Extends WebClaim pattern for person-specific claims with XPath provenance
|
|
#
|
|
# Created December 2025 to provide:
|
|
# - Verifiable provenance for staff/person data from web pages
|
|
# - Person-specific claim types (name, role, contact, etc.)
|
|
# - Integration with PersonObservation and StaffRole patterns
|
|
#
|
|
# Key principle (per Rule 26):
|
|
# All person/staff data associated with heritage custodians MUST have
|
|
# web claim provenance. Staff information without verifiable sources is unacceptable.
|
|
#
|
|
# This class follows the same XPATH OR REMOVE principle as WebClaim:
|
|
# Every claim from a webpage MUST have an XPath pointer to the exact
|
|
# location in archived HTML where that value appears.
|
|
|
|
id: https://nde.nl/ontology/hc/class/PersonWebClaim
|
|
name: PersonWebClaim
|
|
title: Person Web Claim - Verifiable Person Data Claims
|
|
|
|
prefixes:
|
|
linkml: https://w3id.org/linkml/
|
|
hc: https://nde.nl/ontology/hc/
|
|
schema: http://schema.org/
|
|
dcterms: http://purl.org/dc/terms/
|
|
prov: http://www.w3.org/ns/prov#
|
|
pico: https://personsincontext.org/model#
|
|
foaf: http://xmlns.com/foaf/0.1/
|
|
|
|
imports:
|
|
- linkml:types
|
|
- ../slots/source_url
|
|
- ../slots/retrieved_on
|
|
|
|
default_prefix: hc
|
|
|
|
enums:
|
|
PersonClaimTypeEnum:
|
|
description: |
|
|
Types of claims that can be extracted about persons from web pages.
|
|
Each claim type has expected source locations in institutional pages.
|
|
permissible_values:
|
|
full_name:
|
|
description: Full name of the person
|
|
comments:
|
|
- "Expected in: staff directory, about page, h2/h3 headings"
|
|
given_name:
|
|
description: First/given name only
|
|
comments:
|
|
- "May be parsed from full_name"
|
|
family_name:
|
|
description: Family/surname only
|
|
comments:
|
|
- "May be parsed from full_name"
|
|
role_title:
|
|
description: Job title or role within organization
|
|
comments:
|
|
- "Expected in: staff directory, org chart, biography section"
|
|
department:
|
|
description: Department or division name
|
|
comments:
|
|
- "Expected in: staff directory, org chart"
|
|
email:
|
|
description: Professional email address
|
|
comments:
|
|
- "Expected in: staff directory, contact section"
|
|
- "Only include if publicly listed"
|
|
phone:
|
|
description: Professional phone number
|
|
comments:
|
|
- "Expected in: staff directory, contact section"
|
|
- "Only include if publicly listed"
|
|
biography:
|
|
description: Professional biography text
|
|
comments:
|
|
- "Expected in: staff page, about section"
|
|
specialization:
|
|
description: Area of expertise or specialization
|
|
comments:
|
|
- "Expected in: biography, staff profile, research interests"
|
|
education:
|
|
description: Educational background or degrees
|
|
comments:
|
|
- "Expected in: biography, CV section"
|
|
- "Example: 'PhD Art History, University of Amsterdam'"
|
|
start_date:
|
|
description: Date when person started current role
|
|
comments:
|
|
- "Expected in: biography, news announcements"
|
|
- "Format: ISO 8601 date"
|
|
end_date:
|
|
description: Date when person ended role (if applicable)
|
|
comments:
|
|
- "Expected in: historical records, farewell announcements"
|
|
photo_url:
|
|
description: URL to profile photo
|
|
comments:
|
|
- "Expected in: staff directory, biography page"
|
|
- "For LinkedIn: use CDN URL (media.licdn.com), not overlay page"
|
|
linkedin_url:
|
|
description: LinkedIn profile URL
|
|
comments:
|
|
- "Expected in: staff page footer, social links"
|
|
orcid:
|
|
description: ORCID researcher identifier
|
|
comments:
|
|
- "Expected in: research publications, CV section"
|
|
- "Format: 0000-0000-0000-0000"
|
|
twitter_handle:
|
|
description: Twitter/X handle
|
|
comments:
|
|
- "Expected in: social media section, footer"
|
|
previous_employer:
|
|
description: Previous organization where person worked
|
|
comments:
|
|
- "Expected in: biography, LinkedIn experience"
|
|
publication:
|
|
description: Citation or link to publication
|
|
comments:
|
|
- "Expected in: publications list, CV"
|
|
award:
|
|
description: Professional award or recognition
|
|
comments:
|
|
- "Expected in: biography, awards section"
|
|
language_proficiency:
|
|
description: Language the person speaks
|
|
comments:
|
|
- "Expected in: LinkedIn profile, CV"
|
|
|
|
RetrievalAgentEnum:
|
|
description: |
|
|
Tools/agents used to retrieve and extract person data.
|
|
Must match the tool that was actually used for extraction.
|
|
permissible_values:
|
|
firecrawl:
|
|
description: FireCrawl MCP tools for web scraping
|
|
comments:
|
|
- "Primary tool for institutional websites"
|
|
playwright:
|
|
description: Playwright browser automation
|
|
comments:
|
|
- "For JavaScript-heavy sites requiring browser rendering"
|
|
exa_crawling_exa:
|
|
description: Exa AI crawling with direct URL
|
|
comments:
|
|
- "Primary tool for LinkedIn profile extraction"
|
|
exa_linkedin_search_exa:
|
|
description: Exa AI LinkedIn search
|
|
comments:
|
|
- "For finding LinkedIn profiles when URL unknown"
|
|
manual:
|
|
description: Manual inspection and copy
|
|
comments:
|
|
- "Last resort - document why automated tools failed"
|
|
|
|
slots:
|
|
person_claim_id:
|
|
identifier: true
|
|
range: uriorcurie
|
|
description: Unique identifier for this person claim
|
|
|
|
person_claim_type:
|
|
range: PersonClaimTypeEnum
|
|
required: true
|
|
description: Type of claim being made about the person
|
|
|
|
person_claim_value:
|
|
range: string
|
|
required: true
|
|
description: The extracted value for this claim
|
|
|
|
# XPath provenance - CRITICAL for verifiability
|
|
person_xpath:
|
|
range: string
|
|
required: false
|
|
description: |
|
|
XPath to the element containing this claim value.
|
|
REQUIRED for web page claims. May be null for API-sourced claims.
|
|
|
|
Format: Standard XPath 1.0 expression
|
|
Example: /html/body/main/section[2]/div[1]/h2
|
|
pattern: "^/.*"
|
|
|
|
person_html_file:
|
|
range: string
|
|
required: false
|
|
description: |
|
|
Relative path to archived HTML file containing this claim.
|
|
Path is relative to the custodian data directory.
|
|
|
|
Example: web/NL-NH-AMS-M-RM/rijksmuseum.nl/team.html
|
|
|
|
person_xpath_match_score:
|
|
range: float
|
|
required: false
|
|
minimum_value: 0.0
|
|
maximum_value: 1.0
|
|
description: |
|
|
Match quality between claim value and text at XPath location.
|
|
|
|
- 1.0 = Exact match
|
|
- 0.8-0.99 = Near match after normalization
|
|
- 0.5-0.79 = Substring match
|
|
- <0.5 = Weak match (needs review)
|
|
|
|
retrieval_agent:
|
|
range: RetrievalAgentEnum
|
|
required: true
|
|
description: |
|
|
Tool/agent used to retrieve this data.
|
|
Must accurately reflect the extraction method.
|
|
|
|
person_claim_notes:
|
|
range: string
|
|
description: |
|
|
Notes about this claim extraction.
|
|
Document any issues, conflicts, or special circumstances.
|
|
|
|
classes:
|
|
PersonWebClaim:
|
|
class_uri: prov:Entity
|
|
description: |
|
|
A verifiable claim about a person extracted from a web page with provenance.
|
|
|
|
**RULE 26 COMPLIANCE: Person Data Provenance**
|
|
|
|
All person/staff data associated with heritage custodians MUST have
|
|
web claim provenance. This includes:
|
|
- Staff names, titles, and roles
|
|
- Contact information (if publicly available)
|
|
- Professional history and education
|
|
- Affiliations and expertise areas
|
|
|
|
**VERIFIABILITY PRINCIPLE**
|
|
|
|
Like the base WebClaim class, PersonWebClaim follows the "XPath or Remove" principle:
|
|
- Claims from web pages MUST have xpath pointing to source element
|
|
- Claims from APIs (LinkedIn via Exa) may have xpath=null but MUST have source_url
|
|
- Claims without any verifiable source are FABRICATED and must be removed
|
|
|
|
**SOURCE HIERARCHY**
|
|
|
|
When multiple sources provide the same information:
|
|
1. Official institutional website (highest reliability)
|
|
2. LinkedIn profile (high reliability)
|
|
3. News articles/press releases (medium-high)
|
|
4. Conference programs (medium)
|
|
5. Academic publications (medium)
|
|
6. Third-party databases (lower)
|
|
|
|
Document all sources when available; note conflicts.
|
|
|
|
**CLAIM TYPES**
|
|
|
|
Common claim types for heritage institution staff:
|
|
- **full_name**: Complete name as displayed
|
|
- **role_title**: Job title (e.g., "Senior Curator")
|
|
- **department**: Organizational unit
|
|
- **email**: Professional contact (only if public)
|
|
- **biography**: Professional bio text
|
|
- **specialization**: Expertise areas
|
|
- **education**: Degrees and institutions
|
|
|
|
**EXTRACTION WORKFLOW**
|
|
|
|
1. SCRAPE institutional staff/team pages with FireCrawl
|
|
2. EXTRACT names and roles with XPath locations
|
|
3. SEARCH LinkedIn for additional profile data
|
|
4. CREATE PersonWebClaim for each extracted fact
|
|
5. LINK claims to PersonObservation records
|
|
|
|
**INTEGRATION WITH PERSONOBSERVATION**
|
|
|
|
PersonObservation (the PiCo-based staff role record) references
|
|
PersonWebClaim instances via the `web_claims` slot:
|
|
|
|
```yaml
|
|
PersonObservation:
|
|
person_name: "Dr. Jane Smith"
|
|
staff_role: CONSERVATOR
|
|
web_claims:
|
|
- claim_type: full_name
|
|
claim_value: "Dr. Jane Smith"
|
|
source_url: https://museum.org/team
|
|
xpath: /html/body/main/div[2]/h3
|
|
retrieval_agent: firecrawl
|
|
```
|
|
|
|
**LINKEDIN PROFILE HANDLING**
|
|
|
|
For LinkedIn data, create separate profile files (per Rule 12, Rule 20)
|
|
and reference them:
|
|
|
|
```yaml
|
|
linkedin_claims:
|
|
linkedin_url: https://www.linkedin.com/in/jane-smith
|
|
profile_data_path: data/custodian/person/entity/jane-smith_20250115.json
|
|
retrieved_on: "2025-01-15T10:30:00Z"
|
|
retrieval_agent: exa_crawling_exa
|
|
```
|
|
|
|
exact_mappings:
|
|
- prov:Entity
|
|
|
|
close_mappings:
|
|
- schema:PropertyValue
|
|
- foaf:Document
|
|
|
|
slots:
|
|
- person_claim_id
|
|
- person_claim_type
|
|
- person_claim_value
|
|
- source_url
|
|
- retrieved_on
|
|
- person_xpath
|
|
- person_html_file
|
|
- person_xpath_match_score
|
|
- retrieval_agent
|
|
- person_claim_notes
|
|
|
|
slot_usage:
|
|
person_claim_type:
|
|
required: true
|
|
description: |
|
|
Type of person claim. See PersonClaimTypeEnum.
|
|
|
|
Common claim types:
|
|
- full_name: Complete name
|
|
- role_title: Job title
|
|
- department: Organizational unit
|
|
- email: Contact email (if public)
|
|
- biography: Professional bio
|
|
- specialization: Expertise areas
|
|
|
|
person_claim_value:
|
|
required: true
|
|
description: |
|
|
The extracted value. Must be verifiable at the source.
|
|
|
|
source_url:
|
|
required: true
|
|
description: |
|
|
URL where this claim was found.
|
|
Required for ALL claims - enables verification.
|
|
|
|
retrieved_on:
|
|
required: true
|
|
description: |
|
|
Timestamp when data was retrieved.
|
|
ISO 8601 format with timezone (UTC preferred).
|
|
|
|
retrieval_agent:
|
|
required: true
|
|
description: |
|
|
Tool used to extract this data.
|
|
|
|
- firecrawl: Institutional websites (primary)
|
|
- playwright: JS-heavy sites
|
|
- exa_crawling_exa: LinkedIn profiles
|
|
- manual: Last resort
|
|
|
|
comments:
|
|
- "PersonWebClaim extends WebClaim pattern for person-specific data"
|
|
- "XPATH OR REMOVE: Claims without verifiable source must be removed"
|
|
- "Links to PersonObservation via web_claims slot"
|
|
- "LinkedIn data stored separately in person/entity/ files (Rule 12, 20)"
|
|
- "See Rule 26 in AGENTS.md for complete documentation"
|
|
|
|
# Documentation references (in comments to avoid CURIE validation):
|
|
# - .opencode/PERSON_DATA_PROVENANCE_RULE.md
|
|
# - modules/classes/WebClaim.yaml
|
|
# - modules/classes/PersonObservation.yaml
|
|
# - AGENTS.md Rule 26
|
|
|
|
examples:
|
|
- value:
|
|
person_claim_type: full_name
|
|
person_claim_value: "Taco Dibbits"
|
|
source_url: "https://www.rijksmuseum.nl/en/about-us/organisation"
|
|
retrieved_on: "2025-01-15T10:30:00Z"
|
|
person_xpath: "/html/body/main/section[2]/div[1]/h2"
|
|
person_html_file: "web/NL-NH-AMS-M-RM/rijksmuseum.nl/organisation.html"
|
|
person_xpath_match_score: 1.0
|
|
retrieval_agent: firecrawl
|
|
description: "Exact match for museum director name"
|
|
|
|
- value:
|
|
person_claim_type: role_title
|
|
person_claim_value: "General Director"
|
|
source_url: "https://www.rijksmuseum.nl/en/about-us/organisation"
|
|
retrieved_on: "2025-01-15T10:30:00Z"
|
|
person_xpath: "/html/body/main/section[2]/div[1]/p[1]"
|
|
person_html_file: "web/NL-NH-AMS-M-RM/rijksmuseum.nl/organisation.html"
|
|
person_xpath_match_score: 1.0
|
|
retrieval_agent: firecrawl
|
|
description: "Role title from institutional page"
|
|
|
|
- value:
|
|
person_claim_type: biography
|
|
person_claim_value: "Taco Dibbits has been General Director since 2016..."
|
|
source_url: "https://www.rijksmuseum.nl/en/about-us/organisation"
|
|
retrieved_on: "2025-01-15T10:30:00Z"
|
|
person_xpath: "/html/body/main/section[2]/div[1]/div[2]"
|
|
person_html_file: "web/NL-NH-AMS-M-RM/rijksmuseum.nl/organisation.html"
|
|
person_xpath_match_score: 0.92
|
|
retrieval_agent: firecrawl
|
|
person_claim_notes: "Biography truncated from longer text on page"
|
|
description: "Biography text with partial match score"
|
|
|
|
- value:
|
|
person_claim_type: linkedin_url
|
|
person_claim_value: "https://www.linkedin.com/in/taco-dibbits"
|
|
source_url: "https://www.linkedin.com/in/taco-dibbits"
|
|
retrieved_on: "2025-01-15T11:00:00Z"
|
|
person_xpath: null
|
|
retrieval_agent: exa_crawling_exa
|
|
person_claim_notes: "Profile data stored in person/entity/taco-dibbits_20250115.json"
|
|
description: "LinkedIn claim - XPath null for API extraction"
|