glam/schemas/20251121/linkml/modules/classes/PersonWebClaim.yaml
2025-12-17 10:11:56 +01:00

418 lines
14 KiB
YAML

# PersonWebClaim Class
# Extends WebClaim pattern for person-specific claims with XPath provenance
#
# Created December 2025 to provide:
# - Verifiable provenance for staff/person data from web pages
# - Person-specific claim types (name, role, contact, etc.)
# - Integration with PersonObservation and StaffRole patterns
#
# Key principle (per Rule 26):
# All person/staff data associated with heritage custodians MUST have
# web claim provenance. Staff information without verifiable sources is unacceptable.
#
# This class follows the same XPATH OR REMOVE principle as WebClaim:
# Every claim from a webpage MUST have an XPath pointer to the exact
# location in archived HTML where that value appears.
id: https://nde.nl/ontology/hc/class/PersonWebClaim
name: PersonWebClaim
title: Person Web Claim - Verifiable Person Data Claims
prefixes:
linkml: https://w3id.org/linkml/
hc: https://nde.nl/ontology/hc/
schema: http://schema.org/
dcterms: http://purl.org/dc/terms/
prov: http://www.w3.org/ns/prov#
pico: https://personsincontext.org/model#
foaf: http://xmlns.com/foaf/0.1/
imports:
- linkml:types
- ../slots/source_url
- ../slots/retrieved_on
default_prefix: hc
enums:
PersonClaimTypeEnum:
description: |
Types of claims that can be extracted about persons from web pages.
Each claim type has expected source locations in institutional pages.
permissible_values:
full_name:
description: Full name of the person
comments:
- "Expected in: staff directory, about page, h2/h3 headings"
given_name:
description: First/given name only
comments:
- "May be parsed from full_name"
family_name:
description: Family/surname only
comments:
- "May be parsed from full_name"
role_title:
description: Job title or role within organization
comments:
- "Expected in: staff directory, org chart, biography section"
department:
description: Department or division name
comments:
- "Expected in: staff directory, org chart"
email:
description: Professional email address
comments:
- "Expected in: staff directory, contact section"
- "Only include if publicly listed"
phone:
description: Professional phone number
comments:
- "Expected in: staff directory, contact section"
- "Only include if publicly listed"
biography:
description: Professional biography text
comments:
- "Expected in: staff page, about section"
specialization:
description: Area of expertise or specialization
comments:
- "Expected in: biography, staff profile, research interests"
education:
description: Educational background or degrees
comments:
- "Expected in: biography, CV section"
- "Example: 'PhD Art History, University of Amsterdam'"
start_date:
description: Date when person started current role
comments:
- "Expected in: biography, news announcements"
- "Format: ISO 8601 date"
end_date:
description: Date when person ended role (if applicable)
comments:
- "Expected in: historical records, farewell announcements"
photo_url:
description: URL to profile photo
comments:
- "Expected in: staff directory, biography page"
- "For LinkedIn: use CDN URL (media.licdn.com), not overlay page"
linkedin_url:
description: LinkedIn profile URL
comments:
- "Expected in: staff page footer, social links"
orcid:
description: ORCID researcher identifier
comments:
- "Expected in: research publications, CV section"
- "Format: 0000-0000-0000-0000"
twitter_handle:
description: Twitter/X handle
comments:
- "Expected in: social media section, footer"
previous_employer:
description: Previous organization where person worked
comments:
- "Expected in: biography, LinkedIn experience"
publication:
description: Citation or link to publication
comments:
- "Expected in: publications list, CV"
award:
description: Professional award or recognition
comments:
- "Expected in: biography, awards section"
language_proficiency:
description: Language the person speaks
comments:
- "Expected in: LinkedIn profile, CV"
RetrievalAgentEnum:
description: |
Tools/agents used to retrieve and extract person data.
Must match the tool that was actually used for extraction.
permissible_values:
firecrawl:
description: FireCrawl MCP tools for web scraping
comments:
- "Primary tool for institutional websites"
playwright:
description: Playwright browser automation
comments:
- "For JavaScript-heavy sites requiring browser rendering"
exa_crawling_exa:
description: Exa AI crawling with direct URL
comments:
- "Primary tool for LinkedIn profile extraction"
exa_linkedin_search_exa:
description: Exa AI LinkedIn search
comments:
- "For finding LinkedIn profiles when URL unknown"
manual:
description: Manual inspection and copy
comments:
- "Last resort - document why automated tools failed"
slots:
person_claim_id:
identifier: true
range: uriorcurie
description: Unique identifier for this person claim
person_claim_type:
range: PersonClaimTypeEnum
required: true
description: Type of claim being made about the person
person_claim_value:
range: string
required: true
description: The extracted value for this claim
# XPath provenance - CRITICAL for verifiability
person_xpath:
range: string
required: false
description: |
XPath to the element containing this claim value.
REQUIRED for web page claims. May be null for API-sourced claims.
Format: Standard XPath 1.0 expression
Example: /html/body/main/section[2]/div[1]/h2
pattern: "^/.*"
person_html_file:
range: string
required: false
description: |
Relative path to archived HTML file containing this claim.
Path is relative to the custodian data directory.
Example: web/NL-NH-AMS-M-RM/rijksmuseum.nl/team.html
person_xpath_match_score:
range: float
required: false
minimum_value: 0.0
maximum_value: 1.0
description: |
Match quality between claim value and text at XPath location.
- 1.0 = Exact match
- 0.8-0.99 = Near match after normalization
- 0.5-0.79 = Substring match
- <0.5 = Weak match (needs review)
retrieval_agent:
range: RetrievalAgentEnum
required: true
description: |
Tool/agent used to retrieve this data.
Must accurately reflect the extraction method.
person_claim_notes:
range: string
description: |
Notes about this claim extraction.
Document any issues, conflicts, or special circumstances.
classes:
PersonWebClaim:
class_uri: prov:Entity
description: |
A verifiable claim about a person extracted from a web page with provenance.
**RULE 26 COMPLIANCE: Person Data Provenance**
All person/staff data associated with heritage custodians MUST have
web claim provenance. This includes:
- Staff names, titles, and roles
- Contact information (if publicly available)
- Professional history and education
- Affiliations and expertise areas
**VERIFIABILITY PRINCIPLE**
Like the base WebClaim class, PersonWebClaim follows the "XPath or Remove" principle:
- Claims from web pages MUST have xpath pointing to source element
- Claims from APIs (LinkedIn via Exa) may have xpath=null but MUST have source_url
- Claims without any verifiable source are FABRICATED and must be removed
**SOURCE HIERARCHY**
When multiple sources provide the same information:
1. Official institutional website (highest reliability)
2. LinkedIn profile (high reliability)
3. News articles/press releases (medium-high)
4. Conference programs (medium)
5. Academic publications (medium)
6. Third-party databases (lower)
Document all sources when available; note conflicts.
**CLAIM TYPES**
Common claim types for heritage institution staff:
- **full_name**: Complete name as displayed
- **role_title**: Job title (e.g., "Senior Curator")
- **department**: Organizational unit
- **email**: Professional contact (only if public)
- **biography**: Professional bio text
- **specialization**: Expertise areas
- **education**: Degrees and institutions
**EXTRACTION WORKFLOW**
1. SCRAPE institutional staff/team pages with FireCrawl
2. EXTRACT names and roles with XPath locations
3. SEARCH LinkedIn for additional profile data
4. CREATE PersonWebClaim for each extracted fact
5. LINK claims to PersonObservation records
**INTEGRATION WITH PERSONOBSERVATION**
PersonObservation (the PiCo-based staff role record) references
PersonWebClaim instances via the `web_claims` slot:
```yaml
PersonObservation:
person_name: "Dr. Jane Smith"
staff_role: CONSERVATOR
web_claims:
- claim_type: full_name
claim_value: "Dr. Jane Smith"
source_url: https://museum.org/team
xpath: /html/body/main/div[2]/h3
retrieval_agent: firecrawl
```
**LINKEDIN PROFILE HANDLING**
For LinkedIn data, create separate profile files (per Rule 12, Rule 20)
and reference them:
```yaml
linkedin_claims:
linkedin_url: https://www.linkedin.com/in/jane-smith
profile_data_path: data/custodian/person/entity/jane-smith_20250115.json
retrieved_on: "2025-01-15T10:30:00Z"
retrieval_agent: exa_crawling_exa
```
exact_mappings:
- prov:Entity
close_mappings:
- schema:PropertyValue
- foaf:Document
slots:
- person_claim_id
- person_claim_type
- person_claim_value
- source_url
- retrieved_on
- person_xpath
- person_html_file
- person_xpath_match_score
- retrieval_agent
- person_claim_notes
slot_usage:
person_claim_type:
required: true
description: |
Type of person claim. See PersonClaimTypeEnum.
Common claim types:
- full_name: Complete name
- role_title: Job title
- department: Organizational unit
- email: Contact email (if public)
- biography: Professional bio
- specialization: Expertise areas
person_claim_value:
required: true
description: |
The extracted value. Must be verifiable at the source.
source_url:
required: true
description: |
URL where this claim was found.
Required for ALL claims - enables verification.
retrieved_on:
required: true
description: |
Timestamp when data was retrieved.
ISO 8601 format with timezone (UTC preferred).
retrieval_agent:
required: true
description: |
Tool used to extract this data.
- firecrawl: Institutional websites (primary)
- playwright: JS-heavy sites
- exa_crawling_exa: LinkedIn profiles
- manual: Last resort
comments:
- "PersonWebClaim extends WebClaim pattern for person-specific data"
- "XPATH OR REMOVE: Claims without verifiable source must be removed"
- "Links to PersonObservation via web_claims slot"
- "LinkedIn data stored separately in person/entity/ files (Rule 12, 20)"
- "See Rule 26 in AGENTS.md for complete documentation"
# Documentation references (in comments to avoid CURIE validation):
# - .opencode/PERSON_DATA_PROVENANCE_RULE.md
# - modules/classes/WebClaim.yaml
# - modules/classes/PersonObservation.yaml
# - AGENTS.md Rule 26
examples:
- value:
person_claim_type: full_name
person_claim_value: "Taco Dibbits"
source_url: "https://www.rijksmuseum.nl/en/about-us/organisation"
retrieved_on: "2025-01-15T10:30:00Z"
person_xpath: "/html/body/main/section[2]/div[1]/h2"
person_html_file: "web/NL-NH-AMS-M-RM/rijksmuseum.nl/organisation.html"
person_xpath_match_score: 1.0
retrieval_agent: firecrawl
description: "Exact match for museum director name"
- value:
person_claim_type: role_title
person_claim_value: "General Director"
source_url: "https://www.rijksmuseum.nl/en/about-us/organisation"
retrieved_on: "2025-01-15T10:30:00Z"
person_xpath: "/html/body/main/section[2]/div[1]/p[1]"
person_html_file: "web/NL-NH-AMS-M-RM/rijksmuseum.nl/organisation.html"
person_xpath_match_score: 1.0
retrieval_agent: firecrawl
description: "Role title from institutional page"
- value:
person_claim_type: biography
person_claim_value: "Taco Dibbits has been General Director since 2016..."
source_url: "https://www.rijksmuseum.nl/en/about-us/organisation"
retrieved_on: "2025-01-15T10:30:00Z"
person_xpath: "/html/body/main/section[2]/div[1]/div[2]"
person_html_file: "web/NL-NH-AMS-M-RM/rijksmuseum.nl/organisation.html"
person_xpath_match_score: 0.92
retrieval_agent: firecrawl
person_claim_notes: "Biography truncated from longer text on page"
description: "Biography text with partial match score"
- value:
person_claim_type: linkedin_url
person_claim_value: "https://www.linkedin.com/in/taco-dibbits"
source_url: "https://www.linkedin.com/in/taco-dibbits"
retrieved_on: "2025-01-15T11:00:00Z"
person_xpath: null
retrieval_agent: exa_crawling_exa
person_claim_notes: "Profile data stored in person/entity/taco-dibbits_20250115.json"
description: "LinkedIn claim - XPath null for API extraction"