glam/schemas/20251121/linkml/modules/classes/PersonWebClaim.yaml
kempersc 626bd3a095 refactor(schemas): apply naming conventions to 261 class files
- Apply Rule 39: RiC-O style hasOrHad*/isOrWas* for temporal slots
- Apply Rule 43: Singular noun convention (keywords → keyword)
- Update slot references to match renamed slot files
- Maintain schema integrity across all class definitions
2026-01-10 15:36:33 +01:00

383 lines
13 KiB
YAML

id: https://nde.nl/ontology/hc/class/PersonWebClaim
name: PersonWebClaim
title: Person Web Claim - Verifiable Person Data Claims
prefixes:
linkml: https://w3id.org/linkml/
hc: https://nde.nl/ontology/hc/
schema: http://schema.org/
dcterms: http://purl.org/dc/terms/
prov: http://www.w3.org/ns/prov#
pico: https://personsincontext.org/model#
foaf: http://xmlns.com/foaf/0.1/
imports:
- linkml:types
- ../slots/source_url
- ../slots/retrieved_on
- ../slots/retrieval_agent
- ../slots/person_claim_id
- ../slots/person_claim_note
- ../slots/person_claim_type
- ../slots/person_claim_value
- ../slots/person_html_file
- ../slots/person_xpath
- ../slots/person_xpath_match_score
- ../slots/specificity_annotation
- ../slots/template_specificity
- ./SpecificityAnnotation
- ./TemplateSpecificityScores
default_prefix: hc
enums:
PersonClaimTypeEnum:
description: |
Types of claims that can be extracted about persons from web pages.
Each claim type has expected source locations in institutional pages.
permissible_values:
full_name:
description: Full name of the person
comments:
- 'Expected in: staff directory, about page, h2/h3 headings'
given_name:
description: First/given name only
comments:
- May be parsed from full_name
family_name:
description: Family/surname only
comments:
- May be parsed from full_name
role_title:
description: Job title or role within organization
comments:
- 'Expected in: staff directory, org chart, biography section'
department:
description: Department or division name
comments:
- 'Expected in: staff directory, org chart'
email:
description: Professional email address
comments:
- 'Expected in: staff directory, contact section'
- Only include if publicly listed
phone:
description: Professional phone number
comments:
- 'Expected in: staff directory, contact section'
- Only include if publicly listed
biography:
description: Professional biography text
comments:
- 'Expected in: staff page, about section'
specialization:
description: Area of expertise or specialization
comments:
- 'Expected in: biography, staff profile, research interests'
education:
description: Educational background or degrees
comments:
- 'Expected in: biography, CV section'
- 'Example: ''PhD Art History, University of Amsterdam'''
start_date:
description: Date when person started current role
comments:
- 'Expected in: biography, news announcements'
- 'Format: ISO 8601 date'
end_date:
description: Date when person ended role (if applicable)
comments:
- 'Expected in: historical records, farewell announcements'
photo_url:
description: URL to profile photo
comments:
- 'Expected in: staff directory, biography page'
- 'For LinkedIn: use CDN URL (media.licdn.com), not overlay page'
linkedin_url:
description: LinkedIn profile URL
comments:
- 'Expected in: staff page footer, social links'
orcid:
description: ORCID researcher identifier
comments:
- 'Expected in: research publications, CV section'
- 'Format: 0000-0000-0000-0000'
twitter_handle:
description: Twitter/X handle
comments:
- 'Expected in: social media section, footer'
previous_employer:
description: Previous organization where person worked
comments:
- 'Expected in: biography, LinkedIn experience'
publication:
description: Citation or link to publication
comments:
- 'Expected in: publications list, CV'
award:
description: Professional award or recognition
comments:
- 'Expected in: biography, awards section'
language_proficiency:
description: Language the person speaks
comments:
- 'Expected in: LinkedIn profile, CV'
RetrievalAgentEnum:
description: |
Tools/agents used to retrieve and extract person data.
Must match the tool that was actually used for extraction.
permissible_values:
firecrawl:
description: FireCrawl MCP tools for web scraping
comments:
- Primary tool for institutional websites
playwright:
description: Playwright browser automation
comments:
- For JavaScript-heavy sites requiring browser rendering
exa_crawling_exa:
description: Exa AI crawling with direct URL
comments:
- Primary tool for LinkedIn profile extraction
exa_linkedin_search_exa:
description: Exa AI LinkedIn search
comments:
- For finding LinkedIn profiles when URL unknown
manual:
description: Manual inspection and copy
comments:
- Last resort - document why automated tools failed
slots:
person_claim_id:
identifier: true
range: uriorcurie
description: Unique identifier for this person claim
person_claim_type:
range: PersonClaimTypeEnum
required: true
description: Type of claim being made about the person
person_claim_value:
range: string
required: true
description: The extracted value for this claim
person_xpath:
range: string
required: false
description: |
XPath to the element containing this claim value.
REQUIRED for web page claims. May be null for API-sourced claims.
Format: Standard XPath 1.0 expression
Example: /html/body/main/section[2]/div[1]/h2
pattern: ^/.*
person_html_file:
range: string
required: false
description: |
Relative path to archived HTML file containing this claim.
Path is relative to the custodian data directory.
Example: web/NL-NH-AMS-M-RM/rijksmuseum.nl/team.html
person_xpath_match_score:
range: float
required: false
minimum_value: 0.0
maximum_value: 1.0
description: |
Match quality between claim value and text at XPath location.
- 1.0 = Exact match
- 0.8-0.99 = Near match after normalization
- 0.5-0.79 = Substring match
- <0.5 = Weak match (needs review)
person_claim_note:
range: string
description: |
Notes about this claim extraction.
Document any issues, conflicts, or special circumstances.
classes:
PersonWebClaim:
class_uri: prov:Entity
description: |
A verifiable claim about a person extracted from a web page with provenance.
**RULE 26 COMPLIANCE: Person Data Provenance**
All person/staff data associated with heritage custodians MUST have
web claim provenance. This includes:
- Staff names, titles, and roles
- Contact information (if publicly available)
- Professional history and education
- Affiliations and expertise areas
**VERIFIABILITY PRINCIPLE**
Like the base WebClaim class, PersonWebClaim follows the "XPath or Remove" principle:
- Claims from web pages MUST have xpath pointing to source element
- Claims from APIs (LinkedIn via Exa) may have xpath=null but MUST have source_url
- Claims without any verifiable source are FABRICATED and must be removed
**SOURCE HIERARCHY**
When multiple sources provide the same information:
1. Official institutional website (highest reliability)
2. LinkedIn profile (high reliability)
3. News articles/press releases (medium-high)
4. Conference programs (medium)
5. Academic publications (medium)
6. Third-party databases (lower)
Document all sources when available; note conflicts.
**CLAIM TYPES**
Common claim types for heritage institution staff:
- **full_name**: Complete name as displayed
- **role_title**: Job title (e.g., "Senior Curator")
- **department**: Organizational unit
- **email**: Professional contact (only if public)
- **biography**: Professional bio text
- **specialization**: Expertise areas
- **education**: Degrees and institutions
**EXTRACTION WORKFLOW**
1. SCRAPE institutional staff/team pages with FireCrawl
2. EXTRACT names and roles with XPath locations
3. SEARCH LinkedIn for additional profile data
4. CREATE PersonWebClaim for each extracted fact
5. LINK claims to PersonObservation records
**INTEGRATION WITH PERSONOBSERVATION**
PersonObservation (the PiCo-based staff role record) references
PersonWebClaim instances via the `web_claims` slot:
```yaml
PersonObservation:
person_name: "Dr. Jane Smith"
staff_role: CONSERVATOR
has_or_had_web_claim:
- claim_type: full_name
claim_value: "Dr. Jane Smith"
source_url: https://museum.org/team
xpath: /html/body/main/div[2]/h3
retrieval_agent: firecrawl
```
**LINKEDIN PROFILE HANDLING**
For LinkedIn data, create separate profile files (per Rule 12, Rule 20)
and reference them:
```yaml
linkedin_claims:
linkedin_url: https://www.linkedin.com/in/jane-smith
profile_data_path: data/custodian/person/entity/jane-smith_20250115.json
retrieved_on: "2025-01-15T10:30:00Z"
retrieval_agent: exa_crawling_exa
```
exact_mappings:
- prov:Entity
close_mappings:
- schema:PropertyValue
- foaf:Document
slots:
- person_claim_id
- person_claim_note
- person_claim_type
- person_claim_value
- person_html_file
- person_xpath
- person_xpath_match_score
- retrieval_agent
- retrieved_on
- source_url
- specificity_annotation
- template_specificity
slot_usage:
person_claim_type:
required: true
description: |
Type of person claim. See PersonClaimTypeEnum.
Common claim types:
- full_name: Complete name
- role_title: Job title
- department: Organizational unit
- email: Contact email (if public)
- biography: Professional bio
- specialization: Expertise areas
person_claim_value:
required: true
description: |
The extracted value. Must be verifiable at the source.
source_url:
required: true
description: |
URL where this claim was found.
Required for ALL claims - enables verification.
retrieved_on:
required: true
description: |
Timestamp when data was retrieved.
ISO 8601 format with timezone (UTC preferred).
retrieval_agent:
required: true
range: RetrievalAgentEnum
description: |
Tool used to extract this data.
- firecrawl: Institutional websites (primary)
- playwright: JS-heavy sites
- exa_crawling_exa: LinkedIn profiles
- manual: Last resort
specificity_annotation:
range: SpecificityAnnotation
inlined: true
template_specificity:
range: TemplateSpecificityScores
inlined: true
comments:
- PersonWebClaim extends WebClaim pattern for person-specific data
- 'XPATH OR REMOVE: Claims without verifiable source must be removed'
- Links to PersonObservation via web_claims slot
- LinkedIn data stored separately in person/entity/ files (Rule 12, 20)
- See Rule 26 in AGENTS.md for complete documentation
examples:
- value:
person_claim_type: full_name
person_claim_value: Taco Dibbits
source_url: https://www.rijksmuseum.nl/en/about-us/organisation
retrieved_on: '2025-01-15T10:30:00Z'
person_xpath: /html/body/main/section[2]/div[1]/h2
person_html_file: web/NL-NH-AMS-M-RM/rijksmuseum.nl/organisation.html
person_xpath_match_score: 1.0
retrieval_agent: firecrawl
description: Exact match for museum director name
- value:
person_claim_type: role_title
person_claim_value: General Director
source_url: https://www.rijksmuseum.nl/en/about-us/organisation
retrieved_on: '2025-01-15T10:30:00Z'
person_xpath: /html/body/main/section[2]/div[1]/p[1]
person_html_file: web/NL-NH-AMS-M-RM/rijksmuseum.nl/organisation.html
person_xpath_match_score: 1.0
retrieval_agent: firecrawl
description: Role title from institutional page
- value:
person_claim_type: biography
person_claim_value: Taco Dibbits has been General Director since 2016...
source_url: https://www.rijksmuseum.nl/en/about-us/organisation
retrieved_on: '2025-01-15T10:30:00Z'
person_xpath: /html/body/main/section[2]/div[1]/div[2]
person_html_file: web/NL-NH-AMS-M-RM/rijksmuseum.nl/organisation.html
person_xpath_match_score: 0.92
retrieval_agent: firecrawl
person_claim_notes: Biography truncated from longer text on page
description: Biography text with partial match score
- value:
person_claim_type: linkedin_url
person_claim_value: https://www.linkedin.com/in/taco-dibbits
source_url: https://www.linkedin.com/in/taco-dibbits
retrieved_on: '2025-01-15T11:00:00Z'
person_xpath: null
retrieval_agent: exa_crawling_exa
person_claim_notes: Profile data stored in person/entity/taco-dibbits_20250115.json
description: LinkedIn claim - XPath null for API extraction