- Apply Rule 39: RiC-O style hasOrHad*/isOrWas* for temporal slots - Apply Rule 43: Singular noun convention (keywords → keyword) - Update slot references to match renamed slot files - Maintain schema integrity across all class definitions
383 lines
13 KiB
YAML
383 lines
13 KiB
YAML
id: https://nde.nl/ontology/hc/class/PersonWebClaim
|
|
name: PersonWebClaim
|
|
title: Person Web Claim - Verifiable Person Data Claims
|
|
prefixes:
|
|
linkml: https://w3id.org/linkml/
|
|
hc: https://nde.nl/ontology/hc/
|
|
schema: http://schema.org/
|
|
dcterms: http://purl.org/dc/terms/
|
|
prov: http://www.w3.org/ns/prov#
|
|
pico: https://personsincontext.org/model#
|
|
foaf: http://xmlns.com/foaf/0.1/
|
|
imports:
|
|
- linkml:types
|
|
- ../slots/source_url
|
|
- ../slots/retrieved_on
|
|
- ../slots/retrieval_agent
|
|
- ../slots/person_claim_id
|
|
- ../slots/person_claim_note
|
|
- ../slots/person_claim_type
|
|
- ../slots/person_claim_value
|
|
- ../slots/person_html_file
|
|
- ../slots/person_xpath
|
|
- ../slots/person_xpath_match_score
|
|
- ../slots/specificity_annotation
|
|
- ../slots/template_specificity
|
|
- ./SpecificityAnnotation
|
|
- ./TemplateSpecificityScores
|
|
default_prefix: hc
|
|
enums:
|
|
PersonClaimTypeEnum:
|
|
description: |
|
|
Types of claims that can be extracted about persons from web pages.
|
|
Each claim type has expected source locations in institutional pages.
|
|
permissible_values:
|
|
full_name:
|
|
description: Full name of the person
|
|
comments:
|
|
- 'Expected in: staff directory, about page, h2/h3 headings'
|
|
given_name:
|
|
description: First/given name only
|
|
comments:
|
|
- May be parsed from full_name
|
|
family_name:
|
|
description: Family/surname only
|
|
comments:
|
|
- May be parsed from full_name
|
|
role_title:
|
|
description: Job title or role within organization
|
|
comments:
|
|
- 'Expected in: staff directory, org chart, biography section'
|
|
department:
|
|
description: Department or division name
|
|
comments:
|
|
- 'Expected in: staff directory, org chart'
|
|
email:
|
|
description: Professional email address
|
|
comments:
|
|
- 'Expected in: staff directory, contact section'
|
|
- Only include if publicly listed
|
|
phone:
|
|
description: Professional phone number
|
|
comments:
|
|
- 'Expected in: staff directory, contact section'
|
|
- Only include if publicly listed
|
|
biography:
|
|
description: Professional biography text
|
|
comments:
|
|
- 'Expected in: staff page, about section'
|
|
specialization:
|
|
description: Area of expertise or specialization
|
|
comments:
|
|
- 'Expected in: biography, staff profile, research interests'
|
|
education:
|
|
description: Educational background or degrees
|
|
comments:
|
|
- 'Expected in: biography, CV section'
|
|
- 'Example: ''PhD Art History, University of Amsterdam'''
|
|
start_date:
|
|
description: Date when person started current role
|
|
comments:
|
|
- 'Expected in: biography, news announcements'
|
|
- 'Format: ISO 8601 date'
|
|
end_date:
|
|
description: Date when person ended role (if applicable)
|
|
comments:
|
|
- 'Expected in: historical records, farewell announcements'
|
|
photo_url:
|
|
description: URL to profile photo
|
|
comments:
|
|
- 'Expected in: staff directory, biography page'
|
|
- 'For LinkedIn: use CDN URL (media.licdn.com), not overlay page'
|
|
linkedin_url:
|
|
description: LinkedIn profile URL
|
|
comments:
|
|
- 'Expected in: staff page footer, social links'
|
|
orcid:
|
|
description: ORCID researcher identifier
|
|
comments:
|
|
- 'Expected in: research publications, CV section'
|
|
- 'Format: 0000-0000-0000-0000'
|
|
twitter_handle:
|
|
description: Twitter/X handle
|
|
comments:
|
|
- 'Expected in: social media section, footer'
|
|
previous_employer:
|
|
description: Previous organization where person worked
|
|
comments:
|
|
- 'Expected in: biography, LinkedIn experience'
|
|
publication:
|
|
description: Citation or link to publication
|
|
comments:
|
|
- 'Expected in: publications list, CV'
|
|
award:
|
|
description: Professional award or recognition
|
|
comments:
|
|
- 'Expected in: biography, awards section'
|
|
language_proficiency:
|
|
description: Language the person speaks
|
|
comments:
|
|
- 'Expected in: LinkedIn profile, CV'
|
|
RetrievalAgentEnum:
|
|
description: |
|
|
Tools/agents used to retrieve and extract person data.
|
|
Must match the tool that was actually used for extraction.
|
|
permissible_values:
|
|
firecrawl:
|
|
description: FireCrawl MCP tools for web scraping
|
|
comments:
|
|
- Primary tool for institutional websites
|
|
playwright:
|
|
description: Playwright browser automation
|
|
comments:
|
|
- For JavaScript-heavy sites requiring browser rendering
|
|
exa_crawling_exa:
|
|
description: Exa AI crawling with direct URL
|
|
comments:
|
|
- Primary tool for LinkedIn profile extraction
|
|
exa_linkedin_search_exa:
|
|
description: Exa AI LinkedIn search
|
|
comments:
|
|
- For finding LinkedIn profiles when URL unknown
|
|
manual:
|
|
description: Manual inspection and copy
|
|
comments:
|
|
- Last resort - document why automated tools failed
|
|
slots:
|
|
person_claim_id:
|
|
identifier: true
|
|
range: uriorcurie
|
|
description: Unique identifier for this person claim
|
|
person_claim_type:
|
|
range: PersonClaimTypeEnum
|
|
required: true
|
|
description: Type of claim being made about the person
|
|
person_claim_value:
|
|
range: string
|
|
required: true
|
|
description: The extracted value for this claim
|
|
person_xpath:
|
|
range: string
|
|
required: false
|
|
description: |
|
|
XPath to the element containing this claim value.
|
|
REQUIRED for web page claims. May be null for API-sourced claims.
|
|
|
|
Format: Standard XPath 1.0 expression
|
|
Example: /html/body/main/section[2]/div[1]/h2
|
|
pattern: ^/.*
|
|
person_html_file:
|
|
range: string
|
|
required: false
|
|
description: |
|
|
Relative path to archived HTML file containing this claim.
|
|
Path is relative to the custodian data directory.
|
|
|
|
Example: web/NL-NH-AMS-M-RM/rijksmuseum.nl/team.html
|
|
person_xpath_match_score:
|
|
range: float
|
|
required: false
|
|
minimum_value: 0.0
|
|
maximum_value: 1.0
|
|
description: |
|
|
Match quality between claim value and text at XPath location.
|
|
|
|
- 1.0 = Exact match
|
|
- 0.8-0.99 = Near match after normalization
|
|
- 0.5-0.79 = Substring match
|
|
- <0.5 = Weak match (needs review)
|
|
person_claim_note:
|
|
range: string
|
|
description: |
|
|
Notes about this claim extraction.
|
|
Document any issues, conflicts, or special circumstances.
|
|
classes:
|
|
PersonWebClaim:
|
|
class_uri: prov:Entity
|
|
description: |
|
|
A verifiable claim about a person extracted from a web page with provenance.
|
|
|
|
**RULE 26 COMPLIANCE: Person Data Provenance**
|
|
|
|
All person/staff data associated with heritage custodians MUST have
|
|
web claim provenance. This includes:
|
|
- Staff names, titles, and roles
|
|
- Contact information (if publicly available)
|
|
- Professional history and education
|
|
- Affiliations and expertise areas
|
|
|
|
**VERIFIABILITY PRINCIPLE**
|
|
|
|
Like the base WebClaim class, PersonWebClaim follows the "XPath or Remove" principle:
|
|
- Claims from web pages MUST have xpath pointing to source element
|
|
- Claims from APIs (LinkedIn via Exa) may have xpath=null but MUST have source_url
|
|
- Claims without any verifiable source are FABRICATED and must be removed
|
|
|
|
**SOURCE HIERARCHY**
|
|
|
|
When multiple sources provide the same information:
|
|
1. Official institutional website (highest reliability)
|
|
2. LinkedIn profile (high reliability)
|
|
3. News articles/press releases (medium-high)
|
|
4. Conference programs (medium)
|
|
5. Academic publications (medium)
|
|
6. Third-party databases (lower)
|
|
|
|
Document all sources when available; note conflicts.
|
|
|
|
**CLAIM TYPES**
|
|
|
|
Common claim types for heritage institution staff:
|
|
- **full_name**: Complete name as displayed
|
|
- **role_title**: Job title (e.g., "Senior Curator")
|
|
- **department**: Organizational unit
|
|
- **email**: Professional contact (only if public)
|
|
- **biography**: Professional bio text
|
|
- **specialization**: Expertise areas
|
|
- **education**: Degrees and institutions
|
|
|
|
**EXTRACTION WORKFLOW**
|
|
|
|
1. SCRAPE institutional staff/team pages with FireCrawl
|
|
2. EXTRACT names and roles with XPath locations
|
|
3. SEARCH LinkedIn for additional profile data
|
|
4. CREATE PersonWebClaim for each extracted fact
|
|
5. LINK claims to PersonObservation records
|
|
|
|
**INTEGRATION WITH PERSONOBSERVATION**
|
|
|
|
PersonObservation (the PiCo-based staff role record) references
|
|
PersonWebClaim instances via the `web_claims` slot:
|
|
|
|
```yaml
|
|
PersonObservation:
|
|
person_name: "Dr. Jane Smith"
|
|
staff_role: CONSERVATOR
|
|
has_or_had_web_claim:
|
|
- claim_type: full_name
|
|
claim_value: "Dr. Jane Smith"
|
|
source_url: https://museum.org/team
|
|
xpath: /html/body/main/div[2]/h3
|
|
retrieval_agent: firecrawl
|
|
```
|
|
|
|
**LINKEDIN PROFILE HANDLING**
|
|
|
|
For LinkedIn data, create separate profile files (per Rule 12, Rule 20)
|
|
and reference them:
|
|
|
|
```yaml
|
|
linkedin_claims:
|
|
linkedin_url: https://www.linkedin.com/in/jane-smith
|
|
profile_data_path: data/custodian/person/entity/jane-smith_20250115.json
|
|
retrieved_on: "2025-01-15T10:30:00Z"
|
|
retrieval_agent: exa_crawling_exa
|
|
```
|
|
exact_mappings:
|
|
- prov:Entity
|
|
close_mappings:
|
|
- schema:PropertyValue
|
|
- foaf:Document
|
|
slots:
|
|
- person_claim_id
|
|
- person_claim_note
|
|
- person_claim_type
|
|
- person_claim_value
|
|
- person_html_file
|
|
- person_xpath
|
|
- person_xpath_match_score
|
|
- retrieval_agent
|
|
- retrieved_on
|
|
- source_url
|
|
- specificity_annotation
|
|
- template_specificity
|
|
slot_usage:
|
|
person_claim_type:
|
|
required: true
|
|
description: |
|
|
Type of person claim. See PersonClaimTypeEnum.
|
|
|
|
Common claim types:
|
|
- full_name: Complete name
|
|
- role_title: Job title
|
|
- department: Organizational unit
|
|
- email: Contact email (if public)
|
|
- biography: Professional bio
|
|
- specialization: Expertise areas
|
|
person_claim_value:
|
|
required: true
|
|
description: |
|
|
The extracted value. Must be verifiable at the source.
|
|
source_url:
|
|
required: true
|
|
description: |
|
|
URL where this claim was found.
|
|
Required for ALL claims - enables verification.
|
|
retrieved_on:
|
|
required: true
|
|
description: |
|
|
Timestamp when data was retrieved.
|
|
ISO 8601 format with timezone (UTC preferred).
|
|
retrieval_agent:
|
|
required: true
|
|
range: RetrievalAgentEnum
|
|
description: |
|
|
Tool used to extract this data.
|
|
|
|
- firecrawl: Institutional websites (primary)
|
|
- playwright: JS-heavy sites
|
|
- exa_crawling_exa: LinkedIn profiles
|
|
- manual: Last resort
|
|
specificity_annotation:
|
|
range: SpecificityAnnotation
|
|
inlined: true
|
|
template_specificity:
|
|
range: TemplateSpecificityScores
|
|
inlined: true
|
|
comments:
|
|
- PersonWebClaim extends WebClaim pattern for person-specific data
|
|
- 'XPATH OR REMOVE: Claims without verifiable source must be removed'
|
|
- Links to PersonObservation via web_claims slot
|
|
- LinkedIn data stored separately in person/entity/ files (Rule 12, 20)
|
|
- See Rule 26 in AGENTS.md for complete documentation
|
|
examples:
|
|
- value:
|
|
person_claim_type: full_name
|
|
person_claim_value: Taco Dibbits
|
|
source_url: https://www.rijksmuseum.nl/en/about-us/organisation
|
|
retrieved_on: '2025-01-15T10:30:00Z'
|
|
person_xpath: /html/body/main/section[2]/div[1]/h2
|
|
person_html_file: web/NL-NH-AMS-M-RM/rijksmuseum.nl/organisation.html
|
|
person_xpath_match_score: 1.0
|
|
retrieval_agent: firecrawl
|
|
description: Exact match for museum director name
|
|
- value:
|
|
person_claim_type: role_title
|
|
person_claim_value: General Director
|
|
source_url: https://www.rijksmuseum.nl/en/about-us/organisation
|
|
retrieved_on: '2025-01-15T10:30:00Z'
|
|
person_xpath: /html/body/main/section[2]/div[1]/p[1]
|
|
person_html_file: web/NL-NH-AMS-M-RM/rijksmuseum.nl/organisation.html
|
|
person_xpath_match_score: 1.0
|
|
retrieval_agent: firecrawl
|
|
description: Role title from institutional page
|
|
- value:
|
|
person_claim_type: biography
|
|
person_claim_value: Taco Dibbits has been General Director since 2016...
|
|
source_url: https://www.rijksmuseum.nl/en/about-us/organisation
|
|
retrieved_on: '2025-01-15T10:30:00Z'
|
|
person_xpath: /html/body/main/section[2]/div[1]/div[2]
|
|
person_html_file: web/NL-NH-AMS-M-RM/rijksmuseum.nl/organisation.html
|
|
person_xpath_match_score: 0.92
|
|
retrieval_agent: firecrawl
|
|
person_claim_notes: Biography truncated from longer text on page
|
|
description: Biography text with partial match score
|
|
- value:
|
|
person_claim_type: linkedin_url
|
|
person_claim_value: https://www.linkedin.com/in/taco-dibbits
|
|
source_url: https://www.linkedin.com/in/taco-dibbits
|
|
retrieved_on: '2025-01-15T11:00:00Z'
|
|
person_xpath: null
|
|
retrieval_agent: exa_crawling_exa
|
|
person_claim_notes: Profile data stored in person/entity/taco-dibbits_20250115.json
|
|
description: LinkedIn claim - XPath null for API extraction
|