glam/schemas/20251121/linkml/modules/classes/PersonWebClaim.yaml

id: https://nde.nl/ontology/hc/class/PersonWebClaim
name: PersonWebClaim
title: Person Web Claim - Verifiable Person Data Claims
prefixes:
  linkml: https://w3id.org/linkml/
  hc: https://nde.nl/ontology/hc/
  schema: http://schema.org/
  dcterms: http://purl.org/dc/terms/
  prov: http://www.w3.org/ns/prov#
  pico: https://personsincontext.org/model#
  foaf: http://xmlns.com/foaf/0.1/
imports:
- linkml:types
- ../slots/source_url
- ../slots/retrieved_on
- ../slots/retrieval_agent
- ../slots/person_claim_id
- ../slots/person_claim_note
- ../slots/person_claim_type
- ../slots/person_claim_value
- ../slots/person_html_file
- ../slots/person_xpath
- ../slots/person_xpath_match_score
- ../slots/specificity_annotation
- ../slots/template_specificity
- ./SpecificityAnnotation
- ./TemplateSpecificityScores
default_prefix: hc
enums:
  PersonClaimTypeEnum:
    description: |
      Types of claims that can be extracted about persons from web pages.
      Each claim type has expected source locations in institutional pages.
    permissible_values:
      full_name:
        description: Full name of the person
        comments:
        - 'Expected in: staff directory, about page, h2/h3 headings'
      given_name:
        description: First/given name only
        comments:
        - May be parsed from full_name
      family_name:
        description: Family/surname only
        comments:
        - May be parsed from full_name
      role_title:
        description: Job title or role within organization
        comments:
        - 'Expected in: staff directory, org chart, biography section'
      department:
        description: Department or division name
        comments:
        - 'Expected in: staff directory, org chart'
      email:
        description: Professional email address
        comments:
        - 'Expected in: staff directory, contact section'
        - Only include if publicly listed
      phone:
        description: Professional phone number
        comments:
        - 'Expected in: staff directory, contact section'
        - Only include if publicly listed
      biography:
        description: Professional biography text
        comments:
        - 'Expected in: staff page, about section'
      specialization:
        description: Area of expertise or specialization
        comments:
        - 'Expected in: biography, staff profile, research interests'
      education:
        description: Educational background or degrees
        comments:
        - 'Expected in: biography, CV section'
        - 'Example: ''PhD Art History, University of Amsterdam'''
      start_date:
        description: Date when person started current role
        comments:
        - 'Expected in: biography, news announcements'
        - 'Format: ISO 8601 date'
      end_date:
        description: Date when person ended role (if applicable)
        comments:
        - 'Expected in: historical records, farewell announcements'
      photo_url:
        description: URL to profile photo
        comments:
        - 'Expected in: staff directory, biography page'
        - 'For LinkedIn: use CDN URL (media.licdn.com), not overlay page'
      linkedin_url:
        description: LinkedIn profile URL
        comments:
        - 'Expected in: staff page footer, social links'
      orcid:
        description: ORCID researcher identifier
        comments:
        - 'Expected in: research publications, CV section'
        - 'Format: 0000-0000-0000-0000'
      twitter_handle:
        description: Twitter/X handle
        comments:
        - 'Expected in: social media section, footer'
      previous_employer:
        description: Previous organization where person worked
        comments:
        - 'Expected in: biography, LinkedIn experience'
      publication:
        description: Citation or link to publication
        comments:
        - 'Expected in: publications list, CV'
      award:
        description: Professional award or recognition
        comments:
        - 'Expected in: biography, awards section'
      language_proficiency:
        description: Language the person speaks
        comments:
        - 'Expected in: LinkedIn profile, CV'
  RetrievalAgentEnum:
    description: |
      Tools/agents used to retrieve and extract person data.
      Must match the tool that was actually used for extraction.
    permissible_values:
      firecrawl:
        description: FireCrawl MCP tools for web scraping
        comments:
        - Primary tool for institutional websites
      playwright:
        description: Playwright browser automation
        comments:
        - For JavaScript-heavy sites requiring browser rendering
      exa_crawling_exa:
        description: Exa AI crawling with direct URL
        comments:
        - Primary tool for LinkedIn profile extraction
      exa_linkedin_search_exa:
        description: Exa AI LinkedIn search
        comments:
        - For finding LinkedIn profiles when URL unknown
      manual:
        description: Manual inspection and copy
        comments:
        - Last resort - document why automated tools failed
slots:
  person_claim_id:
    identifier: true
    range: uriorcurie
    description: Unique identifier for this person claim
  person_claim_type:
    range: PersonClaimTypeEnum
    required: true
    description: Type of claim being made about the person
  person_claim_value:
    range: string
    required: true
    description: The extracted value for this claim
  person_xpath:
    range: string
    required: false
    description: |
      XPath to the element containing this claim value.
      REQUIRED for web page claims. May be null for API-sourced claims.

      Format: Standard XPath 1.0 expression
      Example: /html/body/main/section[2]/div[1]/h2
    pattern: ^/.*
  person_html_file:
    range: string
    required: false
    description: |
      Relative path to archived HTML file containing this claim.
      Path is relative to the custodian data directory.

      Example: web/NL-NH-AMS-M-RM/rijksmuseum.nl/team.html
  person_xpath_match_score:
    range: float
    required: false
    minimum_value: 0.0
    maximum_value: 1.0
    description: |
      Match quality between claim value and text at XPath location.

      - 1.0 = Exact match
      - 0.8-0.99 = Near match after normalization
      - 0.5-0.79 = Substring match
      - <0.5 = Weak match (needs review)
  person_claim_note:
    range: string
    description: |
      Notes about this claim extraction.
      Document any issues, conflicts, or special circumstances.
classes:
  PersonWebClaim:
    class_uri: prov:Entity
    description: |
      A verifiable claim about a person extracted from a web page with provenance.

      **RULE 26 COMPLIANCE: Person Data Provenance**

      All person/staff data associated with heritage custodians MUST have
      web claim provenance. This includes:
      - Staff names, titles, and roles
      - Contact information (if publicly available)
      - Professional history and education
      - Affiliations and expertise areas

      **VERIFIABILITY PRINCIPLE**

      Like the base WebClaim class, PersonWebClaim follows the "XPath or Remove" principle:
      - Claims from web pages MUST have xpath pointing to source element
      - Claims from APIs (LinkedIn via Exa) may have xpath=null but MUST have source_url
      - Claims without any verifiable source are FABRICATED and must be removed

      **SOURCE HIERARCHY**

      When multiple sources provide the same information:
      1. Official institutional website (highest reliability)
      2. LinkedIn profile (high reliability)
      3. News articles/press releases (medium-high)
      4. Conference programs (medium)
      5. Academic publications (medium)
      6. Third-party databases (lower)

      Document all sources when available; note conflicts.

      **CLAIM TYPES**

      Common claim types for heritage institution staff:
      - **full_name**: Complete name as displayed
      - **role_title**: Job title (e.g., "Senior Curator")
      - **department**: Organizational unit
      - **email**: Professional contact (only if public)
      - **biography**: Professional bio text
      - **specialization**: Expertise areas
      - **education**: Degrees and institutions

      **EXTRACTION WORKFLOW**

      1. SCRAPE institutional staff/team pages with FireCrawl
      2. EXTRACT names and roles with XPath locations
      3. SEARCH LinkedIn for additional profile data
      4. CREATE PersonWebClaim for each extracted fact
      5. LINK claims to PersonObservation records

      **INTEGRATION WITH PERSONOBSERVATION**

      PersonObservation (the PiCo-based staff role record) references
      PersonWebClaim instances via the `web_claims` slot:

      ```yaml
      PersonObservation:
        person_name: "Dr. Jane Smith"
        staff_role: CONSERVATOR
        has_or_had_web_claim:
          - claim_type: full_name
            claim_value: "Dr. Jane Smith"
            source_url: https://museum.org/team
            xpath: /html/body/main/div[2]/h3
            retrieval_agent: firecrawl
      ```

      **LINKEDIN PROFILE HANDLING**

      For LinkedIn data, create separate profile files (per Rule 12, Rule 20)
      and reference them:

      ```yaml
      linkedin_claims:
        linkedin_url: https://www.linkedin.com/in/jane-smith
        profile_data_path: data/custodian/person/entity/jane-smith_20250115.json
        retrieved_on: "2025-01-15T10:30:00Z"
        retrieval_agent: exa_crawling_exa
      ```
    exact_mappings:
    - prov:Entity
    close_mappings:
    - schema:PropertyValue
    - foaf:Document
    slots:
    - person_claim_id
    - person_claim_note
    - person_claim_type
    - person_claim_value
    - person_html_file
    - person_xpath
    - person_xpath_match_score
    - retrieval_agent
    - retrieved_on
    - source_url
    - specificity_annotation
    - template_specificity
    slot_usage:
      person_claim_type:
        required: true
        description: |
          Type of person claim. See PersonClaimTypeEnum.

          Common claim types:
          - full_name: Complete name
          - role_title: Job title
          - department: Organizational unit
          - email: Contact email (if public)
          - biography: Professional bio
          - specialization: Expertise areas
      person_claim_value:
        required: true
        description: |
          The extracted value. Must be verifiable at the source.
      source_url:
        required: true
        description: |
          URL where this claim was found.
          Required for ALL claims - enables verification.
      retrieved_on:
        required: true
        description: |
          Timestamp when data was retrieved.
          ISO 8601 format with timezone (UTC preferred).
      retrieval_agent:
        required: true
        range: RetrievalAgentEnum
        description: |
          Tool used to extract this data.

          - firecrawl: Institutional websites (primary)
          - playwright: JS-heavy sites
          - exa_crawling_exa: LinkedIn profiles
          - manual: Last resort
      specificity_annotation:
        range: SpecificityAnnotation
        inlined: true
      template_specificity:
        range: TemplateSpecificityScores
        inlined: true
    comments:
    - PersonWebClaim extends WebClaim pattern for person-specific data
    - 'XPATH OR REMOVE: Claims without verifiable source must be removed'
    - Links to PersonObservation via web_claims slot
    - LinkedIn data stored separately in person/entity/ files (Rule 12, 20)
    - See Rule 26 in AGENTS.md for complete documentation
    examples:
    - value:
        person_claim_type: full_name
        person_claim_value: Taco Dibbits
        source_url: https://www.rijksmuseum.nl/en/about-us/organisation
        retrieved_on: '2025-01-15T10:30:00Z'
        person_xpath: /html/body/main/section[2]/div[1]/h2
        person_html_file: web/NL-NH-AMS-M-RM/rijksmuseum.nl/organisation.html
        person_xpath_match_score: 1.0
        retrieval_agent: firecrawl
      description: Exact match for museum director name
    - value:
        person_claim_type: role_title
        person_claim_value: General Director
        source_url: https://www.rijksmuseum.nl/en/about-us/organisation
        retrieved_on: '2025-01-15T10:30:00Z'
        person_xpath: /html/body/main/section[2]/div[1]/p[1]
        person_html_file: web/NL-NH-AMS-M-RM/rijksmuseum.nl/organisation.html
        person_xpath_match_score: 1.0
        retrieval_agent: firecrawl
      description: Role title from institutional page
    - value:
        person_claim_type: biography
        person_claim_value: Taco Dibbits has been General Director since 2016...
        source_url: https://www.rijksmuseum.nl/en/about-us/organisation
        retrieved_on: '2025-01-15T10:30:00Z'
        person_xpath: /html/body/main/section[2]/div[1]/div[2]
        person_html_file: web/NL-NH-AMS-M-RM/rijksmuseum.nl/organisation.html
        person_xpath_match_score: 0.92
        retrieval_agent: firecrawl
        person_claim_notes: Biography truncated from longer text on page
      description: Biography text with partial match score
    - value:
        person_claim_type: linkedin_url
        person_claim_value: https://www.linkedin.com/in/taco-dibbits
        source_url: https://www.linkedin.com/in/taco-dibbits
        retrieved_on: '2025-01-15T11:00:00Z'
        person_xpath: null
        retrieval_agent: exa_crawling_exa
        person_claim_notes: Profile data stored in person/entity/taco-dibbits_20250115.json
      description: LinkedIn claim - XPath null for API extraction