glam/schemas/20251121/linkml/modules/classes/PersonWebClaim.yaml

# PersonWebClaim Class
# Extends WebClaim pattern for person-specific claims with XPath provenance
#
# Created December 2025 to provide:
# - Verifiable provenance for staff/person data from web pages
# - Person-specific claim types (name, role, contact, etc.)
# - Integration with PersonObservation and StaffRole patterns
#
# Key principle (per Rule 26):
#   All person/staff data associated with heritage custodians MUST have
#   web claim provenance. Staff information without verifiable sources is unacceptable.
#
# This class follows the same XPATH OR REMOVE principle as WebClaim:
#   Every claim from a webpage MUST have an XPath pointer to the exact
#   location in archived HTML where that value appears.

id: https://nde.nl/ontology/hc/class/PersonWebClaim
name: PersonWebClaim
title: Person Web Claim - Verifiable Person Data Claims

prefixes:
  linkml: https://w3id.org/linkml/
  hc: https://nde.nl/ontology/hc/
  schema: http://schema.org/
  dcterms: http://purl.org/dc/terms/
  prov: http://www.w3.org/ns/prov#
  pico: https://personsincontext.org/model#
  foaf: http://xmlns.com/foaf/0.1/

imports:
  - linkml:types
  - ../slots/source_url
  - ../slots/retrieved_on

default_prefix: hc

enums:
  PersonClaimTypeEnum:
    description: |
      Types of claims that can be extracted about persons from web pages.
      Each claim type has expected source locations in institutional pages.
    permissible_values:
      full_name:
        description: Full name of the person
        comments:
          - "Expected in: staff directory, about page, h2/h3 headings"
      given_name:
        description: First/given name only
        comments:
          - "May be parsed from full_name"
      family_name:
        description: Family/surname only
        comments:
          - "May be parsed from full_name"
      role_title:
        description: Job title or role within organization
        comments:
          - "Expected in: staff directory, org chart, biography section"
      department:
        description: Department or division name
        comments:
          - "Expected in: staff directory, org chart"
      email:
        description: Professional email address
        comments:
          - "Expected in: staff directory, contact section"
          - "Only include if publicly listed"
      phone:
        description: Professional phone number
        comments:
          - "Expected in: staff directory, contact section"
          - "Only include if publicly listed"
      biography:
        description: Professional biography text
        comments:
          - "Expected in: staff page, about section"
      specialization:
        description: Area of expertise or specialization
        comments:
          - "Expected in: biography, staff profile, research interests"
      education:
        description: Educational background or degrees
        comments:
          - "Expected in: biography, CV section"
          - "Example: 'PhD Art History, University of Amsterdam'"
      start_date:
        description: Date when person started current role
        comments:
          - "Expected in: biography, news announcements"
          - "Format: ISO 8601 date"
      end_date:
        description: Date when person ended role (if applicable)
        comments:
          - "Expected in: historical records, farewell announcements"
      photo_url:
        description: URL to profile photo
        comments:
          - "Expected in: staff directory, biography page"
          - "For LinkedIn: use CDN URL (media.licdn.com), not overlay page"
      linkedin_url:
        description: LinkedIn profile URL
        comments:
          - "Expected in: staff page footer, social links"
      orcid:
        description: ORCID researcher identifier
        comments:
          - "Expected in: research publications, CV section"
          - "Format: 0000-0000-0000-0000"
      twitter_handle:
        description: Twitter/X handle
        comments:
          - "Expected in: social media section, footer"
      previous_employer:
        description: Previous organization where person worked
        comments:
          - "Expected in: biography, LinkedIn experience"
      publication:
        description: Citation or link to publication
        comments:
          - "Expected in: publications list, CV"
      award:
        description: Professional award or recognition
        comments:
          - "Expected in: biography, awards section"
      language_proficiency:
        description: Language the person speaks
        comments:
          - "Expected in: LinkedIn profile, CV"

  RetrievalAgentEnum:
    description: |
      Tools/agents used to retrieve and extract person data.
      Must match the tool that was actually used for extraction.
    permissible_values:
      firecrawl:
        description: FireCrawl MCP tools for web scraping
        comments:
          - "Primary tool for institutional websites"
      playwright:
        description: Playwright browser automation
        comments:
          - "For JavaScript-heavy sites requiring browser rendering"
      exa_crawling_exa:
        description: Exa AI crawling with direct URL
        comments:
          - "Primary tool for LinkedIn profile extraction"
      exa_linkedin_search_exa:
        description: Exa AI LinkedIn search
        comments:
          - "For finding LinkedIn profiles when URL unknown"
      manual:
        description: Manual inspection and copy
        comments:
          - "Last resort - document why automated tools failed"

slots:
  person_claim_id:
    identifier: true
    range: uriorcurie
    description: Unique identifier for this person claim

  person_claim_type:
    range: PersonClaimTypeEnum
    required: true
    description: Type of claim being made about the person

  person_claim_value:
    range: string
    required: true
    description: The extracted value for this claim

  # XPath provenance - CRITICAL for verifiability
  person_xpath:
    range: string
    required: false
    description: |
      XPath to the element containing this claim value.
      REQUIRED for web page claims. May be null for API-sourced claims.

      Format: Standard XPath 1.0 expression
      Example: /html/body/main/section[2]/div[1]/h2
    pattern: "^/.*"

  person_html_file:
    range: string
    required: false
    description: |
      Relative path to archived HTML file containing this claim.
      Path is relative to the custodian data directory.

      Example: web/NL-NH-AMS-M-RM/rijksmuseum.nl/team.html

  person_xpath_match_score:
    range: float
    required: false
    minimum_value: 0.0
    maximum_value: 1.0
    description: |
      Match quality between claim value and text at XPath location.

      - 1.0 = Exact match
      - 0.8-0.99 = Near match after normalization
      - 0.5-0.79 = Substring match
      - <0.5 = Weak match (needs review)

  retrieval_agent:
    range: RetrievalAgentEnum
    required: true
    description: |
      Tool/agent used to retrieve this data.
      Must accurately reflect the extraction method.

  person_claim_notes:
    range: string
    description: |
      Notes about this claim extraction.
      Document any issues, conflicts, or special circumstances.

classes:
  PersonWebClaim:
    class_uri: prov:Entity
    description: |
      A verifiable claim about a person extracted from a web page with provenance.

      **RULE 26 COMPLIANCE: Person Data Provenance**

      All person/staff data associated with heritage custodians MUST have
      web claim provenance. This includes:
      - Staff names, titles, and roles
      - Contact information (if publicly available)
      - Professional history and education
      - Affiliations and expertise areas

      **VERIFIABILITY PRINCIPLE**

      Like the base WebClaim class, PersonWebClaim follows the "XPath or Remove" principle:
      - Claims from web pages MUST have xpath pointing to source element
      - Claims from APIs (LinkedIn via Exa) may have xpath=null but MUST have source_url
      - Claims without any verifiable source are FABRICATED and must be removed

      **SOURCE HIERARCHY**

      When multiple sources provide the same information:
      1. Official institutional website (highest reliability)
      2. LinkedIn profile (high reliability)
      3. News articles/press releases (medium-high)
      4. Conference programs (medium)
      5. Academic publications (medium)
      6. Third-party databases (lower)

      Document all sources when available; note conflicts.

      **CLAIM TYPES**

      Common claim types for heritage institution staff:
      - **full_name**: Complete name as displayed
      - **role_title**: Job title (e.g., "Senior Curator")
      - **department**: Organizational unit
      - **email**: Professional contact (only if public)
      - **biography**: Professional bio text
      - **specialization**: Expertise areas
      - **education**: Degrees and institutions

      **EXTRACTION WORKFLOW**

      1. SCRAPE institutional staff/team pages with FireCrawl
      2. EXTRACT names and roles with XPath locations
      3. SEARCH LinkedIn for additional profile data
      4. CREATE PersonWebClaim for each extracted fact
      5. LINK claims to PersonObservation records

      **INTEGRATION WITH PERSONOBSERVATION**

      PersonObservation (the PiCo-based staff role record) references
      PersonWebClaim instances via the `web_claims` slot:

      ```yaml
      PersonObservation:
        person_name: "Dr. Jane Smith"
        staff_role: CONSERVATOR
        web_claims:
          - claim_type: full_name
            claim_value: "Dr. Jane Smith"
            source_url: https://museum.org/team
            xpath: /html/body/main/div[2]/h3
            retrieval_agent: firecrawl
      ```

      **LINKEDIN PROFILE HANDLING**

      For LinkedIn data, create separate profile files (per Rule 12, Rule 20)
      and reference them:

      ```yaml
      linkedin_claims:
        linkedin_url: https://www.linkedin.com/in/jane-smith
        profile_data_path: data/custodian/person/entity/jane-smith_20250115.json
        retrieved_on: "2025-01-15T10:30:00Z"
        retrieval_agent: exa_crawling_exa
      ```

    exact_mappings:
      - prov:Entity

    close_mappings:
      - schema:PropertyValue
      - foaf:Document

    slots:
      - person_claim_id
      - person_claim_type
      - person_claim_value
      - source_url
      - retrieved_on
      - person_xpath
      - person_html_file
      - person_xpath_match_score
      - retrieval_agent
      - person_claim_notes

    slot_usage:
      person_claim_type:
        required: true
        description: |
          Type of person claim. See PersonClaimTypeEnum.

          Common claim types:
          - full_name: Complete name
          - role_title: Job title
          - department: Organizational unit
          - email: Contact email (if public)
          - biography: Professional bio
          - specialization: Expertise areas

      person_claim_value:
        required: true
        description: |
          The extracted value. Must be verifiable at the source.

      source_url:
        required: true
        description: |
          URL where this claim was found.
          Required for ALL claims - enables verification.

      retrieved_on:
        required: true
        description: |
          Timestamp when data was retrieved.
          ISO 8601 format with timezone (UTC preferred).

      retrieval_agent:
        required: true
        description: |
          Tool used to extract this data.

          - firecrawl: Institutional websites (primary)
          - playwright: JS-heavy sites
          - exa_crawling_exa: LinkedIn profiles
          - manual: Last resort

    comments:
      - "PersonWebClaim extends WebClaim pattern for person-specific data"
      - "XPATH OR REMOVE: Claims without verifiable source must be removed"
      - "Links to PersonObservation via web_claims slot"
      - "LinkedIn data stored separately in person/entity/ files (Rule 12, 20)"
      - "See Rule 26 in AGENTS.md for complete documentation"

    # Documentation references (in comments to avoid CURIE validation):
    # - .opencode/PERSON_DATA_PROVENANCE_RULE.md
    # - modules/classes/WebClaim.yaml
    # - modules/classes/PersonObservation.yaml
    # - AGENTS.md Rule 26

    examples:
      - value:
          person_claim_type: full_name
          person_claim_value: "Taco Dibbits"
          source_url: "https://www.rijksmuseum.nl/en/about-us/organisation"
          retrieved_on: "2025-01-15T10:30:00Z"
          person_xpath: "/html/body/main/section[2]/div[1]/h2"
          person_html_file: "web/NL-NH-AMS-M-RM/rijksmuseum.nl/organisation.html"
          person_xpath_match_score: 1.0
          retrieval_agent: firecrawl
        description: "Exact match for museum director name"

      - value:
          person_claim_type: role_title
          person_claim_value: "General Director"
          source_url: "https://www.rijksmuseum.nl/en/about-us/organisation"
          retrieved_on: "2025-01-15T10:30:00Z"
          person_xpath: "/html/body/main/section[2]/div[1]/p[1]"
          person_html_file: "web/NL-NH-AMS-M-RM/rijksmuseum.nl/organisation.html"
          person_xpath_match_score: 1.0
          retrieval_agent: firecrawl
        description: "Role title from institutional page"

      - value:
          person_claim_type: biography
          person_claim_value: "Taco Dibbits has been General Director since 2016..."
          source_url: "https://www.rijksmuseum.nl/en/about-us/organisation"
          retrieved_on: "2025-01-15T10:30:00Z"
          person_xpath: "/html/body/main/section[2]/div[1]/div[2]"
          person_html_file: "web/NL-NH-AMS-M-RM/rijksmuseum.nl/organisation.html"
          person_xpath_match_score: 0.92
          retrieval_agent: firecrawl
          person_claim_notes: "Biography truncated from longer text on page"
        description: "Biography text with partial match score"

      - value:
          person_claim_type: linkedin_url
          person_claim_value: "https://www.linkedin.com/in/taco-dibbits"
          source_url: "https://www.linkedin.com/in/taco-dibbits"
          retrieved_on: "2025-01-15T11:00:00Z"
          person_xpath: null
          retrieval_agent: exa_crawling_exa
          person_claim_notes: "Profile data stored in person/entity/taco-dibbits_20250115.json"
        description: "LinkedIn claim - XPath null for API extraction"