glam/schemas/20251121/linkml/modules/classes/ExtractionMetadata.yaml

# Extraction Metadata Class
# Provenance for LinkedIn/web extractions with PROV-O alignment

id: https://nde.nl/ontology/hc/class/ExtractionMetadata
name: extraction_metadata_class
title: Extraction Metadata Class
version: 1.0.0

prefixes:
  linkml: https://w3id.org/linkml/
  hc: https://nde.nl/ontology/hc/
  schema: http://schema.org/
  prov: http://www.w3.org/ns/prov#
  dct: http://purl.org/dc/terms/
  xsd: http://www.w3.org/2001/XMLSchema#

imports:
  - linkml:types
  - ../metadata

default_range: string

classes:

  ExtractionMetadata:
    class_uri: prov:Activity
    description: |
      Provenance metadata for data extraction activities.

      Records how, when, and by what agent data was extracted from
      external sources (LinkedIn, web scraping, APIs).

      **PROV-O Alignment**:
      - ExtractionMetadata IS a prov:Activity (the extraction process)
      - The extracted data IS the prov:Entity (output of the activity)
      - extraction_agent IS the prov:Agent (software/AI that performed extraction)
      - source_file/linkedin_url IS prov:used (input to the activity)

      **Use Cases**:
      - LinkedIn profile extractions via Exa API
      - Web scraping provenance
      - Staff list parsing provenance
      - Connection network extraction

      **Example JSON Structure**:
      ```json
      {
        "extraction_metadata": {
          "source_file": "/path/to/source.json",
          "staff_id": "org_staff_0001_name",
          "extraction_date": "2025-12-12T22:00:00Z",
          "extraction_method": "exa_crawling_exa",
          "extraction_agent": "claude-opus-4.5",
          "linkedin_url": "https://www.linkedin.com/in/...",
          "cost_usd": 0.001
        }
      }
      ```

    exact_mappings:
      - prov:Activity
    close_mappings:
      - schema:Action
      - dct:ProvenanceStatement

    slots:
      - source_file
      - staff_id
      - extraction_date
      - extraction_method
      - extraction_agent
      - linkedin_url
      - cost_usd
      - request_id

    slot_usage:
      source_file:
        description: |
          Path to the source file from which data was derived.
          PROV-O: prov:used - the entity that was used as input.
        slot_uri: prov:used
        range: string
        examples:
          - value: "/data/custodian/person/affiliated/parsed/rijksmuseum_staff_20251210T155416Z.json"
            description: "Path to parsed staff list JSON"

      staff_id:
        description: |
          Unique identifier for the staff member within the source organization.
          Format: {org_slug}_staff_{index}_{name_slug}
        slot_uri: dct:identifier
        range: string
        pattern: "^[a-z0-9-]+_staff_[a-z0-9-_]+$"
        examples:
          - value: "rijksmuseum_staff_0042_jan_van_der_berg"
            description: "Staff ID with org prefix, index, and name slug"

      extraction_date:
        description: |
          ISO 8601 timestamp when the extraction was performed.
          PROV-O: prov:endedAtTime - when the activity completed.
        slot_uri: prov:endedAtTime
        range: datetime
        required: true
        examples:
          - value: "2025-12-12T22:00:00Z"
            description: "UTC timestamp of extraction"

      extraction_method:
        description: |
          The method/tool used to extract the data.
          PROV-O: prov:wasAssociatedWith via software agent.

          **Common Values**:
          - exa_crawling_exa: Exa AI crawling API
          - exa_contents: Exa contents endpoint
          - exa_crawling_glm46: Exa + GLM 4.6 extraction
          - linkedin_html_parser: Local HTML parsing
          - manual: Manual data entry
          - firecrawl: Firecrawl web scraping
          - playwright: Playwright browser automation
        slot_uri: prov:wasGeneratedBy
        range: ExtractionMethodEnum
        required: true
        examples:
          - value: "exa_crawling_exa"
            description: "Extracted via Exa AI crawling API"

      extraction_agent:
        description: |
          The AI agent or software that performed the extraction.
          PROV-O: prov:wasAssociatedWith - agent associated with the activity.

          **Common Values**:
          - claude-opus-4.5: Claude Opus 4.5 (manual extraction)
          - glm-4.6: ZhipuAI GLM 4.6
          - automated: Fully automated script (no LLM)
        slot_uri: prov:wasAssociatedWith
        range: string
        examples:
          - value: "claude-opus-4.5"
            description: "Extracted by Claude Opus 4.5"
          - value: ""
            description: "Empty string for fully automated extraction"

      linkedin_url:
        description: |
          LinkedIn profile URL that was extracted.
          PROV-O: prov:used - the source entity.
        slot_uri: schema:url
        range: uri
        pattern: "^https://www\\.linkedin\\.com/in/[a-z0-9-]+/?$"
        examples:
          - value: "https://www.linkedin.com/in/jan-van-der-berg-12345"
            description: "LinkedIn profile URL"

      cost_usd:
        description: |
          API cost in USD for the extraction operation.
          Used for tracking extraction costs (Exa API, etc.).
        slot_uri: schema:price
        range: float
        minimum_value: 0.0
        examples:
          - value: 0.001
            description: "Exa API call cost"
          - value: 0.0
            description: "Free extraction (cached/local)"

      request_id:
        description: |
          Unique request ID from the extraction service (for tracing).
        slot_uri: dct:identifier
        range: string
        examples:
          - value: "exa_12345678-abcd-efgh-ijkl-mnopqrstuv"
            description: "Exa API request ID"

    comments:
      - "Every person entity file MUST have extraction_metadata"
      - "See AGENTS.md Rule 20 for required fields"
      - "extraction_agent should be 'claude-opus-4.5' for manual extraction"
      - "cost_usd enables budget tracking for API-heavy extractions"

    see_also:
      - "https://www.w3.org/TR/prov-o/"
      - "https://docs.exa.ai/"

enums:
  ExtractionMethodEnum:
    description: |
      Enumeration of extraction methods/tools used for person data extraction.
    permissible_values:
      exa_crawling_exa:
        description: "Exa AI crawling API - primary LinkedIn extraction"
        meaning: schema:SoftwareApplication
      exa_contents:
        description: "Exa contents endpoint - cached content retrieval"
        meaning: schema:SoftwareApplication
      exa_crawling_glm46:
        description: "Exa crawling with GLM 4.6 processing"
        meaning: schema:SoftwareApplication
      linkedin_html_parser:
        description: "Local HTML parsing of saved LinkedIn pages"
        meaning: schema:SoftwareApplication
      manual:
        description: "Manual data entry by human operator"
        meaning: prov:Person
      firecrawl:
        description: "Firecrawl web scraping service"
        meaning: schema:SoftwareApplication
      playwright:
        description: "Playwright browser automation"
        meaning: schema:SoftwareApplication
      web_archive:
        description: "Internet Archive Wayback Machine"
        meaning: schema:SoftwareApplication
      institutional_website:
        description: "Direct scraping from institutional website"
        meaning: schema:SoftwareApplication

slots:
  source_file:
    description: "Path to the source file from which data was derived"
    range: string

  staff_id:
    description: "Unique identifier for staff member within source organization"
    range: string

  extraction_date:
    description: "ISO 8601 timestamp when extraction was performed"
    range: datetime

  extraction_method:
    description: "Method/tool used to extract the data"
    range: ExtractionMethodEnum

  extraction_agent:
    description: "AI agent or software that performed extraction"
    range: string

  linkedin_url:
    description: "LinkedIn profile URL that was extracted"
    range: uri

  cost_usd:
    description: "API cost in USD for the extraction operation"
    range: float

  request_id:
    description: "Unique request ID from extraction service"
    range: string