glam/schemas/20251121/linkml/modules/classes/ExtractionMetadata.yaml

id: https://nde.nl/ontology/hc/class/ExtractionMetadata
name: extraction_metadata_class
title: Extraction Metadata Class
version: 1.0.0
prefixes:
  linkml: https://w3id.org/linkml/
  hc: https://nde.nl/ontology/hc/
  schema: http://schema.org/
  prov: http://www.w3.org/ns/prov#
  dct: http://purl.org/dc/terms/
  xsd: http://www.w3.org/2001/XMLSchema#
imports:
- linkml:types
- ../metadata
- ./LLMResponse
- ../slots/extraction_agent
- ../slots/extraction_method
- ../slots/cost_usd
- ../slots/class_metadata_slots
default_range: string
classes:
  ExtractionMetadata:
    class_uri: prov:Activity
    description: "Provenance metadata for data extraction activities.\n\nRecords how,\
      \ when, and by what agent data was extracted from \nexternal sources (LinkedIn,\
      \ web scraping, APIs).\n\n**PROV-O Alignment**:\n- ExtractionMetadata IS a prov:Activity\
      \ (the extraction process)\n- The extracted data IS the prov:Entity (output\
      \ of the activity)\n- extraction_agent IS the prov:Agent (software/AI that performed\
      \ extraction)\n- source_file/linkedin_url IS prov:used (input to the activity)\n\
      \n**Use Cases**:\n- LinkedIn profile extractions via Exa API\n- Web scraping\
      \ provenance\n- Staff list parsing provenance\n- Connection network extraction\n\
      \n**Example JSON Structure**:\n```json\n{\n  \"extraction_metadata\": {\n  \
      \  \"source_file\": \"/path/to/source.json\",\n    \"staff_id\": \"org_staff_0001_name\"\
      ,\n    \"extraction_date\": \"2025-12-12T22:00:00Z\",\n    \"extraction_method\"\
      : \"exa_crawling_exa\",\n    \"extraction_agent\": \"claude-opus-4.5\",\n  \
      \  \"linkedin_url\": \"https://www.linkedin.com/in/...\",\n    \"cost_usd\"\
      : 0.001\n  }\n}\n```\n"
    exact_mappings:
    - prov:Activity
    close_mappings:
    - schema:Action
    - dct:ProvenanceStatement
    slots:
    - cost_usd
    - extraction_agent
    - extraction_date
    - extraction_method
    - linkedin_url
    - llm_response
    - request_id
    - source_file
    - specificity_annotation
    - staff_id
    - template_specificity
    slot_usage:
      source_file:
        description: |
          Path to the source file from which data was derived.
          PROV-O: prov:used - the entity that was used as input.
        slot_uri: prov:used
        range: string
        examples:
        - value: /data/custodian/person/affiliated/parsed/rijksmuseum_staff_20251210T155416Z.json
          description: Path to parsed staff list JSON
      staff_id:
        description: |
          Unique identifier for the staff member within the source organization.
          Format: {org_slug}_staff_{index}_{name_slug}
        slot_uri: dct:identifier
        range: string
        pattern: ^[a-z0-9-]+_staff_[a-z0-9-_]+$
        examples:
        - value: rijksmuseum_staff_0042_jan_van_der_berg
          description: Staff ID with org prefix, index, and name slug
      extraction_date:
        description: |
          ISO 8601 timestamp when the extraction was performed.
          PROV-O: prov:endedAtTime - when the activity completed.
        slot_uri: prov:endedAtTime
        range: datetime
        required: true
        examples:
        - value: '2025-12-12T22:00:00Z'
          description: UTC timestamp of extraction
      extraction_method:
        description: |
          The method/tool used to extract the data.
          PROV-O: prov:wasAssociatedWith via software agent.

          **Common Values**:
          - exa_crawling_exa: Exa AI crawling API
          - exa_contents: Exa contents endpoint
          - exa_crawling_glm47: Exa + GLM 4.7 extraction
          - linkedin_html_parser: Local HTML parsing
          - manual: Manual data entry
          - firecrawl: Firecrawl web scraping
          - playwright: Playwright browser automation
        slot_uri: prov:wasGeneratedBy
        range: ProfileExtractionMethodEnum
        required: true
        examples:
        - value: exa_crawling_exa
          description: Extracted via Exa AI crawling API
      extraction_agent:
        description: |
          The AI agent or software that performed the extraction.
          PROV-O: prov:wasAssociatedWith - agent associated with the activity.

          **Common Values**:
          - claude-opus-4.5: Claude Opus 4.5 (manual extraction)
          - glm-4.7: ZhipuAI GLM 4.7
          - automated: Fully automated script (no LLM)
        slot_uri: prov:wasAssociatedWith
        range: string
        examples:
        - value: claude-opus-4.5
          description: Extracted by Claude Opus 4.5
        - value: ''
          description: Empty string for fully automated extraction
      linkedin_url:
        description: |
          LinkedIn profile URL that was extracted.
          PROV-O: prov:used - the source entity.
        slot_uri: schema:url
        range: uri
        pattern: ^https://www\.linkedin\.com/in/[a-z0-9-]+/?$
        examples:
        - value: https://www.linkedin.com/in/jan-van-der-berg-12345
          description: LinkedIn profile URL
      cost_usd:
        description: |
          API cost in USD for the extraction operation.
          Used for tracking extraction costs (Exa API, etc.).
        slot_uri: schema:price
        range: float
        minimum_value: 0.0
        examples:
        - value: 0.001
          description: Exa API call cost
        - value: 0.0
          description: Free extraction (cached/local)
      request_id:
        description: |
          Unique request ID from the extraction service (for tracing).
        slot_uri: dct:identifier
        range: string
        examples:
        - value: exa_12345678-abcd-efgh-ijkl-mnopqrstuv
          description: Exa API request ID
      llm_response:
        description: |
          Full LLM response provenance including reasoning_content.

          Captures GLM 4.7 Thinking Modes (Interleaved, Preserved, Turn-level)
          for extractions that use LLM processing.

          **PROV-O Alignment**:
          - The LLMResponse IS a sub-activity (prov:qualifiedGeneration)
          - Provides detailed audit trail of LLM inference for the extraction

          **Use Cases**:
          - LinkedIn profile extraction with GLM 4.7 reasoning
          - Web content extraction with chain-of-thought logging
          - Staff list processing with schema conformity validation

          **When to Include**:
          - extraction_method is exa_crawling_glm47
          - Any extraction involving LLM processing
          - When reasoning_content provides valuable audit trail
        slot_uri: prov:qualifiedGeneration
        range: LLMResponse
        required: false
        inlined: true
        examples:
        - value: |
            {
              "content": "Extracted institution data...",
              "reasoning_content": "Analyzing the input for LinkML schema conformity...",
              "thinking_mode": "preserved",
              "clear_thinking": false,
              "model": "glm-4.7",
              "provider": "zai",
              "created": "2025-12-23T10:30:00Z",
              "prompt_tokens": 150,
              "completion_tokens": 450,
              "total_tokens": 600,
              "finish_reason": "stop",
              "cost_usd": 0.0
            }
          description: GLM 4.7 response with Preserved Thinking for extraction
      specificity_annotation:
        range: SpecificityAnnotation
        inlined: true
      template_specificity:
        range: TemplateSpecificityScores
        inlined: true
    comments:
    - Every person entity file MUST have extraction_metadata
    - See AGENTS.md Rule 20 for required fields
    - extraction_agent should be 'claude-opus-4.5' for manual extraction
    - cost_usd enables budget tracking for API-heavy extractions
    see_also:
    - https://www.w3.org/TR/prov-o/
    - https://docs.exa.ai/
enums:
  ProfileExtractionMethodEnum:
    description: |
      Enumeration of extraction methods/tools used for person profile data extraction.
      Used for LinkedIn profiles, web scraping of staff pages, and similar person data sources.
    permissible_values:
      exa_crawling_exa:
        description: Exa AI crawling API - primary LinkedIn extraction
        meaning: schema:SoftwareApplication
      exa_contents:
        description: Exa contents endpoint - cached content retrieval
        meaning: schema:SoftwareApplication
      exa_crawling_glm47:
        description: Exa crawling with GLM 4.7 processing
        meaning: schema:SoftwareApplication
      linkedin_html_parser:
        description: Local HTML parsing of saved LinkedIn pages
        meaning: schema:SoftwareApplication
      manual:
        description: Manual data entry by human operator
        meaning: prov:Person
      firecrawl:
        description: Firecrawl web scraping service
        meaning: schema:SoftwareApplication
      playwright:
        description: Playwright browser automation
        meaning: schema:SoftwareApplication
      web_archive:
        description: Internet Archive Wayback Machine
        meaning: schema:SoftwareApplication
      institutional_website:
        description: Direct scraping from institutional website
        meaning: schema:SoftwareApplication
slots:
  source_file:
    description: Path to the source file from which data was derived
    range: string
  staff_id:
    description: Unique identifier for staff member within source organization
    range: string
  extraction_date:
    description: ISO 8601 timestamp when extraction was performed
    range: datetime
  linkedin_url:
    description: LinkedIn profile URL that was extracted
    range: uri
  request_id:
    description: Unique request ID from extraction service
    range: string
  llm_response:
    description: Full LLM response with reasoning provenance (GLM 4.7 Thinking Modes)
    range: LLMResponse