id: https://nde.nl/ontology/hc/class/ExtractionMetadata
name: extraction_metadata_class
title: Extraction Metadata Class
version: 1.0.0
prefixes:
  linkml: https://w3id.org/linkml/
  hc: https://nde.nl/ontology/hc/
  schema: http://schema.org/
  prov: http://www.w3.org/ns/prov#
  dct: http://purl.org/dc/terms/
  xsd: http://www.w3.org/2001/XMLSchema#
imports:
  - linkml:types
  - ../metadata
  - ./LLMResponse
  - ./SpecificityAnnotation
  - ./TemplateSpecificityScores
  - ../enums/ProfileExtractionMethodEnum
  - ../slots/extraction_agent
  - ../slots/extraction_method
  - ../slots/cost_usd
  - ../slots/source_file
  - ../slots/staff_id
  - ../slots/extraction_date
  - ../slots/linkedin_url
  - ../slots/request_id
  - ../slots/llm_response
  - ../slots/specificity_annotation
  - ../slots/template_specificity
default_range: string

classes:
  ExtractionMetadata:
    class_uri: prov:Activity
    description: "Provenance metadata for data extraction activities.\n\nRecords how,\
      \ when, and by what agent data was extracted from \nexternal sources (LinkedIn,\
      \ web scraping, APIs).\n\n**PROV-O Alignment**:\n- ExtractionMetadata IS a prov:Activity\
      \ (the extraction process)\n- The extracted data IS the prov:Entity (output\
      \ of the activity)\n- extraction_agent IS the prov:Agent (software/AI that performed\
      \ extraction)\n- source_file/linkedin_url IS prov:used (input to the activity)\n\
      \n**Use Cases**:\n- LinkedIn profile extractions via Exa API\n- Web scraping\
      \ provenance\n- Staff list parsing provenance\n- Connection network extraction\n\
      \n**Example JSON Structure**:\n```json\n{\n  \"extraction_metadata\": {\n  \
      \  \"source_file\": \"/path/to/source.json\",\n    \"staff_id\": \"org_staff_0001_name\"\
      ,\n    \"extraction_date\": \"2025-12-12T22:00:00Z\",\n    \"extraction_method\"\
      : \"exa_crawling_exa\",\n    \"extraction_agent\": \"claude-opus-4.5\",\n  \
      \  \"linkedin_url\": \"https://www.linkedin.com/in/...\",\n    \"cost_usd\"\
      : 0.001\n  }\n}\n```\n"
    exact_mappings:
      - prov:Activity
    close_mappings:
      - schema:Action
      - dct:ProvenanceStatement
    slots:
      - cost_usd
      - extraction_agent
      - extraction_date
      - extraction_method
      - linkedin_url
      - llm_response
      - request_id
      - source_file
      - specificity_annotation
      - staff_id
      - template_specificity
    slot_usage:
      source_file:
        description: |
          Path to the source file from which data was derived.
          PROV-O: prov:used - the entity that was used as input.
        slot_uri: prov:used
        range: string
        examples:
          - value: /data/custodian/person/affiliated/parsed/rijksmuseum_staff_20251210T155416Z.json
            description: Path to parsed staff list JSON
      staff_id:
        description: |
          Unique identifier for the staff member within the source organization.
          Format: {org_slug}_staff_{index}_{name_slug}
        slot_uri: dct:identifier
        range: string
        pattern: ^[a-z0-9-]+_staff_[a-z0-9-_]+$
        examples:
          - value: rijksmuseum_staff_0042_jan_van_der_berg
            description: Staff ID with org prefix, index, and name slug
      extraction_date:
        description: |
          ISO 8601 timestamp when the extraction was performed.
          PROV-O: prov:endedAtTime - when the activity completed.
        slot_uri: prov:endedAtTime
        range: datetime
        required: true
        examples:
          - value: '2025-12-12T22:00:00Z'
            description: UTC timestamp of extraction
      extraction_method:
        description: |
          The method/tool used to extract the data.
          PROV-O: prov:wasAssociatedWith via software agent.

          **Common Values**:
          - exa_crawling_exa: Exa AI crawling API
          - exa_contents: Exa contents endpoint
          - exa_crawling_glm47: Exa + GLM 4.7 extraction
          - linkedin_html_parser: Local HTML parsing
          - manual: Manual data entry
          - firecrawl: Firecrawl web scraping
          - playwright: Playwright browser automation
        slot_uri: prov:wasGeneratedBy
        range: ProfileExtractionMethodEnum
        required: true
        examples:
          - value: exa_crawling_exa
            description: Extracted via Exa AI crawling API
      extraction_agent:
        description: |
          The AI agent or software that performed the extraction.
          PROV-O: prov:wasAssociatedWith - agent associated with the activity.

          **Common Values**:
          - claude-opus-4.5: Claude Opus 4.5 (manual extraction)
          - glm-4.7: ZhipuAI GLM 4.7
          - automated: Fully automated script (no LLM)
        slot_uri: prov:wasAssociatedWith
        range: string
        examples:
          - value: claude-opus-4.5
            description: Extracted by Claude Opus 4.5
          - value: ''
            description: Empty string for fully automated extraction
      linkedin_url:
        description: |
          LinkedIn profile URL that was extracted.
          PROV-O: prov:used - the source entity.
        slot_uri: schema:url
        range: uri
        pattern: ^https://www\.linkedin\.com/in/[a-z0-9-]+/?$
        examples:
          - value: https://www.linkedin.com/in/jan-van-der-berg-12345
            description: LinkedIn profile URL
      cost_usd:
        description: |
          API cost in USD for the extraction operation.
          Used for tracking extraction costs (Exa API, etc.).
        slot_uri: schema:price
        range: float
        minimum_value: 0.0
        examples:
          - value: 0.001
            description: Exa API call cost
          - value: 0.0
            description: Free extraction (cached/local)
      request_id:
        description: |
          Unique request ID from the extraction service (for tracing).
        slot_uri: dct:identifier
        range: string
        examples:
          - value: exa_12345678-abcd-efgh-ijkl-mnopqrstuv
            description: Exa API request ID
      llm_response:
        description: |
          Full LLM response provenance including reasoning_content.

          Captures GLM 4.7 Thinking Modes (Interleaved, Preserved, Turn-level)
          for extractions that use LLM processing.

          **PROV-O Alignment**:
          - The LLMResponse IS a sub-activity (prov:qualifiedGeneration)
          - Provides detailed audit trail of LLM inference for the extraction

          **Use Cases**:
          - LinkedIn profile extraction with GLM 4.7 reasoning
          - Web content extraction with chain-of-thought logging
          - Staff list processing with schema conformity validation

          **When to Include**:
          - extraction_method is exa_crawling_glm47
          - Any extraction involving LLM processing
          - When reasoning_content provides valuable audit trail
        slot_uri: prov:qualifiedGeneration
        range: LLMResponse
        required: false
        inlined: true
        examples:
          - value: |
              {
                "content": "Extracted institution data...",
                "reasoning_content": "Analyzing the input for LinkML schema conformity...",
                "thinking_mode": "preserved",
                "clear_thinking": false,
                "model": "glm-4.7",
                "provider": "zai",
                "created": "2025-12-23T10:30:00Z",
                "prompt_tokens": 150,
                "completion_tokens": 450,
                "total_tokens": 600,
                "finish_reason": "stop",
                "cost_usd": 0.0
              }
            description: GLM 4.7 response with Preserved Thinking for extraction
      specificity_annotation:
        range: SpecificityAnnotation
        inlined: true
      template_specificity:
        range: TemplateSpecificityScores
        inlined: true
    comments:
      - Every person entity file MUST have extraction_metadata
      - See AGENTS.md Rule 20 for required fields
      - extraction_agent should be 'claude-opus-4.5' for manual extraction
      - cost_usd enables budget tracking for API-heavy extractions
    see_also:
      - https://www.w3.org/TR/prov-o/
      - https://docs.exa.ai/