glam/schemas/20251121/linkml/modules/classes/ExtractionMetadata.yaml

id: https://nde.nl/ontology/hc/class/ExtractionMetadata
name: extraction_metadata_class
title: Extraction Metadata Class
version: 1.0.0
prefixes:
  linkml: https://w3id.org/linkml/
  hc: https://nde.nl/ontology/hc/
  schema: http://schema.org/
  prov: http://www.w3.org/ns/prov#
  dct: http://purl.org/dc/terms/
  xsd: http://www.w3.org/2001/XMLSchema#
imports:
- linkml:types
- ../metadata
- ./LLMResponse
- ./SpecificityAnnotation
- ./TemplateSpecificityScores
- ../enums/ProfileExtractionMethodEnum
- ../slots/extraction_agent
- ../slots/extraction_method
- ../slots/cost_usd
- ../slots/source_file
- ../slots/staff_id
- ../slots/extraction_date
- ../slots/linkedin_url
- ../slots/request_id
- ../slots/llm_response
- ../slots/specificity_annotation
- ../slots/template_specificity
default_range: string
classes:
  ExtractionMetadata:
    class_uri: prov:Activity
    description: "Provenance metadata for data extraction activities.\n\nRecords how, when, and by what agent data was extracted\
      \ from \nexternal sources (LinkedIn, web scraping, APIs).\n\n**PROV-O Alignment**:\n- ExtractionMetadata IS a prov:Activity\
      \ (the extraction process)\n- The extracted data IS the prov:Entity (output of the activity)\n- extraction_agent IS\
      \ the prov:Agent (software/AI that performed extraction)\n- source_file/linkedin_url IS prov:used (input to the activity)\n\
      \n**Use Cases**:\n- LinkedIn profile extractions via Exa API\n- Web scraping provenance\n- Staff list parsing provenance\n\
      - Connection network extraction\n\n**Example JSON Structure**:\n```json\n{\n  \"extraction_metadata\": {\n    \"source_file\"\
      : \"/path/to/source.json\",\n    \"staff_id\": \"org_staff_0001_name\",\n    \"extraction_date\": \"2025-12-12T22:00:00Z\"\
      ,\n    \"extraction_method\": \"exa_crawling_exa\",\n    \"extraction_agent\": \"claude-opus-4.5\",\n    \"linkedin_url\"\
      : \"https://www.linkedin.com/in/...\",\n    \"cost_usd\": 0.001\n  }\n}\n```\n"
    exact_mappings:
    - prov:Activity
    close_mappings:
    - schema:Action
    - dct:ProvenanceStatement
    slots:
    - cost_usd
    - extraction_agent
    - extraction_date
    - extraction_method
    - linkedin_url
    - llm_response
    - request_id
    - source_file
    - specificity_annotation
    - staff_id
    - template_specificity
    slot_usage:
      source_file:
        range: string
        examples:
        - value: /data/custodian/person/affiliated/parsed/rijksmuseum_staff_20251210T155416Z.json
          description: Path to parsed staff list JSON
      staff_id:
        range: string
        pattern: ^[a-z0-9-]+_staff_[a-z0-9-_]+$
        examples:
        - value: rijksmuseum_staff_0042_jan_van_der_berg
          description: Staff ID with org prefix, index, and name slug
      extraction_date:
        range: datetime
        required: true
        examples:
        - value: '2025-12-12T22:00:00Z'
          description: UTC timestamp of extraction
      extraction_method:
        range: ProfileExtractionMethodEnum
        required: true
        examples:
        - value: exa_crawling_exa
          description: Extracted via Exa AI crawling API
      extraction_agent:
        range: string
        examples:
        - value: claude-opus-4.5
          description: Extracted by Claude Opus 4.5
        - value: ''
          description: Empty string for fully automated extraction
      linkedin_url:
        range: uri
        pattern: ^https://www\.linkedin\.com/in/[a-z0-9-]+/?$
        examples:
        - value: https://www.linkedin.com/in/jan-van-der-berg-12345
          description: LinkedIn profile URL
      cost_usd:
        range: float
        minimum_value: 0.0
        examples:
        - value: 0.001
          description: Exa API call cost
        - value: 0.0
          description: Free extraction (cached/local)
      request_id:
        range: string
        examples:
        - value: exa_12345678-abcd-efgh-ijkl-mnopqrstuv
          description: Exa API request ID
      llm_response:
        range: LLMResponse
        required: false
        inlined: true
        examples:
        - value: "{\n  \"content\": \"Extracted institution data...\",\n  \"reasoning_content\": \"Analyzing the input for\
            \ LinkML schema conformity...\",\n  \"thinking_mode\": \"preserved\",\n  \"clear_thinking\": false,\n  \"model\"\
            : \"glm-4.7\",\n  \"provider\": \"zai\",\n  \"created\": \"2025-12-23T10:30:00Z\",\n  \"prompt_tokens\": 150,\n\
            \  \"completion_tokens\": 450,\n  \"total_tokens\": 600,\n  \"finish_reason\": \"stop\",\n  \"cost_usd\": 0.0\n\
            }\n"
          description: GLM 4.7 response with Preserved Thinking for extraction
      specificity_annotation:
        range: SpecificityAnnotation
        inlined: true
      template_specificity:
        range: TemplateSpecificityScores
        inlined: true
    comments:
    - Every person entity file MUST have extraction_metadata
    - See AGENTS.md Rule 20 for required fields
    - extraction_agent should be 'claude-opus-4.5' for manual extraction
    - cost_usd enables budget tracking for API-heavy extractions
    see_also:
    - https://www.w3.org/TR/prov-o/
    - https://docs.exa.ai/