glam/schemas/20251121/linkml/modules/classes/LLMResponse.yaml

id: https://nde.nl/ontology/hc/class/LLMResponse
name: llm_response_class
title: LLM Response Class
version: 1.0.0
prefixes:
  linkml: https://w3id.org/linkml/
  hc: https://nde.nl/ontology/hc/
  schema: http://schema.org/
  prov: http://www.w3.org/ns/prov#
  dct: http://purl.org/dc/terms/
  xsd: http://www.w3.org/2001/XMLSchema#
imports:
  - linkml:types
  - ../metadata
  - ./SpecificityAnnotation
  - ./TemplateSpecificityScore  # was: TemplateSpecificityScores - migrated per Rule 53 (2026-01-17)

  - ./TemplateSpecificityType

  - ./TemplateSpecificityTypes
  - ../enums/LLMProviderEnum
  - ../enums/FinishReasonEnum
  - ../enums/ThinkingModeEnum
  - ../slots/content
  - ../slots/reasoning_content
  - ../slots/model
  - ../slots/provider
  - ../slots/prompt_token
  # completion_token migrated to has_or_had_token + Token with OutputTokenType per Rule 53 (2026-01-19)
  - ../slots/consumes_or_consumed  # was: total_token - migrated per Rule 53 (2026-01-15)
  - ../slots/has_or_had_token  # was: cached_token - migrated per Rule 53/56 (2026-01-17)
  - ./Token  # for has_or_had_token range
  - ../slots/finish_reason
  - ../slots/latency_ms
  - ../slots/has_or_had_mode  # was: thinking_mode - migrated per Rule 53/56 (2026-01-16)
  - ./ThinkingMode  # for has_or_had_mode range
  - ../slots/clear_thinking
  - ../slots/created
  - ../slots/cost_usd
  - ../slots/request_id
  - ../slots/specificity_annotation
  - ../slots/has_or_had_score  # was: template_specificity - migrated per Rule 53 (2026-01-17)
default_range: string
classes:
  LLMResponse:
    class_uri: prov:Activity
    description: "Provenance metadata for LLM API responses, including GLM 4.7 Thinking Modes.\n\nCaptures complete response\
      \ metadata from LLM providers (ZhipuAI GLM, Anthropic,\nOpenAI, etc.) for traceability and analysis. The key innovation\
      \ is capturing\n`reasoning_content` - the chain-of-thought reasoning that GLM 4.7 exposes\nthrough its three thinking\
      \ modes.\n\n**GLM 4.7 Thinking Modes** (https://docs.z.ai/guides/capabilities/thinking-mode):\n\n1. **Interleaved Thinking**\
      \ (default, since GLM-4.5):\n   - Model thinks between tool calls and after receiving tool results\n   - Enables complex,\
      \ step-by-step reasoning with tool chaining\n   - Returns `reasoning_content` alongside `content` in every response\n\
      \n2. **Preserved Thinking** (new in GLM-4.7):\n   - Retains reasoning_content from previous assistant turns in context\n\
      \   - Preserves reasoning continuity across multi-turn conversations\n   - Improves model performance and increases\
      \ cache hit rates\n   - **Enabled by default on Coding Plan endpoint**\n   - Requires returning EXACT, UNMODIFIED reasoning_content\
      \ back to API\n   - Set via: `\"clear_thinking\": false` (do NOT clear previous reasoning)\n\n3. **Turn-level Thinking**\
      \ (new in GLM-4.7):\n   - Control reasoning computation on a per-turn basis\n   - Enable/disable thinking independently\
      \ for each request in a session\n   - Useful for balancing speed (simple queries) vs accuracy (complex tasks)\n   -\
      \ Set via: `\"thinking\": {\"type\": \"enabled\"}` or `\"thinking\": {\"type\": \"disabled\"}`\n\n**Critical Implementation\
      \ Note for Preserved Thinking**:\nWhen using Preserved Thinking with tool calls, thinking blocks MUST be:\n1. Explicitly\
      \ preserved in the messages array\n2. Returned together with tool results\n3. Kept in EXACT original sequence (no reordering/editing)\n\
      \n**PROV-O Alignment**:\n- LLMResponse IS a prov:Activity (the inference process)\n- content IS prov:Entity (the generated\
      \ output)\n- model/provider IS prov:Agent (the AI system)\n- reasoning_content documents the prov:Plan (how the agent\
      \ reasoned)\n- prompt (input) IS prov:used (input to the activity)\n\n**Use Cases**:\n- DSPy RAG responses with reasoning\
      \ traces\n- Heritage institution extraction provenance\n- LinkML schema conformity validation\n- Ontology mapping decision\
      \ logs\n- Multi-turn agent conversations with preserved context\n"
    exact_mappings:
    - prov:Activity
    close_mappings:
    - schema:Action
    - schema:CreativeWork
    slots:
    - has_or_had_token  # was: cached_token AND completion_token - migrated per Rule 53/56 (2026-01-17, 2026-01-19)
    - clear_thinking
    # completion_token removed - now use has_or_had_token with OutputTokenType
    - content
    - cost_usd
    - created
    - finish_reason
    - latency_ms
    - model
    - prompt_token
    - provider
    - reasoning_content
    - request_id
    - specificity_annotation
    - has_or_had_score  # was: template_specificity - migrated per Rule 53 (2026-01-17)
    - has_or_had_mode  # was: thinking_mode - migrated per Rule 53/56 (2026-01-16)
    - consumes_or_consumed  # was: total_token - migrated per Rule 53 (2026-01-15)
    slot_usage:
      content:
        range: string
        required: true
        examples:
        - value: The Rijksmuseum is a national museum in Amsterdam dedicated to Dutch arts and history.
          description: Extracted heritage institution description
      reasoning_content:
        range: string
        required: false
        examples:
        - value: 'The user is asking about Dutch heritage institutions. I need to identify: 1) Institution name: Rijksmuseum,
            2) Type: Museum (maps to InstitutionTypeEnum.MUSEUM), 3) Location: Amsterdam (city in Noord-Holland province)...'
          description: GLM 4.7 interleaved thinking showing explicit schema reasoning
      model:
        range: string
        required: true
        examples:
        - value: glm-4.7
          description: ZhipuAI GLM 4.7 with Interleaved Thinking
      provider:
        range: LLMProviderEnum
        required: true
        examples:
        - value: zai
          description: ZhipuAI (Z.AI) - GLM models
      request_id:
        range: string
        required: false
        examples:
        - value: req_8f3a2b1c4d5e6f7g
          description: Provider-assigned request identifier
      created:
        range: datetime
        required: true
        examples:
        - value: '2025-12-23T10:30:00Z'
          description: UTC timestamp of response generation
      prompt_token:
        range: integer
        minimum_value: 0
        examples:
        - value: 150
          description: 150 tokens in the input prompt
      # completion_token slot_usage removed - now covered by has_or_had_token with OutputTokenType (2026-01-19)
      consumes_or_consumed:  # was: total_token - migrated per Rule 53 (2026-01-15)
        description: |
          Total tokens consumed by this LLM response (prompt + completion).
          MIGRATED from total_token per slot_fixes.yaml (Rule 53, 2026-01-15).
          From API response: usage.total_tokens
        range: integer
        minimum_value: 0
        examples:
        - value: 600
          description: 600 total tokens (150 prompt + 450 completion)
      has_or_had_token:  # was: cached_token AND completion_token - migrated per Rule 53/56 (2026-01-17, 2026-01-19)
        description: |
          Token data for this LLM response.
          Multivalued list capturing different token types (cached, completion, reasoning, etc.).

          **Token Types** (from TokenTypes.yaml):
          - CachedTokenType: Tokens served from provider cache (reduced cost)
          - OutputTokenType: Completion/output tokens (content + reasoning_content)
          - ReasoningTokenType: Chain-of-thought reasoning tokens
          - InputTokenType: Prompt tokens

          **API Mapping**:
          - Cached: usage.prompt_tokens_details.cached_tokens
          - Completion: usage.completion_tokens

          MIGRATED from cached_token (2026-01-17) and completion_token (2026-01-19) per Rule 53/56.
        range: Token
        multivalued: true
        inlined: true
        inlined_as_list: true
        required: false
        examples:
        - value:
            - has_or_had_type:
                has_or_had_identifier: hc:TokenType/CACHED
                has_or_had_label: Cached Token
              has_or_had_quantity:
                quantity_value: 50
                has_or_had_description: Tokens from provider KV cache
            - has_or_had_type:
                has_or_had_identifier: hc:TokenType/OUTPUT
                has_or_had_label: Output Token
              has_or_had_quantity:
                quantity_value: 450
                has_or_had_description: Completion tokens (content + reasoning)
          description: Both cached (50) and completion (450) tokens
        - value:
            - has_or_had_type:
                has_or_had_identifier: hc:TokenType/OUTPUT
                has_or_had_label: Output Token
              has_or_had_quantity:
                quantity_value: 200
          description: Simple completion token count (no caching)
      finish_reason:
        range: FinishReasonEnum
        required: false
        examples:
        - value: stop
          description: Model completed naturally
      latency_ms:
        range: integer
        minimum_value: 0
        required: false
        examples:
        - value: 1250
          description: 1.25 seconds total response time
      cost_usd:
        range: float
        minimum_value: 0.0
        required: false
        examples:
        - value: 0.0
          description: Free (Z.AI Coding Plan)
        - value: 0.015
          description: OpenAI GPT-4 Turbo cost estimate
      has_or_had_mode:  # was: thinking_mode - migrated per Rule 53/56 (2026-01-16)
        description: |
          The GLM 4.7 thinking mode configuration for this request.
          MIGRATED from thinking_mode to has_or_had_mode with ThinkingMode class.
        range: ThinkingMode
        required: false
        examples:
        - value:
            mode_value: preserved
            has_or_had_label: Preserved Thinking
          description: Preserved thinking for multi-turn agent conversations
        - value:
            mode_value: interleaved
            has_or_had_label: Interleaved Thinking
          description: Default interleaved thinking between tool calls
        - value:
            mode_value: disabled
            has_or_had_label: Disabled
          description: Disabled for fast, simple queries
      clear_thinking:
        range: boolean
        required: false
        examples:
        - value: false
          description: Keep reasoning for Preserved Thinking (recommended)
        - value: true
          description: Clear previous reasoning (fresh context each turn)
    comments:
    - reasoning_content is the key field for Interleaved Thinking (GLM 4.7)
    - Store reasoning_content for debugging, auditing, and DSPy optimization
    - 'Z.AI Coding Plan endpoint: https://api.z.ai/api/coding/paas/v4/chat/completions'
    - 'For DSPy: use LLMResponse to track all LLM calls in the pipeline'
    - See AGENTS.md Rule 11 for Z.AI API configuration
    see_also:
    - https://www.w3.org/TR/prov-o/
    - https://api.z.ai/docs
    - https://dspy-docs.vercel.app/