glam/schemas/20251121/linkml/modules/classes/LLMResponse.yaml

# LLM Response Class
# Provenance for LLM API responses with GLM 4.7 Thinking Modes
# Captures reasoning_content for Interleaved, Preserved, and Turn-level Thinking

id: https://nde.nl/ontology/hc/class/LLMResponse
name: llm_response_class
title: LLM Response Class
version: 1.0.0

prefixes:
  linkml: https://w3id.org/linkml/
  hc: https://nde.nl/ontology/hc/
  schema: http://schema.org/
  prov: http://www.w3.org/ns/prov#
  dct: http://purl.org/dc/terms/
  xsd: http://www.w3.org/2001/XMLSchema#

imports:
  - linkml:types
  - ../metadata

default_range: string

classes:

  LLMResponse:
    class_uri: prov:Activity
    description: |
      Provenance metadata for LLM API responses, including GLM 4.7 Thinking Modes.

      Captures complete response metadata from LLM providers (ZhipuAI GLM, Anthropic,
      OpenAI, etc.) for traceability and analysis. The key innovation is capturing
      `reasoning_content` - the chain-of-thought reasoning that GLM 4.7 exposes
      through its three thinking modes.

      **GLM 4.7 Thinking Modes** (https://docs.z.ai/guides/capabilities/thinking-mode):

      1. **Interleaved Thinking** (default, since GLM-4.5):
         - Model thinks between tool calls and after receiving tool results
         - Enables complex, step-by-step reasoning with tool chaining
         - Returns `reasoning_content` alongside `content` in every response

      2. **Preserved Thinking** (new in GLM-4.7):
         - Retains reasoning_content from previous assistant turns in context
         - Preserves reasoning continuity across multi-turn conversations
         - Improves model performance and increases cache hit rates
         - **Enabled by default on Coding Plan endpoint**
         - Requires returning EXACT, UNMODIFIED reasoning_content back to API
         - Set via: `"clear_thinking": false` (do NOT clear previous reasoning)

      3. **Turn-level Thinking** (new in GLM-4.7):
         - Control reasoning computation on a per-turn basis
         - Enable/disable thinking independently for each request in a session
         - Useful for balancing speed (simple queries) vs accuracy (complex tasks)
         - Set via: `"thinking": {"type": "enabled"}` or `"thinking": {"type": "disabled"}`

      **Critical Implementation Note for Preserved Thinking**:
      When using Preserved Thinking with tool calls, thinking blocks MUST be:
      1. Explicitly preserved in the messages array
      2. Returned together with tool results
      3. Kept in EXACT original sequence (no reordering/editing)

      **PROV-O Alignment**:
      - LLMResponse IS a prov:Activity (the inference process)
      - content IS prov:Entity (the generated output)
      - model/provider IS prov:Agent (the AI system)
      - reasoning_content documents the prov:Plan (how the agent reasoned)
      - prompt (input) IS prov:used (input to the activity)

      **Use Cases**:
      - DSPy RAG responses with reasoning traces
      - Heritage institution extraction provenance
      - LinkML schema conformity validation
      - Ontology mapping decision logs
      - Multi-turn agent conversations with preserved context

      **Example JSON Structure (GLM 4.7 with Preserved Thinking)**:
      ```json
      {
        "llm_response": {
          "content": "The Rijksmuseum is a museum in Amsterdam...",
          "reasoning_content": "The user is asking about heritage institutions. Let me identify the key entities: 1) Rijksmuseum is the institution name, 2) It's a museum (institution_type: MUSEUM), 3) Located in Amsterdam (city)...",
          "thinking_mode": "preserved",
          "clear_thinking": false,
          "model": "glm-4.7",
          "provider": "zai",
          "request_id": "req_abc123",
          "created": "2025-12-23T10:30:00Z",
          "prompt_tokens": 150,
          "completion_tokens": 450,
          "total_tokens": 600,
          "cached_tokens": 50,
          "finish_reason": "stop",
          "latency_ms": 1250,
          "cost_usd": 0.0
        }
      }
      ```

    exact_mappings:
      - prov:Activity
    close_mappings:
      - schema:Action
      - schema:CreativeWork

    slots:
      - content
      - reasoning_content
      - thinking_mode
      - clear_thinking
      - model
      - provider
      - request_id
      - created
      - prompt_tokens
      - completion_tokens
      - total_tokens
      - cached_tokens
      - finish_reason
      - latency_ms
      - cost_usd

    slot_usage:
      content:
        description: |
          The final LLM response text (message.content from API response).
          PROV-O: prov:generated - the entity produced by this activity.

          This is the primary output shown to users and used for downstream processing.
        slot_uri: prov:generated
        range: string
        required: true
        examples:
          - value: "The Rijksmuseum is a national museum in Amsterdam dedicated to Dutch arts and history."
            description: "Extracted heritage institution description"

      reasoning_content:
        description: |
          Interleaved Thinking - the model's chain-of-thought reasoning.
          PROV-O: prov:hadPlan - documents HOW the agent reasoned.

          **GLM 4.7 Interleaved Thinking**:
          GLM 4.7 returns `reasoning_content` in every response, exposing the
          model's step-by-step reasoning process. This enables:

          1. **Schema Validation**: Model reasons about LinkML constraints before generating output
          2. **Ontology Mapping**: Explicit reasoning about CIDOC-CRM, CPOV, TOOI class mappings
          3. **RDF Quality**: Chain-of-thought validates triple construction
          4. **Transparency**: Full audit trail of extraction decisions

          **DSPy Integration**:
          When using DSPy, reasoning_content can be used to:
          - Validate signature conformity
          - Debug failed extractions
          - Improve prompt engineering
          - Train on successful reasoning patterns

          May be null for providers that don't expose reasoning (Claude, GPT-4).
        slot_uri: prov:hadPlan
        range: string
        required: false
        examples:
          - value: "The user is asking about Dutch heritage institutions. I need to identify: 1) Institution name: Rijksmuseum, 2) Type: Museum (maps to InstitutionTypeEnum.MUSEUM), 3) Location: Amsterdam (city in Noord-Holland province)..."
            description: "GLM 4.7 interleaved thinking showing explicit schema reasoning"

      model:
        description: |
          The LLM model identifier from the API response.
          PROV-O: Part of prov:wasAssociatedWith - identifies the specific model version.

          Common values:
          - glm-4.7: ZhipuAI GLM 4.7 (with Interleaved Thinking)
          - glm-4.6: ZhipuAI GLM 4.6
          - claude-3-opus-20240229: Anthropic Claude Opus
          - gpt-4-turbo: OpenAI GPT-4 Turbo
        slot_uri: schema:softwareVersion
        range: string
        required: true
        examples:
          - value: "glm-4.7"
            description: "ZhipuAI GLM 4.7 with Interleaved Thinking"

      provider:
        description: |
          The LLM provider/platform.
          PROV-O: prov:wasAssociatedWith - the agent (organization) providing the model.

          Used by DSPy to route requests and track provider-specific behavior.
        slot_uri: prov:wasAssociatedWith
        range: LLMProviderEnum
        required: true
        examples:
          - value: "zai"
            description: "ZhipuAI (Z.AI) - GLM models"

      request_id:
        description: |
          Unique request ID from the LLM provider API (for tracing/debugging).
          Enables correlation with provider logs for troubleshooting.
        slot_uri: dct:identifier
        range: string
        required: false
        examples:
          - value: "req_8f3a2b1c4d5e6f7g"
            description: "Provider-assigned request identifier"

      created:
        description: |
          Timestamp when the LLM response was generated (from API response).
          PROV-O: prov:endedAtTime - when the inference activity completed.
        slot_uri: prov:endedAtTime
        range: datetime
        required: true
        examples:
          - value: "2025-12-23T10:30:00Z"
            description: "UTC timestamp of response generation"

      prompt_tokens:
        description: |
          Number of tokens in the input prompt.
          From API response: usage.prompt_tokens
        slot_uri: schema:value
        range: integer
        minimum_value: 0
        examples:
          - value: 150
            description: "150 tokens in the input prompt"

      completion_tokens:
        description: |
          Number of tokens in the model's response (content + reasoning_content).
          From API response: usage.completion_tokens

          Note: For GLM 4.7, this includes tokens from both content and reasoning_content.
        slot_uri: schema:value
        range: integer
        minimum_value: 0
        examples:
          - value: 450
            description: "450 tokens in the completion (content + reasoning)"

      total_tokens:
        description: |
          Total tokens used (prompt + completion).
          From API response: usage.total_tokens
        slot_uri: schema:value
        range: integer
        minimum_value: 0
        examples:
          - value: 600
            description: "600 total tokens (150 prompt + 450 completion)"

      cached_tokens:
        description: |
          Number of prompt tokens served from cache (if provider supports caching).
          From API response: usage.prompt_tokens_details.cached_tokens

          Cached tokens typically have reduced cost and latency.
        slot_uri: schema:value
        range: integer
        minimum_value: 0
        required: false
        examples:
          - value: 50
            description: "50 tokens served from provider's prompt cache"

      finish_reason:
        description: |
          Why the model stopped generating (from API response).

          Common values:
          - stop: Natural completion (hit stop token)
          - length: Hit max_tokens limit
          - tool_calls: Model invoked a tool (function calling)
          - content_filter: Response filtered for safety
        slot_uri: schema:status
        range: FinishReasonEnum
        required: false
        examples:
          - value: "stop"
            description: "Model completed naturally"

      latency_ms:
        description: |
          Response latency in milliseconds (time from request to response).
          Measured client-side (includes network time).
        slot_uri: schema:duration
        range: integer
        minimum_value: 0
        required: false
        examples:
          - value: 1250
            description: "1.25 seconds total response time"

      cost_usd:
        description: |
          Estimated cost in USD for this LLM call.

          For Z.AI Coding Plan: $0.00 (free tier for GLM models)
          For other providers: calculated from token counts and pricing
        slot_uri: schema:price
        range: float
        minimum_value: 0.0
        required: false
        examples:
          - value: 0.0
            description: "Free (Z.AI Coding Plan)"
          - value: 0.015
            description: "OpenAI GPT-4 Turbo cost estimate"

      thinking_mode:
        description: |
          The GLM 4.7 thinking mode used for this request.

          **Available Modes**:
          - **enabled**: Thinking enabled (default) - model reasons before responding
          - **disabled**: Thinking disabled - faster responses, no reasoning_content
          - **interleaved**: Interleaved thinking - think between tool calls (default behavior)
          - **preserved**: Preserved thinking - retain reasoning across turns (Coding Plan default)

          **Configuration**:
          - Interleaved: Default behavior, no config needed
          - Preserved: Set `"clear_thinking": false`
          - Turn-level: Set `"thinking": {"type": "enabled"}` or `"thinking": {"type": "disabled"}`
        slot_uri: schema:actionOption
        range: ThinkingModeEnum
        required: false
        examples:
          - value: "preserved"
            description: "Preserved thinking for multi-turn agent conversations"
          - value: "interleaved"
            description: "Default interleaved thinking between tool calls"
          - value: "disabled"
            description: "Disabled for fast, simple queries"

      clear_thinking:
        description: |
          Whether to clear previous reasoning_content from context.

          **Preserved Thinking Control**:
          - **false**: Preserved Thinking enabled (keep reasoning, better cache hits)
          - **true**: Clear previous reasoning (default for standard API)

          **Z.AI Coding Plan**: Default is `false` (Preserved Thinking enabled)

          **Critical Implementation Note**:
          When clear_thinking is false, you MUST return the EXACT, UNMODIFIED
          reasoning_content back to the API in subsequent turns. Any modification
          (reordering, editing, truncating) will degrade performance and cache hits.
        slot_uri: schema:Boolean
        range: boolean
        required: false
        examples:
          - value: false
            description: "Keep reasoning for Preserved Thinking (recommended)"
          - value: true
            description: "Clear previous reasoning (fresh context each turn)"

    comments:
      - "reasoning_content is the key field for Interleaved Thinking (GLM 4.7)"
      - "Store reasoning_content for debugging, auditing, and DSPy optimization"
      - "Z.AI Coding Plan endpoint: https://api.z.ai/api/coding/paas/v4/chat/completions"
      - "For DSPy: use LLMResponse to track all LLM calls in the pipeline"
      - "See AGENTS.md Rule 11 for Z.AI API configuration"

    see_also:
      - "https://www.w3.org/TR/prov-o/"
      - "https://api.z.ai/docs"
      - "https://dspy-docs.vercel.app/"

enums:
  LLMProviderEnum:
    description: |
      Enumeration of LLM providers/platforms supported by DSPy integration.
      Used for routing, cost tracking, and provider-specific behavior.
    permissible_values:
      zai:
        description: |
          ZhipuAI (Z.AI) - Chinese AI provider offering GLM models.
          Primary provider for this project via Z.AI Coding Plan.
          Endpoint: https://api.z.ai/api/coding/paas/v4/chat/completions
          Models: glm-4.5, glm-4.6, glm-4.7 (with Interleaved Thinking)
        meaning: schema:Organization
      anthropic:
        description: |
          Anthropic - Provider of Claude models.
          Models: claude-3-opus, claude-3-sonnet, claude-3-haiku
        meaning: schema:Organization
      openai:
        description: |
          OpenAI - Provider of GPT models.
          Models: gpt-4-turbo, gpt-4o, gpt-3.5-turbo
        meaning: schema:Organization
      huggingface:
        description: |
          HuggingFace - Open model hosting and inference.
          Models: Various open-source models via Inference API
        meaning: schema:Organization
      groq:
        description: |
          Groq - High-speed inference provider.
          Models: llama, mixtral, gemma via Groq hardware
        meaning: schema:Organization
      together:
        description: |
          Together AI - Open model inference platform.
          Models: Various open-source models
        meaning: schema:Organization
      local:
        description: |
          Local inference (Ollama, llama.cpp, vLLM).
          No external API calls, runs on local hardware.
        meaning: schema:SoftwareApplication

  FinishReasonEnum:
    description: |
      Reasons why the LLM stopped generating output.
      Standardized across providers.
    permissible_values:
      stop:
        description: "Natural completion - model hit a stop token or finished"
      length:
        description: "Hit max_tokens limit - response was truncated"
      tool_calls:
        description: "Model invoked a tool/function (function calling)"
      content_filter:
        description: "Response was filtered for safety/content policy"
      error:
        description: "Generation failed due to an error"

  ThinkingModeEnum:
    description: |
      GLM 4.7 thinking mode configuration.
      Controls how the model reasons during inference.

      **Reference**: https://docs.z.ai/guides/capabilities/thinking-mode

      GLM 4.7 introduces three distinct thinking modes that can be combined:
      1. Interleaved Thinking (between tool calls)
      2. Preserved Thinking (across conversation turns)
      3. Turn-level Thinking (enable/disable per request)
    permissible_values:
      enabled:
        description: |
          Thinking enabled (turn-level setting).
          Model reasons before responding, returns reasoning_content.
          Set via: `"thinking": {"type": "enabled"}`
        meaning: schema:ActivateAction
      disabled:
        description: |
          Thinking disabled (turn-level setting).
          Faster responses, no reasoning_content returned.
          Useful for simple queries where speed matters more than accuracy.
          Set via: `"thinking": {"type": "disabled"}`
        meaning: schema:DeactivateAction
      interleaved:
        description: |
          Interleaved thinking mode (default since GLM-4.5).
          Model thinks between tool calls and after receiving tool results.
          Enables complex, step-by-step reasoning with tool chaining.
          No special configuration needed - this is the default behavior.
        meaning: schema:Action
      preserved:
        description: |
          Preserved thinking mode (new in GLM-4.7).
          Retains reasoning_content from previous assistant turns in context.
          Improves model performance and increases cache hit rates.
          **Enabled by default on Z.AI Coding Plan endpoint**.
          Set via: `"clear_thinking": false`

          CRITICAL: Must return EXACT, UNMODIFIED reasoning_content back to API.
        meaning: schema:Action

slots:
  content:
    description: "The final LLM response text"
    range: string

  reasoning_content:
    description: "Interleaved Thinking - chain-of-thought reasoning from GLM 4.7"
    range: string

  model:
    description: "LLM model identifier"
    range: string

  provider:
    description: "LLM provider/platform"
    range: LLMProviderEnum

  created:
    description: "Timestamp when response was generated"
    range: datetime

  prompt_tokens:
    description: "Number of tokens in input prompt"
    range: integer

  completion_tokens:
    description: "Number of tokens in response"
    range: integer

  total_tokens:
    description: "Total tokens used"
    range: integer

  cached_tokens:
    description: "Number of tokens served from cache"
    range: integer

  finish_reason:
    description: "Why the model stopped generating"
    range: FinishReasonEnum

  latency_ms:
    description: "Response latency in milliseconds"
    range: integer

  cost_usd:
    description: "API cost in USD for this LLM call"
    range: float

  thinking_mode:
    description: "GLM 4.7 thinking mode configuration"
    range: ThinkingModeEnum

  clear_thinking:
    description: "Whether to clear previous reasoning from context (false = Preserved Thinking)"
    range: boolean