glam/schemas/initial/sparql_query_log.yaml

id: https://w3id.org/heritage/custodian/sparql-query-log
name: heritage-custodian-sparql-query-log
title: Heritage Custodian SPARQL Query Log Schema
description: >-
  Schema for documenting and tracking SPARQL queries executed against Wikidata
  and other knowledge bases. Captures query text, execution metadata, results,
  and provenance for reproducibility and audit trail purposes.

license: https://creativecommons.org/publicdomain/zero/1.0/
version: 0.1.0

prefixes:
  linkml: https://w3id.org/linkml/
  heritage: https://w3id.org/heritage/custodian/
  prov: http://www.w3.org/ns/prov#
  dcterms: http://purl.org/dc/terms/
  schema: http://schema.org/
  rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns#
  sparql: http://www.w3.org/ns/sparql-service-description#

default_prefix: heritage
default_range: string

imports:
  - linkml:types

# =============================================================================
# CORE CLASSES
# =============================================================================

classes:
  SPARQLQueryLog:
    description: >-
      A complete record of a SPARQL query execution, including the query text,
      execution parameters, results metadata, and provenance information.
      Used for documenting research queries, tracking data extraction workflows,
      and enabling reproducibility of results.
    class_uri: sparql:Query
    slots:
      - query_id
      - query_name
      - query_purpose
      - query_date
      - executed_by
      - sparql_endpoint
      - query_text
      - query_method
      - query_timeout
      - execution_status
      - execution_time_seconds
      - error_message
      - result_file_path
      - result_count
      - result_statistics
      - unique_entity_count
      - exclusion_count
      - exclusion_source
      - related_queries
      - tags
      - notes
    slot_usage:
      query_id:
        required: true
        identifier: true
      query_name:
        required: true
      query_date:
        required: true
      sparql_endpoint:
        required: true
      query_text:
        required: true
      execution_status:
        required: true

  QueryComparison:
    description: >-
      Comparative analysis between two related SPARQL queries. Documents
      differences in query logic, results, and performance to track
      refinements and corrections over time.
    slots:
      - comparison_id
      - comparison_date
      - baseline_query
      - updated_query
      - changes_description
      - baseline_result_count
      - updated_result_count
      - result_difference
      - performance_impact
      - recommendation
    slot_usage:
      comparison_id:
        required: true
        identifier: true
      baseline_query:
        required: true
      updated_query:
        required: true

  ResultStatistics:
    description: >-
      Statistical summary of SPARQL query results, including entity counts,
      language coverage, duplicates, and data quality metrics.
    slots:
      - total_bindings
      - unique_entities
      - entities_with_english_labels
      - entities_without_english_labels
      - duplicate_count
      - language_distribution
      - excluded_entities
      - file_size_mb
    slot_usage:
      total_bindings:
        required: true

# =============================================================================
# SLOTS
# =============================================================================

slots:
  # Core identification
  query_id:
    description: >-
      Unique identifier for this query execution. Format: lowercase class
      prefix + descriptive name + timestamp (e.g., "archive_query_corrected_20251113")
    range: string
    identifier: true
    slot_uri: dcterms:identifier
    pattern: '^[a-z_]+_[0-9]{8,14}$'
    comments:
      - "Format: {class}_{description}_{YYYYMMDD} or {class}_{description}_{YYYYMMDDHHmmss}"
      - "Example: archive_query_corrected_20251113"

  query_name:
    description: >-
      Human-readable name for this query. Should describe what the query does
      in a concise way.
    range: string
    required: true
    slot_uri: dcterms:title
    comments:
      - "Example: 'Archive Hyponym Extraction with 318 Exclusions'"

  query_purpose:
    description: >-
      Detailed explanation of why this query was executed and what research
      question it addresses. Include context about the data extraction workflow.
    range: string
    slot_uri: dcterms:description
    comments:
      - "Document the research goal, not just the technical implementation"

  query_date:
    description: >-
      Timestamp when the query was executed (ISO 8601 format with timezone).
    range: datetime
    required: true
    slot_uri: prov:atTime
    comments:
      - "Use UTC timezone or explicit timezone offset"
      - "Format: YYYY-MM-DDTHH:mm:ss+00:00"

  executed_by:
    description: >-
      Person, script, or system that executed the query.
    range: string
    slot_uri: prov:wasAssociatedWith
    comments:
      - "Example: 'Scott Kemper', 'execute_archive_query_corrected.py', 'OpenCODE agent'"

  sparql_endpoint:
    description: >-
      URL of the SPARQL endpoint queried.
    range: uri
    required: true
    slot_uri: sparql:endpoint
    comments:
      - "Example: https://query.wikidata.org/sparql"
      - "Example: https://dbpedia.org/sparql"

  query_text:
    description: >-
      Complete SPARQL query text including all prefixes, SELECT/CONSTRUCT,
      WHERE clause, filters, and LIMIT/OFFSET. Should be reproducible.
    range: string
    required: true
    slot_uri: sparql:text
    comments:
      - "Include all comments from original query"
      - "Preserve indentation for readability"
      - "Should be copy-paste executable"

  query_method:
    description: >-
      HTTP method used for query execution (GET or POST).
    range: QueryMethodEnum
    slot_uri: schema:httpMethod
    comments:
      - "GET for small queries (<6KB typical limit)"
      - "POST for large queries with many exclusions"

  query_timeout:
    description: >-
      Timeout value in seconds (if specified).
    range: integer
    minimum_value: 0
    comments:
      - "Null if no timeout specified"
      - "Wikidata default: 60 seconds"

  execution_status:
    description: >-
      Status of query execution.
    range: ExecutionStatusEnum
    required: true
    slot_uri: prov:statusInfo
    comments:
      - "SUCCESS: Query completed without errors"
      - "FAILED: Query failed with error"
      - "TIMEOUT: Query exceeded time limit"

  execution_time_seconds:
    description: >-
      Time taken to execute query in seconds (wall clock time).
    range: float
    minimum_value: 0.0
    slot_uri: schema:duration
    comments:
      - "Measured from query submission to result retrieval"

  error_message:
    description: >-
      Error message if query failed.
    range: string
    slot_uri: schema:error
    comments:
      - "Null if execution_status is SUCCESS"
      - "Include full error text for debugging"

  result_file_path:
    description: >-
      Absolute path to the file where query results were saved.
    range: string
    slot_uri: schema:contentUrl
    comments:
      - "Use absolute path for reproducibility"
      - "Typical format: JSON for structured results"

  result_count:
    description: >-
      Total number of result bindings (rows) returned by the query.
      May include multiple language variants of the same entity.
    range: integer
    minimum_value: 0
    comments:
      - "Includes all language variants"
      - "For unique entity count, see unique_entity_count"

  result_statistics:
    description: >-
      Detailed statistical analysis of query results.
    range: ResultStatistics
    inlined: true
    slot_uri: schema:summary

  unique_entity_count:
    description: >-
      Number of unique entities (Q-IDs) in results, excluding language variants.
    range: integer
    minimum_value: 0
    comments:
      - "Deduplicated count of distinct Q-IDs"
      - "More meaningful than result_count for multi-language queries"

  exclusion_count:
    description: >-
      Number of entities explicitly excluded via FILTER NOT IN clauses.
    range: integer
    minimum_value: 0
    comments:
      - "Count of Q-IDs in exclusion list"
      - "Reference exclusion_source for provenance"

  exclusion_source:
    description: >-
      Path to file containing excluded entity IDs (e.g., curated vocabulary file).
    range: string
    comments:
      - "Typically a YAML or JSON file with Q-numbers"
      - "Example: data/wikidata/GLAMORCUBEPSXHFN/hyponyms_curated.yaml"

  related_queries:
    description: >-
      List of related query IDs (e.g., previous version, comparison baseline).
    range: string
    multivalued: true
    comments:
      - "Use for tracking query refinement chains"
      - "Example: ['archive_query_raw_20251112']"

  tags:
    description: >-
      Keywords for categorizing and searching queries.
    range: string
    multivalued: true
    slot_uri: schema:keywords
    comments:
      - "Example: ['archive', 'GLAMORCUBEPSXHFN', 'hyponym_extraction', 'corrected']"

  notes:
    description: >-
      Free-text notes about this query execution, including observations,
      issues encountered, or follow-up actions needed.
    range: string
    slot_uri: dcterms:description

  # QueryComparison fields
  comparison_id:
    description: >-
      Unique identifier for this comparison. Format: baseline_vs_updated_{date}
    range: string
    identifier: true
    pattern: '^[a-z_]+_vs_[a-z_]+_[0-9]{8}$'
    comments:
      - "Example: archive_raw_vs_corrected_20251113"

  comparison_date:
    description: >-
      Date when the comparison was performed.
    range: date
    slot_uri: dcterms:created

  baseline_query:
    description: >-
      Reference to the baseline query ID (original query).
    range: SPARQLQueryLog
    required: true

  updated_query:
    description: >-
      Reference to the updated query ID (corrected/refined query).
    range: SPARQLQueryLog
    required: true

  changes_description:
    description: >-
      Summary of changes made between baseline and updated query.
    range: string
    slot_uri: dcterms:description
    comments:
      - "Document query logic changes, exclusion list updates, etc."

  baseline_result_count:
    description: >-
      Number of unique entities in baseline query results.
    range: integer
    minimum_value: 0

  updated_result_count:
    description: >-
      Number of unique entities in updated query results.
    range: integer
    minimum_value: 0

  result_difference:
    description: >-
      Difference in result counts (updated - baseline). Positive means more
      results, negative means fewer results.
    range: integer
    comments:
      - "Calculated as updated_result_count - baseline_result_count"

  performance_impact:
    description: >-
      Change in query execution time (seconds). Negative is faster.
    range: float
    comments:
      - "Calculated as updated_execution_time - baseline_execution_time"

  recommendation:
    description: >-
      Recommendation on which query version to use going forward.
    range: string
    comments:
      - "Example: 'Use updated query - better precision and performance'"

  # ResultStatistics fields
  total_bindings:
    description: >-
      Total number of result bindings including language variants.
    range: integer
    minimum_value: 0
    required: true

  unique_entities:
    description: >-
      Count of unique Q-IDs (deduplicated).
    range: integer
    minimum_value: 0

  entities_with_english_labels:
    description: >-
      Number of entities that have English (en) labels.
    range: integer
    minimum_value: 0
    comments:
      - "Useful for assessing multilingual coverage"

  entities_without_english_labels:
    description: >-
      Number of entities lacking English labels.
    range: integer
    minimum_value: 0
    comments:
      - "These may need label enrichment"

  duplicate_count:
    description: >-
      Number of Q-IDs appearing multiple times in results (should be 0 with DISTINCT).
    range: integer
    minimum_value: 0
    comments:
      - "Non-zero indicates query needs DISTINCT clause"

  language_distribution:
    description: >-
      JSON object mapping language codes to entity counts.
    range: string
    comments:
      - "Example: {'en': 61, 'es': 45, 'fr': 38, 'de': 32}"
      - "Store as JSON string for flexibility"

  excluded_entities:
    description: >-
      Number of entities excluded via FILTER clauses.
    range: integer
    minimum_value: 0

  file_size_mb:
    description: >-
      Size of result file in megabytes.
    range: float
    minimum_value: 0.0
    comments:
      - "Useful for tracking data volume"

# =============================================================================
# ENUMERATIONS
# =============================================================================

enums:
  QueryMethodEnum:
    description: HTTP method for SPARQL query submission
    permissible_values:
      GET:
        description: HTTP GET request (typical for small queries)
      POST:
        description: HTTP POST request (required for large queries >6KB)

  ExecutionStatusEnum:
    description: Status of query execution
    permissible_values:
      SUCCESS:
        description: Query executed successfully and returned results
      FAILED:
        description: Query failed with error
      TIMEOUT:
        description: Query exceeded timeout limit
      CANCELLED:
        description: Query was manually cancelled
      PARTIAL:
        description: Query returned partial results due to limit