glam/schemas/initial/sparql_query_log.yaml
2025-11-21 22:12:33 +01:00

474 lines
13 KiB
YAML

id: https://w3id.org/heritage/custodian/sparql-query-log
name: heritage-custodian-sparql-query-log
title: Heritage Custodian SPARQL Query Log Schema
description: >-
Schema for documenting and tracking SPARQL queries executed against Wikidata
and other knowledge bases. Captures query text, execution metadata, results,
and provenance for reproducibility and audit trail purposes.
license: https://creativecommons.org/publicdomain/zero/1.0/
version: 0.1.0
prefixes:
linkml: https://w3id.org/linkml/
heritage: https://w3id.org/heritage/custodian/
prov: http://www.w3.org/ns/prov#
dcterms: http://purl.org/dc/terms/
schema: http://schema.org/
rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns#
sparql: http://www.w3.org/ns/sparql-service-description#
default_prefix: heritage
default_range: string
imports:
- linkml:types
# =============================================================================
# CORE CLASSES
# =============================================================================
classes:
SPARQLQueryLog:
description: >-
A complete record of a SPARQL query execution, including the query text,
execution parameters, results metadata, and provenance information.
Used for documenting research queries, tracking data extraction workflows,
and enabling reproducibility of results.
class_uri: sparql:Query
slots:
- query_id
- query_name
- query_purpose
- query_date
- executed_by
- sparql_endpoint
- query_text
- query_method
- query_timeout
- execution_status
- execution_time_seconds
- error_message
- result_file_path
- result_count
- result_statistics
- unique_entity_count
- exclusion_count
- exclusion_source
- related_queries
- tags
- notes
slot_usage:
query_id:
required: true
identifier: true
query_name:
required: true
query_date:
required: true
sparql_endpoint:
required: true
query_text:
required: true
execution_status:
required: true
QueryComparison:
description: >-
Comparative analysis between two related SPARQL queries. Documents
differences in query logic, results, and performance to track
refinements and corrections over time.
slots:
- comparison_id
- comparison_date
- baseline_query
- updated_query
- changes_description
- baseline_result_count
- updated_result_count
- result_difference
- performance_impact
- recommendation
slot_usage:
comparison_id:
required: true
identifier: true
baseline_query:
required: true
updated_query:
required: true
ResultStatistics:
description: >-
Statistical summary of SPARQL query results, including entity counts,
language coverage, duplicates, and data quality metrics.
slots:
- total_bindings
- unique_entities
- entities_with_english_labels
- entities_without_english_labels
- duplicate_count
- language_distribution
- excluded_entities
- file_size_mb
slot_usage:
total_bindings:
required: true
# =============================================================================
# SLOTS
# =============================================================================
slots:
# Core identification
query_id:
description: >-
Unique identifier for this query execution. Format: lowercase class
prefix + descriptive name + timestamp (e.g., "archive_query_corrected_20251113")
range: string
identifier: true
slot_uri: dcterms:identifier
pattern: '^[a-z_]+_[0-9]{8,14}$'
comments:
- "Format: {class}_{description}_{YYYYMMDD} or {class}_{description}_{YYYYMMDDHHmmss}"
- "Example: archive_query_corrected_20251113"
query_name:
description: >-
Human-readable name for this query. Should describe what the query does
in a concise way.
range: string
required: true
slot_uri: dcterms:title
comments:
- "Example: 'Archive Hyponym Extraction with 318 Exclusions'"
query_purpose:
description: >-
Detailed explanation of why this query was executed and what research
question it addresses. Include context about the data extraction workflow.
range: string
slot_uri: dcterms:description
comments:
- "Document the research goal, not just the technical implementation"
query_date:
description: >-
Timestamp when the query was executed (ISO 8601 format with timezone).
range: datetime
required: true
slot_uri: prov:atTime
comments:
- "Use UTC timezone or explicit timezone offset"
- "Format: YYYY-MM-DDTHH:mm:ss+00:00"
executed_by:
description: >-
Person, script, or system that executed the query.
range: string
slot_uri: prov:wasAssociatedWith
comments:
- "Example: 'Scott Kemper', 'execute_archive_query_corrected.py', 'OpenCODE agent'"
sparql_endpoint:
description: >-
URL of the SPARQL endpoint queried.
range: uri
required: true
slot_uri: sparql:endpoint
comments:
- "Example: https://query.wikidata.org/sparql"
- "Example: https://dbpedia.org/sparql"
query_text:
description: >-
Complete SPARQL query text including all prefixes, SELECT/CONSTRUCT,
WHERE clause, filters, and LIMIT/OFFSET. Should be reproducible.
range: string
required: true
slot_uri: sparql:text
comments:
- "Include all comments from original query"
- "Preserve indentation for readability"
- "Should be copy-paste executable"
query_method:
description: >-
HTTP method used for query execution (GET or POST).
range: QueryMethodEnum
slot_uri: schema:httpMethod
comments:
- "GET for small queries (<6KB typical limit)"
- "POST for large queries with many exclusions"
query_timeout:
description: >-
Timeout value in seconds (if specified).
range: integer
minimum_value: 0
comments:
- "Null if no timeout specified"
- "Wikidata default: 60 seconds"
execution_status:
description: >-
Status of query execution.
range: ExecutionStatusEnum
required: true
slot_uri: prov:statusInfo
comments:
- "SUCCESS: Query completed without errors"
- "FAILED: Query failed with error"
- "TIMEOUT: Query exceeded time limit"
execution_time_seconds:
description: >-
Time taken to execute query in seconds (wall clock time).
range: float
minimum_value: 0.0
slot_uri: schema:duration
comments:
- "Measured from query submission to result retrieval"
error_message:
description: >-
Error message if query failed.
range: string
slot_uri: schema:error
comments:
- "Null if execution_status is SUCCESS"
- "Include full error text for debugging"
result_file_path:
description: >-
Absolute path to the file where query results were saved.
range: string
slot_uri: schema:contentUrl
comments:
- "Use absolute path for reproducibility"
- "Typical format: JSON for structured results"
result_count:
description: >-
Total number of result bindings (rows) returned by the query.
May include multiple language variants of the same entity.
range: integer
minimum_value: 0
comments:
- "Includes all language variants"
- "For unique entity count, see unique_entity_count"
result_statistics:
description: >-
Detailed statistical analysis of query results.
range: ResultStatistics
inlined: true
slot_uri: schema:summary
unique_entity_count:
description: >-
Number of unique entities (Q-IDs) in results, excluding language variants.
range: integer
minimum_value: 0
comments:
- "Deduplicated count of distinct Q-IDs"
- "More meaningful than result_count for multi-language queries"
exclusion_count:
description: >-
Number of entities explicitly excluded via FILTER NOT IN clauses.
range: integer
minimum_value: 0
comments:
- "Count of Q-IDs in exclusion list"
- "Reference exclusion_source for provenance"
exclusion_source:
description: >-
Path to file containing excluded entity IDs (e.g., curated vocabulary file).
range: string
comments:
- "Typically a YAML or JSON file with Q-numbers"
- "Example: data/wikidata/GLAMORCUBEPSXHFN/hyponyms_curated.yaml"
related_queries:
description: >-
List of related query IDs (e.g., previous version, comparison baseline).
range: string
multivalued: true
comments:
- "Use for tracking query refinement chains"
- "Example: ['archive_query_raw_20251112']"
tags:
description: >-
Keywords for categorizing and searching queries.
range: string
multivalued: true
slot_uri: schema:keywords
comments:
- "Example: ['archive', 'GLAMORCUBEPSXHFN', 'hyponym_extraction', 'corrected']"
notes:
description: >-
Free-text notes about this query execution, including observations,
issues encountered, or follow-up actions needed.
range: string
slot_uri: dcterms:description
# QueryComparison fields
comparison_id:
description: >-
Unique identifier for this comparison. Format: baseline_vs_updated_{date}
range: string
identifier: true
pattern: '^[a-z_]+_vs_[a-z_]+_[0-9]{8}$'
comments:
- "Example: archive_raw_vs_corrected_20251113"
comparison_date:
description: >-
Date when the comparison was performed.
range: date
slot_uri: dcterms:created
baseline_query:
description: >-
Reference to the baseline query ID (original query).
range: SPARQLQueryLog
required: true
updated_query:
description: >-
Reference to the updated query ID (corrected/refined query).
range: SPARQLQueryLog
required: true
changes_description:
description: >-
Summary of changes made between baseline and updated query.
range: string
slot_uri: dcterms:description
comments:
- "Document query logic changes, exclusion list updates, etc."
baseline_result_count:
description: >-
Number of unique entities in baseline query results.
range: integer
minimum_value: 0
updated_result_count:
description: >-
Number of unique entities in updated query results.
range: integer
minimum_value: 0
result_difference:
description: >-
Difference in result counts (updated - baseline). Positive means more
results, negative means fewer results.
range: integer
comments:
- "Calculated as updated_result_count - baseline_result_count"
performance_impact:
description: >-
Change in query execution time (seconds). Negative is faster.
range: float
comments:
- "Calculated as updated_execution_time - baseline_execution_time"
recommendation:
description: >-
Recommendation on which query version to use going forward.
range: string
comments:
- "Example: 'Use updated query - better precision and performance'"
# ResultStatistics fields
total_bindings:
description: >-
Total number of result bindings including language variants.
range: integer
minimum_value: 0
required: true
unique_entities:
description: >-
Count of unique Q-IDs (deduplicated).
range: integer
minimum_value: 0
entities_with_english_labels:
description: >-
Number of entities that have English (en) labels.
range: integer
minimum_value: 0
comments:
- "Useful for assessing multilingual coverage"
entities_without_english_labels:
description: >-
Number of entities lacking English labels.
range: integer
minimum_value: 0
comments:
- "These may need label enrichment"
duplicate_count:
description: >-
Number of Q-IDs appearing multiple times in results (should be 0 with DISTINCT).
range: integer
minimum_value: 0
comments:
- "Non-zero indicates query needs DISTINCT clause"
language_distribution:
description: >-
JSON object mapping language codes to entity counts.
range: string
comments:
- "Example: {'en': 61, 'es': 45, 'fr': 38, 'de': 32}"
- "Store as JSON string for flexibility"
excluded_entities:
description: >-
Number of entities excluded via FILTER clauses.
range: integer
minimum_value: 0
file_size_mb:
description: >-
Size of result file in megabytes.
range: float
minimum_value: 0.0
comments:
- "Useful for tracking data volume"
# =============================================================================
# ENUMERATIONS
# =============================================================================
enums:
QueryMethodEnum:
description: HTTP method for SPARQL query submission
permissible_values:
GET:
description: HTTP GET request (typical for small queries)
POST:
description: HTTP POST request (required for large queries >6KB)
ExecutionStatusEnum:
description: Status of query execution
permissible_values:
SUCCESS:
description: Query executed successfully and returned results
FAILED:
description: Query failed with error
TIMEOUT:
description: Query exceeded timeout limit
CANCELLED:
description: Query was manually cancelled
PARTIAL:
description: Query returned partial results due to limit