474 lines
13 KiB
YAML
474 lines
13 KiB
YAML
id: https://w3id.org/heritage/custodian/sparql-query-log
|
|
name: heritage-custodian-sparql-query-log
|
|
title: Heritage Custodian SPARQL Query Log Schema
|
|
description: >-
|
|
Schema for documenting and tracking SPARQL queries executed against Wikidata
|
|
and other knowledge bases. Captures query text, execution metadata, results,
|
|
and provenance for reproducibility and audit trail purposes.
|
|
|
|
license: https://creativecommons.org/publicdomain/zero/1.0/
|
|
version: 0.1.0
|
|
|
|
prefixes:
|
|
linkml: https://w3id.org/linkml/
|
|
heritage: https://w3id.org/heritage/custodian/
|
|
prov: http://www.w3.org/ns/prov#
|
|
dcterms: http://purl.org/dc/terms/
|
|
schema: http://schema.org/
|
|
rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns#
|
|
sparql: http://www.w3.org/ns/sparql-service-description#
|
|
|
|
default_prefix: heritage
|
|
default_range: string
|
|
|
|
imports:
|
|
- linkml:types
|
|
|
|
# =============================================================================
|
|
# CORE CLASSES
|
|
# =============================================================================
|
|
|
|
classes:
|
|
SPARQLQueryLog:
|
|
description: >-
|
|
A complete record of a SPARQL query execution, including the query text,
|
|
execution parameters, results metadata, and provenance information.
|
|
Used for documenting research queries, tracking data extraction workflows,
|
|
and enabling reproducibility of results.
|
|
class_uri: sparql:Query
|
|
slots:
|
|
- query_id
|
|
- query_name
|
|
- query_purpose
|
|
- query_date
|
|
- executed_by
|
|
- sparql_endpoint
|
|
- query_text
|
|
- query_method
|
|
- query_timeout
|
|
- execution_status
|
|
- execution_time_seconds
|
|
- error_message
|
|
- result_file_path
|
|
- result_count
|
|
- result_statistics
|
|
- unique_entity_count
|
|
- exclusion_count
|
|
- exclusion_source
|
|
- related_queries
|
|
- tags
|
|
- notes
|
|
slot_usage:
|
|
query_id:
|
|
required: true
|
|
identifier: true
|
|
query_name:
|
|
required: true
|
|
query_date:
|
|
required: true
|
|
sparql_endpoint:
|
|
required: true
|
|
query_text:
|
|
required: true
|
|
execution_status:
|
|
required: true
|
|
|
|
QueryComparison:
|
|
description: >-
|
|
Comparative analysis between two related SPARQL queries. Documents
|
|
differences in query logic, results, and performance to track
|
|
refinements and corrections over time.
|
|
slots:
|
|
- comparison_id
|
|
- comparison_date
|
|
- baseline_query
|
|
- updated_query
|
|
- changes_description
|
|
- baseline_result_count
|
|
- updated_result_count
|
|
- result_difference
|
|
- performance_impact
|
|
- recommendation
|
|
slot_usage:
|
|
comparison_id:
|
|
required: true
|
|
identifier: true
|
|
baseline_query:
|
|
required: true
|
|
updated_query:
|
|
required: true
|
|
|
|
ResultStatistics:
|
|
description: >-
|
|
Statistical summary of SPARQL query results, including entity counts,
|
|
language coverage, duplicates, and data quality metrics.
|
|
slots:
|
|
- total_bindings
|
|
- unique_entities
|
|
- entities_with_english_labels
|
|
- entities_without_english_labels
|
|
- duplicate_count
|
|
- language_distribution
|
|
- excluded_entities
|
|
- file_size_mb
|
|
slot_usage:
|
|
total_bindings:
|
|
required: true
|
|
|
|
# =============================================================================
|
|
# SLOTS
|
|
# =============================================================================
|
|
|
|
slots:
|
|
# Core identification
|
|
query_id:
|
|
description: >-
|
|
Unique identifier for this query execution. Format: lowercase class
|
|
prefix + descriptive name + timestamp (e.g., "archive_query_corrected_20251113")
|
|
range: string
|
|
identifier: true
|
|
slot_uri: dcterms:identifier
|
|
pattern: '^[a-z_]+_[0-9]{8,14}$'
|
|
comments:
|
|
- "Format: {class}_{description}_{YYYYMMDD} or {class}_{description}_{YYYYMMDDHHmmss}"
|
|
- "Example: archive_query_corrected_20251113"
|
|
|
|
query_name:
|
|
description: >-
|
|
Human-readable name for this query. Should describe what the query does
|
|
in a concise way.
|
|
range: string
|
|
required: true
|
|
slot_uri: dcterms:title
|
|
comments:
|
|
- "Example: 'Archive Hyponym Extraction with 318 Exclusions'"
|
|
|
|
query_purpose:
|
|
description: >-
|
|
Detailed explanation of why this query was executed and what research
|
|
question it addresses. Include context about the data extraction workflow.
|
|
range: string
|
|
slot_uri: dcterms:description
|
|
comments:
|
|
- "Document the research goal, not just the technical implementation"
|
|
|
|
query_date:
|
|
description: >-
|
|
Timestamp when the query was executed (ISO 8601 format with timezone).
|
|
range: datetime
|
|
required: true
|
|
slot_uri: prov:atTime
|
|
comments:
|
|
- "Use UTC timezone or explicit timezone offset"
|
|
- "Format: YYYY-MM-DDTHH:mm:ss+00:00"
|
|
|
|
executed_by:
|
|
description: >-
|
|
Person, script, or system that executed the query.
|
|
range: string
|
|
slot_uri: prov:wasAssociatedWith
|
|
comments:
|
|
- "Example: 'Scott Kemper', 'execute_archive_query_corrected.py', 'OpenCODE agent'"
|
|
|
|
sparql_endpoint:
|
|
description: >-
|
|
URL of the SPARQL endpoint queried.
|
|
range: uri
|
|
required: true
|
|
slot_uri: sparql:endpoint
|
|
comments:
|
|
- "Example: https://query.wikidata.org/sparql"
|
|
- "Example: https://dbpedia.org/sparql"
|
|
|
|
query_text:
|
|
description: >-
|
|
Complete SPARQL query text including all prefixes, SELECT/CONSTRUCT,
|
|
WHERE clause, filters, and LIMIT/OFFSET. Should be reproducible.
|
|
range: string
|
|
required: true
|
|
slot_uri: sparql:text
|
|
comments:
|
|
- "Include all comments from original query"
|
|
- "Preserve indentation for readability"
|
|
- "Should be copy-paste executable"
|
|
|
|
query_method:
|
|
description: >-
|
|
HTTP method used for query execution (GET or POST).
|
|
range: QueryMethodEnum
|
|
slot_uri: schema:httpMethod
|
|
comments:
|
|
- "GET for small queries (<6KB typical limit)"
|
|
- "POST for large queries with many exclusions"
|
|
|
|
query_timeout:
|
|
description: >-
|
|
Timeout value in seconds (if specified).
|
|
range: integer
|
|
minimum_value: 0
|
|
comments:
|
|
- "Null if no timeout specified"
|
|
- "Wikidata default: 60 seconds"
|
|
|
|
execution_status:
|
|
description: >-
|
|
Status of query execution.
|
|
range: ExecutionStatusEnum
|
|
required: true
|
|
slot_uri: prov:statusInfo
|
|
comments:
|
|
- "SUCCESS: Query completed without errors"
|
|
- "FAILED: Query failed with error"
|
|
- "TIMEOUT: Query exceeded time limit"
|
|
|
|
execution_time_seconds:
|
|
description: >-
|
|
Time taken to execute query in seconds (wall clock time).
|
|
range: float
|
|
minimum_value: 0.0
|
|
slot_uri: schema:duration
|
|
comments:
|
|
- "Measured from query submission to result retrieval"
|
|
|
|
error_message:
|
|
description: >-
|
|
Error message if query failed.
|
|
range: string
|
|
slot_uri: schema:error
|
|
comments:
|
|
- "Null if execution_status is SUCCESS"
|
|
- "Include full error text for debugging"
|
|
|
|
result_file_path:
|
|
description: >-
|
|
Absolute path to the file where query results were saved.
|
|
range: string
|
|
slot_uri: schema:contentUrl
|
|
comments:
|
|
- "Use absolute path for reproducibility"
|
|
- "Typical format: JSON for structured results"
|
|
|
|
result_count:
|
|
description: >-
|
|
Total number of result bindings (rows) returned by the query.
|
|
May include multiple language variants of the same entity.
|
|
range: integer
|
|
minimum_value: 0
|
|
comments:
|
|
- "Includes all language variants"
|
|
- "For unique entity count, see unique_entity_count"
|
|
|
|
result_statistics:
|
|
description: >-
|
|
Detailed statistical analysis of query results.
|
|
range: ResultStatistics
|
|
inlined: true
|
|
slot_uri: schema:summary
|
|
|
|
unique_entity_count:
|
|
description: >-
|
|
Number of unique entities (Q-IDs) in results, excluding language variants.
|
|
range: integer
|
|
minimum_value: 0
|
|
comments:
|
|
- "Deduplicated count of distinct Q-IDs"
|
|
- "More meaningful than result_count for multi-language queries"
|
|
|
|
exclusion_count:
|
|
description: >-
|
|
Number of entities explicitly excluded via FILTER NOT IN clauses.
|
|
range: integer
|
|
minimum_value: 0
|
|
comments:
|
|
- "Count of Q-IDs in exclusion list"
|
|
- "Reference exclusion_source for provenance"
|
|
|
|
exclusion_source:
|
|
description: >-
|
|
Path to file containing excluded entity IDs (e.g., curated vocabulary file).
|
|
range: string
|
|
comments:
|
|
- "Typically a YAML or JSON file with Q-numbers"
|
|
- "Example: data/wikidata/GLAMORCUBEPSXHFN/hyponyms_curated.yaml"
|
|
|
|
related_queries:
|
|
description: >-
|
|
List of related query IDs (e.g., previous version, comparison baseline).
|
|
range: string
|
|
multivalued: true
|
|
comments:
|
|
- "Use for tracking query refinement chains"
|
|
- "Example: ['archive_query_raw_20251112']"
|
|
|
|
tags:
|
|
description: >-
|
|
Keywords for categorizing and searching queries.
|
|
range: string
|
|
multivalued: true
|
|
slot_uri: schema:keywords
|
|
comments:
|
|
- "Example: ['archive', 'GLAMORCUBEPSXHFN', 'hyponym_extraction', 'corrected']"
|
|
|
|
notes:
|
|
description: >-
|
|
Free-text notes about this query execution, including observations,
|
|
issues encountered, or follow-up actions needed.
|
|
range: string
|
|
slot_uri: dcterms:description
|
|
|
|
# QueryComparison fields
|
|
comparison_id:
|
|
description: >-
|
|
Unique identifier for this comparison. Format: baseline_vs_updated_{date}
|
|
range: string
|
|
identifier: true
|
|
pattern: '^[a-z_]+_vs_[a-z_]+_[0-9]{8}$'
|
|
comments:
|
|
- "Example: archive_raw_vs_corrected_20251113"
|
|
|
|
comparison_date:
|
|
description: >-
|
|
Date when the comparison was performed.
|
|
range: date
|
|
slot_uri: dcterms:created
|
|
|
|
baseline_query:
|
|
description: >-
|
|
Reference to the baseline query ID (original query).
|
|
range: SPARQLQueryLog
|
|
required: true
|
|
|
|
updated_query:
|
|
description: >-
|
|
Reference to the updated query ID (corrected/refined query).
|
|
range: SPARQLQueryLog
|
|
required: true
|
|
|
|
changes_description:
|
|
description: >-
|
|
Summary of changes made between baseline and updated query.
|
|
range: string
|
|
slot_uri: dcterms:description
|
|
comments:
|
|
- "Document query logic changes, exclusion list updates, etc."
|
|
|
|
baseline_result_count:
|
|
description: >-
|
|
Number of unique entities in baseline query results.
|
|
range: integer
|
|
minimum_value: 0
|
|
|
|
updated_result_count:
|
|
description: >-
|
|
Number of unique entities in updated query results.
|
|
range: integer
|
|
minimum_value: 0
|
|
|
|
result_difference:
|
|
description: >-
|
|
Difference in result counts (updated - baseline). Positive means more
|
|
results, negative means fewer results.
|
|
range: integer
|
|
comments:
|
|
- "Calculated as updated_result_count - baseline_result_count"
|
|
|
|
performance_impact:
|
|
description: >-
|
|
Change in query execution time (seconds). Negative is faster.
|
|
range: float
|
|
comments:
|
|
- "Calculated as updated_execution_time - baseline_execution_time"
|
|
|
|
recommendation:
|
|
description: >-
|
|
Recommendation on which query version to use going forward.
|
|
range: string
|
|
comments:
|
|
- "Example: 'Use updated query - better precision and performance'"
|
|
|
|
# ResultStatistics fields
|
|
total_bindings:
|
|
description: >-
|
|
Total number of result bindings including language variants.
|
|
range: integer
|
|
minimum_value: 0
|
|
required: true
|
|
|
|
unique_entities:
|
|
description: >-
|
|
Count of unique Q-IDs (deduplicated).
|
|
range: integer
|
|
minimum_value: 0
|
|
|
|
entities_with_english_labels:
|
|
description: >-
|
|
Number of entities that have English (en) labels.
|
|
range: integer
|
|
minimum_value: 0
|
|
comments:
|
|
- "Useful for assessing multilingual coverage"
|
|
|
|
entities_without_english_labels:
|
|
description: >-
|
|
Number of entities lacking English labels.
|
|
range: integer
|
|
minimum_value: 0
|
|
comments:
|
|
- "These may need label enrichment"
|
|
|
|
duplicate_count:
|
|
description: >-
|
|
Number of Q-IDs appearing multiple times in results (should be 0 with DISTINCT).
|
|
range: integer
|
|
minimum_value: 0
|
|
comments:
|
|
- "Non-zero indicates query needs DISTINCT clause"
|
|
|
|
language_distribution:
|
|
description: >-
|
|
JSON object mapping language codes to entity counts.
|
|
range: string
|
|
comments:
|
|
- "Example: {'en': 61, 'es': 45, 'fr': 38, 'de': 32}"
|
|
- "Store as JSON string for flexibility"
|
|
|
|
excluded_entities:
|
|
description: >-
|
|
Number of entities excluded via FILTER clauses.
|
|
range: integer
|
|
minimum_value: 0
|
|
|
|
file_size_mb:
|
|
description: >-
|
|
Size of result file in megabytes.
|
|
range: float
|
|
minimum_value: 0.0
|
|
comments:
|
|
- "Useful for tracking data volume"
|
|
|
|
# =============================================================================
|
|
# ENUMERATIONS
|
|
# =============================================================================
|
|
|
|
enums:
|
|
QueryMethodEnum:
|
|
description: HTTP method for SPARQL query submission
|
|
permissible_values:
|
|
GET:
|
|
description: HTTP GET request (typical for small queries)
|
|
POST:
|
|
description: HTTP POST request (required for large queries >6KB)
|
|
|
|
ExecutionStatusEnum:
|
|
description: Status of query execution
|
|
permissible_values:
|
|
SUCCESS:
|
|
description: Query executed successfully and returned results
|
|
FAILED:
|
|
description: Query failed with error
|
|
TIMEOUT:
|
|
description: Query exceeded timeout limit
|
|
CANCELLED:
|
|
description: Query was manually cancelled
|
|
PARTIAL:
|
|
description: Query returned partial results due to limit
|