id: https://w3id.org/heritage/custodian/sparql-query-log name: heritage-custodian-sparql-query-log title: Heritage Custodian SPARQL Query Log Schema description: >- Schema for documenting and tracking SPARQL queries executed against Wikidata and other knowledge bases. Captures query text, execution metadata, results, and provenance for reproducibility and audit trail purposes. license: https://creativecommons.org/publicdomain/zero/1.0/ version: 0.1.0 prefixes: linkml: https://w3id.org/linkml/ heritage: https://w3id.org/heritage/custodian/ prov: http://www.w3.org/ns/prov# dcterms: http://purl.org/dc/terms/ schema: http://schema.org/ rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns# sparql: http://www.w3.org/ns/sparql-service-description# default_prefix: heritage default_range: string imports: - linkml:types # ============================================================================= # CORE CLASSES # ============================================================================= classes: SPARQLQueryLog: description: >- A complete record of a SPARQL query execution, including the query text, execution parameters, results metadata, and provenance information. Used for documenting research queries, tracking data extraction workflows, and enabling reproducibility of results. class_uri: sparql:Query slots: - query_id - query_name - query_purpose - query_date - executed_by - sparql_endpoint - query_text - query_method - query_timeout - execution_status - execution_time_seconds - error_message - result_file_path - result_count - result_statistics - unique_entity_count - exclusion_count - exclusion_source - related_queries - tags - notes slot_usage: query_id: required: true identifier: true query_name: required: true query_date: required: true sparql_endpoint: required: true query_text: required: true execution_status: required: true QueryComparison: description: >- Comparative analysis between two related SPARQL queries. Documents differences in query logic, results, and performance to track refinements and corrections over time. slots: - comparison_id - comparison_date - baseline_query - updated_query - changes_description - baseline_result_count - updated_result_count - result_difference - performance_impact - recommendation slot_usage: comparison_id: required: true identifier: true baseline_query: required: true updated_query: required: true ResultStatistics: description: >- Statistical summary of SPARQL query results, including entity counts, language coverage, duplicates, and data quality metrics. slots: - total_bindings - unique_entities - entities_with_english_labels - entities_without_english_labels - duplicate_count - language_distribution - excluded_entities - file_size_mb slot_usage: total_bindings: required: true # ============================================================================= # SLOTS # ============================================================================= slots: # Core identification query_id: description: >- Unique identifier for this query execution. Format: lowercase class prefix + descriptive name + timestamp (e.g., "archive_query_corrected_20251113") range: string identifier: true slot_uri: dcterms:identifier pattern: '^[a-z_]+_[0-9]{8,14}$' comments: - "Format: {class}_{description}_{YYYYMMDD} or {class}_{description}_{YYYYMMDDHHmmss}" - "Example: archive_query_corrected_20251113" query_name: description: >- Human-readable name for this query. Should describe what the query does in a concise way. range: string required: true slot_uri: dcterms:title comments: - "Example: 'Archive Hyponym Extraction with 318 Exclusions'" query_purpose: description: >- Detailed explanation of why this query was executed and what research question it addresses. Include context about the data extraction workflow. range: string slot_uri: dcterms:description comments: - "Document the research goal, not just the technical implementation" query_date: description: >- Timestamp when the query was executed (ISO 8601 format with timezone). range: datetime required: true slot_uri: prov:atTime comments: - "Use UTC timezone or explicit timezone offset" - "Format: YYYY-MM-DDTHH:mm:ss+00:00" executed_by: description: >- Person, script, or system that executed the query. range: string slot_uri: prov:wasAssociatedWith comments: - "Example: 'Scott Kemper', 'execute_archive_query_corrected.py', 'OpenCODE agent'" sparql_endpoint: description: >- URL of the SPARQL endpoint queried. range: uri required: true slot_uri: sparql:endpoint comments: - "Example: https://query.wikidata.org/sparql" - "Example: https://dbpedia.org/sparql" query_text: description: >- Complete SPARQL query text including all prefixes, SELECT/CONSTRUCT, WHERE clause, filters, and LIMIT/OFFSET. Should be reproducible. range: string required: true slot_uri: sparql:text comments: - "Include all comments from original query" - "Preserve indentation for readability" - "Should be copy-paste executable" query_method: description: >- HTTP method used for query execution (GET or POST). range: QueryMethodEnum slot_uri: schema:httpMethod comments: - "GET for small queries (<6KB typical limit)" - "POST for large queries with many exclusions" query_timeout: description: >- Timeout value in seconds (if specified). range: integer minimum_value: 0 comments: - "Null if no timeout specified" - "Wikidata default: 60 seconds" execution_status: description: >- Status of query execution. range: ExecutionStatusEnum required: true slot_uri: prov:statusInfo comments: - "SUCCESS: Query completed without errors" - "FAILED: Query failed with error" - "TIMEOUT: Query exceeded time limit" execution_time_seconds: description: >- Time taken to execute query in seconds (wall clock time). range: float minimum_value: 0.0 slot_uri: schema:duration comments: - "Measured from query submission to result retrieval" error_message: description: >- Error message if query failed. range: string slot_uri: schema:error comments: - "Null if execution_status is SUCCESS" - "Include full error text for debugging" result_file_path: description: >- Absolute path to the file where query results were saved. range: string slot_uri: schema:contentUrl comments: - "Use absolute path for reproducibility" - "Typical format: JSON for structured results" result_count: description: >- Total number of result bindings (rows) returned by the query. May include multiple language variants of the same entity. range: integer minimum_value: 0 comments: - "Includes all language variants" - "For unique entity count, see unique_entity_count" result_statistics: description: >- Detailed statistical analysis of query results. range: ResultStatistics inlined: true slot_uri: schema:summary unique_entity_count: description: >- Number of unique entities (Q-IDs) in results, excluding language variants. range: integer minimum_value: 0 comments: - "Deduplicated count of distinct Q-IDs" - "More meaningful than result_count for multi-language queries" exclusion_count: description: >- Number of entities explicitly excluded via FILTER NOT IN clauses. range: integer minimum_value: 0 comments: - "Count of Q-IDs in exclusion list" - "Reference exclusion_source for provenance" exclusion_source: description: >- Path to file containing excluded entity IDs (e.g., curated vocabulary file). range: string comments: - "Typically a YAML or JSON file with Q-numbers" - "Example: data/wikidata/GLAMORCUBEPSXHFN/hyponyms_curated.yaml" related_queries: description: >- List of related query IDs (e.g., previous version, comparison baseline). range: string multivalued: true comments: - "Use for tracking query refinement chains" - "Example: ['archive_query_raw_20251112']" tags: description: >- Keywords for categorizing and searching queries. range: string multivalued: true slot_uri: schema:keywords comments: - "Example: ['archive', 'GLAMORCUBEPSXHFN', 'hyponym_extraction', 'corrected']" notes: description: >- Free-text notes about this query execution, including observations, issues encountered, or follow-up actions needed. range: string slot_uri: dcterms:description # QueryComparison fields comparison_id: description: >- Unique identifier for this comparison. Format: baseline_vs_updated_{date} range: string identifier: true pattern: '^[a-z_]+_vs_[a-z_]+_[0-9]{8}$' comments: - "Example: archive_raw_vs_corrected_20251113" comparison_date: description: >- Date when the comparison was performed. range: date slot_uri: dcterms:created baseline_query: description: >- Reference to the baseline query ID (original query). range: SPARQLQueryLog required: true updated_query: description: >- Reference to the updated query ID (corrected/refined query). range: SPARQLQueryLog required: true changes_description: description: >- Summary of changes made between baseline and updated query. range: string slot_uri: dcterms:description comments: - "Document query logic changes, exclusion list updates, etc." baseline_result_count: description: >- Number of unique entities in baseline query results. range: integer minimum_value: 0 updated_result_count: description: >- Number of unique entities in updated query results. range: integer minimum_value: 0 result_difference: description: >- Difference in result counts (updated - baseline). Positive means more results, negative means fewer results. range: integer comments: - "Calculated as updated_result_count - baseline_result_count" performance_impact: description: >- Change in query execution time (seconds). Negative is faster. range: float comments: - "Calculated as updated_execution_time - baseline_execution_time" recommendation: description: >- Recommendation on which query version to use going forward. range: string comments: - "Example: 'Use updated query - better precision and performance'" # ResultStatistics fields total_bindings: description: >- Total number of result bindings including language variants. range: integer minimum_value: 0 required: true unique_entities: description: >- Count of unique Q-IDs (deduplicated). range: integer minimum_value: 0 entities_with_english_labels: description: >- Number of entities that have English (en) labels. range: integer minimum_value: 0 comments: - "Useful for assessing multilingual coverage" entities_without_english_labels: description: >- Number of entities lacking English labels. range: integer minimum_value: 0 comments: - "These may need label enrichment" duplicate_count: description: >- Number of Q-IDs appearing multiple times in results (should be 0 with DISTINCT). range: integer minimum_value: 0 comments: - "Non-zero indicates query needs DISTINCT clause" language_distribution: description: >- JSON object mapping language codes to entity counts. range: string comments: - "Example: {'en': 61, 'es': 45, 'fr': 38, 'de': 32}" - "Store as JSON string for flexibility" excluded_entities: description: >- Number of entities excluded via FILTER clauses. range: integer minimum_value: 0 file_size_mb: description: >- Size of result file in megabytes. range: float minimum_value: 0.0 comments: - "Useful for tracking data volume" # ============================================================================= # ENUMERATIONS # ============================================================================= enums: QueryMethodEnum: description: HTTP method for SPARQL query submission permissible_values: GET: description: HTTP GET request (typical for small queries) POST: description: HTTP POST request (required for large queries >6KB) ExecutionStatusEnum: description: Status of query execution permissible_values: SUCCESS: description: Query executed successfully and returned results FAILED: description: Query failed with error TIMEOUT: description: Query exceeded timeout limit CANCELLED: description: Query was manually cancelled PARTIAL: description: Query returned partial results due to limit