34 KiB
34 KiB
SPARQL Template Schema Definition
Overview
This document defines the schema for SPARQL query templates used in the template-based query generation system. Templates are defined in YAML format and validated against this schema using Pydantic models.
JSON Schema
{
"$schema": "https://json-schema.org/draft/2020-12/schema",
"$id": "https://bronhouder.nl/schemas/sparql-template/v1",
"title": "SPARQL Template Collection",
"description": "Collection of pre-defined SPARQL query templates for heritage institutions",
"type": "object",
"required": ["version", "templates"],
"properties": {
"version": {
"type": "string",
"description": "Schema version for migration support",
"pattern": "^\\d+\\.\\d+\\.\\d+$",
"examples": ["1.0.0"]
},
"templates": {
"type": "object",
"description": "Map of template ID to template definition",
"additionalProperties": {
"$ref": "#/$defs/Template"
}
},
"slot_sources": {
"type": "object",
"description": "External sources for slot enum values",
"additionalProperties": {
"$ref": "#/$defs/SlotSource"
}
}
},
"$defs": {
"Template": {
"type": "object",
"required": ["id", "question_patterns", "sparql_template", "slots"],
"properties": {
"id": {
"type": "string",
"description": "Unique template identifier",
"pattern": "^[a-z][a-z0-9_]*$"
},
"description": {
"type": "string",
"description": "Human-readable template description"
},
"intent": {
"type": "string",
"description": "Query intent this template handles",
"enum": ["geographic", "statistical", "relational", "temporal", "entity_lookup", "comparative", "exploration"]
},
"entity_type": {
"type": "string",
"description": "Entity type this template queries",
"enum": ["institution", "person", "both"]
},
"question_patterns": {
"type": "array",
"description": "Natural language patterns that match this template",
"items": {
"type": "string"
},
"minItems": 1
},
"sparql_template": {
"type": "string",
"description": "Jinja2 template for SPARQL query with {{slot_name}} placeholders"
},
"slots": {
"type": "object",
"description": "Slot definitions for template variables",
"additionalProperties": {
"$ref": "#/$defs/Slot"
}
},
"examples": {
"type": "array",
"description": "Example question -> instantiated query pairs for testing",
"items": {
"$ref": "#/$defs/Example"
}
},
"fallback_template": {
"type": "string",
"description": "Alternative template ID to use if this one fails"
},
"priority": {
"type": "integer",
"description": "Template priority for matching (higher = preferred)",
"default": 0
},
"tags": {
"type": "array",
"description": "Tags for template categorization",
"items": {
"type": "string"
}
}
}
},
"Slot": {
"type": "object",
"required": ["type"],
"properties": {
"type": {
"type": "string",
"description": "Slot value type",
"enum": ["enum", "string", "integer", "float", "uri", "date"]
},
"description": {
"type": "string",
"description": "Human-readable slot description"
},
"required": {
"type": "boolean",
"description": "Whether this slot must be filled",
"default": true
},
"default": {
"description": "Default value if slot not extracted"
},
"source": {
"type": "string",
"description": "Reference to external enum source (e.g., sparql_validation_rules.json#institution_types)"
},
"enum_values": {
"type": "array",
"description": "Inline enum values (if not using external source)",
"items": {
"type": "string"
}
},
"synonyms": {
"type": "object",
"description": "Map of user-facing terms to canonical values",
"additionalProperties": {
"type": "string"
}
},
"validation_pattern": {
"type": "string",
"description": "Regex pattern for validation"
},
"transform": {
"type": "string",
"description": "Transformation to apply (uppercase, lowercase, slugify, etc.)",
"enum": ["uppercase", "lowercase", "slugify", "uri_encode", "escape_sparql"]
}
}
},
"Example": {
"type": "object",
"required": ["question", "slots", "expected_sparql"],
"properties": {
"question": {
"type": "string",
"description": "Example user question"
},
"slots": {
"type": "object",
"description": "Expected extracted slot values"
},
"expected_sparql": {
"type": "string",
"description": "Expected instantiated SPARQL query"
}
}
},
"SlotSource": {
"type": "object",
"required": ["type", "path"],
"properties": {
"type": {
"type": "string",
"enum": ["json", "yaml", "csv"]
},
"path": {
"type": "string",
"description": "File path relative to project root"
},
"key_path": {
"type": "string",
"description": "JSON path to enum values (e.g., $.institution_types)"
},
"value_field": {
"type": "string",
"description": "Field name for enum value"
},
"label_field": {
"type": "string",
"description": "Field name for human-readable label"
}
}
}
}
}
Pydantic Models
"""Pydantic models for SPARQL template validation."""
from enum import Enum
from typing import Any, Optional
from pydantic import BaseModel, Field, field_validator
class SlotType(str, Enum):
"""Valid slot value types."""
ENUM = "enum"
STRING = "string"
INTEGER = "integer"
FLOAT = "float"
URI = "uri"
DATE = "date"
class SlotTransform(str, Enum):
"""Transformations that can be applied to slot values."""
UPPERCASE = "uppercase"
LOWERCASE = "lowercase"
SLUGIFY = "slugify"
URI_ENCODE = "uri_encode"
ESCAPE_SPARQL = "escape_sparql"
class QueryIntent(str, Enum):
"""Query intent classifications."""
GEOGRAPHIC = "geographic"
STATISTICAL = "statistical"
RELATIONAL = "relational"
TEMPORAL = "temporal"
ENTITY_LOOKUP = "entity_lookup"
COMPARATIVE = "comparative"
EXPLORATION = "exploration"
class EntityType(str, Enum):
"""Entity types that templates can query."""
INSTITUTION = "institution"
PERSON = "person"
BOTH = "both"
class SlotDefinition(BaseModel):
"""Definition of a template slot (variable)."""
type: SlotType = Field(
description="Data type of the slot value"
)
description: str | None = Field(
default=None,
description="Human-readable description of the slot"
)
required: bool = Field(
default=True,
description="Whether this slot must be filled for query execution"
)
default: Any | None = Field(
default=None,
description="Default value if slot is not extracted from question"
)
source: str | None = Field(
default=None,
description="External source reference (e.g., 'sparql_validation_rules.json#institution_types')"
)
enum_values: list[str] | None = Field(
default=None,
description="Inline enum values if not using external source"
)
synonyms: dict[str, str] | None = Field(
default=None,
description="Map of user-facing terms to canonical values"
)
validation_pattern: str | None = Field(
default=None,
description="Regex pattern for value validation"
)
transform: SlotTransform | None = Field(
default=None,
description="Transformation to apply to extracted value"
)
class TemplateExample(BaseModel):
"""Example input/output pair for template testing."""
question: str = Field(
description="Example user question"
)
slots: dict[str, Any] = Field(
description="Expected extracted slot values"
)
expected_sparql: str = Field(
description="Expected instantiated SPARQL query"
)
class SPARQLTemplate(BaseModel):
"""A pre-defined SPARQL query template."""
id: str = Field(
pattern=r"^[a-z][a-z0-9_]*$",
description="Unique template identifier (snake_case)"
)
description: str | None = Field(
default=None,
description="Human-readable template description"
)
intent: QueryIntent | None = Field(
default=None,
description="Query intent this template handles"
)
entity_type: EntityType = Field(
default=EntityType.INSTITUTION,
description="Entity type this template queries"
)
question_patterns: list[str] = Field(
min_length=1,
description="Natural language patterns that match this template"
)
sparql_template: str = Field(
description="Jinja2 template with {{slot_name}} placeholders"
)
slots: dict[str, SlotDefinition] = Field(
description="Slot definitions for template variables"
)
examples: list[TemplateExample] | None = Field(
default=None,
description="Example question -> query pairs for testing"
)
fallback_template: str | None = Field(
default=None,
description="Alternative template ID if this one fails"
)
priority: int = Field(
default=0,
description="Matching priority (higher = preferred)"
)
tags: list[str] | None = Field(
default=None,
description="Tags for categorization"
)
@field_validator("sparql_template")
@classmethod
def validate_sparql_has_placeholders(cls, v: str, info) -> str:
"""Ensure SPARQL template contains placeholders for all slots."""
# Note: Full validation happens during instantiation
if "{{" not in v and "{%" not in v:
# Allow templates without variables (static queries)
pass
return v
class SlotSource(BaseModel):
"""External source for slot enum values."""
type: str = Field(
pattern=r"^(json|yaml|csv)$",
description="Source file format"
)
path: str = Field(
description="File path relative to project root"
)
key_path: str | None = Field(
default=None,
description="JSON path to enum values"
)
value_field: str | None = Field(
default=None,
description="Field name for enum value"
)
label_field: str | None = Field(
default=None,
description="Field name for human-readable label"
)
class TemplateCollection(BaseModel):
"""Collection of SPARQL templates."""
version: str = Field(
pattern=r"^\d+\.\d+\.\d+$",
description="Schema version"
)
templates: dict[str, SPARQLTemplate] = Field(
description="Map of template ID to template definition"
)
slot_sources: dict[str, SlotSource] | None = Field(
default=None,
description="External sources for slot enum values"
)
@field_validator("templates")
@classmethod
def validate_template_ids_match_keys(cls, v: dict) -> dict:
"""Ensure template IDs match their dictionary keys."""
for key, template in v.items():
if template.id != key:
raise ValueError(
f"Template key '{key}' doesn't match template id '{template.id}'"
)
return v
YAML Template Format
Complete Example
# data/sparql_templates.yaml
version: "1.0.0"
slot_sources:
institution_types:
type: json
path: data/validation/sparql_validation_rules.json
key_path: "$.institution_types"
provinces:
type: json
path: data/validation/sparql_validation_rules.json
key_path: "$.subregions.NL"
templates:
# =============================================================================
# GEOGRAPHIC QUERIES
# =============================================================================
region_institution_search:
id: region_institution_search
description: "Find institutions of a specific type in a Dutch province"
intent: geographic
entity_type: institution
question_patterns:
- "Welke {institution_type_nl} zijn er in {province}?"
- "{institution_type_nl} in {province}"
- "Geef me alle {institution_type_nl} in {province}"
- "Which {institution_type_en} are in {province}?"
sparql_template: |
PREFIX hc: <https://nde.nl/ontology/hc/>
PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX schema: <http://schema.org/>
SELECT ?institution ?name ?city WHERE {
?institution a crm:E39_Actor ;
hc:institutionType "{{institution_type_code}}" ;
skos:prefLabel ?name .
OPTIONAL { ?institution schema:addressLocality ?city }
FILTER(CONTAINS(STR(?institution), "{{province_code}}"))
}
ORDER BY ?name
LIMIT 100
slots:
institution_type_code:
type: enum
description: "Single-letter GLAMORCUBESFIXPHDNT code"
required: true
source: sparql_validation_rules.json#institution_types
synonyms:
archieven: "A"
archief: "A"
archives: "A"
musea: "M"
museum: "M"
museums: "M"
bibliotheken: "L"
bibliotheek: "L"
libraries: "L"
library: "L"
galerijen: "G"
galerie: "G"
galleries: "G"
province_code:
type: enum
description: "Dutch province ISO code (e.g., NL-DR)"
required: true
source: sparql_validation_rules.json#subregions
synonyms:
drenthe: "NL-DR"
drente: "NL-DR"
friesland: "NL-FR"
fryslan: "NL-FR"
groningen: "NL-GR"
gelderland: "NL-GE"
limburg: "NL-LI"
noord-brabant: "NL-NB"
brabant: "NL-NB"
noord-holland: "NL-NH"
overijssel: "NL-OV"
utrecht: "NL-UT"
zeeland: "NL-ZE"
zuid-holland: "NL-ZH"
flevoland: "NL-FL"
examples:
- question: "Welke archieven zijn er in Drenthe?"
slots:
institution_type_code: "A"
province_code: "NL-DR"
expected_sparql: |
PREFIX hc: <https://nde.nl/ontology/hc/>
PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX schema: <http://schema.org/>
SELECT ?institution ?name ?city WHERE {
?institution a crm:E39_Actor ;
hc:institutionType "A" ;
skos:prefLabel ?name .
OPTIONAL { ?institution schema:addressLocality ?city }
FILTER(CONTAINS(STR(?institution), "NL-DR"))
}
ORDER BY ?name
LIMIT 100
priority: 10
tags:
- geographic
- institutions
- dutch
city_institution_search:
id: city_institution_search
description: "Find institutions of a specific type in a city"
intent: geographic
entity_type: institution
question_patterns:
- "Welke {institution_type_nl} zijn er in {city}?"
- "{institution_type_nl} in {city}"
sparql_template: |
PREFIX hc: <https://nde.nl/ontology/hc/>
PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX schema: <http://schema.org/>
SELECT ?institution ?name WHERE {
?institution a crm:E39_Actor ;
hc:institutionType "{{institution_type_code}}" ;
skos:prefLabel ?name ;
schema:addressLocality ?city .
FILTER(CONTAINS(LCASE(?city), "{{city_lower}}"))
}
ORDER BY ?name
LIMIT 50
slots:
institution_type_code:
type: enum
required: true
source: sparql_validation_rules.json#institution_types
synonyms:
archieven: "A"
musea: "M"
bibliotheken: "L"
city_lower:
type: string
required: true
transform: lowercase
description: "City name (case-insensitive)"
priority: 8
tags:
- geographic
- city
# =============================================================================
# STATISTICAL QUERIES
# =============================================================================
count_institutions_by_type:
id: count_institutions_by_type
description: "Count institutions by type in a country or region"
intent: statistical
entity_type: institution
question_patterns:
- "Hoeveel {institution_type_nl} zijn er in Nederland?"
- "Hoeveel {institution_type_nl} heeft Nederland?"
- "How many {institution_type_en} are there in the Netherlands?"
- "Count of {institution_type_en}"
sparql_template: |
PREFIX hc: <https://nde.nl/ontology/hc/>
PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>
PREFIX schema: <http://schema.org/>
SELECT (COUNT(DISTINCT ?institution) as ?count) WHERE {
?institution a crm:E39_Actor ;
hc:institutionType "{{institution_type_code}}" ;
schema:addressCountry <http://www.wikidata.org/entity/Q55> .
}
slots:
institution_type_code:
type: enum
required: true
source: sparql_validation_rules.json#institution_types
synonyms:
archieven: "A"
musea: "M"
bibliotheken: "L"
examples:
- question: "Hoeveel musea zijn er in Nederland?"
slots:
institution_type_code: "M"
expected_sparql: |
PREFIX hc: <https://nde.nl/ontology/hc/>
PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>
PREFIX schema: <http://schema.org/>
SELECT (COUNT(DISTINCT ?institution) as ?count) WHERE {
?institution a crm:E39_Actor ;
hc:institutionType "M" ;
schema:addressCountry <http://www.wikidata.org/entity/Q55> .
}
priority: 9
tags:
- statistical
- count
distribution_by_type:
id: distribution_by_type
description: "Distribution of institutions by type"
intent: statistical
entity_type: institution
question_patterns:
- "Verdeling van instellingen per type"
- "Hoeveel instellingen per type?"
- "Distribution of institutions by type"
sparql_template: |
PREFIX hc: <https://nde.nl/ontology/hc/>
PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>
SELECT ?type (COUNT(?institution) AS ?count) WHERE {
?institution a crm:E39_Actor ;
hc:institutionType ?type .
}
GROUP BY ?type
ORDER BY DESC(?count)
slots: {}
priority: 7
tags:
- statistical
- distribution
# =============================================================================
# ENTITY LOOKUP QUERIES
# =============================================================================
find_institution_by_name:
id: find_institution_by_name
description: "Find a specific institution by name"
intent: entity_lookup
entity_type: institution
question_patterns:
- "Informatie over {institution_name}"
- "Wat weet je over {institution_name}?"
- "Tell me about {institution_name}"
- "{institution_name}"
sparql_template: |
PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX schema: <http://schema.org/>
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
PREFIX hc: <https://nde.nl/ontology/hc/>
SELECT ?institution ?name ?type ?city ?website ?description WHERE {
?institution a crm:E39_Actor ;
skos:prefLabel ?name .
OPTIONAL { ?institution hc:institutionType ?type }
OPTIONAL { ?institution schema:addressLocality ?city }
OPTIONAL { ?institution foaf:homepage ?website }
OPTIONAL { ?institution schema:description ?description }
FILTER(CONTAINS(LCASE(STR(?name)), "{{name_lower}}"))
}
LIMIT 10
slots:
name_lower:
type: string
required: true
transform: lowercase
description: "Institution name to search for (case-insensitive)"
priority: 6
tags:
- entity_lookup
- name_search
# =============================================================================
# PERSON QUERIES
# =============================================================================
staff_at_institution:
id: staff_at_institution
description: "Find staff/employees at a specific institution"
intent: entity_lookup
entity_type: person
question_patterns:
- "Wie werkt er bij {institution_name}?"
- "Medewerkers van {institution_name}"
- "Staff at {institution_name}"
- "Who works at {institution_name}?"
sparql_template: |
PREFIX schema: <http://schema.org/>
SELECT DISTINCT ?name ?jobTitle WHERE {
?person a schema:Person ;
schema:name ?name ;
schema:jobTitle ?jobTitle .
FILTER(CONTAINS(LCASE(?jobTitle), "{{institution_slug}}"))
FILTER(!CONTAINS(LCASE(?name), "linkedin member"))
}
ORDER BY ?name
LIMIT 50
slots:
institution_slug:
type: string
required: true
transform: lowercase
description: "Institution name as it appears in job titles"
synonyms:
rijksmuseum: "rijksmuseum"
nationaal archief: "nationaal archief"
het utrechts archief: "utrechts archief"
koninklijke bibliotheek: "kb"
kb: "kb"
examples:
- question: "Wie werkt er bij het Nationaal Archief?"
slots:
institution_slug: "nationaal archief"
expected_sparql: |
PREFIX schema: <http://schema.org/>
SELECT DISTINCT ?name ?jobTitle WHERE {
?person a schema:Person ;
schema:name ?name ;
schema:jobTitle ?jobTitle .
FILTER(CONTAINS(LCASE(?jobTitle), "nationaal archief"))
FILTER(!CONTAINS(LCASE(?name), "linkedin member"))
}
ORDER BY ?name
LIMIT 50
priority: 8
tags:
- person
- staff
find_role_at_institution:
id: find_role_at_institution
description: "Find people with a specific role at an institution"
intent: entity_lookup
entity_type: person
question_patterns:
- "Wie is de {role} bij {institution_name}?"
- "{role} van {institution_name}"
- "Who is the {role} at {institution_name}?"
sparql_template: |
PREFIX schema: <http://schema.org/>
SELECT DISTINCT ?name ?jobTitle WHERE {
?person a schema:Person ;
schema:name ?name ;
schema:jobTitle ?jobTitle .
FILTER(CONTAINS(LCASE(?jobTitle), "{{institution_slug}}"))
FILTER(
CONTAINS(LCASE(?jobTitle), "{{role_term}}") ||
CONTAINS(LCASE(?jobTitle), "{{role_term_alt}}")
)
FILTER(!CONTAINS(LCASE(?name), "linkedin member"))
}
ORDER BY ?name
LIMIT 20
slots:
institution_slug:
type: string
required: true
transform: lowercase
role_term:
type: string
required: true
transform: lowercase
synonyms:
directeur: "directeur"
director: "directeur"
archivaris: "archivaris"
archivist: "archivaris"
conservator: "conservator"
curator: "conservator"
bibliothecaris: "bibliothecaris"
librarian: "bibliothecaris"
role_term_alt:
type: string
required: false
default: ""
transform: lowercase
priority: 9
tags:
- person
- role
Slot Resolution
Resolving External Sources
"""Resolve slot values from external sources."""
import json
from pathlib import Path
from typing import Any
class SlotResolver:
"""Resolve slot values from external sources."""
def __init__(self, project_root: Path):
self.project_root = project_root
self._cache: dict[str, dict[str, Any]] = {}
def resolve_source(self, source_ref: str) -> dict[str, Any]:
"""Resolve a source reference like 'file.json#key.path'.
Args:
source_ref: Reference in format 'file.json#key.path'
Returns:
Dict mapping canonical values to metadata
"""
if source_ref in self._cache:
return self._cache[source_ref]
# Parse reference
if "#" in source_ref:
file_path, key_path = source_ref.split("#", 1)
else:
file_path = source_ref
key_path = None
# Load file
full_path = self.project_root / file_path
if file_path.endswith(".json"):
with open(full_path) as f:
data = json.load(f)
elif file_path.endswith(".yaml") or file_path.endswith(".yml"):
import yaml
with open(full_path) as f:
data = yaml.safe_load(f)
else:
raise ValueError(f"Unsupported file format: {file_path}")
# Navigate to key path
if key_path:
for key in key_path.split("."):
data = data[key]
self._cache[source_ref] = data
return data
def get_valid_values(self, slot: "SlotDefinition") -> set[str]:
"""Get all valid values for a slot.
Combines inline enum_values with external source values.
"""
values = set()
if slot.enum_values:
values.update(slot.enum_values)
if slot.source:
source_data = self.resolve_source(slot.source)
if isinstance(source_data, dict):
values.update(source_data.keys())
elif isinstance(source_data, list):
values.update(source_data)
return values
Synonym Resolution
"""Resolve user-facing terms to canonical slot values."""
from rapidfuzz import fuzz, process
def resolve_synonym(
user_input: str,
slot: SlotDefinition,
resolver: SlotResolver,
fuzzy_threshold: float = 70.0,
) -> str | None:
"""Resolve user input to a canonical slot value.
1. Check exact match in synonyms
2. Check fuzzy match in synonyms
3. Check exact match in valid values
4. Check fuzzy match in valid values
Args:
user_input: User-provided value
slot: Slot definition with synonyms and source
resolver: SlotResolver for external sources
fuzzy_threshold: Minimum fuzzy match score (0-100)
Returns:
Canonical value or None if no match
"""
user_lower = user_input.lower().strip()
# 1. Exact match in synonyms
if slot.synonyms:
if user_lower in slot.synonyms:
return slot.synonyms[user_lower]
# 2. Fuzzy match in synonyms
result = process.extractOne(
user_lower,
slot.synonyms.keys(),
scorer=fuzz.WRatio,
score_cutoff=fuzzy_threshold,
)
if result:
return slot.synonyms[result[0]]
# 3. Get valid values from source
valid_values = resolver.get_valid_values(slot)
# Exact match
if user_lower in {v.lower() for v in valid_values}:
for v in valid_values:
if v.lower() == user_lower:
return v
# 4. Fuzzy match in valid values
result = process.extractOne(
user_lower,
valid_values,
scorer=fuzz.WRatio,
score_cutoff=fuzzy_threshold,
)
if result:
return result[0]
return None
Template Instantiation
"""Instantiate SPARQL templates with slot values."""
from jinja2 import Environment, BaseLoader
class TemplateInstantiator:
"""Instantiate SPARQL templates with extracted slot values."""
def __init__(self):
self.env = Environment(loader=BaseLoader())
# Use {{ }} for Jinja2 (default)
# This matches the template format in YAML
def instantiate(
self,
template: SPARQLTemplate,
slots: dict[str, Any],
) -> str:
"""Fill template with slot values.
Args:
template: Template definition
slots: Extracted slot values
Returns:
Instantiated SPARQL query string
Raises:
ValueError: If required slot is missing
"""
# Validate required slots
for slot_name, slot_def in template.slots.items():
if slot_def.required and slot_name not in slots:
if slot_def.default is not None:
slots[slot_name] = slot_def.default
else:
raise ValueError(f"Required slot '{slot_name}' not provided")
# Apply transformations
transformed_slots = {}
for slot_name, value in slots.items():
if slot_name in template.slots:
slot_def = template.slots[slot_name]
transformed_slots[slot_name] = self._transform(value, slot_def)
else:
transformed_slots[slot_name] = value
# Render template
jinja_template = self.env.from_string(template.sparql_template)
return jinja_template.render(**transformed_slots)
def _transform(self, value: Any, slot_def: SlotDefinition) -> Any:
"""Apply transformation to slot value."""
if slot_def.transform is None:
return value
if not isinstance(value, str):
return value
match slot_def.transform:
case SlotTransform.UPPERCASE:
return value.upper()
case SlotTransform.LOWERCASE:
return value.lower()
case SlotTransform.SLUGIFY:
import re
slug = value.lower()
slug = re.sub(r"[^a-z0-9\s-]", "", slug)
slug = re.sub(r"[\s_]+", "-", slug)
return slug.strip("-")
case SlotTransform.URI_ENCODE:
from urllib.parse import quote
return quote(value, safe="")
case SlotTransform.ESCAPE_SPARQL:
# Escape special characters for SPARQL strings
return value.replace("\\", "\\\\").replace('"', '\\"')
case _:
return value
Validation
"""Validate instantiated SPARQL queries."""
import re
def validate_instantiated_sparql(sparql: str) -> tuple[bool, list[str]]:
"""Validate that an instantiated SPARQL query is syntactically correct.
Args:
sparql: Instantiated SPARQL query
Returns:
Tuple of (is_valid, list_of_errors)
"""
errors = []
# Check for unresolved placeholders
unresolved = re.findall(r"\{\{[^}]+\}\}", sparql)
if unresolved:
errors.append(f"Unresolved placeholders: {unresolved}")
# Check for common syntax issues
# Orphaned dots (common LLM error)
if re.search(r"^\s*\.\s*$", sparql, re.MULTILINE):
errors.append("Orphaned dot on empty line")
# Missing SELECT/ASK/CONSTRUCT
if not re.search(r"\b(SELECT|ASK|CONSTRUCT|DESCRIBE)\b", sparql, re.IGNORECASE):
errors.append("Missing query form (SELECT/ASK/CONSTRUCT/DESCRIBE)")
# Missing WHERE clause for SELECT
if re.search(r"\bSELECT\b", sparql, re.IGNORECASE):
if not re.search(r"\bWHERE\s*\{", sparql, re.IGNORECASE):
errors.append("SELECT query missing WHERE clause")
# Unbalanced braces
open_braces = sparql.count("{")
close_braces = sparql.count("}")
if open_braces != close_braces:
errors.append(f"Unbalanced braces: {open_braces} open, {close_braces} close")
return len(errors) == 0, errors
Usage Example
"""Example usage of template-based SPARQL generation."""
from pathlib import Path
import yaml
def main():
# Load templates
with open("data/sparql_templates.yaml") as f:
raw = yaml.safe_load(f)
collection = TemplateCollection.model_validate(raw)
# Initialize components
resolver = SlotResolver(Path("."))
instantiator = TemplateInstantiator()
# Find matching template
template = collection.templates["region_institution_search"]
# Extract slots from user question
# (In production, this would use the DSPy classifier)
slots = {
"institution_type_code": "A",
"province_code": "NL-DR",
}
# Instantiate template
sparql = instantiator.instantiate(template, slots)
# Validate
is_valid, errors = validate_instantiated_sparql(sparql)
if is_valid:
print("Generated SPARQL:")
print(sparql)
else:
print(f"Validation errors: {errors}")
if __name__ == "__main__":
main()
Summary
The template schema provides:
- Structured definitions - Pydantic models ensure type safety
- Flexible slots - Support for enums, strings, with transformations
- Synonym resolution - Map user terms to canonical values
- External sources - Load enum values from existing validation files
- Built-in examples - Each template includes test cases
- Validation - Pre-execution syntax checking