SPARQL Template Schema Definition

Overview

This document defines the schema for SPARQL query templates used in the template-based query generation system. Templates are defined in YAML format and validated against this schema using Pydantic models.

JSON Schema

{
  "$schema": "https://json-schema.org/draft/2020-12/schema",
  "$id": "https://bronhouder.nl/schemas/sparql-template/v1",
  "title": "SPARQL Template Collection",
  "description": "Collection of pre-defined SPARQL query templates for heritage institutions",
  "type": "object",
  "required": ["version", "templates"],
  "properties": {
    "version": {
      "type": "string",
      "description": "Schema version for migration support",
      "pattern": "^\\d+\\.\\d+\\.\\d+$",
      "examples": ["1.0.0"]
    },
    "templates": {
      "type": "object",
      "description": "Map of template ID to template definition",
      "additionalProperties": {
        "$ref": "#/$defs/Template"
      }
    },
    "slot_sources": {
      "type": "object",
      "description": "External sources for slot enum values",
      "additionalProperties": {
        "$ref": "#/$defs/SlotSource"
      }
    }
  },
  "$defs": {
    "Template": {
      "type": "object",
      "required": ["id", "question_patterns", "sparql_template", "slots"],
      "properties": {
        "id": {
          "type": "string",
          "description": "Unique template identifier",
          "pattern": "^[a-z][a-z0-9_]*$"
        },
        "description": {
          "type": "string",
          "description": "Human-readable template description"
        },
        "intent": {
          "type": "string",
          "description": "Query intent this template handles",
          "enum": ["geographic", "statistical", "relational", "temporal", "entity_lookup", "comparative", "exploration"]
        },
        "entity_type": {
          "type": "string",
          "description": "Entity type this template queries",
          "enum": ["institution", "person", "both"]
        },
        "question_patterns": {
          "type": "array",
          "description": "Natural language patterns that match this template",
          "items": {
            "type": "string"
          },
          "minItems": 1
        },
        "sparql_template": {
          "type": "string",
          "description": "Jinja2 template for SPARQL query with {{slot_name}} placeholders"
        },
        "slots": {
          "type": "object",
          "description": "Slot definitions for template variables",
          "additionalProperties": {
            "$ref": "#/$defs/Slot"
          }
        },
        "examples": {
          "type": "array",
          "description": "Example question -> instantiated query pairs for testing",
          "items": {
            "$ref": "#/$defs/Example"
          }
        },
        "fallback_template": {
          "type": "string",
          "description": "Alternative template ID to use if this one fails"
        },
        "priority": {
          "type": "integer",
          "description": "Template priority for matching (higher = preferred)",
          "default": 0
        },
        "tags": {
          "type": "array",
          "description": "Tags for template categorization",
          "items": {
            "type": "string"
          }
        }
      }
    },
    "Slot": {
      "type": "object",
      "required": ["type"],
      "properties": {
        "type": {
          "type": "string",
          "description": "Slot value type",
          "enum": ["enum", "string", "integer", "float", "uri", "date"]
        },
        "description": {
          "type": "string",
          "description": "Human-readable slot description"
        },
        "required": {
          "type": "boolean",
          "description": "Whether this slot must be filled",
          "default": true
        },
        "default": {
          "description": "Default value if slot not extracted"
        },
        "source": {
          "type": "string",
          "description": "Reference to external enum source (e.g., sparql_validation_rules.json#institution_types)"
        },
        "enum_values": {
          "type": "array",
          "description": "Inline enum values (if not using external source)",
          "items": {
            "type": "string"
          }
        },
        "synonyms": {
          "type": "object",
          "description": "Map of user-facing terms to canonical values",
          "additionalProperties": {
            "type": "string"
          }
        },
        "validation_pattern": {
          "type": "string",
          "description": "Regex pattern for validation"
        },
        "transform": {
          "type": "string",
          "description": "Transformation to apply (uppercase, lowercase, slugify, etc.)",
          "enum": ["uppercase", "lowercase", "slugify", "uri_encode", "escape_sparql"]
        }
      }
    },
    "Example": {
      "type": "object",
      "required": ["question", "slots", "expected_sparql"],
      "properties": {
        "question": {
          "type": "string",
          "description": "Example user question"
        },
        "slots": {
          "type": "object",
          "description": "Expected extracted slot values"
        },
        "expected_sparql": {
          "type": "string",
          "description": "Expected instantiated SPARQL query"
        }
      }
    },
    "SlotSource": {
      "type": "object",
      "required": ["type", "path"],
      "properties": {
        "type": {
          "type": "string",
          "enum": ["json", "yaml", "csv"]
        },
        "path": {
          "type": "string",
          "description": "File path relative to project root"
        },
        "key_path": {
          "type": "string",
          "description": "JSON path to enum values (e.g., $.institution_types)"
        },
        "value_field": {
          "type": "string",
          "description": "Field name for enum value"
        },
        "label_field": {
          "type": "string",
          "description": "Field name for human-readable label"
        }
      }
    }
  }
}

Pydantic Models

"""Pydantic models for SPARQL template validation."""

from enum import Enum
from typing import Any, Optional
from pydantic import BaseModel, Field, field_validator


class SlotType(str, Enum):
    """Valid slot value types."""
    ENUM = "enum"
    STRING = "string"
    INTEGER = "integer"
    FLOAT = "float"
    URI = "uri"
    DATE = "date"


class SlotTransform(str, Enum):
    """Transformations that can be applied to slot values."""
    UPPERCASE = "uppercase"
    LOWERCASE = "lowercase"
    SLUGIFY = "slugify"
    URI_ENCODE = "uri_encode"
    ESCAPE_SPARQL = "escape_sparql"


class QueryIntent(str, Enum):
    """Query intent classifications."""
    GEOGRAPHIC = "geographic"
    STATISTICAL = "statistical"
    RELATIONAL = "relational"
    TEMPORAL = "temporal"
    ENTITY_LOOKUP = "entity_lookup"
    COMPARATIVE = "comparative"
    EXPLORATION = "exploration"


class EntityType(str, Enum):
    """Entity types that templates can query."""
    INSTITUTION = "institution"
    PERSON = "person"
    BOTH = "both"


class SlotDefinition(BaseModel):
    """Definition of a template slot (variable)."""
    
    type: SlotType = Field(
        description="Data type of the slot value"
    )
    description: str | None = Field(
        default=None,
        description="Human-readable description of the slot"
    )
    required: bool = Field(
        default=True,
        description="Whether this slot must be filled for query execution"
    )
    default: Any | None = Field(
        default=None,
        description="Default value if slot is not extracted from question"
    )
    source: str | None = Field(
        default=None,
        description="External source reference (e.g., 'sparql_validation_rules.json#institution_types')"
    )
    enum_values: list[str] | None = Field(
        default=None,
        description="Inline enum values if not using external source"
    )
    synonyms: dict[str, str] | None = Field(
        default=None,
        description="Map of user-facing terms to canonical values"
    )
    validation_pattern: str | None = Field(
        default=None,
        description="Regex pattern for value validation"
    )
    transform: SlotTransform | None = Field(
        default=None,
        description="Transformation to apply to extracted value"
    )


class TemplateExample(BaseModel):
    """Example input/output pair for template testing."""
    
    question: str = Field(
        description="Example user question"
    )
    slots: dict[str, Any] = Field(
        description="Expected extracted slot values"
    )
    expected_sparql: str = Field(
        description="Expected instantiated SPARQL query"
    )


class SPARQLTemplate(BaseModel):
    """A pre-defined SPARQL query template."""
    
    id: str = Field(
        pattern=r"^[a-z][a-z0-9_]*$",
        description="Unique template identifier (snake_case)"
    )
    description: str | None = Field(
        default=None,
        description="Human-readable template description"
    )
    intent: QueryIntent | None = Field(
        default=None,
        description="Query intent this template handles"
    )
    entity_type: EntityType = Field(
        default=EntityType.INSTITUTION,
        description="Entity type this template queries"
    )
    question_patterns: list[str] = Field(
        min_length=1,
        description="Natural language patterns that match this template"
    )
    sparql_template: str = Field(
        description="Jinja2 template with {{slot_name}} placeholders"
    )
    slots: dict[str, SlotDefinition] = Field(
        description="Slot definitions for template variables"
    )
    examples: list[TemplateExample] | None = Field(
        default=None,
        description="Example question -> query pairs for testing"
    )
    fallback_template: str | None = Field(
        default=None,
        description="Alternative template ID if this one fails"
    )
    priority: int = Field(
        default=0,
        description="Matching priority (higher = preferred)"
    )
    tags: list[str] | None = Field(
        default=None,
        description="Tags for categorization"
    )
    
    @field_validator("sparql_template")
    @classmethod
    def validate_sparql_has_placeholders(cls, v: str, info) -> str:
        """Ensure SPARQL template contains placeholders for all slots."""
        # Note: Full validation happens during instantiation
        if "{{" not in v and "{%" not in v:
            # Allow templates without variables (static queries)
            pass
        return v


class SlotSource(BaseModel):
    """External source for slot enum values."""
    
    type: str = Field(
        pattern=r"^(json|yaml|csv)$",
        description="Source file format"
    )
    path: str = Field(
        description="File path relative to project root"
    )
    key_path: str | None = Field(
        default=None,
        description="JSON path to enum values"
    )
    value_field: str | None = Field(
        default=None,
        description="Field name for enum value"
    )
    label_field: str | None = Field(
        default=None,
        description="Field name for human-readable label"
    )


class TemplateCollection(BaseModel):
    """Collection of SPARQL templates."""
    
    version: str = Field(
        pattern=r"^\d+\.\d+\.\d+$",
        description="Schema version"
    )
    templates: dict[str, SPARQLTemplate] = Field(
        description="Map of template ID to template definition"
    )
    slot_sources: dict[str, SlotSource] | None = Field(
        default=None,
        description="External sources for slot enum values"
    )
    
    @field_validator("templates")
    @classmethod
    def validate_template_ids_match_keys(cls, v: dict) -> dict:
        """Ensure template IDs match their dictionary keys."""
        for key, template in v.items():
            if template.id != key:
                raise ValueError(
                    f"Template key '{key}' doesn't match template id '{template.id}'"
                )
        return v

YAML Template Format

Complete Example

# data/sparql_templates.yaml
version: "1.0.0"

slot_sources:
  institution_types:
    type: json
    path: data/validation/sparql_validation_rules.json
    key_path: "$.institution_types"
  provinces:
    type: json
    path: data/validation/sparql_validation_rules.json
    key_path: "$.subregions.NL"

templates:
  # =============================================================================
  # GEOGRAPHIC QUERIES
  # =============================================================================
  
  region_institution_search:
    id: region_institution_search
    description: "Find institutions of a specific type in a Dutch province"
    intent: geographic
    entity_type: institution
    question_patterns:
      - "Welke {institution_type_nl} zijn er in {province}?"
      - "{institution_type_nl} in {province}"
      - "Geef me alle {institution_type_nl} in {province}"
      - "Which {institution_type_en} are in {province}?"
    sparql_template: |
      PREFIX hc: <https://nde.nl/ontology/hc/>
      PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>
      PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
      PREFIX schema: <http://schema.org/>
      
      SELECT ?institution ?name ?city WHERE {
        ?institution a crm:E39_Actor ;
                     hc:institutionType "{{institution_type_code}}" ;
                     skos:prefLabel ?name .
        OPTIONAL { ?institution schema:addressLocality ?city }
        FILTER(CONTAINS(STR(?institution), "{{province_code}}"))
      }
      ORDER BY ?name
      LIMIT 100      
    slots:
      institution_type_code:
        type: enum
        description: "Single-letter GLAMORCUBESFIXPHDNT code"
        required: true
        source: sparql_validation_rules.json#institution_types
        synonyms:
          archieven: "A"
          archief: "A"
          archives: "A"
          musea: "M"
          museum: "M"
          museums: "M"
          bibliotheken: "L"
          bibliotheek: "L"
          libraries: "L"
          library: "L"
          galerijen: "G"
          galerie: "G"
          galleries: "G"
      province_code:
        type: enum
        description: "Dutch province ISO code (e.g., NL-DR)"
        required: true
        source: sparql_validation_rules.json#subregions
        synonyms:
          drenthe: "NL-DR"
          drente: "NL-DR"
          friesland: "NL-FR"
          fryslan: "NL-FR"
          groningen: "NL-GR"
          gelderland: "NL-GE"
          limburg: "NL-LI"
          noord-brabant: "NL-NB"
          brabant: "NL-NB"
          noord-holland: "NL-NH"
          overijssel: "NL-OV"
          utrecht: "NL-UT"
          zeeland: "NL-ZE"
          zuid-holland: "NL-ZH"
          flevoland: "NL-FL"
    examples:
      - question: "Welke archieven zijn er in Drenthe?"
        slots:
          institution_type_code: "A"
          province_code: "NL-DR"
        expected_sparql: |
          PREFIX hc: <https://nde.nl/ontology/hc/>
          PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>
          PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
          PREFIX schema: <http://schema.org/>
          
          SELECT ?institution ?name ?city WHERE {
            ?institution a crm:E39_Actor ;
                         hc:institutionType "A" ;
                         skos:prefLabel ?name .
            OPTIONAL { ?institution schema:addressLocality ?city }
            FILTER(CONTAINS(STR(?institution), "NL-DR"))
          }
          ORDER BY ?name
          LIMIT 100          
    priority: 10
    tags:
      - geographic
      - institutions
      - dutch

  city_institution_search:
    id: city_institution_search
    description: "Find institutions of a specific type in a city"
    intent: geographic
    entity_type: institution
    question_patterns:
      - "Welke {institution_type_nl} zijn er in {city}?"
      - "{institution_type_nl} in {city}"
    sparql_template: |
      PREFIX hc: <https://nde.nl/ontology/hc/>
      PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>
      PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
      PREFIX schema: <http://schema.org/>
      
      SELECT ?institution ?name WHERE {
        ?institution a crm:E39_Actor ;
                     hc:institutionType "{{institution_type_code}}" ;
                     skos:prefLabel ?name ;
                     schema:addressLocality ?city .
        FILTER(CONTAINS(LCASE(?city), "{{city_lower}}"))
      }
      ORDER BY ?name
      LIMIT 50      
    slots:
      institution_type_code:
        type: enum
        required: true
        source: sparql_validation_rules.json#institution_types
        synonyms:
          archieven: "A"
          musea: "M"
          bibliotheken: "L"
      city_lower:
        type: string
        required: true
        transform: lowercase
        description: "City name (case-insensitive)"
    priority: 8
    tags:
      - geographic
      - city

  # =============================================================================
  # STATISTICAL QUERIES
  # =============================================================================

  count_institutions_by_type:
    id: count_institutions_by_type
    description: "Count institutions by type in a country or region"
    intent: statistical
    entity_type: institution
    question_patterns:
      - "Hoeveel {institution_type_nl} zijn er in Nederland?"
      - "Hoeveel {institution_type_nl} heeft Nederland?"
      - "How many {institution_type_en} are there in the Netherlands?"
      - "Count of {institution_type_en}"
    sparql_template: |
      PREFIX hc: <https://nde.nl/ontology/hc/>
      PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>
      PREFIX schema: <http://schema.org/>
      
      SELECT (COUNT(DISTINCT ?institution) as ?count) WHERE {
        ?institution a crm:E39_Actor ;
                     hc:institutionType "{{institution_type_code}}" ;
                     schema:addressCountry <http://www.wikidata.org/entity/Q55> .
      }      
    slots:
      institution_type_code:
        type: enum
        required: true
        source: sparql_validation_rules.json#institution_types
        synonyms:
          archieven: "A"
          musea: "M"
          bibliotheken: "L"
    examples:
      - question: "Hoeveel musea zijn er in Nederland?"
        slots:
          institution_type_code: "M"
        expected_sparql: |
          PREFIX hc: <https://nde.nl/ontology/hc/>
          PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>
          PREFIX schema: <http://schema.org/>
          
          SELECT (COUNT(DISTINCT ?institution) as ?count) WHERE {
            ?institution a crm:E39_Actor ;
                         hc:institutionType "M" ;
                         schema:addressCountry <http://www.wikidata.org/entity/Q55> .
          }          
    priority: 9
    tags:
      - statistical
      - count

  distribution_by_type:
    id: distribution_by_type
    description: "Distribution of institutions by type"
    intent: statistical
    entity_type: institution
    question_patterns:
      - "Verdeling van instellingen per type"
      - "Hoeveel instellingen per type?"
      - "Distribution of institutions by type"
    sparql_template: |
      PREFIX hc: <https://nde.nl/ontology/hc/>
      PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>
      
      SELECT ?type (COUNT(?institution) AS ?count) WHERE {
        ?institution a crm:E39_Actor ;
                     hc:institutionType ?type .
      }
      GROUP BY ?type
      ORDER BY DESC(?count)      
    slots: {}
    priority: 7
    tags:
      - statistical
      - distribution

  # =============================================================================
  # ENTITY LOOKUP QUERIES
  # =============================================================================

  find_institution_by_name:
    id: find_institution_by_name
    description: "Find a specific institution by name"
    intent: entity_lookup
    entity_type: institution
    question_patterns:
      - "Informatie over {institution_name}"
      - "Wat weet je over {institution_name}?"
      - "Tell me about {institution_name}"
      - "{institution_name}"
    sparql_template: |
      PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>
      PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
      PREFIX schema: <http://schema.org/>
      PREFIX foaf: <http://xmlns.com/foaf/0.1/>
      PREFIX hc: <https://nde.nl/ontology/hc/>
      
      SELECT ?institution ?name ?type ?city ?website ?description WHERE {
        ?institution a crm:E39_Actor ;
                     skos:prefLabel ?name .
        OPTIONAL { ?institution hc:institutionType ?type }
        OPTIONAL { ?institution schema:addressLocality ?city }
        OPTIONAL { ?institution foaf:homepage ?website }
        OPTIONAL { ?institution schema:description ?description }
        FILTER(CONTAINS(LCASE(STR(?name)), "{{name_lower}}"))
      }
      LIMIT 10      
    slots:
      name_lower:
        type: string
        required: true
        transform: lowercase
        description: "Institution name to search for (case-insensitive)"
    priority: 6
    tags:
      - entity_lookup
      - name_search

  # =============================================================================
  # PERSON QUERIES
  # =============================================================================

  staff_at_institution:
    id: staff_at_institution
    description: "Find staff/employees at a specific institution"
    intent: entity_lookup
    entity_type: person
    question_patterns:
      - "Wie werkt er bij {institution_name}?"
      - "Medewerkers van {institution_name}"
      - "Staff at {institution_name}"
      - "Who works at {institution_name}?"
    sparql_template: |
      PREFIX schema: <http://schema.org/>
      
      SELECT DISTINCT ?name ?jobTitle WHERE {
        ?person a schema:Person ;
                schema:name ?name ;
                schema:jobTitle ?jobTitle .
        FILTER(CONTAINS(LCASE(?jobTitle), "{{institution_slug}}"))
        FILTER(!CONTAINS(LCASE(?name), "linkedin member"))
      }
      ORDER BY ?name
      LIMIT 50      
    slots:
      institution_slug:
        type: string
        required: true
        transform: lowercase
        description: "Institution name as it appears in job titles"
        synonyms:
          rijksmuseum: "rijksmuseum"
          nationaal archief: "nationaal archief"
          het utrechts archief: "utrechts archief"
          koninklijke bibliotheek: "kb"
          kb: "kb"
    examples:
      - question: "Wie werkt er bij het Nationaal Archief?"
        slots:
          institution_slug: "nationaal archief"
        expected_sparql: |
          PREFIX schema: <http://schema.org/>
          
          SELECT DISTINCT ?name ?jobTitle WHERE {
            ?person a schema:Person ;
                    schema:name ?name ;
                    schema:jobTitle ?jobTitle .
            FILTER(CONTAINS(LCASE(?jobTitle), "nationaal archief"))
            FILTER(!CONTAINS(LCASE(?name), "linkedin member"))
          }
          ORDER BY ?name
          LIMIT 50          
    priority: 8
    tags:
      - person
      - staff

  find_role_at_institution:
    id: find_role_at_institution
    description: "Find people with a specific role at an institution"
    intent: entity_lookup
    entity_type: person
    question_patterns:
      - "Wie is de {role} bij {institution_name}?"
      - "{role} van {institution_name}"
      - "Who is the {role} at {institution_name}?"
    sparql_template: |
      PREFIX schema: <http://schema.org/>
      
      SELECT DISTINCT ?name ?jobTitle WHERE {
        ?person a schema:Person ;
                schema:name ?name ;
                schema:jobTitle ?jobTitle .
        FILTER(CONTAINS(LCASE(?jobTitle), "{{institution_slug}}"))
        FILTER(
          CONTAINS(LCASE(?jobTitle), "{{role_term}}") ||
          CONTAINS(LCASE(?jobTitle), "{{role_term_alt}}")
        )
        FILTER(!CONTAINS(LCASE(?name), "linkedin member"))
      }
      ORDER BY ?name
      LIMIT 20      
    slots:
      institution_slug:
        type: string
        required: true
        transform: lowercase
      role_term:
        type: string
        required: true
        transform: lowercase
        synonyms:
          directeur: "directeur"
          director: "directeur"
          archivaris: "archivaris"
          archivist: "archivaris"
          conservator: "conservator"
          curator: "conservator"
          bibliothecaris: "bibliothecaris"
          librarian: "bibliothecaris"
      role_term_alt:
        type: string
        required: false
        default: ""
        transform: lowercase
    priority: 9
    tags:
      - person
      - role

Slot Resolution

Resolving External Sources

"""Resolve slot values from external sources."""

import json
from pathlib import Path
from typing import Any


class SlotResolver:
    """Resolve slot values from external sources."""
    
    def __init__(self, project_root: Path):
        self.project_root = project_root
        self._cache: dict[str, dict[str, Any]] = {}
    
    def resolve_source(self, source_ref: str) -> dict[str, Any]:
        """Resolve a source reference like 'file.json#key.path'.
        
        Args:
            source_ref: Reference in format 'file.json#key.path'
            
        Returns:
            Dict mapping canonical values to metadata
        """
        if source_ref in self._cache:
            return self._cache[source_ref]
        
        # Parse reference
        if "#" in source_ref:
            file_path, key_path = source_ref.split("#", 1)
        else:
            file_path = source_ref
            key_path = None
        
        # Load file
        full_path = self.project_root / file_path
        if file_path.endswith(".json"):
            with open(full_path) as f:
                data = json.load(f)
        elif file_path.endswith(".yaml") or file_path.endswith(".yml"):
            import yaml
            with open(full_path) as f:
                data = yaml.safe_load(f)
        else:
            raise ValueError(f"Unsupported file format: {file_path}")
        
        # Navigate to key path
        if key_path:
            for key in key_path.split("."):
                data = data[key]
        
        self._cache[source_ref] = data
        return data
    
    def get_valid_values(self, slot: "SlotDefinition") -> set[str]:
        """Get all valid values for a slot.
        
        Combines inline enum_values with external source values.
        """
        values = set()
        
        if slot.enum_values:
            values.update(slot.enum_values)
        
        if slot.source:
            source_data = self.resolve_source(slot.source)
            if isinstance(source_data, dict):
                values.update(source_data.keys())
            elif isinstance(source_data, list):
                values.update(source_data)
        
        return values

Synonym Resolution

"""Resolve user-facing terms to canonical slot values."""

from rapidfuzz import fuzz, process


def resolve_synonym(
    user_input: str,
    slot: SlotDefinition,
    resolver: SlotResolver,
    fuzzy_threshold: float = 70.0,
) -> str | None:
    """Resolve user input to a canonical slot value.
    
    1. Check exact match in synonyms
    2. Check fuzzy match in synonyms
    3. Check exact match in valid values
    4. Check fuzzy match in valid values
    
    Args:
        user_input: User-provided value
        slot: Slot definition with synonyms and source
        resolver: SlotResolver for external sources
        fuzzy_threshold: Minimum fuzzy match score (0-100)
        
    Returns:
        Canonical value or None if no match
    """
    user_lower = user_input.lower().strip()
    
    # 1. Exact match in synonyms
    if slot.synonyms:
        if user_lower in slot.synonyms:
            return slot.synonyms[user_lower]
        
        # 2. Fuzzy match in synonyms
        result = process.extractOne(
            user_lower,
            slot.synonyms.keys(),
            scorer=fuzz.WRatio,
            score_cutoff=fuzzy_threshold,
        )
        if result:
            return slot.synonyms[result[0]]
    
    # 3. Get valid values from source
    valid_values = resolver.get_valid_values(slot)
    
    # Exact match
    if user_lower in {v.lower() for v in valid_values}:
        for v in valid_values:
            if v.lower() == user_lower:
                return v
    
    # 4. Fuzzy match in valid values
    result = process.extractOne(
        user_lower,
        valid_values,
        scorer=fuzz.WRatio,
        score_cutoff=fuzzy_threshold,
    )
    if result:
        return result[0]
    
    return None

Template Instantiation

"""Instantiate SPARQL templates with slot values."""

from jinja2 import Environment, BaseLoader


class TemplateInstantiator:
    """Instantiate SPARQL templates with extracted slot values."""
    
    def __init__(self):
        self.env = Environment(loader=BaseLoader())
        # Use {{ }} for Jinja2 (default)
        # This matches the template format in YAML
    
    def instantiate(
        self,
        template: SPARQLTemplate,
        slots: dict[str, Any],
    ) -> str:
        """Fill template with slot values.
        
        Args:
            template: Template definition
            slots: Extracted slot values
            
        Returns:
            Instantiated SPARQL query string
            
        Raises:
            ValueError: If required slot is missing
        """
        # Validate required slots
        for slot_name, slot_def in template.slots.items():
            if slot_def.required and slot_name not in slots:
                if slot_def.default is not None:
                    slots[slot_name] = slot_def.default
                else:
                    raise ValueError(f"Required slot '{slot_name}' not provided")
        
        # Apply transformations
        transformed_slots = {}
        for slot_name, value in slots.items():
            if slot_name in template.slots:
                slot_def = template.slots[slot_name]
                transformed_slots[slot_name] = self._transform(value, slot_def)
            else:
                transformed_slots[slot_name] = value
        
        # Render template
        jinja_template = self.env.from_string(template.sparql_template)
        return jinja_template.render(**transformed_slots)
    
    def _transform(self, value: Any, slot_def: SlotDefinition) -> Any:
        """Apply transformation to slot value."""
        if slot_def.transform is None:
            return value
        
        if not isinstance(value, str):
            return value
        
        match slot_def.transform:
            case SlotTransform.UPPERCASE:
                return value.upper()
            case SlotTransform.LOWERCASE:
                return value.lower()
            case SlotTransform.SLUGIFY:
                import re
                slug = value.lower()
                slug = re.sub(r"[^a-z0-9\s-]", "", slug)
                slug = re.sub(r"[\s_]+", "-", slug)
                return slug.strip("-")
            case SlotTransform.URI_ENCODE:
                from urllib.parse import quote
                return quote(value, safe="")
            case SlotTransform.ESCAPE_SPARQL:
                # Escape special characters for SPARQL strings
                return value.replace("\\", "\\\\").replace('"', '\\"')
            case _:
                return value

Validation

"""Validate instantiated SPARQL queries."""

import re


def validate_instantiated_sparql(sparql: str) -> tuple[bool, list[str]]:
    """Validate that an instantiated SPARQL query is syntactically correct.
    
    Args:
        sparql: Instantiated SPARQL query
        
    Returns:
        Tuple of (is_valid, list_of_errors)
    """
    errors = []
    
    # Check for unresolved placeholders
    unresolved = re.findall(r"\{\{[^}]+\}\}", sparql)
    if unresolved:
        errors.append(f"Unresolved placeholders: {unresolved}")
    
    # Check for common syntax issues
    
    # Orphaned dots (common LLM error)
    if re.search(r"^\s*\.\s*$", sparql, re.MULTILINE):
        errors.append("Orphaned dot on empty line")
    
    # Missing SELECT/ASK/CONSTRUCT
    if not re.search(r"\b(SELECT|ASK|CONSTRUCT|DESCRIBE)\b", sparql, re.IGNORECASE):
        errors.append("Missing query form (SELECT/ASK/CONSTRUCT/DESCRIBE)")
    
    # Missing WHERE clause for SELECT
    if re.search(r"\bSELECT\b", sparql, re.IGNORECASE):
        if not re.search(r"\bWHERE\s*\{", sparql, re.IGNORECASE):
            errors.append("SELECT query missing WHERE clause")
    
    # Unbalanced braces
    open_braces = sparql.count("{")
    close_braces = sparql.count("}")
    if open_braces != close_braces:
        errors.append(f"Unbalanced braces: {open_braces} open, {close_braces} close")
    
    return len(errors) == 0, errors

Usage Example

"""Example usage of template-based SPARQL generation."""

from pathlib import Path
import yaml


def main():
    # Load templates
    with open("data/sparql_templates.yaml") as f:
        raw = yaml.safe_load(f)
    
    collection = TemplateCollection.model_validate(raw)
    
    # Initialize components
    resolver = SlotResolver(Path("."))
    instantiator = TemplateInstantiator()
    
    # Find matching template
    template = collection.templates["region_institution_search"]
    
    # Extract slots from user question
    # (In production, this would use the DSPy classifier)
    slots = {
        "institution_type_code": "A",
        "province_code": "NL-DR",
    }
    
    # Instantiate template
    sparql = instantiator.instantiate(template, slots)
    
    # Validate
    is_valid, errors = validate_instantiated_sparql(sparql)
    
    if is_valid:
        print("Generated SPARQL:")
        print(sparql)
    else:
        print(f"Validation errors: {errors}")


if __name__ == "__main__":
    main()

Summary

The template schema provides:

Structured definitions - Pydantic models ensure type safety
Flexible slots - Support for enums, strings, with transformations
Synonym resolution - Map user terms to canonical values
External sources - Load enum values from existing validation files
Built-in examples - Each template includes test cases
Validation - Pre-execution syntax checking

34 KiB Raw Blame History