1137 lines
34 KiB
Markdown
1137 lines
34 KiB
Markdown
# SPARQL Template Schema Definition
|
|
|
|
## Overview
|
|
|
|
This document defines the schema for SPARQL query templates used in the template-based query generation system. Templates are defined in YAML format and validated against this schema using Pydantic models.
|
|
|
|
## JSON Schema
|
|
|
|
```json
|
|
{
|
|
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
"$id": "https://bronhouder.nl/schemas/sparql-template/v1",
|
|
"title": "SPARQL Template Collection",
|
|
"description": "Collection of pre-defined SPARQL query templates for heritage institutions",
|
|
"type": "object",
|
|
"required": ["version", "templates"],
|
|
"properties": {
|
|
"version": {
|
|
"type": "string",
|
|
"description": "Schema version for migration support",
|
|
"pattern": "^\\d+\\.\\d+\\.\\d+$",
|
|
"examples": ["1.0.0"]
|
|
},
|
|
"templates": {
|
|
"type": "object",
|
|
"description": "Map of template ID to template definition",
|
|
"additionalProperties": {
|
|
"$ref": "#/$defs/Template"
|
|
}
|
|
},
|
|
"slot_sources": {
|
|
"type": "object",
|
|
"description": "External sources for slot enum values",
|
|
"additionalProperties": {
|
|
"$ref": "#/$defs/SlotSource"
|
|
}
|
|
}
|
|
},
|
|
"$defs": {
|
|
"Template": {
|
|
"type": "object",
|
|
"required": ["id", "question_patterns", "sparql_template", "slots"],
|
|
"properties": {
|
|
"id": {
|
|
"type": "string",
|
|
"description": "Unique template identifier",
|
|
"pattern": "^[a-z][a-z0-9_]*$"
|
|
},
|
|
"description": {
|
|
"type": "string",
|
|
"description": "Human-readable template description"
|
|
},
|
|
"intent": {
|
|
"type": "string",
|
|
"description": "Query intent this template handles",
|
|
"enum": ["geographic", "statistical", "relational", "temporal", "entity_lookup", "comparative", "exploration"]
|
|
},
|
|
"entity_type": {
|
|
"type": "string",
|
|
"description": "Entity type this template queries",
|
|
"enum": ["institution", "person", "both"]
|
|
},
|
|
"question_patterns": {
|
|
"type": "array",
|
|
"description": "Natural language patterns that match this template",
|
|
"items": {
|
|
"type": "string"
|
|
},
|
|
"minItems": 1
|
|
},
|
|
"sparql_template": {
|
|
"type": "string",
|
|
"description": "Jinja2 template for SPARQL query with {{slot_name}} placeholders"
|
|
},
|
|
"slots": {
|
|
"type": "object",
|
|
"description": "Slot definitions for template variables",
|
|
"additionalProperties": {
|
|
"$ref": "#/$defs/Slot"
|
|
}
|
|
},
|
|
"examples": {
|
|
"type": "array",
|
|
"description": "Example question -> instantiated query pairs for testing",
|
|
"items": {
|
|
"$ref": "#/$defs/Example"
|
|
}
|
|
},
|
|
"fallback_template": {
|
|
"type": "string",
|
|
"description": "Alternative template ID to use if this one fails"
|
|
},
|
|
"priority": {
|
|
"type": "integer",
|
|
"description": "Template priority for matching (higher = preferred)",
|
|
"default": 0
|
|
},
|
|
"tags": {
|
|
"type": "array",
|
|
"description": "Tags for template categorization",
|
|
"items": {
|
|
"type": "string"
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"Slot": {
|
|
"type": "object",
|
|
"required": ["type"],
|
|
"properties": {
|
|
"type": {
|
|
"type": "string",
|
|
"description": "Slot value type",
|
|
"enum": ["enum", "string", "integer", "float", "uri", "date"]
|
|
},
|
|
"description": {
|
|
"type": "string",
|
|
"description": "Human-readable slot description"
|
|
},
|
|
"required": {
|
|
"type": "boolean",
|
|
"description": "Whether this slot must be filled",
|
|
"default": true
|
|
},
|
|
"default": {
|
|
"description": "Default value if slot not extracted"
|
|
},
|
|
"source": {
|
|
"type": "string",
|
|
"description": "Reference to external enum source (e.g., sparql_validation_rules.json#institution_types)"
|
|
},
|
|
"enum_values": {
|
|
"type": "array",
|
|
"description": "Inline enum values (if not using external source)",
|
|
"items": {
|
|
"type": "string"
|
|
}
|
|
},
|
|
"synonyms": {
|
|
"type": "object",
|
|
"description": "Map of user-facing terms to canonical values",
|
|
"additionalProperties": {
|
|
"type": "string"
|
|
}
|
|
},
|
|
"validation_pattern": {
|
|
"type": "string",
|
|
"description": "Regex pattern for validation"
|
|
},
|
|
"transform": {
|
|
"type": "string",
|
|
"description": "Transformation to apply (uppercase, lowercase, slugify, etc.)",
|
|
"enum": ["uppercase", "lowercase", "slugify", "uri_encode", "escape_sparql"]
|
|
}
|
|
}
|
|
},
|
|
"Example": {
|
|
"type": "object",
|
|
"required": ["question", "slots", "expected_sparql"],
|
|
"properties": {
|
|
"question": {
|
|
"type": "string",
|
|
"description": "Example user question"
|
|
},
|
|
"slots": {
|
|
"type": "object",
|
|
"description": "Expected extracted slot values"
|
|
},
|
|
"expected_sparql": {
|
|
"type": "string",
|
|
"description": "Expected instantiated SPARQL query"
|
|
}
|
|
}
|
|
},
|
|
"SlotSource": {
|
|
"type": "object",
|
|
"required": ["type", "path"],
|
|
"properties": {
|
|
"type": {
|
|
"type": "string",
|
|
"enum": ["json", "yaml", "csv"]
|
|
},
|
|
"path": {
|
|
"type": "string",
|
|
"description": "File path relative to project root"
|
|
},
|
|
"key_path": {
|
|
"type": "string",
|
|
"description": "JSON path to enum values (e.g., $.institution_types)"
|
|
},
|
|
"value_field": {
|
|
"type": "string",
|
|
"description": "Field name for enum value"
|
|
},
|
|
"label_field": {
|
|
"type": "string",
|
|
"description": "Field name for human-readable label"
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
```
|
|
|
|
## Pydantic Models
|
|
|
|
```python
|
|
"""Pydantic models for SPARQL template validation."""
|
|
|
|
from enum import Enum
|
|
from typing import Any, Optional
|
|
from pydantic import BaseModel, Field, field_validator
|
|
|
|
|
|
class SlotType(str, Enum):
|
|
"""Valid slot value types."""
|
|
ENUM = "enum"
|
|
STRING = "string"
|
|
INTEGER = "integer"
|
|
FLOAT = "float"
|
|
URI = "uri"
|
|
DATE = "date"
|
|
|
|
|
|
class SlotTransform(str, Enum):
|
|
"""Transformations that can be applied to slot values."""
|
|
UPPERCASE = "uppercase"
|
|
LOWERCASE = "lowercase"
|
|
SLUGIFY = "slugify"
|
|
URI_ENCODE = "uri_encode"
|
|
ESCAPE_SPARQL = "escape_sparql"
|
|
|
|
|
|
class QueryIntent(str, Enum):
|
|
"""Query intent classifications."""
|
|
GEOGRAPHIC = "geographic"
|
|
STATISTICAL = "statistical"
|
|
RELATIONAL = "relational"
|
|
TEMPORAL = "temporal"
|
|
ENTITY_LOOKUP = "entity_lookup"
|
|
COMPARATIVE = "comparative"
|
|
EXPLORATION = "exploration"
|
|
|
|
|
|
class EntityType(str, Enum):
|
|
"""Entity types that templates can query."""
|
|
INSTITUTION = "institution"
|
|
PERSON = "person"
|
|
BOTH = "both"
|
|
|
|
|
|
class SlotDefinition(BaseModel):
|
|
"""Definition of a template slot (variable)."""
|
|
|
|
type: SlotType = Field(
|
|
description="Data type of the slot value"
|
|
)
|
|
description: str | None = Field(
|
|
default=None,
|
|
description="Human-readable description of the slot"
|
|
)
|
|
required: bool = Field(
|
|
default=True,
|
|
description="Whether this slot must be filled for query execution"
|
|
)
|
|
default: Any | None = Field(
|
|
default=None,
|
|
description="Default value if slot is not extracted from question"
|
|
)
|
|
source: str | None = Field(
|
|
default=None,
|
|
description="External source reference (e.g., 'sparql_validation_rules.json#institution_types')"
|
|
)
|
|
enum_values: list[str] | None = Field(
|
|
default=None,
|
|
description="Inline enum values if not using external source"
|
|
)
|
|
synonyms: dict[str, str] | None = Field(
|
|
default=None,
|
|
description="Map of user-facing terms to canonical values"
|
|
)
|
|
validation_pattern: str | None = Field(
|
|
default=None,
|
|
description="Regex pattern for value validation"
|
|
)
|
|
transform: SlotTransform | None = Field(
|
|
default=None,
|
|
description="Transformation to apply to extracted value"
|
|
)
|
|
|
|
|
|
class TemplateExample(BaseModel):
|
|
"""Example input/output pair for template testing."""
|
|
|
|
question: str = Field(
|
|
description="Example user question"
|
|
)
|
|
slots: dict[str, Any] = Field(
|
|
description="Expected extracted slot values"
|
|
)
|
|
expected_sparql: str = Field(
|
|
description="Expected instantiated SPARQL query"
|
|
)
|
|
|
|
|
|
class SPARQLTemplate(BaseModel):
|
|
"""A pre-defined SPARQL query template."""
|
|
|
|
id: str = Field(
|
|
pattern=r"^[a-z][a-z0-9_]*$",
|
|
description="Unique template identifier (snake_case)"
|
|
)
|
|
description: str | None = Field(
|
|
default=None,
|
|
description="Human-readable template description"
|
|
)
|
|
intent: QueryIntent | None = Field(
|
|
default=None,
|
|
description="Query intent this template handles"
|
|
)
|
|
entity_type: EntityType = Field(
|
|
default=EntityType.INSTITUTION,
|
|
description="Entity type this template queries"
|
|
)
|
|
question_patterns: list[str] = Field(
|
|
min_length=1,
|
|
description="Natural language patterns that match this template"
|
|
)
|
|
sparql_template: str = Field(
|
|
description="Jinja2 template with {{slot_name}} placeholders"
|
|
)
|
|
slots: dict[str, SlotDefinition] = Field(
|
|
description="Slot definitions for template variables"
|
|
)
|
|
examples: list[TemplateExample] | None = Field(
|
|
default=None,
|
|
description="Example question -> query pairs for testing"
|
|
)
|
|
fallback_template: str | None = Field(
|
|
default=None,
|
|
description="Alternative template ID if this one fails"
|
|
)
|
|
priority: int = Field(
|
|
default=0,
|
|
description="Matching priority (higher = preferred)"
|
|
)
|
|
tags: list[str] | None = Field(
|
|
default=None,
|
|
description="Tags for categorization"
|
|
)
|
|
|
|
@field_validator("sparql_template")
|
|
@classmethod
|
|
def validate_sparql_has_placeholders(cls, v: str, info) -> str:
|
|
"""Ensure SPARQL template contains placeholders for all slots."""
|
|
# Note: Full validation happens during instantiation
|
|
if "{{" not in v and "{%" not in v:
|
|
# Allow templates without variables (static queries)
|
|
pass
|
|
return v
|
|
|
|
|
|
class SlotSource(BaseModel):
|
|
"""External source for slot enum values."""
|
|
|
|
type: str = Field(
|
|
pattern=r"^(json|yaml|csv)$",
|
|
description="Source file format"
|
|
)
|
|
path: str = Field(
|
|
description="File path relative to project root"
|
|
)
|
|
key_path: str | None = Field(
|
|
default=None,
|
|
description="JSON path to enum values"
|
|
)
|
|
value_field: str | None = Field(
|
|
default=None,
|
|
description="Field name for enum value"
|
|
)
|
|
label_field: str | None = Field(
|
|
default=None,
|
|
description="Field name for human-readable label"
|
|
)
|
|
|
|
|
|
class TemplateCollection(BaseModel):
|
|
"""Collection of SPARQL templates."""
|
|
|
|
version: str = Field(
|
|
pattern=r"^\d+\.\d+\.\d+$",
|
|
description="Schema version"
|
|
)
|
|
templates: dict[str, SPARQLTemplate] = Field(
|
|
description="Map of template ID to template definition"
|
|
)
|
|
slot_sources: dict[str, SlotSource] | None = Field(
|
|
default=None,
|
|
description="External sources for slot enum values"
|
|
)
|
|
|
|
@field_validator("templates")
|
|
@classmethod
|
|
def validate_template_ids_match_keys(cls, v: dict) -> dict:
|
|
"""Ensure template IDs match their dictionary keys."""
|
|
for key, template in v.items():
|
|
if template.id != key:
|
|
raise ValueError(
|
|
f"Template key '{key}' doesn't match template id '{template.id}'"
|
|
)
|
|
return v
|
|
```
|
|
|
|
## YAML Template Format
|
|
|
|
### Complete Example
|
|
|
|
```yaml
|
|
# data/sparql_templates.yaml
|
|
version: "1.0.0"
|
|
|
|
slot_sources:
|
|
institution_types:
|
|
type: json
|
|
path: data/validation/sparql_validation_rules.json
|
|
key_path: "$.institution_types"
|
|
provinces:
|
|
type: json
|
|
path: data/validation/sparql_validation_rules.json
|
|
key_path: "$.subregions.NL"
|
|
|
|
templates:
|
|
# =============================================================================
|
|
# GEOGRAPHIC QUERIES
|
|
# =============================================================================
|
|
|
|
region_institution_search:
|
|
id: region_institution_search
|
|
description: "Find institutions of a specific type in a Dutch province"
|
|
intent: geographic
|
|
entity_type: institution
|
|
question_patterns:
|
|
- "Welke {institution_type_nl} zijn er in {province}?"
|
|
- "{institution_type_nl} in {province}"
|
|
- "Geef me alle {institution_type_nl} in {province}"
|
|
- "Which {institution_type_en} are in {province}?"
|
|
sparql_template: |
|
|
PREFIX hc: <https://nde.nl/ontology/hc/>
|
|
PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>
|
|
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
|
|
PREFIX schema: <http://schema.org/>
|
|
|
|
SELECT ?institution ?name ?city WHERE {
|
|
?institution a crm:E39_Actor ;
|
|
hc:institutionType "{{institution_type_code}}" ;
|
|
skos:prefLabel ?name .
|
|
OPTIONAL { ?institution schema:addressLocality ?city }
|
|
FILTER(CONTAINS(STR(?institution), "{{province_code}}"))
|
|
}
|
|
ORDER BY ?name
|
|
LIMIT 100
|
|
slots:
|
|
institution_type_code:
|
|
type: enum
|
|
description: "Single-letter GLAMORCUBESFIXPHDNT code"
|
|
required: true
|
|
source: sparql_validation_rules.json#institution_types
|
|
synonyms:
|
|
archieven: "A"
|
|
archief: "A"
|
|
archives: "A"
|
|
musea: "M"
|
|
museum: "M"
|
|
museums: "M"
|
|
bibliotheken: "L"
|
|
bibliotheek: "L"
|
|
libraries: "L"
|
|
library: "L"
|
|
galerijen: "G"
|
|
galerie: "G"
|
|
galleries: "G"
|
|
province_code:
|
|
type: enum
|
|
description: "Dutch province ISO code (e.g., NL-DR)"
|
|
required: true
|
|
source: sparql_validation_rules.json#subregions
|
|
synonyms:
|
|
drenthe: "NL-DR"
|
|
drente: "NL-DR"
|
|
friesland: "NL-FR"
|
|
fryslan: "NL-FR"
|
|
groningen: "NL-GR"
|
|
gelderland: "NL-GE"
|
|
limburg: "NL-LI"
|
|
noord-brabant: "NL-NB"
|
|
brabant: "NL-NB"
|
|
noord-holland: "NL-NH"
|
|
overijssel: "NL-OV"
|
|
utrecht: "NL-UT"
|
|
zeeland: "NL-ZE"
|
|
zuid-holland: "NL-ZH"
|
|
flevoland: "NL-FL"
|
|
examples:
|
|
- question: "Welke archieven zijn er in Drenthe?"
|
|
slots:
|
|
institution_type_code: "A"
|
|
province_code: "NL-DR"
|
|
expected_sparql: |
|
|
PREFIX hc: <https://nde.nl/ontology/hc/>
|
|
PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>
|
|
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
|
|
PREFIX schema: <http://schema.org/>
|
|
|
|
SELECT ?institution ?name ?city WHERE {
|
|
?institution a crm:E39_Actor ;
|
|
hc:institutionType "A" ;
|
|
skos:prefLabel ?name .
|
|
OPTIONAL { ?institution schema:addressLocality ?city }
|
|
FILTER(CONTAINS(STR(?institution), "NL-DR"))
|
|
}
|
|
ORDER BY ?name
|
|
LIMIT 100
|
|
priority: 10
|
|
tags:
|
|
- geographic
|
|
- institutions
|
|
- dutch
|
|
|
|
city_institution_search:
|
|
id: city_institution_search
|
|
description: "Find institutions of a specific type in a city"
|
|
intent: geographic
|
|
entity_type: institution
|
|
question_patterns:
|
|
- "Welke {institution_type_nl} zijn er in {city}?"
|
|
- "{institution_type_nl} in {city}"
|
|
sparql_template: |
|
|
PREFIX hc: <https://nde.nl/ontology/hc/>
|
|
PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>
|
|
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
|
|
PREFIX schema: <http://schema.org/>
|
|
|
|
SELECT ?institution ?name WHERE {
|
|
?institution a crm:E39_Actor ;
|
|
hc:institutionType "{{institution_type_code}}" ;
|
|
skos:prefLabel ?name ;
|
|
schema:addressLocality ?city .
|
|
FILTER(CONTAINS(LCASE(?city), "{{city_lower}}"))
|
|
}
|
|
ORDER BY ?name
|
|
LIMIT 50
|
|
slots:
|
|
institution_type_code:
|
|
type: enum
|
|
required: true
|
|
source: sparql_validation_rules.json#institution_types
|
|
synonyms:
|
|
archieven: "A"
|
|
musea: "M"
|
|
bibliotheken: "L"
|
|
city_lower:
|
|
type: string
|
|
required: true
|
|
transform: lowercase
|
|
description: "City name (case-insensitive)"
|
|
priority: 8
|
|
tags:
|
|
- geographic
|
|
- city
|
|
|
|
# =============================================================================
|
|
# STATISTICAL QUERIES
|
|
# =============================================================================
|
|
|
|
count_institutions_by_type:
|
|
id: count_institutions_by_type
|
|
description: "Count institutions by type in a country or region"
|
|
intent: statistical
|
|
entity_type: institution
|
|
question_patterns:
|
|
- "Hoeveel {institution_type_nl} zijn er in Nederland?"
|
|
- "Hoeveel {institution_type_nl} heeft Nederland?"
|
|
- "How many {institution_type_en} are there in the Netherlands?"
|
|
- "Count of {institution_type_en}"
|
|
sparql_template: |
|
|
PREFIX hc: <https://nde.nl/ontology/hc/>
|
|
PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>
|
|
PREFIX schema: <http://schema.org/>
|
|
|
|
SELECT (COUNT(DISTINCT ?institution) as ?count) WHERE {
|
|
?institution a crm:E39_Actor ;
|
|
hc:institutionType "{{institution_type_code}}" ;
|
|
schema:addressCountry <http://www.wikidata.org/entity/Q55> .
|
|
}
|
|
slots:
|
|
institution_type_code:
|
|
type: enum
|
|
required: true
|
|
source: sparql_validation_rules.json#institution_types
|
|
synonyms:
|
|
archieven: "A"
|
|
musea: "M"
|
|
bibliotheken: "L"
|
|
examples:
|
|
- question: "Hoeveel musea zijn er in Nederland?"
|
|
slots:
|
|
institution_type_code: "M"
|
|
expected_sparql: |
|
|
PREFIX hc: <https://nde.nl/ontology/hc/>
|
|
PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>
|
|
PREFIX schema: <http://schema.org/>
|
|
|
|
SELECT (COUNT(DISTINCT ?institution) as ?count) WHERE {
|
|
?institution a crm:E39_Actor ;
|
|
hc:institutionType "M" ;
|
|
schema:addressCountry <http://www.wikidata.org/entity/Q55> .
|
|
}
|
|
priority: 9
|
|
tags:
|
|
- statistical
|
|
- count
|
|
|
|
distribution_by_type:
|
|
id: distribution_by_type
|
|
description: "Distribution of institutions by type"
|
|
intent: statistical
|
|
entity_type: institution
|
|
question_patterns:
|
|
- "Verdeling van instellingen per type"
|
|
- "Hoeveel instellingen per type?"
|
|
- "Distribution of institutions by type"
|
|
sparql_template: |
|
|
PREFIX hc: <https://nde.nl/ontology/hc/>
|
|
PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>
|
|
|
|
SELECT ?type (COUNT(?institution) AS ?count) WHERE {
|
|
?institution a crm:E39_Actor ;
|
|
hc:institutionType ?type .
|
|
}
|
|
GROUP BY ?type
|
|
ORDER BY DESC(?count)
|
|
slots: {}
|
|
priority: 7
|
|
tags:
|
|
- statistical
|
|
- distribution
|
|
|
|
# =============================================================================
|
|
# ENTITY LOOKUP QUERIES
|
|
# =============================================================================
|
|
|
|
find_institution_by_name:
|
|
id: find_institution_by_name
|
|
description: "Find a specific institution by name"
|
|
intent: entity_lookup
|
|
entity_type: institution
|
|
question_patterns:
|
|
- "Informatie over {institution_name}"
|
|
- "Wat weet je over {institution_name}?"
|
|
- "Tell me about {institution_name}"
|
|
- "{institution_name}"
|
|
sparql_template: |
|
|
PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>
|
|
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
|
|
PREFIX schema: <http://schema.org/>
|
|
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
|
|
PREFIX hc: <https://nde.nl/ontology/hc/>
|
|
|
|
SELECT ?institution ?name ?type ?city ?website ?description WHERE {
|
|
?institution a crm:E39_Actor ;
|
|
skos:prefLabel ?name .
|
|
OPTIONAL { ?institution hc:institutionType ?type }
|
|
OPTIONAL { ?institution schema:addressLocality ?city }
|
|
OPTIONAL { ?institution foaf:homepage ?website }
|
|
OPTIONAL { ?institution schema:description ?description }
|
|
FILTER(CONTAINS(LCASE(STR(?name)), "{{name_lower}}"))
|
|
}
|
|
LIMIT 10
|
|
slots:
|
|
name_lower:
|
|
type: string
|
|
required: true
|
|
transform: lowercase
|
|
description: "Institution name to search for (case-insensitive)"
|
|
priority: 6
|
|
tags:
|
|
- entity_lookup
|
|
- name_search
|
|
|
|
# =============================================================================
|
|
# PERSON QUERIES
|
|
# =============================================================================
|
|
|
|
staff_at_institution:
|
|
id: staff_at_institution
|
|
description: "Find staff/employees at a specific institution"
|
|
intent: entity_lookup
|
|
entity_type: person
|
|
question_patterns:
|
|
- "Wie werkt er bij {institution_name}?"
|
|
- "Medewerkers van {institution_name}"
|
|
- "Staff at {institution_name}"
|
|
- "Who works at {institution_name}?"
|
|
sparql_template: |
|
|
PREFIX schema: <http://schema.org/>
|
|
|
|
SELECT DISTINCT ?name ?jobTitle WHERE {
|
|
?person a schema:Person ;
|
|
schema:name ?name ;
|
|
schema:jobTitle ?jobTitle .
|
|
FILTER(CONTAINS(LCASE(?jobTitle), "{{institution_slug}}"))
|
|
FILTER(!CONTAINS(LCASE(?name), "linkedin member"))
|
|
}
|
|
ORDER BY ?name
|
|
LIMIT 50
|
|
slots:
|
|
institution_slug:
|
|
type: string
|
|
required: true
|
|
transform: lowercase
|
|
description: "Institution name as it appears in job titles"
|
|
synonyms:
|
|
rijksmuseum: "rijksmuseum"
|
|
nationaal archief: "nationaal archief"
|
|
het utrechts archief: "utrechts archief"
|
|
koninklijke bibliotheek: "kb"
|
|
kb: "kb"
|
|
examples:
|
|
- question: "Wie werkt er bij het Nationaal Archief?"
|
|
slots:
|
|
institution_slug: "nationaal archief"
|
|
expected_sparql: |
|
|
PREFIX schema: <http://schema.org/>
|
|
|
|
SELECT DISTINCT ?name ?jobTitle WHERE {
|
|
?person a schema:Person ;
|
|
schema:name ?name ;
|
|
schema:jobTitle ?jobTitle .
|
|
FILTER(CONTAINS(LCASE(?jobTitle), "nationaal archief"))
|
|
FILTER(!CONTAINS(LCASE(?name), "linkedin member"))
|
|
}
|
|
ORDER BY ?name
|
|
LIMIT 50
|
|
priority: 8
|
|
tags:
|
|
- person
|
|
- staff
|
|
|
|
find_role_at_institution:
|
|
id: find_role_at_institution
|
|
description: "Find people with a specific role at an institution"
|
|
intent: entity_lookup
|
|
entity_type: person
|
|
question_patterns:
|
|
- "Wie is de {role} bij {institution_name}?"
|
|
- "{role} van {institution_name}"
|
|
- "Who is the {role} at {institution_name}?"
|
|
sparql_template: |
|
|
PREFIX schema: <http://schema.org/>
|
|
|
|
SELECT DISTINCT ?name ?jobTitle WHERE {
|
|
?person a schema:Person ;
|
|
schema:name ?name ;
|
|
schema:jobTitle ?jobTitle .
|
|
FILTER(CONTAINS(LCASE(?jobTitle), "{{institution_slug}}"))
|
|
FILTER(
|
|
CONTAINS(LCASE(?jobTitle), "{{role_term}}") ||
|
|
CONTAINS(LCASE(?jobTitle), "{{role_term_alt}}")
|
|
)
|
|
FILTER(!CONTAINS(LCASE(?name), "linkedin member"))
|
|
}
|
|
ORDER BY ?name
|
|
LIMIT 20
|
|
slots:
|
|
institution_slug:
|
|
type: string
|
|
required: true
|
|
transform: lowercase
|
|
role_term:
|
|
type: string
|
|
required: true
|
|
transform: lowercase
|
|
synonyms:
|
|
directeur: "directeur"
|
|
director: "directeur"
|
|
archivaris: "archivaris"
|
|
archivist: "archivaris"
|
|
conservator: "conservator"
|
|
curator: "conservator"
|
|
bibliothecaris: "bibliothecaris"
|
|
librarian: "bibliothecaris"
|
|
role_term_alt:
|
|
type: string
|
|
required: false
|
|
default: ""
|
|
transform: lowercase
|
|
priority: 9
|
|
tags:
|
|
- person
|
|
- role
|
|
```
|
|
|
|
## Slot Resolution
|
|
|
|
### Resolving External Sources
|
|
|
|
```python
|
|
"""Resolve slot values from external sources."""
|
|
|
|
import json
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
|
|
class SlotResolver:
|
|
"""Resolve slot values from external sources."""
|
|
|
|
def __init__(self, project_root: Path):
|
|
self.project_root = project_root
|
|
self._cache: dict[str, dict[str, Any]] = {}
|
|
|
|
def resolve_source(self, source_ref: str) -> dict[str, Any]:
|
|
"""Resolve a source reference like 'file.json#key.path'.
|
|
|
|
Args:
|
|
source_ref: Reference in format 'file.json#key.path'
|
|
|
|
Returns:
|
|
Dict mapping canonical values to metadata
|
|
"""
|
|
if source_ref in self._cache:
|
|
return self._cache[source_ref]
|
|
|
|
# Parse reference
|
|
if "#" in source_ref:
|
|
file_path, key_path = source_ref.split("#", 1)
|
|
else:
|
|
file_path = source_ref
|
|
key_path = None
|
|
|
|
# Load file
|
|
full_path = self.project_root / file_path
|
|
if file_path.endswith(".json"):
|
|
with open(full_path) as f:
|
|
data = json.load(f)
|
|
elif file_path.endswith(".yaml") or file_path.endswith(".yml"):
|
|
import yaml
|
|
with open(full_path) as f:
|
|
data = yaml.safe_load(f)
|
|
else:
|
|
raise ValueError(f"Unsupported file format: {file_path}")
|
|
|
|
# Navigate to key path
|
|
if key_path:
|
|
for key in key_path.split("."):
|
|
data = data[key]
|
|
|
|
self._cache[source_ref] = data
|
|
return data
|
|
|
|
def get_valid_values(self, slot: "SlotDefinition") -> set[str]:
|
|
"""Get all valid values for a slot.
|
|
|
|
Combines inline enum_values with external source values.
|
|
"""
|
|
values = set()
|
|
|
|
if slot.enum_values:
|
|
values.update(slot.enum_values)
|
|
|
|
if slot.source:
|
|
source_data = self.resolve_source(slot.source)
|
|
if isinstance(source_data, dict):
|
|
values.update(source_data.keys())
|
|
elif isinstance(source_data, list):
|
|
values.update(source_data)
|
|
|
|
return values
|
|
```
|
|
|
|
### Synonym Resolution
|
|
|
|
```python
|
|
"""Resolve user-facing terms to canonical slot values."""
|
|
|
|
from rapidfuzz import fuzz, process
|
|
|
|
|
|
def resolve_synonym(
|
|
user_input: str,
|
|
slot: SlotDefinition,
|
|
resolver: SlotResolver,
|
|
fuzzy_threshold: float = 70.0,
|
|
) -> str | None:
|
|
"""Resolve user input to a canonical slot value.
|
|
|
|
1. Check exact match in synonyms
|
|
2. Check fuzzy match in synonyms
|
|
3. Check exact match in valid values
|
|
4. Check fuzzy match in valid values
|
|
|
|
Args:
|
|
user_input: User-provided value
|
|
slot: Slot definition with synonyms and source
|
|
resolver: SlotResolver for external sources
|
|
fuzzy_threshold: Minimum fuzzy match score (0-100)
|
|
|
|
Returns:
|
|
Canonical value or None if no match
|
|
"""
|
|
user_lower = user_input.lower().strip()
|
|
|
|
# 1. Exact match in synonyms
|
|
if slot.synonyms:
|
|
if user_lower in slot.synonyms:
|
|
return slot.synonyms[user_lower]
|
|
|
|
# 2. Fuzzy match in synonyms
|
|
result = process.extractOne(
|
|
user_lower,
|
|
slot.synonyms.keys(),
|
|
scorer=fuzz.WRatio,
|
|
score_cutoff=fuzzy_threshold,
|
|
)
|
|
if result:
|
|
return slot.synonyms[result[0]]
|
|
|
|
# 3. Get valid values from source
|
|
valid_values = resolver.get_valid_values(slot)
|
|
|
|
# Exact match
|
|
if user_lower in {v.lower() for v in valid_values}:
|
|
for v in valid_values:
|
|
if v.lower() == user_lower:
|
|
return v
|
|
|
|
# 4. Fuzzy match in valid values
|
|
result = process.extractOne(
|
|
user_lower,
|
|
valid_values,
|
|
scorer=fuzz.WRatio,
|
|
score_cutoff=fuzzy_threshold,
|
|
)
|
|
if result:
|
|
return result[0]
|
|
|
|
return None
|
|
```
|
|
|
|
## Template Instantiation
|
|
|
|
```python
|
|
"""Instantiate SPARQL templates with slot values."""
|
|
|
|
from jinja2 import Environment, BaseLoader
|
|
|
|
|
|
class TemplateInstantiator:
|
|
"""Instantiate SPARQL templates with extracted slot values."""
|
|
|
|
def __init__(self):
|
|
self.env = Environment(loader=BaseLoader())
|
|
# Use {{ }} for Jinja2 (default)
|
|
# This matches the template format in YAML
|
|
|
|
def instantiate(
|
|
self,
|
|
template: SPARQLTemplate,
|
|
slots: dict[str, Any],
|
|
) -> str:
|
|
"""Fill template with slot values.
|
|
|
|
Args:
|
|
template: Template definition
|
|
slots: Extracted slot values
|
|
|
|
Returns:
|
|
Instantiated SPARQL query string
|
|
|
|
Raises:
|
|
ValueError: If required slot is missing
|
|
"""
|
|
# Validate required slots
|
|
for slot_name, slot_def in template.slots.items():
|
|
if slot_def.required and slot_name not in slots:
|
|
if slot_def.default is not None:
|
|
slots[slot_name] = slot_def.default
|
|
else:
|
|
raise ValueError(f"Required slot '{slot_name}' not provided")
|
|
|
|
# Apply transformations
|
|
transformed_slots = {}
|
|
for slot_name, value in slots.items():
|
|
if slot_name in template.slots:
|
|
slot_def = template.slots[slot_name]
|
|
transformed_slots[slot_name] = self._transform(value, slot_def)
|
|
else:
|
|
transformed_slots[slot_name] = value
|
|
|
|
# Render template
|
|
jinja_template = self.env.from_string(template.sparql_template)
|
|
return jinja_template.render(**transformed_slots)
|
|
|
|
def _transform(self, value: Any, slot_def: SlotDefinition) -> Any:
|
|
"""Apply transformation to slot value."""
|
|
if slot_def.transform is None:
|
|
return value
|
|
|
|
if not isinstance(value, str):
|
|
return value
|
|
|
|
match slot_def.transform:
|
|
case SlotTransform.UPPERCASE:
|
|
return value.upper()
|
|
case SlotTransform.LOWERCASE:
|
|
return value.lower()
|
|
case SlotTransform.SLUGIFY:
|
|
import re
|
|
slug = value.lower()
|
|
slug = re.sub(r"[^a-z0-9\s-]", "", slug)
|
|
slug = re.sub(r"[\s_]+", "-", slug)
|
|
return slug.strip("-")
|
|
case SlotTransform.URI_ENCODE:
|
|
from urllib.parse import quote
|
|
return quote(value, safe="")
|
|
case SlotTransform.ESCAPE_SPARQL:
|
|
# Escape special characters for SPARQL strings
|
|
return value.replace("\\", "\\\\").replace('"', '\\"')
|
|
case _:
|
|
return value
|
|
```
|
|
|
|
## Validation
|
|
|
|
```python
|
|
"""Validate instantiated SPARQL queries."""
|
|
|
|
import re
|
|
|
|
|
|
def validate_instantiated_sparql(sparql: str) -> tuple[bool, list[str]]:
|
|
"""Validate that an instantiated SPARQL query is syntactically correct.
|
|
|
|
Args:
|
|
sparql: Instantiated SPARQL query
|
|
|
|
Returns:
|
|
Tuple of (is_valid, list_of_errors)
|
|
"""
|
|
errors = []
|
|
|
|
# Check for unresolved placeholders
|
|
unresolved = re.findall(r"\{\{[^}]+\}\}", sparql)
|
|
if unresolved:
|
|
errors.append(f"Unresolved placeholders: {unresolved}")
|
|
|
|
# Check for common syntax issues
|
|
|
|
# Orphaned dots (common LLM error)
|
|
if re.search(r"^\s*\.\s*$", sparql, re.MULTILINE):
|
|
errors.append("Orphaned dot on empty line")
|
|
|
|
# Missing SELECT/ASK/CONSTRUCT
|
|
if not re.search(r"\b(SELECT|ASK|CONSTRUCT|DESCRIBE)\b", sparql, re.IGNORECASE):
|
|
errors.append("Missing query form (SELECT/ASK/CONSTRUCT/DESCRIBE)")
|
|
|
|
# Missing WHERE clause for SELECT
|
|
if re.search(r"\bSELECT\b", sparql, re.IGNORECASE):
|
|
if not re.search(r"\bWHERE\s*\{", sparql, re.IGNORECASE):
|
|
errors.append("SELECT query missing WHERE clause")
|
|
|
|
# Unbalanced braces
|
|
open_braces = sparql.count("{")
|
|
close_braces = sparql.count("}")
|
|
if open_braces != close_braces:
|
|
errors.append(f"Unbalanced braces: {open_braces} open, {close_braces} close")
|
|
|
|
return len(errors) == 0, errors
|
|
```
|
|
|
|
## Usage Example
|
|
|
|
```python
|
|
"""Example usage of template-based SPARQL generation."""
|
|
|
|
from pathlib import Path
|
|
import yaml
|
|
|
|
|
|
def main():
|
|
# Load templates
|
|
with open("data/sparql_templates.yaml") as f:
|
|
raw = yaml.safe_load(f)
|
|
|
|
collection = TemplateCollection.model_validate(raw)
|
|
|
|
# Initialize components
|
|
resolver = SlotResolver(Path("."))
|
|
instantiator = TemplateInstantiator()
|
|
|
|
# Find matching template
|
|
template = collection.templates["region_institution_search"]
|
|
|
|
# Extract slots from user question
|
|
# (In production, this would use the DSPy classifier)
|
|
slots = {
|
|
"institution_type_code": "A",
|
|
"province_code": "NL-DR",
|
|
}
|
|
|
|
# Instantiate template
|
|
sparql = instantiator.instantiate(template, slots)
|
|
|
|
# Validate
|
|
is_valid, errors = validate_instantiated_sparql(sparql)
|
|
|
|
if is_valid:
|
|
print("Generated SPARQL:")
|
|
print(sparql)
|
|
else:
|
|
print(f"Validation errors: {errors}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
```
|
|
|
|
## Summary
|
|
|
|
The template schema provides:
|
|
|
|
1. **Structured definitions** - Pydantic models ensure type safety
|
|
2. **Flexible slots** - Support for enums, strings, with transformations
|
|
3. **Synonym resolution** - Map user terms to canonical values
|
|
4. **External sources** - Load enum values from existing validation files
|
|
5. **Built-in examples** - Each template includes test cases
|
|
6. **Validation** - Pre-execution syntax checking
|