glam/src/glam_extractor/api/sparql_lsp.py
2026-01-02 02:11:04 +01:00

1462 lines
54 KiB
Python

"""
SPARQL Language Server Protocol (SPARQL-LSP)
A JSON-RPC based protocol for providing language intelligence for SPARQL queries
against the Heritage Custodian ontology. Designed like the Language Server Protocol
to be reusable across different AI agents, IDEs, and tools.
Architecture:
┌─────────────────┐ JSON-RPC ┌─────────────────────┐
│ AI Agent │◄──────────────────►│ SPARQL-LSP │
│ (Client) │ │ Server │
└─────────────────┘ └─────────────────────┘
┌─────────────────┐ JSON-RPC ┌────────▼────────────┐
│ IDE/Editor │◄──────────────────►│ Knowledge Sources │
│ (Client) │ │ - SHACL Shapes │
└─────────────────┘ │ - LinkML Schema │
│ - TypeDB Rules │
┌─────────────────┐ JSON-RPC │ - SPARQL Endpoint │
│ Web UI │◄──────────────────►└─────────────────────┘
│ (Client) │
└─────────────────┘
LSP Methods Implemented:
- initialize: Server capabilities handshake
- textDocument/publishDiagnostics: SHACL-based validation errors
- textDocument/completion: Prefix, class, property completion
- textDocument/hover: Documentation on hover
- textDocument/signatureHelp: Function signatures
- sparql/execute: Execute query and return results
- sparql/explain: Explain what a query does
- sparql/suggest: Suggest novel connections from vector DB
Author: Heritage Custodian Ontology Project
Date: 2025-12-27
Protocol Version: 1.0.0
"""
import json
import logging
import re
import uuid
from dataclasses import dataclass, field, asdict
from enum import Enum, IntEnum
from typing import Any, Dict, List, Optional, Union, Callable, Sequence
from abc import ABC, abstractmethod
logger = logging.getLogger(__name__)
# =============================================================================
# JSON-RPC Protocol Types (LSP Standard)
# =============================================================================
class ErrorCode(IntEnum):
"""Standard JSON-RPC and LSP error codes."""
# JSON-RPC errors
ParseError = -32700
InvalidRequest = -32600
MethodNotFound = -32601
InvalidParams = -32602
InternalError = -32603
# LSP errors
ServerNotInitialized = -32002
UnknownErrorCode = -32001
RequestCancelled = -32800
ContentModified = -32801
@dataclass
class Position:
"""Position in a text document (0-indexed)."""
line: int
character: int
@dataclass
class Range:
"""Range in a text document."""
start: Position
end: Position
@dataclass
class Location:
"""Location in a document."""
uri: str
range: Range
@dataclass
class TextDocumentIdentifier:
"""Identifies a text document."""
uri: str
@dataclass
class TextDocumentItem:
"""Text document with content."""
uri: str
languageId: str
version: int
text: str
class DiagnosticSeverity(IntEnum):
"""Diagnostic severity levels."""
Error = 1
Warning = 2
Information = 3
Hint = 4
@dataclass
class Diagnostic:
"""Represents a diagnostic (error, warning, etc.)."""
range: Range
message: str
severity: DiagnosticSeverity = DiagnosticSeverity.Error
code: Optional[str] = None
source: str = "sparql-lsp"
relatedInformation: Optional[List[Dict]] = None
def to_dict(self) -> Dict[str, Any]:
return {
"range": {
"start": {"line": self.range.start.line, "character": self.range.start.character},
"end": {"line": self.range.end.line, "character": self.range.end.character},
},
"message": self.message,
"severity": self.severity,
"code": self.code,
"source": self.source,
}
class CompletionItemKind(IntEnum):
"""Completion item kinds."""
Text = 1
Method = 2
Function = 3
Constructor = 4
Field = 5
Variable = 6
Class = 7
Interface = 8
Module = 9
Property = 10
Unit = 11
Value = 12
Enum = 13
Keyword = 14
Snippet = 15
Color = 16
File = 17
Reference = 18
Folder = 19
EnumMember = 20
Constant = 21
Struct = 22
Event = 23
Operator = 24
TypeParameter = 25
@dataclass
class CompletionItem:
"""Completion item returned by completion requests."""
label: str
kind: CompletionItemKind
detail: Optional[str] = None
documentation: Optional[str] = None
insertText: Optional[str] = None
insertTextFormat: int = 1 # 1 = PlainText, 2 = Snippet
def to_dict(self) -> Dict[str, Any]:
result: Dict[str, Any] = {
"label": self.label,
"kind": self.kind,
}
if self.detail:
result["detail"] = self.detail
if self.documentation:
result["documentation"] = {"kind": "markdown", "value": self.documentation}
if self.insertText:
result["insertText"] = self.insertText
result["insertTextFormat"] = self.insertTextFormat
return result
@dataclass
class Hover:
"""Hover information."""
contents: str # Markdown content
range: Optional[Range] = None
def to_dict(self) -> Dict[str, Any]:
result: Dict[str, Any] = {"contents": {"kind": "markdown", "value": self.contents}}
if self.range:
result["range"] = {
"start": {"line": self.range.start.line, "character": self.range.start.character},
"end": {"line": self.range.end.line, "character": self.range.end.character},
}
return result
@dataclass
class SignatureInformation:
"""Signature information for a function."""
label: str
documentation: Optional[str] = None
parameters: Optional[List[Dict[str, Any]]] = None
def to_dict(self) -> Dict[str, Any]:
result: Dict[str, Any] = {"label": self.label}
if self.documentation:
result["documentation"] = {"kind": "markdown", "value": self.documentation}
if self.parameters:
result["parameters"] = self.parameters
return result
@dataclass
class SignatureHelp:
"""Signature help result."""
signatures: List[SignatureInformation]
activeSignature: int = 0
activeParameter: int = 0
def to_dict(self) -> Dict[str, Any]:
return {
"signatures": [s.to_dict() for s in self.signatures],
"activeSignature": self.activeSignature,
"activeParameter": self.activeParameter,
}
# =============================================================================
# SPARQL-LSP Specific Types
# =============================================================================
@dataclass
class SPARQLExecuteResult:
"""Result of executing a SPARQL query."""
success: bool
results: Optional[Dict] = None
error: Optional[str] = None
executionTimeMs: Optional[float] = None
def to_dict(self) -> Dict[str, Any]:
return {
"success": self.success,
"results": self.results,
"error": self.error,
"executionTimeMs": self.executionTimeMs,
}
@dataclass
class SPARQLExplanation:
"""Explanation of what a SPARQL query does."""
summary: str
steps: List[str]
estimatedComplexity: str # "simple", "moderate", "complex"
suggestedOptimizations: Optional[List[str]] = None
def to_dict(self) -> Dict[str, Any]:
return {
"summary": self.summary,
"steps": self.steps,
"estimatedComplexity": self.estimatedComplexity,
"suggestedOptimizations": self.suggestedOptimizations,
}
@dataclass
class SPARQLSuggestion:
"""Novel connection suggestion from vector DB."""
type: str # "relationship", "entity", "pattern"
description: str
sparqlFragment: str
confidence: float
source: str # "qdrant", "typedb", "inference"
def to_dict(self) -> Dict[str, Any]:
return asdict(self)
# =============================================================================
# Knowledge Base (SHACL, LinkML, TypeDB)
# =============================================================================
class OntologyKnowledgeBase:
"""
Knowledge base for SPARQL-LSP, derived from:
- SHACL shapes (validation rules)
- LinkML schema (class/property definitions)
- TypeDB rules (inference patterns)
"""
# Prefixes
PREFIXES: Dict[str, Dict[str, str]] = {
"hc": {
"uri": "https://nde.nl/ontology/hc/class/",
"description": "Heritage Custodian classes",
"example": "hc:Custodian",
},
"hcp": {
"uri": "https://nde.nl/ontology/hc/",
"description": "Heritage Custodian properties",
"example": "hcp:institutionType",
},
"schema": {
"uri": "http://schema.org/",
"description": "Schema.org vocabulary",
"example": "schema:name",
},
"skos": {
"uri": "http://www.w3.org/2004/02/skos/core#",
"description": "SKOS vocabulary for labels",
"example": "skos:prefLabel",
},
"rdfs": {
"uri": "http://www.w3.org/2000/01/rdf-schema#",
"description": "RDF Schema",
"example": "rdfs:label",
},
"wd": {
"uri": "http://www.wikidata.org/entity/",
"description": "Wikidata entities",
"example": "wd:Q55 (Netherlands)",
},
"wdt": {
"uri": "http://www.wikidata.org/prop/direct/",
"description": "Wikidata direct properties",
"example": "wdt:P17 (country)",
},
"foaf": {
"uri": "http://xmlns.com/foaf/0.1/",
"description": "FOAF vocabulary",
"example": "foaf:name",
},
"dct": {
"uri": "http://purl.org/dc/terms/",
"description": "Dublin Core Terms",
"example": "dct:description",
},
}
# Classes (from SHACL NodeShapes)
CLASSES: Dict[str, Dict[str, Any]] = {
"hc:Custodian": {
"description": "Heritage custodian institution (museum, archive, library, etc.)",
"properties": ["hcp:institutionType", "hcp:ghcid", "hcp:isil", "skos:prefLabel", "schema:addressCountry"],
"example": "?s a hc:Custodian .",
},
}
# Properties (from SHACL PropertyShapes)
PROPERTIES: Dict[str, Dict[str, Any]] = {
"hcp:institutionType": {
"description": "Single-letter institution type code",
"domain": "hc:Custodian",
"range": "xsd:string",
"pattern": "^[MLAGSBREDONFHICUT]$",
"values": {
"M": "Museum",
"L": "Library",
"A": "Archive",
"G": "Gallery",
"S": "Collecting Society",
"B": "Botanical/Zoo",
"R": "Research Center",
"E": "Education Provider",
"O": "Official Institution",
"D": "Digital Platform",
"N": "NGO",
"H": "Holy Site",
"F": "Feature",
"I": "Intangible Heritage",
"C": "Corporation",
"U": "Unknown",
"T": "Trade Association",
},
"example": '?s hcp:institutionType "M" .',
},
"hcp:ghcid": {
"description": "Global Heritage Custodian ID",
"domain": "hc:Custodian",
"range": "xsd:string",
"pattern": "^[A-Z]{2}-[A-Z]{2,3}-[A-Z]{2,4}-[A-Z]-[A-Z0-9]+$",
"example": '?s hcp:ghcid "NL-NH-AMS-M-RIJKS" .',
},
"hcp:isil": {
"description": "ISIL code (International Standard Identifier for Libraries)",
"domain": "hc:Custodian",
"range": "xsd:string",
"pattern": "^[A-Z]{2}-[A-Za-z0-9]+$",
"example": '?s hcp:isil "NL-AmRMA" .',
},
"hcp:wikidataId": {
"description": "Wikidata Q-number (without wd: prefix)",
"domain": "hc:Custodian",
"range": "xsd:string",
"pattern": "^Q[0-9]+$",
"example": '?s hcp:wikidataId "Q190804" .',
},
"skos:prefLabel": {
"description": "Preferred label/name of the institution",
"domain": "hc:Custodian",
"range": "xsd:string",
"example": "?s skos:prefLabel ?name .",
},
"schema:name": {
"description": "Name of the institution",
"domain": "hc:Custodian",
"range": "xsd:string",
"example": "?s schema:name ?name .",
},
"schema:addressCountry": {
"description": "Country as Wikidata entity URI",
"domain": "hc:Custodian",
"range": "IRI",
"example": "?s schema:addressCountry wd:Q55 . # Netherlands",
},
"schema:url": {
"description": "Website URL",
"domain": "hc:Custodian",
"range": "xsd:anyURI",
"example": "?s schema:url ?website .",
},
}
# SPARQL Keywords
KEYWORDS: List[str] = [
"SELECT", "CONSTRUCT", "ASK", "DESCRIBE",
"WHERE", "FILTER", "OPTIONAL", "UNION", "MINUS",
"GROUP BY", "ORDER BY", "HAVING", "LIMIT", "OFFSET",
"DISTINCT", "REDUCED", "AS", "BIND", "VALUES",
"COUNT", "SUM", "AVG", "MIN", "MAX", "SAMPLE", "GROUP_CONCAT",
"STR", "LANG", "LANGMATCHES", "DATATYPE", "BOUND", "IRI", "URI",
"BNODE", "RAND", "ABS", "CEIL", "FLOOR", "ROUND",
"CONCAT", "STRLEN", "UCASE", "LCASE", "ENCODE_FOR_URI",
"CONTAINS", "STRSTARTS", "STRENDS", "STRBEFORE", "STRAFTER",
"YEAR", "MONTH", "DAY", "HOURS", "MINUTES", "SECONDS",
"TIMEZONE", "TZ", "NOW", "UUID", "STRUUID",
"MD5", "SHA1", "SHA256", "SHA384", "SHA512",
"COALESCE", "IF", "STRLANG", "STRDT",
"SAMETERM", "ISIRI", "ISURI", "ISBLANK", "ISLITERAL", "ISNUMERIC",
"REGEX", "REPLACE", "EXISTS", "NOT EXISTS",
"PREFIX", "BASE", "FROM", "FROM NAMED", "GRAPH",
"SERVICE", "SILENT", "IN", "NOT IN",
"a", # shorthand for rdf:type
]
# Province codes for filtering
DUTCH_PROVINCES: Dict[str, str] = {
"NH": "Noord-Holland",
"ZH": "Zuid-Holland",
"NB": "Noord-Brabant",
"GE": "Gelderland",
"UT": "Utrecht",
"OV": "Overijssel",
"LI": "Limburg",
"FR": "Friesland",
"GR": "Groningen",
"DR": "Drenthe",
"FL": "Flevoland",
"ZE": "Zeeland",
}
# Country Wikidata IDs
COUNTRIES: Dict[str, str] = {
"Q55": "Netherlands",
"Q17": "Japan",
"Q213": "Czech Republic",
"Q31": "Belgium",
"Q40": "Austria",
"Q183": "Germany",
"Q145": "United Kingdom",
"Q142": "France",
"Q30": "United States",
}
# TypeDB Functions (inference rules)
TYPEDB_FUNCTIONS: Dict[str, Dict[str, str]] = {
"get-reconstructions-by-observation-name": {
"parameters": "$name: string",
"returns": "{ custodian-reconstruction }",
"description": "Get all reconstructions derived from observations with given name",
},
"get-high-confidence-observations": {
"parameters": "",
"returns": "{ custodian-observation }",
"description": "Get observations that have multiple sources (high confidence)",
},
"get-entity-names": {
"parameters": "$recon: custodian-reconstruction",
"returns": "{ string }",
"description": "Get all observed names for a given reconstruction",
},
"get-all-descendants": {
"parameters": "$parent: custodian-reconstruction",
"returns": "{ custodian-reconstruction }",
"description": "Get all child organizations recursively",
},
"get-name-successors": {
"parameters": "$name: custodian-name",
"returns": "{ custodian-name }",
"description": "Get all successor names in temporal order",
},
}
# =============================================================================
# SPARQL-LSP Server
# =============================================================================
class SPARQLLanguageServer:
"""
SPARQL Language Server implementing LSP-like protocol.
Provides:
- Diagnostics (SHACL-based validation)
- Code completion (prefixes, classes, properties)
- Hover information (documentation)
- Signature help (SPARQL functions)
- Query execution
- Query explanation
- Novel connection suggestions
"""
def __init__(
self,
sparql_endpoint: str = "https://bronhouder.nl/sparql",
qdrant_host: Optional[str] = None,
typedb_host: Optional[str] = None,
):
self.sparql_endpoint = sparql_endpoint
self.qdrant_host = qdrant_host
self.typedb_host = typedb_host
self.kb = OntologyKnowledgeBase()
self.initialized = False
self.documents: Dict[str, TextDocumentItem] = {}
# Method handlers
self._methods: Dict[str, Callable] = {
"initialize": self._handle_initialize,
"initialized": self._handle_initialized,
"shutdown": self._handle_shutdown,
"textDocument/didOpen": self._handle_did_open,
"textDocument/didChange": self._handle_did_change,
"textDocument/didClose": self._handle_did_close,
"textDocument/completion": self._handle_completion,
"textDocument/hover": self._handle_hover,
"textDocument/signatureHelp": self._handle_signature_help,
"sparql/validate": self._handle_validate,
"sparql/execute": self._handle_execute,
"sparql/explain": self._handle_explain,
"sparql/suggest": self._handle_suggest,
}
# =========================================================================
# JSON-RPC Message Handling
# =========================================================================
def handle_message(self, message: Dict[str, Any]) -> Optional[Dict[str, Any]]:
"""
Handle incoming JSON-RPC message.
Args:
message: JSON-RPC request/notification
Returns:
JSON-RPC response (None for notifications)
"""
try:
# Validate JSON-RPC format
if "jsonrpc" not in message or message["jsonrpc"] != "2.0":
return self._error_response(None, ErrorCode.InvalidRequest, "Invalid JSON-RPC version")
method = message.get("method")
params = message.get("params", {})
msg_id = message.get("id") # None for notifications
if not method:
return self._error_response(msg_id, ErrorCode.InvalidRequest, "Missing method")
# Find handler
handler = self._methods.get(method)
if not handler:
return self._error_response(msg_id, ErrorCode.MethodNotFound, f"Unknown method: {method}")
# Check initialization
if not self.initialized and method not in ("initialize", "initialized", "shutdown"):
return self._error_response(msg_id, ErrorCode.ServerNotInitialized, "Server not initialized")
# Call handler
result = handler(params)
# Return response (None for notifications)
if msg_id is not None:
return self._success_response(msg_id, result)
return None
except Exception as e:
logger.exception(f"Error handling message: {e}")
return self._error_response(
message.get("id"),
ErrorCode.InternalError,
str(e)
)
def _success_response(self, msg_id: Any, result: Any) -> Dict[str, Any]:
"""Create JSON-RPC success response."""
return {
"jsonrpc": "2.0",
"id": msg_id,
"result": result,
}
def _error_response(self, msg_id: Any, code: ErrorCode, message: str) -> Dict[str, Any]:
"""Create JSON-RPC error response."""
return {
"jsonrpc": "2.0",
"id": msg_id,
"error": {
"code": code,
"message": message,
},
}
def _notification(self, method: str, params: Dict[str, Any]) -> Dict[str, Any]:
"""Create JSON-RPC notification (no id)."""
return {
"jsonrpc": "2.0",
"method": method,
"params": params,
}
# =========================================================================
# LSP Lifecycle Methods
# =========================================================================
def _handle_initialize(self, params: Dict[str, Any]) -> Dict[str, Any]:
"""Handle initialize request."""
self.initialized = True
return {
"capabilities": {
"textDocumentSync": {
"openClose": True,
"change": 1, # Full sync
},
"completionProvider": {
"triggerCharacters": [":", "<", "?", "$", '"'],
"resolveProvider": False,
},
"hoverProvider": True,
"signatureHelpProvider": {
"triggerCharacters": ["(", ","],
},
"diagnosticProvider": {
"interFileDependencies": False,
"workspaceDiagnostics": False,
},
# Custom SPARQL capabilities
"sparqlExecuteProvider": True,
"sparqlExplainProvider": True,
"sparqlSuggestProvider": True,
},
"serverInfo": {
"name": "sparql-lsp",
"version": "1.0.0",
},
}
def _handle_initialized(self, params: Dict[str, Any]) -> None:
"""Handle initialized notification."""
logger.info("SPARQL-LSP server initialized")
return None
def _handle_shutdown(self, params: Dict[str, Any]) -> None:
"""Handle shutdown request."""
self.initialized = False
return None
# =========================================================================
# Document Sync Methods
# =========================================================================
def _handle_did_open(self, params: Dict[str, Any]) -> None:
"""Handle textDocument/didOpen notification."""
doc = params.get("textDocument", {})
self.documents[doc["uri"]] = TextDocumentItem(
uri=doc["uri"],
languageId=doc.get("languageId", "sparql"),
version=doc.get("version", 0),
text=doc.get("text", ""),
)
return None
def _handle_did_change(self, params: Dict[str, Any]) -> None:
"""Handle textDocument/didChange notification."""
uri = params.get("textDocument", {}).get("uri")
changes = params.get("contentChanges", [])
if uri in self.documents and changes:
# Full sync - take the whole text
self.documents[uri].text = changes[0].get("text", "")
self.documents[uri].version = params.get("textDocument", {}).get("version", 0)
return None
def _handle_did_close(self, params: Dict[str, Any]) -> None:
"""Handle textDocument/didClose notification."""
uri = params.get("textDocument", {}).get("uri")
if uri in self.documents:
del self.documents[uri]
return None
# =========================================================================
# Diagnostics (SHACL-based Validation)
# =========================================================================
def _handle_validate(self, params: Dict[str, Any]) -> Dict[str, Any]:
"""
Handle sparql/validate request.
Returns diagnostics for a SPARQL query, plus auto-corrected version if applicable.
"""
uri = params.get("textDocument", {}).get("uri")
doc = self.documents.get(uri) if uri else None
text = params.get("text") or (doc.text if doc else "")
# Import auto_correct here to avoid circular imports
try:
from .sparql_linter import auto_correct_sparql
corrected_text, was_corrected = auto_correct_sparql(text)
except ImportError:
try:
from sparql_linter import auto_correct_sparql
corrected_text, was_corrected = auto_correct_sparql(text)
except ImportError:
corrected_text, was_corrected = text, False
# Validate the ORIGINAL query to show what's wrong
diagnostics = self._validate_sparql(text)
result: Dict[str, Any] = {
"uri": uri,
"diagnostics": [d.to_dict() for d in diagnostics],
}
# Include corrected query if auto-correction was applied
if was_corrected:
result["corrected_query"] = corrected_text
result["auto_corrected"] = True
return result
def _validate_sparql(self, text: str) -> List[Diagnostic]:
"""Validate SPARQL query and return diagnostics."""
diagnostics = []
lines = text.split("\n")
# Check for deprecated prefixes/classes
# NOTE: Only flag patterns that are ACTUALLY wrong in our triplestore.
# DO NOT flag crm:E39_Actor - it works correctly (dual typing with hcc:Custodian)
# DO NOT suggest hcp: prefix - our ontology uses hc: for BOTH classes and properties
deprecated_patterns = [
(r"w3id\.org/heritage/custodian", "WRONG_PREFIX_URI", "Use https://nde.nl/ontology/hc/ prefix"),
(r'institutionType\s+"Museum"', "WRONG_TYPE_VALUE", 'Use "M" instead of "Museum"'),
(r'institutionType\s+"Library"', "WRONG_TYPE_VALUE", 'Use "L" instead of "Library"'),
(r'institutionType\s+"Archive"', "WRONG_TYPE_VALUE", 'Use "A" instead of "Archive"'),
(r'addressCountry\s+"NL"', "WRONG_COUNTRY_FORMAT", "Use wd:Q55 for Netherlands"),
# NOTE: The following rules were REMOVED because they broke queries:
# - crm:E39_Actor works correctly in our triplestore (dual typing)
# - hc: prefix is used for BOTH classes and properties, hcp: is undefined
# - Suggesting hc:Custodian is wrong - the correct class is hcc:Custodian
]
for line_num, line in enumerate(lines):
for pattern, code, message in deprecated_patterns:
match = re.search(pattern, line, re.IGNORECASE)
if match:
diagnostics.append(Diagnostic(
range=Range(
start=Position(line_num, match.start()),
end=Position(line_num, match.end()),
),
message=message,
severity=DiagnosticSeverity.Error,
code=code,
))
# Check for syntax issues
open_braces = text.count("{")
close_braces = text.count("}")
if open_braces != close_braces:
diagnostics.append(Diagnostic(
range=Range(start=Position(0, 0), end=Position(0, 1)),
message=f"Unbalanced braces: {open_braces} opening, {close_braces} closing",
severity=DiagnosticSeverity.Error,
code="SYNTAX_ERROR",
))
# Check for SELECT without WHERE
if re.search(r"\bSELECT\b", text, re.IGNORECASE) and not re.search(r"\bWHERE\b", text, re.IGNORECASE):
diagnostics.append(Diagnostic(
range=Range(start=Position(0, 0), end=Position(0, 6)),
message="SELECT query missing WHERE clause",
severity=DiagnosticSeverity.Error,
code="MISSING_WHERE",
))
# Warning for province filtering without URI pattern
for code, name in self.kb.DUTCH_PROVINCES.items():
if name.lower() in text.lower() and f"NL-{code}" not in text:
line_num = next((i for i, l in enumerate(lines) if name.lower() in l.lower()), 0)
diagnostics.append(Diagnostic(
range=Range(start=Position(line_num, 0), end=Position(line_num, len(lines[line_num]))),
message=f"Province '{name}' - consider URI filtering",
severity=DiagnosticSeverity.Warning,
code="SUGGEST_URI_FILTER",
))
return diagnostics
# =========================================================================
# Completion
# =========================================================================
def _handle_completion(self, params: Dict[str, Any]) -> Dict[str, Any]:
"""Handle textDocument/completion request."""
uri = params.get("textDocument", {}).get("uri")
position = params.get("position", {})
doc = self.documents.get(uri) if uri else None
text = doc.text if doc else ""
line = position.get("line", 0)
character = position.get("character", 0)
# Get the current line and context
lines = text.split("\n")
current_line = lines[line] if line < len(lines) else ""
prefix_text = current_line[:character]
items = []
# Prefix completion (after PREFIX keyword)
if re.search(r"PREFIX\s+\w*$", prefix_text, re.IGNORECASE):
for prefix, info in self.kb.PREFIXES.items():
items.append(CompletionItem(
label=prefix,
kind=CompletionItemKind.Module,
detail=info["description"],
documentation=f"**URI:** `{info['uri']}`\n\n**Example:** `{info['example']}`",
insertText=f"{prefix}: <{info['uri']}>",
))
# Class completion (after "a " or "rdf:type")
elif re.search(r"(\ba\s+|\brdf:type\s+)\w*$", prefix_text, re.IGNORECASE):
for cls, info in self.kb.CLASSES.items():
items.append(CompletionItem(
label=cls,
kind=CompletionItemKind.Class,
detail=info["description"],
documentation=f"**Properties:** {', '.join(info['properties'])}\n\n**Example:**\n```sparql\n{info['example']}\n```",
insertText=cls,
))
# Property completion (after prefix like "hcp:")
elif re.search(r"(hcp|schema|skos|rdfs|foaf|dct):\w*$", prefix_text):
prefix_match = re.search(r"(hcp|schema|skos|rdfs|foaf|dct):(\w*)$", prefix_text)
if prefix_match:
prefix = prefix_match.group(1)
for prop, info in self.kb.PROPERTIES.items():
if prop.startswith(f"{prefix}:"):
prop_name = prop.split(":")[1]
items.append(CompletionItem(
label=prop_name,
kind=CompletionItemKind.Property,
detail=info["description"],
documentation=f"**Range:** `{info['range']}`\n\n**Example:**\n```sparql\n{info['example']}\n```",
insertText=prop_name,
))
# Institution type value completion (after institutionType)
elif re.search(r'institutionType\s+"?\w*$', prefix_text):
for code, name in self.kb.PROPERTIES["hcp:institutionType"]["values"].items():
items.append(CompletionItem(
label=f'"{code}"',
kind=CompletionItemKind.Value,
detail=name,
documentation=f"Institution type code for **{name}**",
insertText=f'"{code}"',
))
# Country completion (after addressCountry)
elif re.search(r"addressCountry\s+\w*$", prefix_text):
for qid, name in self.kb.COUNTRIES.items():
items.append(CompletionItem(
label=f"wd:{qid}",
kind=CompletionItemKind.Value,
detail=name,
documentation=f"Wikidata entity for **{name}**",
insertText=f"wd:{qid}",
))
# Keyword completion
elif re.search(r"\b[A-Z]*$", prefix_text):
keyword_prefix = re.search(r"\b([A-Z]*)$", prefix_text)
if keyword_prefix:
prefix_str = keyword_prefix.group(1).upper()
for kw in self.kb.KEYWORDS:
if kw.upper().startswith(prefix_str):
items.append(CompletionItem(
label=kw,
kind=CompletionItemKind.Keyword,
detail="SPARQL keyword",
insertText=kw,
))
return {
"isIncomplete": False,
"items": [item.to_dict() for item in items[:50]], # Limit to 50 items
}
# =========================================================================
# Hover
# =========================================================================
def _handle_hover(self, params: Dict[str, Any]) -> Optional[Dict[str, Any]]:
"""Handle textDocument/hover request."""
uri = params.get("textDocument", {}).get("uri")
position = params.get("position", {})
doc = self.documents.get(uri) if uri else None
text = doc.text if doc else ""
line = position.get("line", 0)
character = position.get("character", 0)
lines = text.split("\n")
current_line = lines[line] if line < len(lines) else ""
# Find word at position
word_match = None
for match in re.finditer(r"[\w:]+", current_line):
if match.start() <= character <= match.end():
word_match = match
break
if not word_match:
return None
word = word_match.group()
# Check if it's a class
if word in self.kb.CLASSES:
info = self.kb.CLASSES[word]
return Hover(
contents=f"### {word}\n\n{info['description']}\n\n**Properties:** {', '.join(info['properties'])}\n\n```sparql\n{info['example']}\n```",
range=Range(
start=Position(line, word_match.start()),
end=Position(line, word_match.end()),
),
).to_dict()
# Check if it's a property
if word in self.kb.PROPERTIES:
info = self.kb.PROPERTIES[word]
content = f"### {word}\n\n{info['description']}\n\n**Range:** `{info['range']}`"
if "values" in info:
content += "\n\n**Valid values:**\n"
for code, name in info["values"].items():
content += f"- `\"{code}\"` = {name}\n"
content += f"\n\n```sparql\n{info['example']}\n```"
return Hover(
contents=content,
range=Range(
start=Position(line, word_match.start()),
end=Position(line, word_match.end()),
),
).to_dict()
# Check if it's a prefix
prefix = word.rstrip(":")
if prefix in self.kb.PREFIXES:
info = self.kb.PREFIXES[prefix]
return Hover(
contents=f"### PREFIX {prefix}:\n\n{info['description']}\n\n**URI:** `{info['uri']}`\n\n**Example:** `{info['example']}`",
range=Range(
start=Position(line, word_match.start()),
end=Position(line, word_match.end()),
),
).to_dict()
# Check for Wikidata entity
if word.startswith("wd:Q") or word.startswith("Q"):
qid = word.replace("wd:", "")
if qid in self.kb.COUNTRIES:
return Hover(
contents=f"### {word}\n\n**Country:** {self.kb.COUNTRIES[qid]}\n\nWikidata entity for country filtering.",
range=Range(
start=Position(line, word_match.start()),
end=Position(line, word_match.end()),
),
).to_dict()
return None
# =========================================================================
# Signature Help
# =========================================================================
def _handle_signature_help(self, params: Dict[str, Any]) -> Optional[Dict[str, Any]]:
"""Handle textDocument/signatureHelp request."""
uri = params.get("textDocument", {}).get("uri")
position = params.get("position", {})
doc = self.documents.get(uri) if uri else None
text = doc.text if doc else ""
line = position.get("line", 0)
character = position.get("character", 0)
lines = text.split("\n")
current_line = lines[line] if line < len(lines) else ""
prefix_text = current_line[:character]
# SPARQL aggregate functions
functions: Dict[str, Dict[str, Any]] = {
"COUNT": {
"signature": "COUNT(expression) or COUNT(DISTINCT expression)",
"documentation": "Returns the count of bindings. Use DISTINCT to count unique values.",
"parameters": [{"label": "expression", "documentation": "Variable or expression to count"}],
},
"SUM": {
"signature": "SUM(expression)",
"documentation": "Returns the sum of numeric values.",
"parameters": [{"label": "expression", "documentation": "Numeric variable or expression"}],
},
"AVG": {
"signature": "AVG(expression)",
"documentation": "Returns the average of numeric values.",
"parameters": [{"label": "expression", "documentation": "Numeric variable or expression"}],
},
"MIN": {
"signature": "MIN(expression)",
"documentation": "Returns the minimum value.",
"parameters": [{"label": "expression", "documentation": "Variable or expression"}],
},
"MAX": {
"signature": "MAX(expression)",
"documentation": "Returns the maximum value.",
"parameters": [{"label": "expression", "documentation": "Variable or expression"}],
},
"FILTER": {
"signature": "FILTER(condition)",
"documentation": "Filters results based on a boolean condition.",
"parameters": [{"label": "condition", "documentation": "Boolean expression (e.g., CONTAINS(?name, 'Museum'))"}],
},
"CONTAINS": {
"signature": "CONTAINS(string, substring)",
"documentation": "Returns true if string contains substring. Use with STR() for URIs.",
"parameters": [
{"label": "string", "documentation": "String to search in"},
{"label": "substring", "documentation": "String to search for"},
],
},
"STR": {
"signature": "STR(term)",
"documentation": "Converts a term (URI, literal) to its string representation.",
"parameters": [{"label": "term", "documentation": "URI or literal to convert"}],
},
"BIND": {
"signature": "BIND(expression AS ?variable)",
"documentation": "Binds the result of an expression to a variable.",
"parameters": [
{"label": "expression", "documentation": "Expression to evaluate"},
{"label": "variable", "documentation": "Variable to bind to"},
],
},
}
# Find which function we're in
for func_name, func_info in functions.items():
pattern = rf"\b{func_name}\s*\($"
if re.search(pattern, prefix_text, re.IGNORECASE):
return SignatureHelp(
signatures=[SignatureInformation(
label=func_info["signature"],
documentation=func_info["documentation"],
parameters=func_info["parameters"],
)],
).to_dict()
return None
# =========================================================================
# SPARQL Execution
# =========================================================================
def _handle_execute(self, params: Dict[str, Any]) -> Dict[str, Any]:
"""Handle sparql/execute request."""
import time
import httpx
query = params.get("query", "")
try:
start_time = time.time()
response = httpx.post(
self.sparql_endpoint,
content=query,
headers={
"Content-Type": "application/sparql-query",
"Accept": "application/json",
},
timeout=30.0,
)
execution_time = (time.time() - start_time) * 1000
if response.status_code == 200:
return SPARQLExecuteResult(
success=True,
results=response.json(),
executionTimeMs=execution_time,
).to_dict()
else:
return SPARQLExecuteResult(
success=False,
error=f"HTTP {response.status_code}: {response.text}",
executionTimeMs=execution_time,
).to_dict()
except Exception as e:
return SPARQLExecuteResult(
success=False,
error=str(e),
).to_dict()
# =========================================================================
# SPARQL Explanation
# =========================================================================
def _handle_explain(self, params: Dict[str, Any]) -> Dict[str, Any]:
"""Handle sparql/explain request."""
query = params.get("query", "")
steps = []
complexity = "simple"
optimizations = []
# Analyze query structure
if re.search(r"\bSELECT\b", query, re.IGNORECASE):
steps.append("SELECT query - retrieves specific variables")
elif re.search(r"\bCONSTRUCT\b", query, re.IGNORECASE):
steps.append("CONSTRUCT query - builds RDF graph")
complexity = "moderate"
elif re.search(r"\bASK\b", query, re.IGNORECASE):
steps.append("ASK query - returns boolean result")
# Check for patterns
if "a hc:Custodian" in query or "rdf:type hc:Custodian" in query:
steps.append("Filters to heritage custodian institutions")
if "hcp:institutionType" in query:
type_match = re.search(r'institutionType\s+"([A-Z])"', query)
if type_match:
type_code = type_match.group(1)
type_name = self.kb.PROPERTIES["hcp:institutionType"]["values"].get(type_code, "Unknown")
steps.append(f"Filters by institution type: {type_name} ({type_code})")
if "schema:addressCountry" in query:
country_match = re.search(r"addressCountry\s+wd:(Q\d+)", query)
if country_match:
qid = country_match.group(1)
country_name = self.kb.COUNTRIES.get(qid, "Unknown")
steps.append(f"Filters by country: {country_name}")
if re.search(r"FILTER.*CONTAINS.*STR.*NL-[A-Z]{2}", query):
province_match = re.search(r"NL-([A-Z]{2})", query)
if province_match:
code = province_match.group(1)
province_name = self.kb.DUTCH_PROVINCES.get(code, "Unknown")
steps.append(f"Filters by Dutch province: {province_name} ({code})")
if re.search(r"\bGROUP BY\b", query, re.IGNORECASE):
steps.append("Groups results for aggregation")
complexity = "moderate"
if re.search(r"\bCOUNT\b", query, re.IGNORECASE):
steps.append("Counts matching results")
if re.search(r"\bOPTIONAL\b", query, re.IGNORECASE):
steps.append("Includes optional patterns (may return nulls)")
complexity = "moderate"
if re.search(r"\bUNION\b", query, re.IGNORECASE):
steps.append("Combines multiple patterns with UNION")
complexity = "complex"
if re.search(r"\bSERVICE\b", query, re.IGNORECASE):
steps.append("Federated query to external endpoint")
complexity = "complex"
optimizations.append("Federated queries can be slow - consider caching results")
# Generate summary
summary = f"This SPARQL query {'retrieves' if 'SELECT' in query.upper() else 'processes'} data from the Heritage Custodian knowledge graph."
return SPARQLExplanation(
summary=summary,
steps=steps if steps else ["Basic query pattern"],
estimatedComplexity=complexity,
suggestedOptimizations=optimizations if optimizations else None,
).to_dict()
# =========================================================================
# Novel Connection Suggestions (Vector DB Integration)
# =========================================================================
def _handle_suggest(self, params: Dict[str, Any]) -> Dict[str, Any]:
"""
Handle sparql/suggest request.
Uses vector database to suggest novel connections that could
enhance the SPARQL query.
"""
query = params.get("query", "")
context = params.get("context", "")
suggestions: List[SPARQLSuggestion] = []
# Extract entities from query
entities: List[str] = []
if "skos:prefLabel" in query or "schema:name" in query:
# Query is looking for names - suggest related entities
suggestions.append(SPARQLSuggestion(
type="relationship",
description="Consider adding organizational hierarchy",
sparqlFragment="OPTIONAL { ?s schema:containedInPlace ?parent . ?parent skos:prefLabel ?parentName . }",
confidence=0.7,
source="inference",
))
if "hcp:institutionType" in query:
# Query filters by type - suggest cross-type relationships
suggestions.append(SPARQLSuggestion(
type="pattern",
description="Find related institutions in same location",
sparqlFragment="""
OPTIONAL {
?related a hc:Custodian ;
schema:containedInPlace ?location .
?s schema:containedInPlace ?location .
FILTER(?related != ?s)
}""",
confidence=0.6,
source="inference",
))
# If we have Qdrant configured, query for semantic suggestions
if self.qdrant_host:
# This would integrate with the actual Qdrant retriever
suggestions.append(SPARQLSuggestion(
type="entity",
description="Semantically similar institutions found in vector index",
sparqlFragment="# Use vector similarity to find: [entities from Qdrant]",
confidence=0.8,
source="qdrant",
))
# If we have TypeDB configured, suggest inference patterns
if self.typedb_host:
suggestions.append(SPARQLSuggestion(
type="relationship",
description="TypeDB can infer: observation → reconstruction chains",
sparqlFragment="# Consider TypeQL: get-reconstructions-by-observation-name($name)",
confidence=0.9,
source="typedb",
))
return {
"suggestions": [s.to_dict() for s in suggestions],
}
# =============================================================================
# Convenience Functions for AI Agent Integration
# =============================================================================
def create_lsp_request(method: str, params: Dict[str, Any], request_id: Optional[int] = None) -> Dict[str, Any]:
"""
Create a JSON-RPC request for the SPARQL-LSP server.
Args:
method: LSP method name (e.g., "sparql/validate")
params: Method parameters
request_id: Optional request ID (None for notifications)
Returns:
JSON-RPC request dictionary
"""
request: Dict[str, Any] = {
"jsonrpc": "2.0",
"method": method,
"params": params,
}
if request_id is not None:
request["id"] = request_id
return request
def validate_sparql_query(query: str, server: Optional[SPARQLLanguageServer] = None) -> Dict[str, Any]:
"""
Convenience function to validate a SPARQL query.
Args:
query: SPARQL query string
server: Optional LSP server instance (creates one if not provided)
Returns:
Validation result with diagnostics
"""
if server is None:
server = SPARQLLanguageServer()
server.initialized = True
request = create_lsp_request("sparql/validate", {"text": query}, request_id=1)
response = server.handle_message(request)
if response is None:
return {}
result: Dict[str, Any] = response.get("result", {})
return result
def get_sparql_completions(query: str, line: int, character: int, server: Optional[SPARQLLanguageServer] = None) -> List[Dict[str, Any]]:
"""
Convenience function to get completions for a SPARQL query.
Args:
query: SPARQL query string
line: Line number (0-indexed)
character: Character position (0-indexed)
server: Optional LSP server instance
Returns:
List of completion items
"""
if server is None:
server = SPARQLLanguageServer()
server.initialized = True
# Open document
doc_uri = "inmemory://query.sparql"
server.handle_message(create_lsp_request("textDocument/didOpen", {
"textDocument": {
"uri": doc_uri,
"languageId": "sparql",
"version": 1,
"text": query,
}
}))
# Get completions
request = create_lsp_request("textDocument/completion", {
"textDocument": {"uri": doc_uri},
"position": {"line": line, "character": character},
}, request_id=1)
response = server.handle_message(request)
if response is None:
return []
result_dict: Dict[str, Any] = response.get("result", {})
items: List[Dict[str, Any]] = result_dict.get("items", [])
return items
# =============================================================================
# Example Usage
# =============================================================================
if __name__ == "__main__":
# Create server
server = SPARQLLanguageServer()
# Initialize
init_response = server.handle_message({
"jsonrpc": "2.0",
"id": 1,
"method": "initialize",
"params": {},
})
print("Initialize:", json.dumps(init_response, indent=2))
# Validate a query with issues
bad_query = """
PREFIX hc: <https://w3id.org/heritage/custodian/>
SELECT ?museum ?name WHERE {
?museum a crm:E39_Actor ;
hc:institutionType "Museum" ;
schema:addressCountry "NL" ;
skos:prefLabel ?name .
}
"""
validate_response = server.handle_message({
"jsonrpc": "2.0",
"id": 2,
"method": "sparql/validate",
"params": {"text": bad_query},
})
print("\nValidation:", json.dumps(validate_response, indent=2))
# Get completions
server.handle_message({
"jsonrpc": "2.0",
"method": "textDocument/didOpen",
"params": {
"textDocument": {
"uri": "test://query.sparql",
"languageId": "sparql",
"version": 1,
"text": "SELECT ?s WHERE { ?s hcp:",
}
}
})
completion_response = server.handle_message({
"jsonrpc": "2.0",
"id": 3,
"method": "textDocument/completion",
"params": {
"textDocument": {"uri": "test://query.sparql"},
"position": {"line": 0, "character": 25},
},
})
print("\nCompletions:", json.dumps(completion_response, indent=2))
# Explain a query
good_query = """
PREFIX hc: <https://nde.nl/ontology/hc/class/>
PREFIX hcp: <https://nde.nl/ontology/hc/>
SELECT (COUNT(?s) as ?count) WHERE {
?s a hc:Custodian ;
hcp:institutionType "M" ;
schema:addressCountry wd:Q55 .
FILTER(CONTAINS(STR(?s), "NL-NH"))
}
"""
explain_response = server.handle_message({
"jsonrpc": "2.0",
"id": 4,
"method": "sparql/explain",
"params": {"query": good_query},
})
print("\nExplanation:", json.dumps(explain_response, indent=2))