glam/src/glam_extractor/api/sparql_lsp.py

"""
SPARQL Language Server Protocol (SPARQL-LSP)

A JSON-RPC based protocol for providing language intelligence for SPARQL queries
against the Heritage Custodian ontology. Designed like the Language Server Protocol
to be reusable across different AI agents, IDEs, and tools.

Architecture:
    ┌─────────────────┐     JSON-RPC      ┌─────────────────────┐
    │   AI Agent      │◄──────────────────►│   SPARQL-LSP        │
    │   (Client)      │                    │   Server            │
    └─────────────────┘                    └─────────────────────┘
                                                    │
    ┌─────────────────┐     JSON-RPC      ┌────────▼────────────┐
    │   IDE/Editor    │◄──────────────────►│   Knowledge Sources │
    │   (Client)      │                    │   - SHACL Shapes    │
    └─────────────────┘                    │   - LinkML Schema   │
                                           │   - TypeDB Rules    │
    ┌─────────────────┐     JSON-RPC      │   - SPARQL Endpoint │
    │   Web UI        │◄──────────────────►└─────────────────────┘
    │   (Client)      │
    └─────────────────┘

LSP Methods Implemented:
    - initialize: Server capabilities handshake
    - textDocument/publishDiagnostics: SHACL-based validation errors
    - textDocument/completion: Prefix, class, property completion
    - textDocument/hover: Documentation on hover
    - textDocument/signatureHelp: Function signatures
    - sparql/execute: Execute query and return results
    - sparql/explain: Explain what a query does
    - sparql/suggest: Suggest novel connections from vector DB

Author: Heritage Custodian Ontology Project
Date: 2025-12-27
Protocol Version: 1.0.0
"""

import json
import logging
import re
import uuid
from dataclasses import dataclass, field, asdict
from enum import Enum, IntEnum
from typing import Any, Dict, List, Optional, Union, Callable, Sequence
from abc import ABC, abstractmethod

logger = logging.getLogger(__name__)


# =============================================================================
# JSON-RPC Protocol Types (LSP Standard)
# =============================================================================

class ErrorCode(IntEnum):
    """Standard JSON-RPC and LSP error codes."""
    # JSON-RPC errors
    ParseError = -32700
    InvalidRequest = -32600
    MethodNotFound = -32601
    InvalidParams = -32602
    InternalError = -32603

    # LSP errors
    ServerNotInitialized = -32002
    UnknownErrorCode = -32001
    RequestCancelled = -32800
    ContentModified = -32801


@dataclass
class Position:
    """Position in a text document (0-indexed)."""
    line: int
    character: int


@dataclass
class Range:
    """Range in a text document."""
    start: Position
    end: Position


@dataclass
class Location:
    """Location in a document."""
    uri: str
    range: Range


@dataclass
class TextDocumentIdentifier:
    """Identifies a text document."""
    uri: str


@dataclass
class TextDocumentItem:
    """Text document with content."""
    uri: str
    languageId: str
    version: int
    text: str


class DiagnosticSeverity(IntEnum):
    """Diagnostic severity levels."""
    Error = 1
    Warning = 2
    Information = 3
    Hint = 4


@dataclass
class Diagnostic:
    """Represents a diagnostic (error, warning, etc.)."""
    range: Range
    message: str
    severity: DiagnosticSeverity = DiagnosticSeverity.Error
    code: Optional[str] = None
    source: str = "sparql-lsp"
    relatedInformation: Optional[List[Dict]] = None

    def to_dict(self) -> Dict[str, Any]:
        return {
            "range": {
                "start": {"line": self.range.start.line, "character": self.range.start.character},
                "end": {"line": self.range.end.line, "character": self.range.end.character},
            },
            "message": self.message,
            "severity": self.severity,
            "code": self.code,
            "source": self.source,
        }


class CompletionItemKind(IntEnum):
    """Completion item kinds."""
    Text = 1
    Method = 2
    Function = 3
    Constructor = 4
    Field = 5
    Variable = 6
    Class = 7
    Interface = 8
    Module = 9
    Property = 10
    Unit = 11
    Value = 12
    Enum = 13
    Keyword = 14
    Snippet = 15
    Color = 16
    File = 17
    Reference = 18
    Folder = 19
    EnumMember = 20
    Constant = 21
    Struct = 22
    Event = 23
    Operator = 24
    TypeParameter = 25


@dataclass
class CompletionItem:
    """Completion item returned by completion requests."""
    label: str
    kind: CompletionItemKind
    detail: Optional[str] = None
    documentation: Optional[str] = None
    insertText: Optional[str] = None
    insertTextFormat: int = 1  # 1 = PlainText, 2 = Snippet

    def to_dict(self) -> Dict[str, Any]:
        result: Dict[str, Any] = {
            "label": self.label,
            "kind": self.kind,
        }
        if self.detail:
            result["detail"] = self.detail
        if self.documentation:
            result["documentation"] = {"kind": "markdown", "value": self.documentation}
        if self.insertText:
            result["insertText"] = self.insertText
            result["insertTextFormat"] = self.insertTextFormat
        return result


@dataclass
class Hover:
    """Hover information."""
    contents: str  # Markdown content
    range: Optional[Range] = None

    def to_dict(self) -> Dict[str, Any]:
        result: Dict[str, Any] = {"contents": {"kind": "markdown", "value": self.contents}}
        if self.range:
            result["range"] = {
                "start": {"line": self.range.start.line, "character": self.range.start.character},
                "end": {"line": self.range.end.line, "character": self.range.end.character},
            }
        return result


@dataclass
class SignatureInformation:
    """Signature information for a function."""
    label: str
    documentation: Optional[str] = None
    parameters: Optional[List[Dict[str, Any]]] = None

    def to_dict(self) -> Dict[str, Any]:
        result: Dict[str, Any] = {"label": self.label}
        if self.documentation:
            result["documentation"] = {"kind": "markdown", "value": self.documentation}
        if self.parameters:
            result["parameters"] = self.parameters
        return result


@dataclass
class SignatureHelp:
    """Signature help result."""
    signatures: List[SignatureInformation]
    activeSignature: int = 0
    activeParameter: int = 0

    def to_dict(self) -> Dict[str, Any]:
        return {
            "signatures": [s.to_dict() for s in self.signatures],
            "activeSignature": self.activeSignature,
            "activeParameter": self.activeParameter,
        }


# =============================================================================
# SPARQL-LSP Specific Types
# =============================================================================

@dataclass
class SPARQLExecuteResult:
    """Result of executing a SPARQL query."""
    success: bool
    results: Optional[Dict] = None
    error: Optional[str] = None
    executionTimeMs: Optional[float] = None

    def to_dict(self) -> Dict[str, Any]:
        return {
            "success": self.success,
            "results": self.results,
            "error": self.error,
            "executionTimeMs": self.executionTimeMs,
        }


@dataclass
class SPARQLExplanation:
    """Explanation of what a SPARQL query does."""
    summary: str
    steps: List[str]
    estimatedComplexity: str  # "simple", "moderate", "complex"
    suggestedOptimizations: Optional[List[str]] = None

    def to_dict(self) -> Dict[str, Any]:
        return {
            "summary": self.summary,
            "steps": self.steps,
            "estimatedComplexity": self.estimatedComplexity,
            "suggestedOptimizations": self.suggestedOptimizations,
        }


@dataclass
class SPARQLSuggestion:
    """Novel connection suggestion from vector DB."""
    type: str  # "relationship", "entity", "pattern"
    description: str
    sparqlFragment: str
    confidence: float
    source: str  # "qdrant", "typedb", "inference"

    def to_dict(self) -> Dict[str, Any]:
        return asdict(self)


# =============================================================================
# Knowledge Base (SHACL, LinkML, TypeDB)
# =============================================================================

class OntologyKnowledgeBase:
    """
    Knowledge base for SPARQL-LSP, derived from:
    - SHACL shapes (validation rules)
    - LinkML schema (class/property definitions)
    - TypeDB rules (inference patterns)
    """

    # Prefixes
    PREFIXES: Dict[str, Dict[str, str]] = {
        "hc": {
            "uri": "https://nde.nl/ontology/hc/class/",
            "description": "Heritage Custodian classes",
            "example": "hc:Custodian",
        },
        "hcp": {
            "uri": "https://nde.nl/ontology/hc/",
            "description": "Heritage Custodian properties",
            "example": "hcp:institutionType",
        },
        "schema": {
            "uri": "http://schema.org/",
            "description": "Schema.org vocabulary",
            "example": "schema:name",
        },
        "skos": {
            "uri": "http://www.w3.org/2004/02/skos/core#",
            "description": "SKOS vocabulary for labels",
            "example": "skos:prefLabel",
        },
        "rdfs": {
            "uri": "http://www.w3.org/2000/01/rdf-schema#",
            "description": "RDF Schema",
            "example": "rdfs:label",
        },
        "wd": {
            "uri": "http://www.wikidata.org/entity/",
            "description": "Wikidata entities",
            "example": "wd:Q55 (Netherlands)",
        },
        "wdt": {
            "uri": "http://www.wikidata.org/prop/direct/",
            "description": "Wikidata direct properties",
            "example": "wdt:P17 (country)",
        },
        "foaf": {
            "uri": "http://xmlns.com/foaf/0.1/",
            "description": "FOAF vocabulary",
            "example": "foaf:name",
        },
        "dct": {
            "uri": "http://purl.org/dc/terms/",
            "description": "Dublin Core Terms",
            "example": "dct:description",
        },
    }

    # Classes (from SHACL NodeShapes)
    CLASSES: Dict[str, Dict[str, Any]] = {
        "hc:Custodian": {
            "description": "Heritage custodian institution (museum, archive, library, etc.)",
            "properties": ["hcp:institutionType", "hcp:ghcid", "hcp:isil", "skos:prefLabel", "schema:addressCountry"],
            "example": "?s a hc:Custodian .",
        },
    }

    # Properties (from SHACL PropertyShapes)
    PROPERTIES: Dict[str, Dict[str, Any]] = {
        "hcp:institutionType": {
            "description": "Single-letter institution type code",
            "domain": "hc:Custodian",
            "range": "xsd:string",
            "pattern": "^[MLAGSBREDONFHICUT]$",
            "values": {
                "M": "Museum",
                "L": "Library",
                "A": "Archive",
                "G": "Gallery",
                "S": "Collecting Society",
                "B": "Botanical/Zoo",
                "R": "Research Center",
                "E": "Education Provider",
                "O": "Official Institution",
                "D": "Digital Platform",
                "N": "NGO",
                "H": "Holy Site",
                "F": "Feature",
                "I": "Intangible Heritage",
                "C": "Corporation",
                "U": "Unknown",
                "T": "Trade Association",
            },
            "example": '?s hcp:institutionType "M" .',
        },
        "hcp:ghcid": {
            "description": "Global Heritage Custodian ID",
            "domain": "hc:Custodian",
            "range": "xsd:string",
            "pattern": "^[A-Z]{2}-[A-Z]{2,3}-[A-Z]{2,4}-[A-Z]-[A-Z0-9]+$",
            "example": '?s hcp:ghcid "NL-NH-AMS-M-RIJKS" .',
        },
        "hcp:isil": {
            "description": "ISIL code (International Standard Identifier for Libraries)",
            "domain": "hc:Custodian",
            "range": "xsd:string",
            "pattern": "^[A-Z]{2}-[A-Za-z0-9]+$",
            "example": '?s hcp:isil "NL-AmRMA" .',
        },
        "hcp:wikidataId": {
            "description": "Wikidata Q-number (without wd: prefix)",
            "domain": "hc:Custodian",
            "range": "xsd:string",
            "pattern": "^Q[0-9]+$",
            "example": '?s hcp:wikidataId "Q190804" .',
        },
        "skos:prefLabel": {
            "description": "Preferred label/name of the institution",
            "domain": "hc:Custodian",
            "range": "xsd:string",
            "example": "?s skos:prefLabel ?name .",
        },
        "schema:name": {
            "description": "Name of the institution",
            "domain": "hc:Custodian",
            "range": "xsd:string",
            "example": "?s schema:name ?name .",
        },
        "schema:addressCountry": {
            "description": "Country as Wikidata entity URI",
            "domain": "hc:Custodian",
            "range": "IRI",
            "example": "?s schema:addressCountry wd:Q55 .  # Netherlands",
        },
        "schema:url": {
            "description": "Website URL",
            "domain": "hc:Custodian",
            "range": "xsd:anyURI",
            "example": "?s schema:url ?website .",
        },
    }

    # SPARQL Keywords
    KEYWORDS: List[str] = [
        "SELECT", "CONSTRUCT", "ASK", "DESCRIBE",
        "WHERE", "FILTER", "OPTIONAL", "UNION", "MINUS",
        "GROUP BY", "ORDER BY", "HAVING", "LIMIT", "OFFSET",
        "DISTINCT", "REDUCED", "AS", "BIND", "VALUES",
        "COUNT", "SUM", "AVG", "MIN", "MAX", "SAMPLE", "GROUP_CONCAT",
        "STR", "LANG", "LANGMATCHES", "DATATYPE", "BOUND", "IRI", "URI",
        "BNODE", "RAND", "ABS", "CEIL", "FLOOR", "ROUND",
        "CONCAT", "STRLEN", "UCASE", "LCASE", "ENCODE_FOR_URI",
        "CONTAINS", "STRSTARTS", "STRENDS", "STRBEFORE", "STRAFTER",
        "YEAR", "MONTH", "DAY", "HOURS", "MINUTES", "SECONDS",
        "TIMEZONE", "TZ", "NOW", "UUID", "STRUUID",
        "MD5", "SHA1", "SHA256", "SHA384", "SHA512",
        "COALESCE", "IF", "STRLANG", "STRDT",
        "SAMETERM", "ISIRI", "ISURI", "ISBLANK", "ISLITERAL", "ISNUMERIC",
        "REGEX", "REPLACE", "EXISTS", "NOT EXISTS",
        "PREFIX", "BASE", "FROM", "FROM NAMED", "GRAPH",
        "SERVICE", "SILENT", "IN", "NOT IN",
        "a",  # shorthand for rdf:type
    ]

    # Province codes for filtering
    DUTCH_PROVINCES: Dict[str, str] = {
        "NH": "Noord-Holland",
        "ZH": "Zuid-Holland",
        "NB": "Noord-Brabant",
        "GE": "Gelderland",
        "UT": "Utrecht",
        "OV": "Overijssel",
        "LI": "Limburg",
        "FR": "Friesland",
        "GR": "Groningen",
        "DR": "Drenthe",
        "FL": "Flevoland",
        "ZE": "Zeeland",
    }

    # Country Wikidata IDs
    COUNTRIES: Dict[str, str] = {
        "Q55": "Netherlands",
        "Q17": "Japan",
        "Q213": "Czech Republic",
        "Q31": "Belgium",
        "Q40": "Austria",
        "Q183": "Germany",
        "Q145": "United Kingdom",
        "Q142": "France",
        "Q30": "United States",
    }

    # TypeDB Functions (inference rules)
    TYPEDB_FUNCTIONS: Dict[str, Dict[str, str]] = {
        "get-reconstructions-by-observation-name": {
            "parameters": "$name: string",
            "returns": "{ custodian-reconstruction }",
            "description": "Get all reconstructions derived from observations with given name",
        },
        "get-high-confidence-observations": {
            "parameters": "",
            "returns": "{ custodian-observation }",
            "description": "Get observations that have multiple sources (high confidence)",
        },
        "get-entity-names": {
            "parameters": "$recon: custodian-reconstruction",
            "returns": "{ string }",
            "description": "Get all observed names for a given reconstruction",
        },
        "get-all-descendants": {
            "parameters": "$parent: custodian-reconstruction",
            "returns": "{ custodian-reconstruction }",
            "description": "Get all child organizations recursively",
        },
        "get-name-successors": {
            "parameters": "$name: custodian-name",
            "returns": "{ custodian-name }",
            "description": "Get all successor names in temporal order",
        },
    }


# =============================================================================
# SPARQL-LSP Server
# =============================================================================

class SPARQLLanguageServer:
    """
    SPARQL Language Server implementing LSP-like protocol.

    Provides:
    - Diagnostics (SHACL-based validation)
    - Code completion (prefixes, classes, properties)
    - Hover information (documentation)
    - Signature help (SPARQL functions)
    - Query execution
    - Query explanation
    - Novel connection suggestions
    """

    def __init__(
        self,
        sparql_endpoint: str = "https://bronhouder.nl/sparql",
        qdrant_host: Optional[str] = None,
        typedb_host: Optional[str] = None,
    ):
        self.sparql_endpoint = sparql_endpoint
        self.qdrant_host = qdrant_host
        self.typedb_host = typedb_host
        self.kb = OntologyKnowledgeBase()
        self.initialized = False
        self.documents: Dict[str, TextDocumentItem] = {}

        # Method handlers
        self._methods: Dict[str, Callable] = {
            "initialize": self._handle_initialize,
            "initialized": self._handle_initialized,
            "shutdown": self._handle_shutdown,
            "textDocument/didOpen": self._handle_did_open,
            "textDocument/didChange": self._handle_did_change,
            "textDocument/didClose": self._handle_did_close,
            "textDocument/completion": self._handle_completion,
            "textDocument/hover": self._handle_hover,
            "textDocument/signatureHelp": self._handle_signature_help,
            "sparql/validate": self._handle_validate,
            "sparql/execute": self._handle_execute,
            "sparql/explain": self._handle_explain,
            "sparql/suggest": self._handle_suggest,
        }

    # =========================================================================
    # JSON-RPC Message Handling
    # =========================================================================

    def handle_message(self, message: Dict[str, Any]) -> Optional[Dict[str, Any]]:
        """
        Handle incoming JSON-RPC message.

        Args:
            message: JSON-RPC request/notification

        Returns:
            JSON-RPC response (None for notifications)
        """
        try:
            # Validate JSON-RPC format
            if "jsonrpc" not in message or message["jsonrpc"] != "2.0":
                return self._error_response(None, ErrorCode.InvalidRequest, "Invalid JSON-RPC version")

            method = message.get("method")
            params = message.get("params", {})
            msg_id = message.get("id")  # None for notifications

            if not method:
                return self._error_response(msg_id, ErrorCode.InvalidRequest, "Missing method")

            # Find handler
            handler = self._methods.get(method)
            if not handler:
                return self._error_response(msg_id, ErrorCode.MethodNotFound, f"Unknown method: {method}")

            # Check initialization
            if not self.initialized and method not in ("initialize", "initialized", "shutdown"):
                return self._error_response(msg_id, ErrorCode.ServerNotInitialized, "Server not initialized")

            # Call handler
            result = handler(params)

            # Return response (None for notifications)
            if msg_id is not None:
                return self._success_response(msg_id, result)
            return None

        except Exception as e:
            logger.exception(f"Error handling message: {e}")
            return self._error_response(
                message.get("id"),
                ErrorCode.InternalError,
                str(e)
            )

    def _success_response(self, msg_id: Any, result: Any) -> Dict[str, Any]:
        """Create JSON-RPC success response."""
        return {
            "jsonrpc": "2.0",
            "id": msg_id,
            "result": result,
        }

    def _error_response(self, msg_id: Any, code: ErrorCode, message: str) -> Dict[str, Any]:
        """Create JSON-RPC error response."""
        return {
            "jsonrpc": "2.0",
            "id": msg_id,
            "error": {
                "code": code,
                "message": message,
            },
        }

    def _notification(self, method: str, params: Dict[str, Any]) -> Dict[str, Any]:
        """Create JSON-RPC notification (no id)."""
        return {
            "jsonrpc": "2.0",
            "method": method,
            "params": params,
        }

    # =========================================================================
    # LSP Lifecycle Methods
    # =========================================================================

    def _handle_initialize(self, params: Dict[str, Any]) -> Dict[str, Any]:
        """Handle initialize request."""
        self.initialized = True

        return {
            "capabilities": {
                "textDocumentSync": {
                    "openClose": True,
                    "change": 1,  # Full sync
                },
                "completionProvider": {
                    "triggerCharacters": [":", "<", "?", "$", '"'],
                    "resolveProvider": False,
                },
                "hoverProvider": True,
                "signatureHelpProvider": {
                    "triggerCharacters": ["(", ","],
                },
                "diagnosticProvider": {
                    "interFileDependencies": False,
                    "workspaceDiagnostics": False,
                },
                # Custom SPARQL capabilities
                "sparqlExecuteProvider": True,
                "sparqlExplainProvider": True,
                "sparqlSuggestProvider": True,
            },
            "serverInfo": {
                "name": "sparql-lsp",
                "version": "1.0.0",
            },
        }

    def _handle_initialized(self, params: Dict[str, Any]) -> None:
        """Handle initialized notification."""
        logger.info("SPARQL-LSP server initialized")
        return None

    def _handle_shutdown(self, params: Dict[str, Any]) -> None:
        """Handle shutdown request."""
        self.initialized = False
        return None

    # =========================================================================
    # Document Sync Methods
    # =========================================================================

    def _handle_did_open(self, params: Dict[str, Any]) -> None:
        """Handle textDocument/didOpen notification."""
        doc = params.get("textDocument", {})
        self.documents[doc["uri"]] = TextDocumentItem(
            uri=doc["uri"],
            languageId=doc.get("languageId", "sparql"),
            version=doc.get("version", 0),
            text=doc.get("text", ""),
        )
        return None

    def _handle_did_change(self, params: Dict[str, Any]) -> None:
        """Handle textDocument/didChange notification."""
        uri = params.get("textDocument", {}).get("uri")
        changes = params.get("contentChanges", [])

        if uri in self.documents and changes:
            # Full sync - take the whole text
            self.documents[uri].text = changes[0].get("text", "")
            self.documents[uri].version = params.get("textDocument", {}).get("version", 0)

        return None

    def _handle_did_close(self, params: Dict[str, Any]) -> None:
        """Handle textDocument/didClose notification."""
        uri = params.get("textDocument", {}).get("uri")
        if uri in self.documents:
            del self.documents[uri]
        return None

    # =========================================================================
    # Diagnostics (SHACL-based Validation)
    # =========================================================================

    def _handle_validate(self, params: Dict[str, Any]) -> Dict[str, Any]:
        """
        Handle sparql/validate request.

        Returns diagnostics for a SPARQL query, plus auto-corrected version if applicable.
        """
        uri = params.get("textDocument", {}).get("uri")
        doc = self.documents.get(uri) if uri else None
        text = params.get("text") or (doc.text if doc else "")

        # Import auto_correct here to avoid circular imports
        try:
            from .sparql_linter import auto_correct_sparql
            corrected_text, was_corrected = auto_correct_sparql(text)
        except ImportError:
            try:
                from sparql_linter import auto_correct_sparql
                corrected_text, was_corrected = auto_correct_sparql(text)
            except ImportError:
                corrected_text, was_corrected = text, False

        # Validate the ORIGINAL query to show what's wrong
        diagnostics = self._validate_sparql(text)

        result: Dict[str, Any] = {
            "uri": uri,
            "diagnostics": [d.to_dict() for d in diagnostics],
        }

        # Include corrected query if auto-correction was applied
        if was_corrected:
            result["corrected_query"] = corrected_text
            result["auto_corrected"] = True

        return result

    def _validate_sparql(self, text: str) -> List[Diagnostic]:
        """Validate SPARQL query and return diagnostics."""
        diagnostics = []
        lines = text.split("\n")

        # Check for deprecated prefixes/classes
        # NOTE: Only flag patterns that are ACTUALLY wrong in our triplestore.
        # DO NOT flag crm:E39_Actor - it works correctly (dual typing with hcc:Custodian)
        # DO NOT suggest hcp: prefix - our ontology uses hc: for BOTH classes and properties
        deprecated_patterns = [
            (r"w3id\.org/heritage/custodian", "WRONG_PREFIX_URI", "Use https://nde.nl/ontology/hc/ prefix"),
            (r'institutionType\s+"Museum"', "WRONG_TYPE_VALUE", 'Use "M" instead of "Museum"'),
            (r'institutionType\s+"Library"', "WRONG_TYPE_VALUE", 'Use "L" instead of "Library"'),
            (r'institutionType\s+"Archive"', "WRONG_TYPE_VALUE", 'Use "A" instead of "Archive"'),
            (r'addressCountry\s+"NL"', "WRONG_COUNTRY_FORMAT", "Use wd:Q55 for Netherlands"),
            # NOTE: The following rules were REMOVED because they broke queries:
            # - crm:E39_Actor works correctly in our triplestore (dual typing)
            # - hc: prefix is used for BOTH classes and properties, hcp: is undefined
            # - Suggesting hc:Custodian is wrong - the correct class is hcc:Custodian
        ]

        for line_num, line in enumerate(lines):
            for pattern, code, message in deprecated_patterns:
                match = re.search(pattern, line, re.IGNORECASE)
                if match:
                    diagnostics.append(Diagnostic(
                        range=Range(
                            start=Position(line_num, match.start()),
                            end=Position(line_num, match.end()),
                        ),
                        message=message,
                        severity=DiagnosticSeverity.Error,
                        code=code,
                    ))

        # Check for syntax issues
        open_braces = text.count("{")
        close_braces = text.count("}")
        if open_braces != close_braces:
            diagnostics.append(Diagnostic(
                range=Range(start=Position(0, 0), end=Position(0, 1)),
                message=f"Unbalanced braces: {open_braces} opening, {close_braces} closing",
                severity=DiagnosticSeverity.Error,
                code="SYNTAX_ERROR",
            ))

        # Check for SELECT without WHERE
        if re.search(r"\bSELECT\b", text, re.IGNORECASE) and not re.search(r"\bWHERE\b", text, re.IGNORECASE):
            diagnostics.append(Diagnostic(
                range=Range(start=Position(0, 0), end=Position(0, 6)),
                message="SELECT query missing WHERE clause",
                severity=DiagnosticSeverity.Error,
                code="MISSING_WHERE",
            ))

        # Warning for province filtering without URI pattern
        for code, name in self.kb.DUTCH_PROVINCES.items():
            if name.lower() in text.lower() and f"NL-{code}" not in text:
                line_num = next((i for i, l in enumerate(lines) if name.lower() in l.lower()), 0)
                diagnostics.append(Diagnostic(
                    range=Range(start=Position(line_num, 0), end=Position(line_num, len(lines[line_num]))),
                    message=f"Province '{name}' - consider URI filtering",
                    severity=DiagnosticSeverity.Warning,
                    code="SUGGEST_URI_FILTER",
                ))

        return diagnostics

    # =========================================================================
    # Completion
    # =========================================================================

    def _handle_completion(self, params: Dict[str, Any]) -> Dict[str, Any]:
        """Handle textDocument/completion request."""
        uri = params.get("textDocument", {}).get("uri")
        position = params.get("position", {})

        doc = self.documents.get(uri) if uri else None
        text = doc.text if doc else ""
        line = position.get("line", 0)
        character = position.get("character", 0)

        # Get the current line and context
        lines = text.split("\n")
        current_line = lines[line] if line < len(lines) else ""
        prefix_text = current_line[:character]

        items = []

        # Prefix completion (after PREFIX keyword)
        if re.search(r"PREFIX\s+\w*$", prefix_text, re.IGNORECASE):
            for prefix, info in self.kb.PREFIXES.items():
                items.append(CompletionItem(
                    label=prefix,
                    kind=CompletionItemKind.Module,
                    detail=info["description"],
                    documentation=f"**URI:** `{info['uri']}`\n\n**Example:** `{info['example']}`",
                    insertText=f"{prefix}: <{info['uri']}>",
                ))

        # Class completion (after "a " or "rdf:type")
        elif re.search(r"(\ba\s+|\brdf:type\s+)\w*$", prefix_text, re.IGNORECASE):
            for cls, info in self.kb.CLASSES.items():
                items.append(CompletionItem(
                    label=cls,
                    kind=CompletionItemKind.Class,
                    detail=info["description"],
                    documentation=f"**Properties:** {', '.join(info['properties'])}\n\n**Example:**\n```sparql\n{info['example']}\n```",
                    insertText=cls,
                ))

        # Property completion (after prefix like "hcp:")
        elif re.search(r"(hcp|schema|skos|rdfs|foaf|dct):\w*$", prefix_text):
            prefix_match = re.search(r"(hcp|schema|skos|rdfs|foaf|dct):(\w*)$", prefix_text)
            if prefix_match:
                prefix = prefix_match.group(1)
                for prop, info in self.kb.PROPERTIES.items():
                    if prop.startswith(f"{prefix}:"):
                        prop_name = prop.split(":")[1]
                        items.append(CompletionItem(
                            label=prop_name,
                            kind=CompletionItemKind.Property,
                            detail=info["description"],
                            documentation=f"**Range:** `{info['range']}`\n\n**Example:**\n```sparql\n{info['example']}\n```",
                            insertText=prop_name,
                        ))

        # Institution type value completion (after institutionType)
        elif re.search(r'institutionType\s+"?\w*$', prefix_text):
            for code, name in self.kb.PROPERTIES["hcp:institutionType"]["values"].items():
                items.append(CompletionItem(
                    label=f'"{code}"',
                    kind=CompletionItemKind.Value,
                    detail=name,
                    documentation=f"Institution type code for **{name}**",
                    insertText=f'"{code}"',
                ))

        # Country completion (after addressCountry)
        elif re.search(r"addressCountry\s+\w*$", prefix_text):
            for qid, name in self.kb.COUNTRIES.items():
                items.append(CompletionItem(
                    label=f"wd:{qid}",
                    kind=CompletionItemKind.Value,
                    detail=name,
                    documentation=f"Wikidata entity for **{name}**",
                    insertText=f"wd:{qid}",
                ))

        # Keyword completion
        elif re.search(r"\b[A-Z]*$", prefix_text):
            keyword_prefix = re.search(r"\b([A-Z]*)$", prefix_text)
            if keyword_prefix:
                prefix_str = keyword_prefix.group(1).upper()
                for kw in self.kb.KEYWORDS:
                    if kw.upper().startswith(prefix_str):
                        items.append(CompletionItem(
                            label=kw,
                            kind=CompletionItemKind.Keyword,
                            detail="SPARQL keyword",
                            insertText=kw,
                        ))

        return {
            "isIncomplete": False,
            "items": [item.to_dict() for item in items[:50]],  # Limit to 50 items
        }

    # =========================================================================
    # Hover
    # =========================================================================

    def _handle_hover(self, params: Dict[str, Any]) -> Optional[Dict[str, Any]]:
        """Handle textDocument/hover request."""
        uri = params.get("textDocument", {}).get("uri")
        position = params.get("position", {})

        doc = self.documents.get(uri) if uri else None
        text = doc.text if doc else ""
        line = position.get("line", 0)
        character = position.get("character", 0)

        lines = text.split("\n")
        current_line = lines[line] if line < len(lines) else ""

        # Find word at position
        word_match = None
        for match in re.finditer(r"[\w:]+", current_line):
            if match.start() <= character <= match.end():
                word_match = match
                break

        if not word_match:
            return None

        word = word_match.group()

        # Check if it's a class
        if word in self.kb.CLASSES:
            info = self.kb.CLASSES[word]
            return Hover(
                contents=f"### {word}\n\n{info['description']}\n\n**Properties:** {', '.join(info['properties'])}\n\n```sparql\n{info['example']}\n```",
                range=Range(
                    start=Position(line, word_match.start()),
                    end=Position(line, word_match.end()),
                ),
            ).to_dict()

        # Check if it's a property
        if word in self.kb.PROPERTIES:
            info = self.kb.PROPERTIES[word]
            content = f"### {word}\n\n{info['description']}\n\n**Range:** `{info['range']}`"
            if "values" in info:
                content += "\n\n**Valid values:**\n"
                for code, name in info["values"].items():
                    content += f"- `\"{code}\"` = {name}\n"
            content += f"\n\n```sparql\n{info['example']}\n```"
            return Hover(
                contents=content,
                range=Range(
                    start=Position(line, word_match.start()),
                    end=Position(line, word_match.end()),
                ),
            ).to_dict()

        # Check if it's a prefix
        prefix = word.rstrip(":")
        if prefix in self.kb.PREFIXES:
            info = self.kb.PREFIXES[prefix]
            return Hover(
                contents=f"### PREFIX {prefix}:\n\n{info['description']}\n\n**URI:** `{info['uri']}`\n\n**Example:** `{info['example']}`",
                range=Range(
                    start=Position(line, word_match.start()),
                    end=Position(line, word_match.end()),
                ),
            ).to_dict()

        # Check for Wikidata entity
        if word.startswith("wd:Q") or word.startswith("Q"):
            qid = word.replace("wd:", "")
            if qid in self.kb.COUNTRIES:
                return Hover(
                    contents=f"### {word}\n\n**Country:** {self.kb.COUNTRIES[qid]}\n\nWikidata entity for country filtering.",
                    range=Range(
                        start=Position(line, word_match.start()),
                        end=Position(line, word_match.end()),
                    ),
                ).to_dict()

        return None

    # =========================================================================
    # Signature Help
    # =========================================================================

    def _handle_signature_help(self, params: Dict[str, Any]) -> Optional[Dict[str, Any]]:
        """Handle textDocument/signatureHelp request."""
        uri = params.get("textDocument", {}).get("uri")
        position = params.get("position", {})

        doc = self.documents.get(uri) if uri else None
        text = doc.text if doc else ""
        line = position.get("line", 0)
        character = position.get("character", 0)

        lines = text.split("\n")
        current_line = lines[line] if line < len(lines) else ""
        prefix_text = current_line[:character]

        # SPARQL aggregate functions
        functions: Dict[str, Dict[str, Any]] = {
            "COUNT": {
                "signature": "COUNT(expression) or COUNT(DISTINCT expression)",
                "documentation": "Returns the count of bindings. Use DISTINCT to count unique values.",
                "parameters": [{"label": "expression", "documentation": "Variable or expression to count"}],
            },
            "SUM": {
                "signature": "SUM(expression)",
                "documentation": "Returns the sum of numeric values.",
                "parameters": [{"label": "expression", "documentation": "Numeric variable or expression"}],
            },
            "AVG": {
                "signature": "AVG(expression)",
                "documentation": "Returns the average of numeric values.",
                "parameters": [{"label": "expression", "documentation": "Numeric variable or expression"}],
            },
            "MIN": {
                "signature": "MIN(expression)",
                "documentation": "Returns the minimum value.",
                "parameters": [{"label": "expression", "documentation": "Variable or expression"}],
            },
            "MAX": {
                "signature": "MAX(expression)",
                "documentation": "Returns the maximum value.",
                "parameters": [{"label": "expression", "documentation": "Variable or expression"}],
            },
            "FILTER": {
                "signature": "FILTER(condition)",
                "documentation": "Filters results based on a boolean condition.",
                "parameters": [{"label": "condition", "documentation": "Boolean expression (e.g., CONTAINS(?name, 'Museum'))"}],
            },
            "CONTAINS": {
                "signature": "CONTAINS(string, substring)",
                "documentation": "Returns true if string contains substring. Use with STR() for URIs.",
                "parameters": [
                    {"label": "string", "documentation": "String to search in"},
                    {"label": "substring", "documentation": "String to search for"},
                ],
            },
            "STR": {
                "signature": "STR(term)",
                "documentation": "Converts a term (URI, literal) to its string representation.",
                "parameters": [{"label": "term", "documentation": "URI or literal to convert"}],
            },
            "BIND": {
                "signature": "BIND(expression AS ?variable)",
                "documentation": "Binds the result of an expression to a variable.",
                "parameters": [
                    {"label": "expression", "documentation": "Expression to evaluate"},
                    {"label": "variable", "documentation": "Variable to bind to"},
                ],
            },
        }

        # Find which function we're in
        for func_name, func_info in functions.items():
            pattern = rf"\b{func_name}\s*\($"
            if re.search(pattern, prefix_text, re.IGNORECASE):
                return SignatureHelp(
                    signatures=[SignatureInformation(
                        label=func_info["signature"],
                        documentation=func_info["documentation"],
                        parameters=func_info["parameters"],
                    )],
                ).to_dict()

        return None

    # =========================================================================
    # SPARQL Execution
    # =========================================================================

    def _handle_execute(self, params: Dict[str, Any]) -> Dict[str, Any]:
        """Handle sparql/execute request."""
        import time
        import httpx

        query = params.get("query", "")

        try:
            start_time = time.time()

            response = httpx.post(
                self.sparql_endpoint,
                content=query,
                headers={
                    "Content-Type": "application/sparql-query",
                    "Accept": "application/json",
                },
                timeout=30.0,
            )

            execution_time = (time.time() - start_time) * 1000

            if response.status_code == 200:
                return SPARQLExecuteResult(
                    success=True,
                    results=response.json(),
                    executionTimeMs=execution_time,
                ).to_dict()
            else:
                return SPARQLExecuteResult(
                    success=False,
                    error=f"HTTP {response.status_code}: {response.text}",
                    executionTimeMs=execution_time,
                ).to_dict()

        except Exception as e:
            return SPARQLExecuteResult(
                success=False,
                error=str(e),
            ).to_dict()

    # =========================================================================
    # SPARQL Explanation
    # =========================================================================

    def _handle_explain(self, params: Dict[str, Any]) -> Dict[str, Any]:
        """Handle sparql/explain request."""
        query = params.get("query", "")

        steps = []
        complexity = "simple"
        optimizations = []

        # Analyze query structure
        if re.search(r"\bSELECT\b", query, re.IGNORECASE):
            steps.append("SELECT query - retrieves specific variables")
        elif re.search(r"\bCONSTRUCT\b", query, re.IGNORECASE):
            steps.append("CONSTRUCT query - builds RDF graph")
            complexity = "moderate"
        elif re.search(r"\bASK\b", query, re.IGNORECASE):
            steps.append("ASK query - returns boolean result")

        # Check for patterns
        if "a hc:Custodian" in query or "rdf:type hc:Custodian" in query:
            steps.append("Filters to heritage custodian institutions")

        if "hcp:institutionType" in query:
            type_match = re.search(r'institutionType\s+"([A-Z])"', query)
            if type_match:
                type_code = type_match.group(1)
                type_name = self.kb.PROPERTIES["hcp:institutionType"]["values"].get(type_code, "Unknown")
                steps.append(f"Filters by institution type: {type_name} ({type_code})")

        if "schema:addressCountry" in query:
            country_match = re.search(r"addressCountry\s+wd:(Q\d+)", query)
            if country_match:
                qid = country_match.group(1)
                country_name = self.kb.COUNTRIES.get(qid, "Unknown")
                steps.append(f"Filters by country: {country_name}")

        if re.search(r"FILTER.*CONTAINS.*STR.*NL-[A-Z]{2}", query):
            province_match = re.search(r"NL-([A-Z]{2})", query)
            if province_match:
                code = province_match.group(1)
                province_name = self.kb.DUTCH_PROVINCES.get(code, "Unknown")
                steps.append(f"Filters by Dutch province: {province_name} ({code})")

        if re.search(r"\bGROUP BY\b", query, re.IGNORECASE):
            steps.append("Groups results for aggregation")
            complexity = "moderate"

        if re.search(r"\bCOUNT\b", query, re.IGNORECASE):
            steps.append("Counts matching results")

        if re.search(r"\bOPTIONAL\b", query, re.IGNORECASE):
            steps.append("Includes optional patterns (may return nulls)")
            complexity = "moderate"

        if re.search(r"\bUNION\b", query, re.IGNORECASE):
            steps.append("Combines multiple patterns with UNION")
            complexity = "complex"

        if re.search(r"\bSERVICE\b", query, re.IGNORECASE):
            steps.append("Federated query to external endpoint")
            complexity = "complex"
            optimizations.append("Federated queries can be slow - consider caching results")

        # Generate summary
        summary = f"This SPARQL query {'retrieves' if 'SELECT' in query.upper() else 'processes'} data from the Heritage Custodian knowledge graph."

        return SPARQLExplanation(
            summary=summary,
            steps=steps if steps else ["Basic query pattern"],
            estimatedComplexity=complexity,
            suggestedOptimizations=optimizations if optimizations else None,
        ).to_dict()

    # =========================================================================
    # Novel Connection Suggestions (Vector DB Integration)
    # =========================================================================

    def _handle_suggest(self, params: Dict[str, Any]) -> Dict[str, Any]:
        """
        Handle sparql/suggest request.

        Uses vector database to suggest novel connections that could
        enhance the SPARQL query.
        """
        query = params.get("query", "")
        context = params.get("context", "")

        suggestions: List[SPARQLSuggestion] = []

        # Extract entities from query
        entities: List[str] = []
        if "skos:prefLabel" in query or "schema:name" in query:
            # Query is looking for names - suggest related entities
            suggestions.append(SPARQLSuggestion(
                type="relationship",
                description="Consider adding organizational hierarchy",
                sparqlFragment="OPTIONAL { ?s schema:containedInPlace ?parent . ?parent skos:prefLabel ?parentName . }",
                confidence=0.7,
                source="inference",
            ))

        if "hcp:institutionType" in query:
            # Query filters by type - suggest cross-type relationships
            suggestions.append(SPARQLSuggestion(
                type="pattern",
                description="Find related institutions in same location",
                sparqlFragment="""
OPTIONAL {
    ?related a hc:Custodian ;
             schema:containedInPlace ?location .
    ?s schema:containedInPlace ?location .
    FILTER(?related != ?s)
}""",
                confidence=0.6,
                source="inference",
            ))

        # If we have Qdrant configured, query for semantic suggestions
        if self.qdrant_host:
            # This would integrate with the actual Qdrant retriever
            suggestions.append(SPARQLSuggestion(
                type="entity",
                description="Semantically similar institutions found in vector index",
                sparqlFragment="# Use vector similarity to find: [entities from Qdrant]",
                confidence=0.8,
                source="qdrant",
            ))

        # If we have TypeDB configured, suggest inference patterns
        if self.typedb_host:
            suggestions.append(SPARQLSuggestion(
                type="relationship",
                description="TypeDB can infer: observation → reconstruction chains",
                sparqlFragment="# Consider TypeQL: get-reconstructions-by-observation-name($name)",
                confidence=0.9,
                source="typedb",
            ))

        return {
            "suggestions": [s.to_dict() for s in suggestions],
        }


# =============================================================================
# Convenience Functions for AI Agent Integration
# =============================================================================

def create_lsp_request(method: str, params: Dict[str, Any], request_id: Optional[int] = None) -> Dict[str, Any]:
    """
    Create a JSON-RPC request for the SPARQL-LSP server.

    Args:
        method: LSP method name (e.g., "sparql/validate")
        params: Method parameters
        request_id: Optional request ID (None for notifications)

    Returns:
        JSON-RPC request dictionary
    """
    request: Dict[str, Any] = {
        "jsonrpc": "2.0",
        "method": method,
        "params": params,
    }
    if request_id is not None:
        request["id"] = request_id
    return request


def validate_sparql_query(query: str, server: Optional[SPARQLLanguageServer] = None) -> Dict[str, Any]:
    """
    Convenience function to validate a SPARQL query.

    Args:
        query: SPARQL query string
        server: Optional LSP server instance (creates one if not provided)

    Returns:
        Validation result with diagnostics
    """
    if server is None:
        server = SPARQLLanguageServer()
        server.initialized = True

    request = create_lsp_request("sparql/validate", {"text": query}, request_id=1)
    response = server.handle_message(request)
    if response is None:
        return {}
    result: Dict[str, Any] = response.get("result", {})
    return result


def get_sparql_completions(query: str, line: int, character: int, server: Optional[SPARQLLanguageServer] = None) -> List[Dict[str, Any]]:
    """
    Convenience function to get completions for a SPARQL query.

    Args:
        query: SPARQL query string
        line: Line number (0-indexed)
        character: Character position (0-indexed)
        server: Optional LSP server instance

    Returns:
        List of completion items
    """
    if server is None:
        server = SPARQLLanguageServer()
        server.initialized = True

    # Open document
    doc_uri = "inmemory://query.sparql"
    server.handle_message(create_lsp_request("textDocument/didOpen", {
        "textDocument": {
            "uri": doc_uri,
            "languageId": "sparql",
            "version": 1,
            "text": query,
        }
    }))

    # Get completions
    request = create_lsp_request("textDocument/completion", {
        "textDocument": {"uri": doc_uri},
        "position": {"line": line, "character": character},
    }, request_id=1)

    response = server.handle_message(request)
    if response is None:
        return []
    result_dict: Dict[str, Any] = response.get("result", {})
    items: List[Dict[str, Any]] = result_dict.get("items", [])
    return items


# =============================================================================
# Example Usage
# =============================================================================

if __name__ == "__main__":
    # Create server
    server = SPARQLLanguageServer()

    # Initialize
    init_response = server.handle_message({
        "jsonrpc": "2.0",
        "id": 1,
        "method": "initialize",
        "params": {},
    })
    print("Initialize:", json.dumps(init_response, indent=2))

    # Validate a query with issues
    bad_query = """
    PREFIX hc: <https://w3id.org/heritage/custodian/>
    SELECT ?museum ?name WHERE {
        ?museum a crm:E39_Actor ;
                hc:institutionType "Museum" ;
                schema:addressCountry "NL" ;
                skos:prefLabel ?name .
    }
    """

    validate_response = server.handle_message({
        "jsonrpc": "2.0",
        "id": 2,
        "method": "sparql/validate",
        "params": {"text": bad_query},
    })
    print("\nValidation:", json.dumps(validate_response, indent=2))

    # Get completions
    server.handle_message({
        "jsonrpc": "2.0",
        "method": "textDocument/didOpen",
        "params": {
            "textDocument": {
                "uri": "test://query.sparql",
                "languageId": "sparql",
                "version": 1,
                "text": "SELECT ?s WHERE { ?s hcp:",
            }
        }
    })

    completion_response = server.handle_message({
        "jsonrpc": "2.0",
        "id": 3,
        "method": "textDocument/completion",
        "params": {
            "textDocument": {"uri": "test://query.sparql"},
            "position": {"line": 0, "character": 25},
        },
    })
    print("\nCompletions:", json.dumps(completion_response, indent=2))

    # Explain a query
    good_query = """
    PREFIX hc: <https://nde.nl/ontology/hc/class/>
    PREFIX hcp: <https://nde.nl/ontology/hc/>
    SELECT (COUNT(?s) as ?count) WHERE {
        ?s a hc:Custodian ;
           hcp:institutionType "M" ;
           schema:addressCountry wd:Q55 .
        FILTER(CONTAINS(STR(?s), "NL-NH"))
    }
    """

    explain_response = server.handle_message({
        "jsonrpc": "2.0",
        "id": 4,
        "method": "sparql/explain",
        "params": {"query": good_query},
    })
    print("\nExplanation:", json.dumps(explain_response, indent=2))