glam/scripts/generate_sparql_validation_rules.py

#!/usr/bin/env python3
"""
Generate SPARQL validation rules from LinkML schema.

This script reads the LinkML schema and generates a JSON file containing:
1. Valid enum values (e.g., institution types: M, L, A, G...)
2. Slot patterns (e.g., ISO 3166-2 format for subregions)
3. Valid prefixes and their URIs
4. Property mappings (slot_uri mappings)

The generated JSON is used by sparql_linter.py for SPARQL auto-correction.

SINGLE SOURCE OF TRUTH: LinkML schema → Generated rules → SPARQL validation

Usage:
    python scripts/generate_sparql_validation_rules.py

Output:
    data/validation/sparql_validation_rules.json
"""

import json
import re
import sys
from pathlib import Path
from typing import Any

import yaml

# Project root
PROJECT_ROOT = Path(__file__).parent.parent
LINKML_DIR = PROJECT_ROOT / "schemas" / "20251121" / "linkml"
OUTPUT_FILE = PROJECT_ROOT / "data" / "validation" / "sparql_validation_rules.json"


def load_yaml_file(path: Path) -> dict:
    """Load a YAML file."""
    with open(path) as f:
        return yaml.safe_load(f) or {}


def extract_enum_values(enum_file: Path) -> dict[str, Any]:
    """Extract enum values and their mappings from a LinkML enum file."""
    data = load_yaml_file(enum_file)
    result = {}

    for enum_name, enum_def in data.get("enums", {}).items():
        values = []
        value_mappings = {}  # Full name -> code mappings

        for value_name, value_def in enum_def.get("permissible_values", {}).items():
            values.append(value_name)

            # Extract meaning (Wikidata ID if present)
            meaning = value_def.get("meaning") if isinstance(value_def, dict) else None
            description = value_def.get("description", "") if isinstance(value_def, dict) else ""

            value_mappings[value_name] = {
                "meaning": meaning,
                "description": description,
            }

        result[enum_name] = {
            "values": values,
            "mappings": value_mappings,
        }

    return result


def extract_slot_patterns(slot_file: Path) -> dict[str, Any]:
    """Extract slot patterns and constraints from a LinkML slot file."""
    data = load_yaml_file(slot_file)
    result = {}

    for slot_name, slot_def in data.get("slots", {}).items():
        if not isinstance(slot_def, dict):
            continue

        slot_info = {
            "slot_uri": slot_def.get("slot_uri"),
            "range": slot_def.get("range"),
            "pattern": slot_def.get("pattern"),
            "description": slot_def.get("description", ""),
        }

        # Only include if there's useful information
        if any(v for v in slot_info.values() if v):
            result[slot_name] = slot_info

    return result


def generate_institution_type_mappings() -> dict:
    """
    Generate mappings from full words to single-letter codes for institution types.

    Based on CustodianPrimaryTypeEnum from LinkML.
    """
    return {
        # Full word -> single letter code
        "MUSEUM": "M",
        "Museum": "M",
        "museum": "M",
        "LIBRARY": "L",
        "Library": "L",
        "library": "L",
        "ARCHIVE": "A",
        "Archive": "A",
        "archive": "A",
        "GALLERY": "G",
        "Gallery": "G",
        "gallery": "G",
        "OFFICIAL_INSTITUTION": "O",
        "RESEARCH_CENTER": "R",
        "COMMERCIAL": "C",
        "UNSPECIFIED": "U",
        "BIO_CUSTODIAN": "B",
        "EDUCATION_PROVIDER": "E",
        "HERITAGE_SOCIETY": "S",
        "FEATURE_CUSTODIAN": "F",
        "INTANGIBLE_HERITAGE_GROUP": "I",
        "MIXED": "X",
        "PERSONAL_COLLECTION": "P",
        "HOLY_SACRED_SITE": "H",
        "DIGITAL_PLATFORM": "D",
        "NON_PROFIT": "N",
        "TASTE_SCENT_HERITAGE": "T",
    }


def generate_subregion_mappings() -> dict:
    """
    Generate mappings from province names to ISO 3166-2 codes.

    These are the most common Dutch provinces that LLMs might use.
    Format follows ISO 3166-2: {country}-{subdivision}
    """
    return {
        # Dutch provinces (NL-XX)
        "noord-holland": "NL-NH",
        "noord holland": "NL-NH",
        "noordholland": "NL-NH",
        "south-holland": "NL-ZH",
        "south holland": "NL-ZH",
        "zuid-holland": "NL-ZH",
        "zuid holland": "NL-ZH",
        "zuidholland": "NL-ZH",
        "noord-brabant": "NL-NB",
        "noord brabant": "NL-NB",
        "north brabant": "NL-NB",
        "gelderland": "NL-GE",
        "utrecht": "NL-UT",
        "overijssel": "NL-OV",
        "limburg": "NL-LI",
        "friesland": "NL-FR",
        "frisia": "NL-FR",
        "groningen": "NL-GR",
        "drenthe": "NL-DR",
        "flevoland": "NL-FL",
        "zeeland": "NL-ZE",

        # German states (DE-XX) - common ones
        "bavaria": "DE-BY",
        "bayern": "DE-BY",
        "berlin": "DE-BE",
        "baden-württemberg": "DE-BW",
        "north rhine-westphalia": "DE-NW",
        "nordrhein-westfalen": "DE-NW",
        "saxony": "DE-SN",
        "sachsen": "DE-SN",
        "hesse": "DE-HE",
        "hessen": "DE-HE",

        # Belgian regions (BE-XX)
        "flanders": "BE-VLG",
        "vlaanderen": "BE-VLG",
        "wallonia": "BE-WAL",
        "wallonie": "BE-WAL",
        "brussels": "BE-BRU",
        "bruxelles": "BE-BRU",
    }


def generate_country_mappings() -> dict:
    """
    Generate mappings from ISO country codes to Wikidata URIs.

    schema:addressCountry should use Wikidata entity URIs, not ISO codes.
    """
    return {
        "NL": "wd:Q55",      # Netherlands
        "DE": "wd:Q183",     # Germany
        "BE": "wd:Q31",      # Belgium
        "FR": "wd:Q142",     # France
        "GB": "wd:Q145",     # United Kingdom
        "UK": "wd:Q145",     # United Kingdom (alias)
        "US": "wd:Q30",      # United States
        "JP": "wd:Q17",      # Japan
        "CZ": "wd:Q213",     # Czech Republic
        "AT": "wd:Q40",      # Austria
        "CH": "wd:Q39",      # Switzerland
        "IT": "wd:Q38",      # Italy
        "ES": "wd:Q29",      # Spain
        "PL": "wd:Q36",      # Poland
        "PT": "wd:Q45",      # Portugal
        "BR": "wd:Q155",     # Brazil
        "MX": "wd:Q96",      # Mexico
        "CA": "wd:Q16",      # Canada
        "AU": "wd:Q408",     # Australia
        "IN": "wd:Q668",     # India
        "CN": "wd:Q148",     # China
    }


def generate_valid_prefixes() -> dict:
    """
    Generate valid SPARQL prefixes and their URIs.

    From the LinkML schema prefixes.
    """
    return {
        "hc": "https://nde.nl/ontology/hc/class/",
        "hcp": "https://nde.nl/ontology/hc/",
        "schema": "http://schema.org/",
        "skos": "http://www.w3.org/2004/02/skos/core#",
        "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
        "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
        "wd": "http://www.wikidata.org/entity/",
        "wdt": "http://www.wikidata.org/prop/direct/",
        "foaf": "http://xmlns.com/foaf/0.1/",
        "dct": "http://purl.org/dc/terms/",
        "dcterms": "http://purl.org/dc/terms/",
        "owl": "http://www.w3.org/2002/07/owl#",
        "xsd": "http://www.w3.org/2001/XMLSchema#",
        "org": "http://www.w3.org/ns/org#",
        "prov": "http://www.w3.org/ns/prov#",
    }


def generate_invalid_patterns() -> list[dict]:
    """
    Generate patterns that are known to be invalid in SPARQL queries.

    These are common mistakes LLMs make that should be auto-corrected.
    """
    return [
        {
            "pattern": r"hcp:custodian_type\s+hc:(\w+)",
            "description": "hcp:custodian_type does not exist; use hcp:institutionType with single-letter codes",
            "correction_type": "institution_type_class_to_code",
        },
        {
            "pattern": r"hc:custodian_type\s+hc:(\w+)",
            "description": "hc:custodian_type does not exist; use hcp:institutionType with single-letter codes",
            "correction_type": "institution_type_class_to_code",
        },
        {
            "pattern": r"\bcrm:P\d+[a-zA-Z_]*\b",
            "description": "CIDOC-CRM properties (crm:P*) are not used in this ontology",
            "correction_type": "remove_triple_pattern",
        },
        {
            "pattern": r"PREFIX\s+hc:\s*<https?://w3id\.org/heritage/custodian/?>",
            "description": "Wrong prefix URI for hc:",
            "correction_type": "fix_prefix_uri",
            "replacement": "PREFIX hc: <https://nde.nl/ontology/hc/class/>",
        },
    ]


def generate_property_mappings() -> dict:
    """
    Generate property name to slot_uri mappings.

    These help identify when an LLM uses a non-existent property.
    """
    return {
        # Correct properties
        "hcp:institutionType": {
            "slot_uri": "org:classification",
            "range": "CustodianPrimaryTypeEnum",
            "valid_values": ["M", "L", "A", "G", "O", "R", "C", "U", "B", "E", "S", "F", "I", "X", "P", "H", "D", "N", "T"],
        },
        "hcp:ghcid": {
            "slot_uri": "https://nde.nl/ontology/hc/ghcid",
            "pattern": r"^[A-Z]{2}-[A-Z]{2,3}-[A-Z]{2,4}-[A-Z]-[A-Z0-9]+$",
        },
        "hcp:isil": {
            "slot_uri": "https://nde.nl/ontology/hc/isil",
            "pattern": r"^[A-Z]{2}-[A-Za-z0-9]+$",
        },
        "schema:name": {
            "slot_uri": "schema:name",
            "datatype": "string",
        },
        "schema:addressCountry": {
            "slot_uri": "schema:addressCountry",
            "nodeKind": "IRI",
            "description": "Country as Wikidata entity URI (e.g., wd:Q55)",
        },

        # Non-existent properties (common LLM mistakes)
        "hcp:custodian_type": {
            "error": "Property does not exist",
            "correct_property": "hcp:institutionType",
        },
        "hc:custodian_type": {
            "error": "Property does not exist",
            "correct_property": "hcp:institutionType",
        },
    }


def main():
    """Generate validation rules from LinkML schema."""
    print("Generating SPARQL validation rules from LinkML schema...")

    # Load enum files
    enums = {}
    enum_dir = LINKML_DIR / "modules" / "enums"
    if enum_dir.exists():
        for enum_file in enum_dir.glob("*.yaml"):
            print(f"  Loading enum: {enum_file.name}")
            enums.update(extract_enum_values(enum_file))

    # Load slot files for patterns
    slots = {}
    slot_dir = LINKML_DIR / "modules" / "slots"
    if slot_dir.exists():
        important_slots = ["custodian_type", "country", "subregion", "iso_3166_2_code"]
        for slot_name in important_slots:
            slot_file = slot_dir / f"{slot_name}.yaml"
            if slot_file.exists():
                print(f"  Loading slot: {slot_file.name}")
                slots.update(extract_slot_patterns(slot_file))

    # Generate the complete validation rules
    rules = {
        "_metadata": {
            "generated_from": "LinkML schema at schemas/20251121/linkml/",
            "purpose": "SPARQL query validation and auto-correction",
            "single_source_of_truth": "LinkML schema defines all valid values",
        },
        "enums": enums,
        "slots": slots,
        "prefixes": generate_valid_prefixes(),
        "institution_type_mappings": generate_institution_type_mappings(),
        "subregion_mappings": generate_subregion_mappings(),
        "country_mappings": generate_country_mappings(),
        "property_mappings": generate_property_mappings(),
        "invalid_patterns": generate_invalid_patterns(),
    }

    # Ensure output directory exists
    OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)

    # Write the rules
    with open(OUTPUT_FILE, "w") as f:
        json.dump(rules, f, indent=2)

    print(f"\nGenerated validation rules: {OUTPUT_FILE}")
    print(f"  - {len(enums)} enums")
    print(f"  - {len(slots)} slots")
    print(f"  - {len(rules['institution_type_mappings'])} institution type mappings")
    print(f"  - {len(rules['subregion_mappings'])} subregion mappings")
    print(f"  - {len(rules['country_mappings'])} country mappings")


if __name__ == "__main__":
    main()