#!/usr/bin/env python3 """ Generate SPARQL validation rules from LinkML schema. This script reads the LinkML schema and generates a JSON file containing: 1. Valid enum values (e.g., institution types: M, L, A, G...) 2. Slot patterns (e.g., ISO 3166-2 format for subregions) 3. Valid prefixes and their URIs 4. Property mappings (slot_uri mappings) The generated JSON is used by sparql_linter.py for SPARQL auto-correction. SINGLE SOURCE OF TRUTH: LinkML schema → Generated rules → SPARQL validation Usage: python scripts/generate_sparql_validation_rules.py Output: data/validation/sparql_validation_rules.json """ import json import re import sys from pathlib import Path from typing import Any import yaml # Project root PROJECT_ROOT = Path(__file__).parent.parent LINKML_DIR = PROJECT_ROOT / "schemas" / "20251121" / "linkml" OUTPUT_FILE = PROJECT_ROOT / "data" / "validation" / "sparql_validation_rules.json" def load_yaml_file(path: Path) -> dict: """Load a YAML file.""" with open(path) as f: return yaml.safe_load(f) or {} def extract_enum_values(enum_file: Path) -> dict[str, Any]: """Extract enum values and their mappings from a LinkML enum file.""" data = load_yaml_file(enum_file) result = {} for enum_name, enum_def in data.get("enums", {}).items(): values = [] value_mappings = {} # Full name -> code mappings for value_name, value_def in enum_def.get("permissible_values", {}).items(): values.append(value_name) # Extract meaning (Wikidata ID if present) meaning = value_def.get("meaning") if isinstance(value_def, dict) else None description = value_def.get("description", "") if isinstance(value_def, dict) else "" value_mappings[value_name] = { "meaning": meaning, "description": description, } result[enum_name] = { "values": values, "mappings": value_mappings, } return result def extract_slot_patterns(slot_file: Path) -> dict[str, Any]: """Extract slot patterns and constraints from a LinkML slot file.""" data = load_yaml_file(slot_file) result = {} for slot_name, slot_def in data.get("slots", {}).items(): if not isinstance(slot_def, dict): continue slot_info = { "slot_uri": slot_def.get("slot_uri"), "range": slot_def.get("range"), "pattern": slot_def.get("pattern"), "description": slot_def.get("description", ""), } # Only include if there's useful information if any(v for v in slot_info.values() if v): result[slot_name] = slot_info return result def generate_institution_type_mappings() -> dict: """ Generate mappings from full words to single-letter codes for institution types. Based on CustodianPrimaryTypeEnum from LinkML. """ return { # Full word -> single letter code "MUSEUM": "M", "Museum": "M", "museum": "M", "LIBRARY": "L", "Library": "L", "library": "L", "ARCHIVE": "A", "Archive": "A", "archive": "A", "GALLERY": "G", "Gallery": "G", "gallery": "G", "OFFICIAL_INSTITUTION": "O", "RESEARCH_CENTER": "R", "COMMERCIAL": "C", "UNSPECIFIED": "U", "BIO_CUSTODIAN": "B", "EDUCATION_PROVIDER": "E", "HERITAGE_SOCIETY": "S", "FEATURE_CUSTODIAN": "F", "INTANGIBLE_HERITAGE_GROUP": "I", "MIXED": "X", "PERSONAL_COLLECTION": "P", "HOLY_SACRED_SITE": "H", "DIGITAL_PLATFORM": "D", "NON_PROFIT": "N", "TASTE_SCENT_HERITAGE": "T", } def generate_subregion_mappings() -> dict: """ Generate mappings from province names to ISO 3166-2 codes. These are the most common Dutch provinces that LLMs might use. Format follows ISO 3166-2: {country}-{subdivision} """ return { # Dutch provinces (NL-XX) "noord-holland": "NL-NH", "noord holland": "NL-NH", "noordholland": "NL-NH", "south-holland": "NL-ZH", "south holland": "NL-ZH", "zuid-holland": "NL-ZH", "zuid holland": "NL-ZH", "zuidholland": "NL-ZH", "noord-brabant": "NL-NB", "noord brabant": "NL-NB", "north brabant": "NL-NB", "gelderland": "NL-GE", "utrecht": "NL-UT", "overijssel": "NL-OV", "limburg": "NL-LI", "friesland": "NL-FR", "frisia": "NL-FR", "groningen": "NL-GR", "drenthe": "NL-DR", "flevoland": "NL-FL", "zeeland": "NL-ZE", # German states (DE-XX) - common ones "bavaria": "DE-BY", "bayern": "DE-BY", "berlin": "DE-BE", "baden-württemberg": "DE-BW", "north rhine-westphalia": "DE-NW", "nordrhein-westfalen": "DE-NW", "saxony": "DE-SN", "sachsen": "DE-SN", "hesse": "DE-HE", "hessen": "DE-HE", # Belgian regions (BE-XX) "flanders": "BE-VLG", "vlaanderen": "BE-VLG", "wallonia": "BE-WAL", "wallonie": "BE-WAL", "brussels": "BE-BRU", "bruxelles": "BE-BRU", } def generate_country_mappings() -> dict: """ Generate mappings from ISO country codes to Wikidata URIs. schema:addressCountry should use Wikidata entity URIs, not ISO codes. """ return { "NL": "wd:Q55", # Netherlands "DE": "wd:Q183", # Germany "BE": "wd:Q31", # Belgium "FR": "wd:Q142", # France "GB": "wd:Q145", # United Kingdom "UK": "wd:Q145", # United Kingdom (alias) "US": "wd:Q30", # United States "JP": "wd:Q17", # Japan "CZ": "wd:Q213", # Czech Republic "AT": "wd:Q40", # Austria "CH": "wd:Q39", # Switzerland "IT": "wd:Q38", # Italy "ES": "wd:Q29", # Spain "PL": "wd:Q36", # Poland "PT": "wd:Q45", # Portugal "BR": "wd:Q155", # Brazil "MX": "wd:Q96", # Mexico "CA": "wd:Q16", # Canada "AU": "wd:Q408", # Australia "IN": "wd:Q668", # India "CN": "wd:Q148", # China } def generate_valid_prefixes() -> dict: """ Generate valid SPARQL prefixes and their URIs. From the LinkML schema prefixes. """ return { "hc": "https://nde.nl/ontology/hc/class/", "hcp": "https://nde.nl/ontology/hc/", "schema": "http://schema.org/", "skos": "http://www.w3.org/2004/02/skos/core#", "rdfs": "http://www.w3.org/2000/01/rdf-schema#", "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", "wd": "http://www.wikidata.org/entity/", "wdt": "http://www.wikidata.org/prop/direct/", "foaf": "http://xmlns.com/foaf/0.1/", "dct": "http://purl.org/dc/terms/", "dcterms": "http://purl.org/dc/terms/", "owl": "http://www.w3.org/2002/07/owl#", "xsd": "http://www.w3.org/2001/XMLSchema#", "org": "http://www.w3.org/ns/org#", "prov": "http://www.w3.org/ns/prov#", } def generate_invalid_patterns() -> list[dict]: """ Generate patterns that are known to be invalid in SPARQL queries. These are common mistakes LLMs make that should be auto-corrected. """ return [ { "pattern": r"hcp:custodian_type\s+hc:(\w+)", "description": "hcp:custodian_type does not exist; use hcp:institutionType with single-letter codes", "correction_type": "institution_type_class_to_code", }, { "pattern": r"hc:custodian_type\s+hc:(\w+)", "description": "hc:custodian_type does not exist; use hcp:institutionType with single-letter codes", "correction_type": "institution_type_class_to_code", }, { "pattern": r"\bcrm:P\d+[a-zA-Z_]*\b", "description": "CIDOC-CRM properties (crm:P*) are not used in this ontology", "correction_type": "remove_triple_pattern", }, { "pattern": r"PREFIX\s+hc:\s*", "description": "Wrong prefix URI for hc:", "correction_type": "fix_prefix_uri", "replacement": "PREFIX hc: ", }, ] def generate_property_mappings() -> dict: """ Generate property name to slot_uri mappings. These help identify when an LLM uses a non-existent property. """ return { # Correct properties "hcp:institutionType": { "slot_uri": "org:classification", "range": "CustodianPrimaryTypeEnum", "valid_values": ["M", "L", "A", "G", "O", "R", "C", "U", "B", "E", "S", "F", "I", "X", "P", "H", "D", "N", "T"], }, "hcp:ghcid": { "slot_uri": "https://nde.nl/ontology/hc/ghcid", "pattern": r"^[A-Z]{2}-[A-Z]{2,3}-[A-Z]{2,4}-[A-Z]-[A-Z0-9]+$", }, "hcp:isil": { "slot_uri": "https://nde.nl/ontology/hc/isil", "pattern": r"^[A-Z]{2}-[A-Za-z0-9]+$", }, "schema:name": { "slot_uri": "schema:name", "datatype": "string", }, "schema:addressCountry": { "slot_uri": "schema:addressCountry", "nodeKind": "IRI", "description": "Country as Wikidata entity URI (e.g., wd:Q55)", }, # Non-existent properties (common LLM mistakes) "hcp:custodian_type": { "error": "Property does not exist", "correct_property": "hcp:institutionType", }, "hc:custodian_type": { "error": "Property does not exist", "correct_property": "hcp:institutionType", }, } def main(): """Generate validation rules from LinkML schema.""" print("Generating SPARQL validation rules from LinkML schema...") # Load enum files enums = {} enum_dir = LINKML_DIR / "modules" / "enums" if enum_dir.exists(): for enum_file in enum_dir.glob("*.yaml"): print(f" Loading enum: {enum_file.name}") enums.update(extract_enum_values(enum_file)) # Load slot files for patterns slots = {} slot_dir = LINKML_DIR / "modules" / "slots" if slot_dir.exists(): important_slots = ["custodian_type", "country", "subregion", "iso_3166_2_code"] for slot_name in important_slots: slot_file = slot_dir / f"{slot_name}.yaml" if slot_file.exists(): print(f" Loading slot: {slot_file.name}") slots.update(extract_slot_patterns(slot_file)) # Generate the complete validation rules rules = { "_metadata": { "generated_from": "LinkML schema at schemas/20251121/linkml/", "purpose": "SPARQL query validation and auto-correction", "single_source_of_truth": "LinkML schema defines all valid values", }, "enums": enums, "slots": slots, "prefixes": generate_valid_prefixes(), "institution_type_mappings": generate_institution_type_mappings(), "subregion_mappings": generate_subregion_mappings(), "country_mappings": generate_country_mappings(), "property_mappings": generate_property_mappings(), "invalid_patterns": generate_invalid_patterns(), } # Ensure output directory exists OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True) # Write the rules with open(OUTPUT_FILE, "w") as f: json.dump(rules, f, indent=2) print(f"\nGenerated validation rules: {OUTPUT_FILE}") print(f" - {len(enums)} enums") print(f" - {len(slots)} slots") print(f" - {len(rules['institution_type_mappings'])} institution type mappings") print(f" - {len(rules['subregion_mappings'])} subregion mappings") print(f" - {len(rules['country_mappings'])} country mappings") if __name__ == "__main__": main()