371 lines
12 KiB
Python
371 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Generate SPARQL validation rules from LinkML schema.
|
|
|
|
This script reads the LinkML schema and generates a JSON file containing:
|
|
1. Valid enum values (e.g., institution types: M, L, A, G...)
|
|
2. Slot patterns (e.g., ISO 3166-2 format for subregions)
|
|
3. Valid prefixes and their URIs
|
|
4. Property mappings (slot_uri mappings)
|
|
|
|
The generated JSON is used by sparql_linter.py for SPARQL auto-correction.
|
|
|
|
SINGLE SOURCE OF TRUTH: LinkML schema → Generated rules → SPARQL validation
|
|
|
|
Usage:
|
|
python scripts/generate_sparql_validation_rules.py
|
|
|
|
Output:
|
|
data/validation/sparql_validation_rules.json
|
|
"""
|
|
|
|
import json
|
|
import re
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
import yaml
|
|
|
|
# Project root
|
|
PROJECT_ROOT = Path(__file__).parent.parent
|
|
LINKML_DIR = PROJECT_ROOT / "schemas" / "20251121" / "linkml"
|
|
OUTPUT_FILE = PROJECT_ROOT / "data" / "validation" / "sparql_validation_rules.json"
|
|
|
|
|
|
def load_yaml_file(path: Path) -> dict:
|
|
"""Load a YAML file."""
|
|
with open(path) as f:
|
|
return yaml.safe_load(f) or {}
|
|
|
|
|
|
def extract_enum_values(enum_file: Path) -> dict[str, Any]:
|
|
"""Extract enum values and their mappings from a LinkML enum file."""
|
|
data = load_yaml_file(enum_file)
|
|
result = {}
|
|
|
|
for enum_name, enum_def in data.get("enums", {}).items():
|
|
values = []
|
|
value_mappings = {} # Full name -> code mappings
|
|
|
|
for value_name, value_def in enum_def.get("permissible_values", {}).items():
|
|
values.append(value_name)
|
|
|
|
# Extract meaning (Wikidata ID if present)
|
|
meaning = value_def.get("meaning") if isinstance(value_def, dict) else None
|
|
description = value_def.get("description", "") if isinstance(value_def, dict) else ""
|
|
|
|
value_mappings[value_name] = {
|
|
"meaning": meaning,
|
|
"description": description,
|
|
}
|
|
|
|
result[enum_name] = {
|
|
"values": values,
|
|
"mappings": value_mappings,
|
|
}
|
|
|
|
return result
|
|
|
|
|
|
def extract_slot_patterns(slot_file: Path) -> dict[str, Any]:
|
|
"""Extract slot patterns and constraints from a LinkML slot file."""
|
|
data = load_yaml_file(slot_file)
|
|
result = {}
|
|
|
|
for slot_name, slot_def in data.get("slots", {}).items():
|
|
if not isinstance(slot_def, dict):
|
|
continue
|
|
|
|
slot_info = {
|
|
"slot_uri": slot_def.get("slot_uri"),
|
|
"range": slot_def.get("range"),
|
|
"pattern": slot_def.get("pattern"),
|
|
"description": slot_def.get("description", ""),
|
|
}
|
|
|
|
# Only include if there's useful information
|
|
if any(v for v in slot_info.values() if v):
|
|
result[slot_name] = slot_info
|
|
|
|
return result
|
|
|
|
|
|
def generate_institution_type_mappings() -> dict:
|
|
"""
|
|
Generate mappings from full words to single-letter codes for institution types.
|
|
|
|
Based on CustodianPrimaryTypeEnum from LinkML.
|
|
"""
|
|
return {
|
|
# Full word -> single letter code
|
|
"MUSEUM": "M",
|
|
"Museum": "M",
|
|
"museum": "M",
|
|
"LIBRARY": "L",
|
|
"Library": "L",
|
|
"library": "L",
|
|
"ARCHIVE": "A",
|
|
"Archive": "A",
|
|
"archive": "A",
|
|
"GALLERY": "G",
|
|
"Gallery": "G",
|
|
"gallery": "G",
|
|
"OFFICIAL_INSTITUTION": "O",
|
|
"RESEARCH_CENTER": "R",
|
|
"COMMERCIAL": "C",
|
|
"UNSPECIFIED": "U",
|
|
"BIO_CUSTODIAN": "B",
|
|
"EDUCATION_PROVIDER": "E",
|
|
"HERITAGE_SOCIETY": "S",
|
|
"FEATURE_CUSTODIAN": "F",
|
|
"INTANGIBLE_HERITAGE_GROUP": "I",
|
|
"MIXED": "X",
|
|
"PERSONAL_COLLECTION": "P",
|
|
"HOLY_SACRED_SITE": "H",
|
|
"DIGITAL_PLATFORM": "D",
|
|
"NON_PROFIT": "N",
|
|
"TASTE_SCENT_HERITAGE": "T",
|
|
}
|
|
|
|
|
|
def generate_subregion_mappings() -> dict:
|
|
"""
|
|
Generate mappings from province names to ISO 3166-2 codes.
|
|
|
|
These are the most common Dutch provinces that LLMs might use.
|
|
Format follows ISO 3166-2: {country}-{subdivision}
|
|
"""
|
|
return {
|
|
# Dutch provinces (NL-XX)
|
|
"noord-holland": "NL-NH",
|
|
"noord holland": "NL-NH",
|
|
"noordholland": "NL-NH",
|
|
"south-holland": "NL-ZH",
|
|
"south holland": "NL-ZH",
|
|
"zuid-holland": "NL-ZH",
|
|
"zuid holland": "NL-ZH",
|
|
"zuidholland": "NL-ZH",
|
|
"noord-brabant": "NL-NB",
|
|
"noord brabant": "NL-NB",
|
|
"north brabant": "NL-NB",
|
|
"gelderland": "NL-GE",
|
|
"utrecht": "NL-UT",
|
|
"overijssel": "NL-OV",
|
|
"limburg": "NL-LI",
|
|
"friesland": "NL-FR",
|
|
"frisia": "NL-FR",
|
|
"groningen": "NL-GR",
|
|
"drenthe": "NL-DR",
|
|
"flevoland": "NL-FL",
|
|
"zeeland": "NL-ZE",
|
|
|
|
# German states (DE-XX) - common ones
|
|
"bavaria": "DE-BY",
|
|
"bayern": "DE-BY",
|
|
"berlin": "DE-BE",
|
|
"baden-württemberg": "DE-BW",
|
|
"north rhine-westphalia": "DE-NW",
|
|
"nordrhein-westfalen": "DE-NW",
|
|
"saxony": "DE-SN",
|
|
"sachsen": "DE-SN",
|
|
"hesse": "DE-HE",
|
|
"hessen": "DE-HE",
|
|
|
|
# Belgian regions (BE-XX)
|
|
"flanders": "BE-VLG",
|
|
"vlaanderen": "BE-VLG",
|
|
"wallonia": "BE-WAL",
|
|
"wallonie": "BE-WAL",
|
|
"brussels": "BE-BRU",
|
|
"bruxelles": "BE-BRU",
|
|
}
|
|
|
|
|
|
def generate_country_mappings() -> dict:
|
|
"""
|
|
Generate mappings from ISO country codes to Wikidata URIs.
|
|
|
|
schema:addressCountry should use Wikidata entity URIs, not ISO codes.
|
|
"""
|
|
return {
|
|
"NL": "wd:Q55", # Netherlands
|
|
"DE": "wd:Q183", # Germany
|
|
"BE": "wd:Q31", # Belgium
|
|
"FR": "wd:Q142", # France
|
|
"GB": "wd:Q145", # United Kingdom
|
|
"UK": "wd:Q145", # United Kingdom (alias)
|
|
"US": "wd:Q30", # United States
|
|
"JP": "wd:Q17", # Japan
|
|
"CZ": "wd:Q213", # Czech Republic
|
|
"AT": "wd:Q40", # Austria
|
|
"CH": "wd:Q39", # Switzerland
|
|
"IT": "wd:Q38", # Italy
|
|
"ES": "wd:Q29", # Spain
|
|
"PL": "wd:Q36", # Poland
|
|
"PT": "wd:Q45", # Portugal
|
|
"BR": "wd:Q155", # Brazil
|
|
"MX": "wd:Q96", # Mexico
|
|
"CA": "wd:Q16", # Canada
|
|
"AU": "wd:Q408", # Australia
|
|
"IN": "wd:Q668", # India
|
|
"CN": "wd:Q148", # China
|
|
}
|
|
|
|
|
|
def generate_valid_prefixes() -> dict:
|
|
"""
|
|
Generate valid SPARQL prefixes and their URIs.
|
|
|
|
From the LinkML schema prefixes.
|
|
"""
|
|
return {
|
|
"hc": "https://nde.nl/ontology/hc/class/",
|
|
"hcp": "https://nde.nl/ontology/hc/",
|
|
"schema": "http://schema.org/",
|
|
"skos": "http://www.w3.org/2004/02/skos/core#",
|
|
"rdfs": "http://www.w3.org/2000/01/rdf-schema#",
|
|
"rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
|
|
"wd": "http://www.wikidata.org/entity/",
|
|
"wdt": "http://www.wikidata.org/prop/direct/",
|
|
"foaf": "http://xmlns.com/foaf/0.1/",
|
|
"dct": "http://purl.org/dc/terms/",
|
|
"dcterms": "http://purl.org/dc/terms/",
|
|
"owl": "http://www.w3.org/2002/07/owl#",
|
|
"xsd": "http://www.w3.org/2001/XMLSchema#",
|
|
"org": "http://www.w3.org/ns/org#",
|
|
"prov": "http://www.w3.org/ns/prov#",
|
|
}
|
|
|
|
|
|
def generate_invalid_patterns() -> list[dict]:
|
|
"""
|
|
Generate patterns that are known to be invalid in SPARQL queries.
|
|
|
|
These are common mistakes LLMs make that should be auto-corrected.
|
|
"""
|
|
return [
|
|
{
|
|
"pattern": r"hcp:custodian_type\s+hc:(\w+)",
|
|
"description": "hcp:custodian_type does not exist; use hcp:institutionType with single-letter codes",
|
|
"correction_type": "institution_type_class_to_code",
|
|
},
|
|
{
|
|
"pattern": r"hc:custodian_type\s+hc:(\w+)",
|
|
"description": "hc:custodian_type does not exist; use hcp:institutionType with single-letter codes",
|
|
"correction_type": "institution_type_class_to_code",
|
|
},
|
|
{
|
|
"pattern": r"\bcrm:P\d+[a-zA-Z_]*\b",
|
|
"description": "CIDOC-CRM properties (crm:P*) are not used in this ontology",
|
|
"correction_type": "remove_triple_pattern",
|
|
},
|
|
{
|
|
"pattern": r"PREFIX\s+hc:\s*<https?://w3id\.org/heritage/custodian/?>",
|
|
"description": "Wrong prefix URI for hc:",
|
|
"correction_type": "fix_prefix_uri",
|
|
"replacement": "PREFIX hc: <https://nde.nl/ontology/hc/class/>",
|
|
},
|
|
]
|
|
|
|
|
|
def generate_property_mappings() -> dict:
|
|
"""
|
|
Generate property name to slot_uri mappings.
|
|
|
|
These help identify when an LLM uses a non-existent property.
|
|
"""
|
|
return {
|
|
# Correct properties
|
|
"hcp:institutionType": {
|
|
"slot_uri": "org:classification",
|
|
"range": "CustodianPrimaryTypeEnum",
|
|
"valid_values": ["M", "L", "A", "G", "O", "R", "C", "U", "B", "E", "S", "F", "I", "X", "P", "H", "D", "N", "T"],
|
|
},
|
|
"hcp:ghcid": {
|
|
"slot_uri": "https://nde.nl/ontology/hc/ghcid",
|
|
"pattern": r"^[A-Z]{2}-[A-Z]{2,3}-[A-Z]{2,4}-[A-Z]-[A-Z0-9]+$",
|
|
},
|
|
"hcp:isil": {
|
|
"slot_uri": "https://nde.nl/ontology/hc/isil",
|
|
"pattern": r"^[A-Z]{2}-[A-Za-z0-9]+$",
|
|
},
|
|
"schema:name": {
|
|
"slot_uri": "schema:name",
|
|
"datatype": "string",
|
|
},
|
|
"schema:addressCountry": {
|
|
"slot_uri": "schema:addressCountry",
|
|
"nodeKind": "IRI",
|
|
"description": "Country as Wikidata entity URI (e.g., wd:Q55)",
|
|
},
|
|
|
|
# Non-existent properties (common LLM mistakes)
|
|
"hcp:custodian_type": {
|
|
"error": "Property does not exist",
|
|
"correct_property": "hcp:institutionType",
|
|
},
|
|
"hc:custodian_type": {
|
|
"error": "Property does not exist",
|
|
"correct_property": "hcp:institutionType",
|
|
},
|
|
}
|
|
|
|
|
|
def main():
|
|
"""Generate validation rules from LinkML schema."""
|
|
print("Generating SPARQL validation rules from LinkML schema...")
|
|
|
|
# Load enum files
|
|
enums = {}
|
|
enum_dir = LINKML_DIR / "modules" / "enums"
|
|
if enum_dir.exists():
|
|
for enum_file in enum_dir.glob("*.yaml"):
|
|
print(f" Loading enum: {enum_file.name}")
|
|
enums.update(extract_enum_values(enum_file))
|
|
|
|
# Load slot files for patterns
|
|
slots = {}
|
|
slot_dir = LINKML_DIR / "modules" / "slots"
|
|
if slot_dir.exists():
|
|
important_slots = ["custodian_type", "country", "subregion", "iso_3166_2_code"]
|
|
for slot_name in important_slots:
|
|
slot_file = slot_dir / f"{slot_name}.yaml"
|
|
if slot_file.exists():
|
|
print(f" Loading slot: {slot_file.name}")
|
|
slots.update(extract_slot_patterns(slot_file))
|
|
|
|
# Generate the complete validation rules
|
|
rules = {
|
|
"_metadata": {
|
|
"generated_from": "LinkML schema at schemas/20251121/linkml/",
|
|
"purpose": "SPARQL query validation and auto-correction",
|
|
"single_source_of_truth": "LinkML schema defines all valid values",
|
|
},
|
|
"enums": enums,
|
|
"slots": slots,
|
|
"prefixes": generate_valid_prefixes(),
|
|
"institution_type_mappings": generate_institution_type_mappings(),
|
|
"subregion_mappings": generate_subregion_mappings(),
|
|
"country_mappings": generate_country_mappings(),
|
|
"property_mappings": generate_property_mappings(),
|
|
"invalid_patterns": generate_invalid_patterns(),
|
|
}
|
|
|
|
# Ensure output directory exists
|
|
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Write the rules
|
|
with open(OUTPUT_FILE, "w") as f:
|
|
json.dump(rules, f, indent=2)
|
|
|
|
print(f"\nGenerated validation rules: {OUTPUT_FILE}")
|
|
print(f" - {len(enums)} enums")
|
|
print(f" - {len(slots)} slots")
|
|
print(f" - {len(rules['institution_type_mappings'])} institution type mappings")
|
|
print(f" - {len(rules['subregion_mappings'])} subregion mappings")
|
|
print(f" - {len(rules['country_mappings'])} country mappings")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|