glam/scripts/generate_sparql_validation_rules.py

371 lines
12 KiB
Python

#!/usr/bin/env python3
"""
Generate SPARQL validation rules from LinkML schema.
This script reads the LinkML schema and generates a JSON file containing:
1. Valid enum values (e.g., institution types: M, L, A, G...)
2. Slot patterns (e.g., ISO 3166-2 format for subregions)
3. Valid prefixes and their URIs
4. Property mappings (slot_uri mappings)
The generated JSON is used by sparql_linter.py for SPARQL auto-correction.
SINGLE SOURCE OF TRUTH: LinkML schema → Generated rules → SPARQL validation
Usage:
python scripts/generate_sparql_validation_rules.py
Output:
data/validation/sparql_validation_rules.json
"""
import json
import re
import sys
from pathlib import Path
from typing import Any
import yaml
# Project root
PROJECT_ROOT = Path(__file__).parent.parent
LINKML_DIR = PROJECT_ROOT / "schemas" / "20251121" / "linkml"
OUTPUT_FILE = PROJECT_ROOT / "data" / "validation" / "sparql_validation_rules.json"
def load_yaml_file(path: Path) -> dict:
"""Load a YAML file."""
with open(path) as f:
return yaml.safe_load(f) or {}
def extract_enum_values(enum_file: Path) -> dict[str, Any]:
"""Extract enum values and their mappings from a LinkML enum file."""
data = load_yaml_file(enum_file)
result = {}
for enum_name, enum_def in data.get("enums", {}).items():
values = []
value_mappings = {} # Full name -> code mappings
for value_name, value_def in enum_def.get("permissible_values", {}).items():
values.append(value_name)
# Extract meaning (Wikidata ID if present)
meaning = value_def.get("meaning") if isinstance(value_def, dict) else None
description = value_def.get("description", "") if isinstance(value_def, dict) else ""
value_mappings[value_name] = {
"meaning": meaning,
"description": description,
}
result[enum_name] = {
"values": values,
"mappings": value_mappings,
}
return result
def extract_slot_patterns(slot_file: Path) -> dict[str, Any]:
"""Extract slot patterns and constraints from a LinkML slot file."""
data = load_yaml_file(slot_file)
result = {}
for slot_name, slot_def in data.get("slots", {}).items():
if not isinstance(slot_def, dict):
continue
slot_info = {
"slot_uri": slot_def.get("slot_uri"),
"range": slot_def.get("range"),
"pattern": slot_def.get("pattern"),
"description": slot_def.get("description", ""),
}
# Only include if there's useful information
if any(v for v in slot_info.values() if v):
result[slot_name] = slot_info
return result
def generate_institution_type_mappings() -> dict:
"""
Generate mappings from full words to single-letter codes for institution types.
Based on CustodianPrimaryTypeEnum from LinkML.
"""
return {
# Full word -> single letter code
"MUSEUM": "M",
"Museum": "M",
"museum": "M",
"LIBRARY": "L",
"Library": "L",
"library": "L",
"ARCHIVE": "A",
"Archive": "A",
"archive": "A",
"GALLERY": "G",
"Gallery": "G",
"gallery": "G",
"OFFICIAL_INSTITUTION": "O",
"RESEARCH_CENTER": "R",
"COMMERCIAL": "C",
"UNSPECIFIED": "U",
"BIO_CUSTODIAN": "B",
"EDUCATION_PROVIDER": "E",
"HERITAGE_SOCIETY": "S",
"FEATURE_CUSTODIAN": "F",
"INTANGIBLE_HERITAGE_GROUP": "I",
"MIXED": "X",
"PERSONAL_COLLECTION": "P",
"HOLY_SACRED_SITE": "H",
"DIGITAL_PLATFORM": "D",
"NON_PROFIT": "N",
"TASTE_SCENT_HERITAGE": "T",
}
def generate_subregion_mappings() -> dict:
"""
Generate mappings from province names to ISO 3166-2 codes.
These are the most common Dutch provinces that LLMs might use.
Format follows ISO 3166-2: {country}-{subdivision}
"""
return {
# Dutch provinces (NL-XX)
"noord-holland": "NL-NH",
"noord holland": "NL-NH",
"noordholland": "NL-NH",
"south-holland": "NL-ZH",
"south holland": "NL-ZH",
"zuid-holland": "NL-ZH",
"zuid holland": "NL-ZH",
"zuidholland": "NL-ZH",
"noord-brabant": "NL-NB",
"noord brabant": "NL-NB",
"north brabant": "NL-NB",
"gelderland": "NL-GE",
"utrecht": "NL-UT",
"overijssel": "NL-OV",
"limburg": "NL-LI",
"friesland": "NL-FR",
"frisia": "NL-FR",
"groningen": "NL-GR",
"drenthe": "NL-DR",
"flevoland": "NL-FL",
"zeeland": "NL-ZE",
# German states (DE-XX) - common ones
"bavaria": "DE-BY",
"bayern": "DE-BY",
"berlin": "DE-BE",
"baden-württemberg": "DE-BW",
"north rhine-westphalia": "DE-NW",
"nordrhein-westfalen": "DE-NW",
"saxony": "DE-SN",
"sachsen": "DE-SN",
"hesse": "DE-HE",
"hessen": "DE-HE",
# Belgian regions (BE-XX)
"flanders": "BE-VLG",
"vlaanderen": "BE-VLG",
"wallonia": "BE-WAL",
"wallonie": "BE-WAL",
"brussels": "BE-BRU",
"bruxelles": "BE-BRU",
}
def generate_country_mappings() -> dict:
"""
Generate mappings from ISO country codes to Wikidata URIs.
schema:addressCountry should use Wikidata entity URIs, not ISO codes.
"""
return {
"NL": "wd:Q55", # Netherlands
"DE": "wd:Q183", # Germany
"BE": "wd:Q31", # Belgium
"FR": "wd:Q142", # France
"GB": "wd:Q145", # United Kingdom
"UK": "wd:Q145", # United Kingdom (alias)
"US": "wd:Q30", # United States
"JP": "wd:Q17", # Japan
"CZ": "wd:Q213", # Czech Republic
"AT": "wd:Q40", # Austria
"CH": "wd:Q39", # Switzerland
"IT": "wd:Q38", # Italy
"ES": "wd:Q29", # Spain
"PL": "wd:Q36", # Poland
"PT": "wd:Q45", # Portugal
"BR": "wd:Q155", # Brazil
"MX": "wd:Q96", # Mexico
"CA": "wd:Q16", # Canada
"AU": "wd:Q408", # Australia
"IN": "wd:Q668", # India
"CN": "wd:Q148", # China
}
def generate_valid_prefixes() -> dict:
"""
Generate valid SPARQL prefixes and their URIs.
From the LinkML schema prefixes.
"""
return {
"hc": "https://nde.nl/ontology/hc/class/",
"hcp": "https://nde.nl/ontology/hc/",
"schema": "http://schema.org/",
"skos": "http://www.w3.org/2004/02/skos/core#",
"rdfs": "http://www.w3.org/2000/01/rdf-schema#",
"rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
"wd": "http://www.wikidata.org/entity/",
"wdt": "http://www.wikidata.org/prop/direct/",
"foaf": "http://xmlns.com/foaf/0.1/",
"dct": "http://purl.org/dc/terms/",
"dcterms": "http://purl.org/dc/terms/",
"owl": "http://www.w3.org/2002/07/owl#",
"xsd": "http://www.w3.org/2001/XMLSchema#",
"org": "http://www.w3.org/ns/org#",
"prov": "http://www.w3.org/ns/prov#",
}
def generate_invalid_patterns() -> list[dict]:
"""
Generate patterns that are known to be invalid in SPARQL queries.
These are common mistakes LLMs make that should be auto-corrected.
"""
return [
{
"pattern": r"hcp:custodian_type\s+hc:(\w+)",
"description": "hcp:custodian_type does not exist; use hcp:institutionType with single-letter codes",
"correction_type": "institution_type_class_to_code",
},
{
"pattern": r"hc:custodian_type\s+hc:(\w+)",
"description": "hc:custodian_type does not exist; use hcp:institutionType with single-letter codes",
"correction_type": "institution_type_class_to_code",
},
{
"pattern": r"\bcrm:P\d+[a-zA-Z_]*\b",
"description": "CIDOC-CRM properties (crm:P*) are not used in this ontology",
"correction_type": "remove_triple_pattern",
},
{
"pattern": r"PREFIX\s+hc:\s*<https?://w3id\.org/heritage/custodian/?>",
"description": "Wrong prefix URI for hc:",
"correction_type": "fix_prefix_uri",
"replacement": "PREFIX hc: <https://nde.nl/ontology/hc/class/>",
},
]
def generate_property_mappings() -> dict:
"""
Generate property name to slot_uri mappings.
These help identify when an LLM uses a non-existent property.
"""
return {
# Correct properties
"hcp:institutionType": {
"slot_uri": "org:classification",
"range": "CustodianPrimaryTypeEnum",
"valid_values": ["M", "L", "A", "G", "O", "R", "C", "U", "B", "E", "S", "F", "I", "X", "P", "H", "D", "N", "T"],
},
"hcp:ghcid": {
"slot_uri": "https://nde.nl/ontology/hc/ghcid",
"pattern": r"^[A-Z]{2}-[A-Z]{2,3}-[A-Z]{2,4}-[A-Z]-[A-Z0-9]+$",
},
"hcp:isil": {
"slot_uri": "https://nde.nl/ontology/hc/isil",
"pattern": r"^[A-Z]{2}-[A-Za-z0-9]+$",
},
"schema:name": {
"slot_uri": "schema:name",
"datatype": "string",
},
"schema:addressCountry": {
"slot_uri": "schema:addressCountry",
"nodeKind": "IRI",
"description": "Country as Wikidata entity URI (e.g., wd:Q55)",
},
# Non-existent properties (common LLM mistakes)
"hcp:custodian_type": {
"error": "Property does not exist",
"correct_property": "hcp:institutionType",
},
"hc:custodian_type": {
"error": "Property does not exist",
"correct_property": "hcp:institutionType",
},
}
def main():
"""Generate validation rules from LinkML schema."""
print("Generating SPARQL validation rules from LinkML schema...")
# Load enum files
enums = {}
enum_dir = LINKML_DIR / "modules" / "enums"
if enum_dir.exists():
for enum_file in enum_dir.glob("*.yaml"):
print(f" Loading enum: {enum_file.name}")
enums.update(extract_enum_values(enum_file))
# Load slot files for patterns
slots = {}
slot_dir = LINKML_DIR / "modules" / "slots"
if slot_dir.exists():
important_slots = ["custodian_type", "country", "subregion", "iso_3166_2_code"]
for slot_name in important_slots:
slot_file = slot_dir / f"{slot_name}.yaml"
if slot_file.exists():
print(f" Loading slot: {slot_file.name}")
slots.update(extract_slot_patterns(slot_file))
# Generate the complete validation rules
rules = {
"_metadata": {
"generated_from": "LinkML schema at schemas/20251121/linkml/",
"purpose": "SPARQL query validation and auto-correction",
"single_source_of_truth": "LinkML schema defines all valid values",
},
"enums": enums,
"slots": slots,
"prefixes": generate_valid_prefixes(),
"institution_type_mappings": generate_institution_type_mappings(),
"subregion_mappings": generate_subregion_mappings(),
"country_mappings": generate_country_mappings(),
"property_mappings": generate_property_mappings(),
"invalid_patterns": generate_invalid_patterns(),
}
# Ensure output directory exists
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
# Write the rules
with open(OUTPUT_FILE, "w") as f:
json.dump(rules, f, indent=2)
print(f"\nGenerated validation rules: {OUTPUT_FILE}")
print(f" - {len(enums)} enums")
print(f" - {len(slots)} slots")
print(f" - {len(rules['institution_type_mappings'])} institution type mappings")
print(f" - {len(rules['subregion_mappings'])} subregion mappings")
print(f" - {len(rules['country_mappings'])} country mappings")
if __name__ == "__main__":
main()