glam/scripts/validate_hc_storage_examples.py
2026-01-04 13:12:32 +01:00

222 lines
6.5 KiB
Python

#!/usr/bin/env python3
"""
Validate HC Storage ontology example files against SHACL shapes.
This script validates all the institution-specific example TTL files
against the HC Storage SHACL shapes to ensure data quality.
"""
import sys
from pathlib import Path
try:
from pyshacl import validate
from rdflib import Graph, Namespace
from rdflib.namespace import RDF, RDFS, XSD
except ImportError:
print("ERROR: Required packages not installed.")
print("Run: pip install pyshacl rdflib")
sys.exit(1)
# Paths
BASE_DIR = Path(__file__).parent.parent
ONTOLOGY_DIR = BASE_DIR / "frontend" / "public" / "ontology"
EXAMPLES_DIR = ONTOLOGY_DIR / "examples"
# Files to validate
SHAPES_FILE = ONTOLOGY_DIR / "hc-storage-shapes.ttl"
ONTOLOGY_FILE = ONTOLOGY_DIR / "hc-storage.ttl"
EXAMPLE_FILES = [
EXAMPLES_DIR / "hc-storage-examples.ttl", # Original archive examples
EXAMPLES_DIR / "hc-storage-museum-examples.ttl", # Art museum
EXAMPLES_DIR / "hc-storage-library-examples.ttl", # Rare books library
EXAMPLES_DIR / "hc-storage-naturalhistory-examples.ttl", # Natural history
EXAMPLES_DIR / "hc-storage-archaeology-examples.ttl", # Archaeological depot
EXAMPLES_DIR / "hc-storage-all-examples.ttl", # Combined all domains
]
def load_graph(filepath: Path, format: str = "turtle") -> Graph:
"""Load an RDF graph from a file."""
g = Graph()
g.parse(filepath, format=format)
return g
def count_instances(graph: Graph, ontology_ns: str = "https://nde.nl/ontology/hc/") -> dict:
"""Count instances of each class in the graph."""
HC = Namespace(ontology_ns)
class_counts = {}
# Key classes to count
classes = [
"StorageFacility",
"StorageUnit",
"StorageLocation",
"StorageAssignment",
"EnvironmentalRequirement",
"EnvironmentalObservation",
"StorageAssessment",
"HeldItem",
]
for cls_name in classes:
cls_uri = HC[cls_name]
# Count direct instances and subclass instances
count = 0
for s, p, o in graph.triples((None, RDF.type, None)):
if str(o) == str(cls_uri) or str(o).startswith(str(cls_uri)):
count += 1
class_counts[cls_name] = count
return class_counts
def validate_example_file(
example_file: Path,
shapes_graph: Graph,
ontology_graph: Graph,
verbose: bool = True
) -> tuple[bool, str, dict]:
"""
Validate a single example file against SHACL shapes.
Returns:
tuple: (conforms: bool, results_text: str, stats: dict)
"""
if not example_file.exists():
return False, f"File not found: {example_file}", {}
# Load example data
data_graph = Graph()
try:
data_graph.parse(example_file, format="turtle")
except Exception as e:
return False, f"Parse error: {e}", {}
# Add ontology for class hierarchy inference
combined_graph = data_graph + ontology_graph
# Get statistics
stats = {
"triples": len(data_graph),
"instances": count_instances(data_graph),
}
# Run SHACL validation
try:
conforms, results_graph, results_text = validate(
combined_graph,
shacl_graph=shapes_graph,
inference='rdfs', # Enable RDFS inference for subclass reasoning
abort_on_first=False, # Report all violations
)
except Exception as e:
return False, f"Validation error: {e}", stats
return conforms, results_text, stats
def main():
"""Run validation on all example files."""
print("=" * 70)
print("HC Storage Ontology - SHACL Validation")
print("=" * 70)
print()
# Check files exist
if not SHAPES_FILE.exists():
print(f"ERROR: Shapes file not found: {SHAPES_FILE}")
sys.exit(1)
if not ONTOLOGY_FILE.exists():
print(f"ERROR: Ontology file not found: {ONTOLOGY_FILE}")
sys.exit(1)
# Load shapes and ontology once
print(f"Loading SHACL shapes: {SHAPES_FILE.name}")
shapes_graph = load_graph(SHAPES_FILE)
print(f"{len(shapes_graph)} triples")
print(f"Loading ontology: {ONTOLOGY_FILE.name}")
ontology_graph = load_graph(ONTOLOGY_FILE)
print(f"{len(ontology_graph)} triples")
print()
# Validate each example file
results = []
for example_file in EXAMPLE_FILES:
print("-" * 70)
print(f"Validating: {example_file.name}")
print("-" * 70)
if not example_file.exists():
print(f" ⚠ SKIPPED: File not found")
results.append((example_file.name, None, "File not found", {}))
continue
conforms, results_text, stats = validate_example_file(
example_file, shapes_graph, ontology_graph
)
# Print statistics
print(f" Triples: {stats.get('triples', 0)}")
if stats.get('instances'):
print(" Instances:")
for cls_name, count in stats['instances'].items():
if count > 0:
print(f" - {cls_name}: {count}")
# Print result
if conforms:
print(f" ✓ VALID - No SHACL violations")
else:
print(f" ✗ INVALID - SHACL violations found:")
# Print abbreviated results
for line in results_text.split('\n')[:50]:
if line.strip():
print(f" {line}")
if results_text.count('\n') > 50:
print(f" ... ({results_text.count('Constraint Violation')} total violations)")
results.append((example_file.name, conforms, results_text, stats))
print()
# Summary
print("=" * 70)
print("VALIDATION SUMMARY")
print("=" * 70)
total_valid = 0
total_invalid = 0
total_skipped = 0
for filename, conforms, _, stats in results:
if conforms is None:
status = "⚠ SKIPPED"
total_skipped += 1
elif conforms:
status = "✓ VALID"
total_valid += 1
else:
status = "✗ INVALID"
total_invalid += 1
triples = stats.get('triples', '-')
print(f" {status:12} {filename:50} ({triples} triples)")
print()
print(f"Total: {total_valid} valid, {total_invalid} invalid, {total_skipped} skipped")
# Exit code
if total_invalid > 0:
sys.exit(1)
else:
sys.exit(0)
if __name__ == "__main__":
main()