#!/usr/bin/env python3 """ Validate HC Storage ontology example files against SHACL shapes. This script validates all the institution-specific example TTL files against the HC Storage SHACL shapes to ensure data quality. """ import sys from pathlib import Path try: from pyshacl import validate from rdflib import Graph, Namespace from rdflib.namespace import RDF, RDFS, XSD except ImportError: print("ERROR: Required packages not installed.") print("Run: pip install pyshacl rdflib") sys.exit(1) # Paths BASE_DIR = Path(__file__).parent.parent ONTOLOGY_DIR = BASE_DIR / "frontend" / "public" / "ontology" EXAMPLES_DIR = ONTOLOGY_DIR / "examples" # Files to validate SHAPES_FILE = ONTOLOGY_DIR / "hc-storage-shapes.ttl" ONTOLOGY_FILE = ONTOLOGY_DIR / "hc-storage.ttl" EXAMPLE_FILES = [ EXAMPLES_DIR / "hc-storage-examples.ttl", # Original archive examples EXAMPLES_DIR / "hc-storage-museum-examples.ttl", # Art museum EXAMPLES_DIR / "hc-storage-library-examples.ttl", # Rare books library EXAMPLES_DIR / "hc-storage-naturalhistory-examples.ttl", # Natural history EXAMPLES_DIR / "hc-storage-archaeology-examples.ttl", # Archaeological depot EXAMPLES_DIR / "hc-storage-all-examples.ttl", # Combined all domains ] def load_graph(filepath: Path, format: str = "turtle") -> Graph: """Load an RDF graph from a file.""" g = Graph() g.parse(filepath, format=format) return g def count_instances(graph: Graph, ontology_ns: str = "https://nde.nl/ontology/hc/") -> dict: """Count instances of each class in the graph.""" HC = Namespace(ontology_ns) class_counts = {} # Key classes to count classes = [ "StorageFacility", "StorageUnit", "StorageLocation", "StorageAssignment", "EnvironmentalRequirement", "EnvironmentalObservation", "StorageAssessment", "HeldItem", ] for cls_name in classes: cls_uri = HC[cls_name] # Count direct instances and subclass instances count = 0 for s, p, o in graph.triples((None, RDF.type, None)): if str(o) == str(cls_uri) or str(o).startswith(str(cls_uri)): count += 1 class_counts[cls_name] = count return class_counts def validate_example_file( example_file: Path, shapes_graph: Graph, ontology_graph: Graph, verbose: bool = True ) -> tuple[bool, str, dict]: """ Validate a single example file against SHACL shapes. Returns: tuple: (conforms: bool, results_text: str, stats: dict) """ if not example_file.exists(): return False, f"File not found: {example_file}", {} # Load example data data_graph = Graph() try: data_graph.parse(example_file, format="turtle") except Exception as e: return False, f"Parse error: {e}", {} # Add ontology for class hierarchy inference combined_graph = data_graph + ontology_graph # Get statistics stats = { "triples": len(data_graph), "instances": count_instances(data_graph), } # Run SHACL validation try: conforms, results_graph, results_text = validate( combined_graph, shacl_graph=shapes_graph, inference='rdfs', # Enable RDFS inference for subclass reasoning abort_on_first=False, # Report all violations ) except Exception as e: return False, f"Validation error: {e}", stats return conforms, results_text, stats def main(): """Run validation on all example files.""" print("=" * 70) print("HC Storage Ontology - SHACL Validation") print("=" * 70) print() # Check files exist if not SHAPES_FILE.exists(): print(f"ERROR: Shapes file not found: {SHAPES_FILE}") sys.exit(1) if not ONTOLOGY_FILE.exists(): print(f"ERROR: Ontology file not found: {ONTOLOGY_FILE}") sys.exit(1) # Load shapes and ontology once print(f"Loading SHACL shapes: {SHAPES_FILE.name}") shapes_graph = load_graph(SHAPES_FILE) print(f" → {len(shapes_graph)} triples") print(f"Loading ontology: {ONTOLOGY_FILE.name}") ontology_graph = load_graph(ONTOLOGY_FILE) print(f" → {len(ontology_graph)} triples") print() # Validate each example file results = [] for example_file in EXAMPLE_FILES: print("-" * 70) print(f"Validating: {example_file.name}") print("-" * 70) if not example_file.exists(): print(f" ⚠ SKIPPED: File not found") results.append((example_file.name, None, "File not found", {})) continue conforms, results_text, stats = validate_example_file( example_file, shapes_graph, ontology_graph ) # Print statistics print(f" Triples: {stats.get('triples', 0)}") if stats.get('instances'): print(" Instances:") for cls_name, count in stats['instances'].items(): if count > 0: print(f" - {cls_name}: {count}") # Print result if conforms: print(f" ✓ VALID - No SHACL violations") else: print(f" ✗ INVALID - SHACL violations found:") # Print abbreviated results for line in results_text.split('\n')[:50]: if line.strip(): print(f" {line}") if results_text.count('\n') > 50: print(f" ... ({results_text.count('Constraint Violation')} total violations)") results.append((example_file.name, conforms, results_text, stats)) print() # Summary print("=" * 70) print("VALIDATION SUMMARY") print("=" * 70) total_valid = 0 total_invalid = 0 total_skipped = 0 for filename, conforms, _, stats in results: if conforms is None: status = "⚠ SKIPPED" total_skipped += 1 elif conforms: status = "✓ VALID" total_valid += 1 else: status = "✗ INVALID" total_invalid += 1 triples = stats.get('triples', '-') print(f" {status:12} {filename:50} ({triples} triples)") print() print(f"Total: {total_valid} valid, {total_invalid} invalid, {total_skipped} skipped") # Exit code if total_invalid > 0: sys.exit(1) else: sys.exit(0) if __name__ == "__main__": main()