- Created SHACL shapes for validating temporal consistency and bidirectional relationships in custodial collections and staff observations. - Implemented a Python script to validate RDF data against the defined SHACL shapes using the pyshacl library. - Added command-line interface for validation with options for specifying data formats and output reports. - Included detailed error handling and reporting for validation results.
297 lines
8.9 KiB
Python
Executable file
297 lines
8.9 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
SHACL Validation Script for Heritage Custodian Ontology
|
|
|
|
Uses pyshacl library to validate RDF data against SHACL shapes.
|
|
|
|
Usage:
|
|
python scripts/validate_with_shacl.py <data.ttl>
|
|
python scripts/validate_with_shacl.py <data.ttl> --shapes <shapes.ttl>
|
|
python scripts/validate_with_shacl.py <data.ttl> --format jsonld
|
|
python scripts/validate_with_shacl.py <data.ttl> --output report.ttl
|
|
|
|
Author: Heritage Custodian Ontology Project
|
|
Date: 2025-11-22
|
|
Schema Version: v0.7.0 (Phase 7: SHACL Validation)
|
|
"""
|
|
|
|
import sys
|
|
import argparse
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
|
|
try:
|
|
from pyshacl import validate
|
|
from rdflib import Graph
|
|
except ImportError:
|
|
print("ERROR: Required libraries not installed.")
|
|
print("Install with: pip install pyshacl rdflib")
|
|
sys.exit(1)
|
|
|
|
|
|
# ============================================================================
|
|
# Constants
|
|
# ============================================================================
|
|
|
|
DEFAULT_SHAPES_FILE = "schemas/20251121/shacl/custodian_validation_shapes.ttl"
|
|
SUPPORTED_FORMATS = ["turtle", "ttl", "xml", "n3", "nt", "jsonld", "json-ld"]
|
|
|
|
|
|
# ============================================================================
|
|
# Validation Functions
|
|
# ============================================================================
|
|
|
|
def validate_rdf_data(
|
|
data_file: Path,
|
|
shapes_file: Optional[Path] = None,
|
|
data_format: str = "turtle",
|
|
output_file: Optional[Path] = None,
|
|
verbose: bool = False
|
|
) -> bool:
|
|
"""
|
|
Validate RDF data against SHACL shapes.
|
|
|
|
Args:
|
|
data_file: Path to RDF data file to validate
|
|
shapes_file: Path to SHACL shapes file (default: schemas/20251121/shacl/custodian_validation_shapes.ttl)
|
|
data_format: RDF format (turtle, xml, n3, nt, jsonld)
|
|
output_file: Optional path to write validation report
|
|
verbose: Print detailed validation report
|
|
|
|
Returns:
|
|
True if validation passes, False otherwise
|
|
"""
|
|
|
|
# Use default shapes file if not specified
|
|
if shapes_file is None:
|
|
shapes_file = Path(DEFAULT_SHAPES_FILE)
|
|
|
|
# Check files exist
|
|
if not data_file.exists():
|
|
print(f"ERROR: Data file not found: {data_file}")
|
|
return False
|
|
|
|
if not shapes_file.exists():
|
|
print(f"ERROR: SHACL shapes file not found: {shapes_file}")
|
|
return False
|
|
|
|
print(f"\n{'=' * 80}")
|
|
print("SHACL VALIDATION")
|
|
print(f"{'=' * 80}")
|
|
print(f"Data file: {data_file}")
|
|
print(f"Shapes file: {shapes_file}")
|
|
print(f"Data format: {data_format}")
|
|
print(f"{'=' * 80}\n")
|
|
|
|
try:
|
|
# Load data graph
|
|
if verbose:
|
|
print("Loading data graph...")
|
|
data_graph = Graph()
|
|
data_graph.parse(str(data_file), format=data_format)
|
|
|
|
if verbose:
|
|
print(f" Loaded {len(data_graph)} triples")
|
|
|
|
# Load shapes graph
|
|
if verbose:
|
|
print("Loading SHACL shapes...")
|
|
shapes_graph = Graph()
|
|
shapes_graph.parse(str(shapes_file), format="turtle")
|
|
|
|
if verbose:
|
|
print(f" Loaded {len(shapes_graph)} shape triples")
|
|
print("\nExecuting SHACL validation...")
|
|
|
|
# Run SHACL validation
|
|
conforms, results_graph, results_text = validate(
|
|
data_graph,
|
|
shacl_graph=shapes_graph,
|
|
inference='rdfs', # Use RDFS inference
|
|
abort_on_first=False, # Check all violations
|
|
meta_shacl=False, # Don't validate shapes themselves
|
|
advanced=True, # Enable SHACL-AF features
|
|
js=False # Disable SHACL-JS (not needed)
|
|
)
|
|
|
|
# Print results
|
|
print(f"\n{'=' * 80}")
|
|
print("VALIDATION RESULTS")
|
|
print(f"{'=' * 80}")
|
|
|
|
if conforms:
|
|
print("✅ VALIDATION PASSED")
|
|
print("No constraint violations found.")
|
|
else:
|
|
print("❌ VALIDATION FAILED")
|
|
print("\nConstraint Violations:")
|
|
print("-" * 80)
|
|
print(results_text)
|
|
|
|
print(f"{'=' * 80}\n")
|
|
|
|
# Write validation report if requested
|
|
if output_file:
|
|
print(f"Writing validation report to: {output_file}")
|
|
results_graph.serialize(destination=str(output_file), format="turtle")
|
|
print(f"Report written successfully.\n")
|
|
|
|
# Print statistics
|
|
if verbose:
|
|
print("\nValidation Statistics:")
|
|
print(f" Triples validated: {len(data_graph)}")
|
|
print(f" Shapes applied: {count_shapes(shapes_graph)}")
|
|
print(f" Violations found: {count_violations(results_graph)}")
|
|
|
|
return conforms
|
|
|
|
except Exception as e:
|
|
print(f"\nERROR during validation: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return False
|
|
|
|
|
|
def count_shapes(shapes_graph: Graph) -> int:
|
|
"""Count number of SHACL shapes in graph."""
|
|
from rdflib import SH
|
|
return len(list(shapes_graph.subjects(predicate=SH.targetClass, object=None)))
|
|
|
|
|
|
def count_violations(results_graph: Graph) -> int:
|
|
"""Count number of validation violations in results graph."""
|
|
from rdflib import SH
|
|
return len(list(results_graph.subjects(predicate=SH.resultSeverity, object=None)))
|
|
|
|
|
|
# ============================================================================
|
|
# CLI Interface
|
|
# ============================================================================
|
|
|
|
def main():
|
|
"""Main entry point for CLI."""
|
|
|
|
parser = argparse.ArgumentParser(
|
|
description="Validate RDF data against Heritage Custodian SHACL shapes",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
Examples:
|
|
# Validate Turtle file with default shapes
|
|
python scripts/validate_with_shacl.py data.ttl
|
|
|
|
# Validate JSON-LD file with custom shapes
|
|
python scripts/validate_with_shacl.py data.jsonld --shapes custom_shapes.ttl --format jsonld
|
|
|
|
# Validate and save report
|
|
python scripts/validate_with_shacl.py data.ttl --output validation_report.ttl
|
|
|
|
# Verbose output
|
|
python scripts/validate_with_shacl.py data.ttl --verbose
|
|
|
|
Exit Codes:
|
|
0 = Validation passed (no violations)
|
|
1 = Validation failed (violations found)
|
|
2 = Error during validation (file not found, parse error, etc.)
|
|
"""
|
|
)
|
|
|
|
parser.add_argument(
|
|
"data_file",
|
|
type=Path,
|
|
help="RDF data file to validate (Turtle, JSON-LD, N-Triples, etc.)"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"-s", "--shapes",
|
|
type=Path,
|
|
default=None,
|
|
help=f"SHACL shapes file (default: {DEFAULT_SHAPES_FILE})"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"-f", "--format",
|
|
type=str,
|
|
default="turtle",
|
|
choices=SUPPORTED_FORMATS,
|
|
help="RDF format of data file (default: turtle)"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"-o", "--output",
|
|
type=Path,
|
|
default=None,
|
|
help="Write validation report to file (Turtle format)"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"-v", "--verbose",
|
|
action="store_true",
|
|
help="Print detailed validation information"
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Normalize format aliases
|
|
if args.format in ["ttl", "turtle"]:
|
|
args.format = "turtle"
|
|
elif args.format in ["jsonld", "json-ld"]:
|
|
args.format = "json-ld"
|
|
|
|
# Run validation
|
|
try:
|
|
conforms = validate_rdf_data(
|
|
data_file=args.data_file,
|
|
shapes_file=args.shapes,
|
|
data_format=args.format,
|
|
output_file=args.output,
|
|
verbose=args.verbose
|
|
)
|
|
|
|
# Exit with appropriate code
|
|
sys.exit(0 if conforms else 1)
|
|
|
|
except KeyboardInterrupt:
|
|
print("\n\nValidation interrupted by user.")
|
|
sys.exit(2)
|
|
|
|
except Exception as e:
|
|
print(f"\n\nFATAL ERROR: {e}")
|
|
sys.exit(2)
|
|
|
|
|
|
# ============================================================================
|
|
# Library Interface
|
|
# ============================================================================
|
|
|
|
def validate_file(data_file: str, shapes_file: Optional[str] = None) -> bool:
|
|
"""
|
|
Library interface for programmatic validation.
|
|
|
|
Args:
|
|
data_file: Path to RDF data file
|
|
shapes_file: Optional path to SHACL shapes file
|
|
|
|
Returns:
|
|
True if validation passes, False otherwise
|
|
|
|
Example:
|
|
from scripts.validate_with_shacl import validate_file
|
|
|
|
if validate_file("data.ttl"):
|
|
print("Valid!")
|
|
else:
|
|
print("Invalid!")
|
|
"""
|
|
return validate_rdf_data(
|
|
data_file=Path(data_file),
|
|
shapes_file=Path(shapes_file) if shapes_file else None,
|
|
verbose=False
|
|
)
|
|
|
|
|
|
# ============================================================================
|
|
# Entry Point
|
|
# ============================================================================
|
|
|
|
if __name__ == "__main__":
|
|
main()
|