glam/scripts/validate_with_shacl.py
kempersc 6eb18700f0 Add SHACL validation shapes and validation script for Heritage Custodian Ontology
- Created SHACL shapes for validating temporal consistency and bidirectional relationships in custodial collections and staff observations.
- Implemented a Python script to validate RDF data against the defined SHACL shapes using the pyshacl library.
- Added command-line interface for validation with options for specifying data formats and output reports.
- Included detailed error handling and reporting for validation results.
2025-11-22 23:22:10 +01:00

297 lines
8.9 KiB
Python
Executable file

#!/usr/bin/env python3
"""
SHACL Validation Script for Heritage Custodian Ontology
Uses pyshacl library to validate RDF data against SHACL shapes.
Usage:
python scripts/validate_with_shacl.py <data.ttl>
python scripts/validate_with_shacl.py <data.ttl> --shapes <shapes.ttl>
python scripts/validate_with_shacl.py <data.ttl> --format jsonld
python scripts/validate_with_shacl.py <data.ttl> --output report.ttl
Author: Heritage Custodian Ontology Project
Date: 2025-11-22
Schema Version: v0.7.0 (Phase 7: SHACL Validation)
"""
import sys
import argparse
from pathlib import Path
from typing import Optional
try:
from pyshacl import validate
from rdflib import Graph
except ImportError:
print("ERROR: Required libraries not installed.")
print("Install with: pip install pyshacl rdflib")
sys.exit(1)
# ============================================================================
# Constants
# ============================================================================
DEFAULT_SHAPES_FILE = "schemas/20251121/shacl/custodian_validation_shapes.ttl"
SUPPORTED_FORMATS = ["turtle", "ttl", "xml", "n3", "nt", "jsonld", "json-ld"]
# ============================================================================
# Validation Functions
# ============================================================================
def validate_rdf_data(
data_file: Path,
shapes_file: Optional[Path] = None,
data_format: str = "turtle",
output_file: Optional[Path] = None,
verbose: bool = False
) -> bool:
"""
Validate RDF data against SHACL shapes.
Args:
data_file: Path to RDF data file to validate
shapes_file: Path to SHACL shapes file (default: schemas/20251121/shacl/custodian_validation_shapes.ttl)
data_format: RDF format (turtle, xml, n3, nt, jsonld)
output_file: Optional path to write validation report
verbose: Print detailed validation report
Returns:
True if validation passes, False otherwise
"""
# Use default shapes file if not specified
if shapes_file is None:
shapes_file = Path(DEFAULT_SHAPES_FILE)
# Check files exist
if not data_file.exists():
print(f"ERROR: Data file not found: {data_file}")
return False
if not shapes_file.exists():
print(f"ERROR: SHACL shapes file not found: {shapes_file}")
return False
print(f"\n{'=' * 80}")
print("SHACL VALIDATION")
print(f"{'=' * 80}")
print(f"Data file: {data_file}")
print(f"Shapes file: {shapes_file}")
print(f"Data format: {data_format}")
print(f"{'=' * 80}\n")
try:
# Load data graph
if verbose:
print("Loading data graph...")
data_graph = Graph()
data_graph.parse(str(data_file), format=data_format)
if verbose:
print(f" Loaded {len(data_graph)} triples")
# Load shapes graph
if verbose:
print("Loading SHACL shapes...")
shapes_graph = Graph()
shapes_graph.parse(str(shapes_file), format="turtle")
if verbose:
print(f" Loaded {len(shapes_graph)} shape triples")
print("\nExecuting SHACL validation...")
# Run SHACL validation
conforms, results_graph, results_text = validate(
data_graph,
shacl_graph=shapes_graph,
inference='rdfs', # Use RDFS inference
abort_on_first=False, # Check all violations
meta_shacl=False, # Don't validate shapes themselves
advanced=True, # Enable SHACL-AF features
js=False # Disable SHACL-JS (not needed)
)
# Print results
print(f"\n{'=' * 80}")
print("VALIDATION RESULTS")
print(f"{'=' * 80}")
if conforms:
print("✅ VALIDATION PASSED")
print("No constraint violations found.")
else:
print("❌ VALIDATION FAILED")
print("\nConstraint Violations:")
print("-" * 80)
print(results_text)
print(f"{'=' * 80}\n")
# Write validation report if requested
if output_file:
print(f"Writing validation report to: {output_file}")
results_graph.serialize(destination=str(output_file), format="turtle")
print(f"Report written successfully.\n")
# Print statistics
if verbose:
print("\nValidation Statistics:")
print(f" Triples validated: {len(data_graph)}")
print(f" Shapes applied: {count_shapes(shapes_graph)}")
print(f" Violations found: {count_violations(results_graph)}")
return conforms
except Exception as e:
print(f"\nERROR during validation: {e}")
import traceback
traceback.print_exc()
return False
def count_shapes(shapes_graph: Graph) -> int:
"""Count number of SHACL shapes in graph."""
from rdflib import SH
return len(list(shapes_graph.subjects(predicate=SH.targetClass, object=None)))
def count_violations(results_graph: Graph) -> int:
"""Count number of validation violations in results graph."""
from rdflib import SH
return len(list(results_graph.subjects(predicate=SH.resultSeverity, object=None)))
# ============================================================================
# CLI Interface
# ============================================================================
def main():
"""Main entry point for CLI."""
parser = argparse.ArgumentParser(
description="Validate RDF data against Heritage Custodian SHACL shapes",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Validate Turtle file with default shapes
python scripts/validate_with_shacl.py data.ttl
# Validate JSON-LD file with custom shapes
python scripts/validate_with_shacl.py data.jsonld --shapes custom_shapes.ttl --format jsonld
# Validate and save report
python scripts/validate_with_shacl.py data.ttl --output validation_report.ttl
# Verbose output
python scripts/validate_with_shacl.py data.ttl --verbose
Exit Codes:
0 = Validation passed (no violations)
1 = Validation failed (violations found)
2 = Error during validation (file not found, parse error, etc.)
"""
)
parser.add_argument(
"data_file",
type=Path,
help="RDF data file to validate (Turtle, JSON-LD, N-Triples, etc.)"
)
parser.add_argument(
"-s", "--shapes",
type=Path,
default=None,
help=f"SHACL shapes file (default: {DEFAULT_SHAPES_FILE})"
)
parser.add_argument(
"-f", "--format",
type=str,
default="turtle",
choices=SUPPORTED_FORMATS,
help="RDF format of data file (default: turtle)"
)
parser.add_argument(
"-o", "--output",
type=Path,
default=None,
help="Write validation report to file (Turtle format)"
)
parser.add_argument(
"-v", "--verbose",
action="store_true",
help="Print detailed validation information"
)
args = parser.parse_args()
# Normalize format aliases
if args.format in ["ttl", "turtle"]:
args.format = "turtle"
elif args.format in ["jsonld", "json-ld"]:
args.format = "json-ld"
# Run validation
try:
conforms = validate_rdf_data(
data_file=args.data_file,
shapes_file=args.shapes,
data_format=args.format,
output_file=args.output,
verbose=args.verbose
)
# Exit with appropriate code
sys.exit(0 if conforms else 1)
except KeyboardInterrupt:
print("\n\nValidation interrupted by user.")
sys.exit(2)
except Exception as e:
print(f"\n\nFATAL ERROR: {e}")
sys.exit(2)
# ============================================================================
# Library Interface
# ============================================================================
def validate_file(data_file: str, shapes_file: Optional[str] = None) -> bool:
"""
Library interface for programmatic validation.
Args:
data_file: Path to RDF data file
shapes_file: Optional path to SHACL shapes file
Returns:
True if validation passes, False otherwise
Example:
from scripts.validate_with_shacl import validate_file
if validate_file("data.ttl"):
print("Valid!")
else:
print("Invalid!")
"""
return validate_rdf_data(
data_file=Path(data_file),
shapes_file=Path(shapes_file) if shapes_file else None,
verbose=False
)
# ============================================================================
# Entry Point
# ============================================================================
if __name__ == "__main__":
main()