412 lines
13 KiB
Python
412 lines
13 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
SHACL Generation Script for Heritage Custodian Ontology
|
|
|
|
Works around a LinkML bug where gen-shacl fails on modular schemas due to
|
|
schema_map keying mismatch (import paths vs. schema names).
|
|
|
|
Workaround approach:
|
|
1. Load schema via SchemaView (which correctly resolves imports)
|
|
2. Merge all classes/slots/enums from imports into main schema
|
|
3. Clear from_schema references that confuse gen-shacl
|
|
4. Set imports to only ['linkml:types']
|
|
5. Write to temp file and run ShaclGenerator
|
|
|
|
Usage:
|
|
python scripts/generate_shacl.py
|
|
python scripts/generate_shacl.py --schema <path_to_schema.yaml>
|
|
python scripts/generate_shacl.py --output <output_path.ttl>
|
|
python scripts/generate_shacl.py --format turtle
|
|
|
|
Bug Background:
|
|
gen-shacl (linkml.generators.shaclgen) fails with KeyError on modular schemas.
|
|
The issue is in linkml/utils/schemaloader.py where schema_map is keyed by
|
|
import paths (e.g., "modules/classes/ContributingAgency") but lookups use
|
|
schema names (e.g., "contributing_agency"). This mismatch causes failures
|
|
when from_schema references are resolved.
|
|
|
|
Other generators (gen-owl, gen-yaml) work because they don't perform this
|
|
specific schema_map lookup during serialization.
|
|
|
|
Author: Heritage Custodian Ontology Project
|
|
Date: 2025-12-01
|
|
Schema Version: v0.7.1 (SHACL Generation Workaround)
|
|
"""
|
|
|
|
import sys
|
|
import argparse
|
|
import tempfile
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
|
|
try:
|
|
from linkml_runtime.utils.schemaview import SchemaView
|
|
from linkml_runtime.dumpers import yaml_dumper
|
|
from linkml.generators.shaclgen import ShaclGenerator
|
|
except ImportError as e:
|
|
print(f"ERROR: Required libraries not installed: {e}")
|
|
print("Install with: pip install linkml linkml-runtime")
|
|
sys.exit(1)
|
|
|
|
|
|
# ============================================================================
|
|
# Constants
|
|
# ============================================================================
|
|
|
|
DEFAULT_SCHEMA = "schemas/20251121/linkml/01_custodian_name_modular.yaml"
|
|
DEFAULT_OUTPUT_DIR = "schemas/20251121/shacl"
|
|
SUPPORTED_FORMATS = ["turtle", "ttl", "nt", "n3", "xml"]
|
|
|
|
# LinkML built-in types that should not be merged (they come from linkml:types)
|
|
BUILTIN_TYPES = {
|
|
'string', 'integer', 'boolean', 'float', 'double', 'decimal',
|
|
'time', 'date', 'datetime', 'date_or_datetime', 'uriorcurie',
|
|
'curie', 'uri', 'ncname', 'objectidentifier', 'nodeidentifier',
|
|
'jsonpointer', 'jsonpath', 'sparqlpath'
|
|
}
|
|
|
|
|
|
# ============================================================================
|
|
# SHACL Generation Functions
|
|
# ============================================================================
|
|
|
|
def generate_shacl(
|
|
schema_path: Path,
|
|
output_path: "Path | None" = None,
|
|
output_format: str = "turtle",
|
|
verbose: bool = False
|
|
) -> str:
|
|
"""
|
|
Generate SHACL shapes from LinkML schema with workaround for modular schemas.
|
|
|
|
Args:
|
|
schema_path: Path to LinkML schema file
|
|
output_path: Optional path to write SHACL output
|
|
output_format: RDF format (turtle, nt, n3, xml)
|
|
verbose: Print detailed progress
|
|
|
|
Returns:
|
|
SHACL shapes as string in specified format
|
|
"""
|
|
|
|
print(f"\n{'=' * 80}")
|
|
print("SHACL GENERATION (with modular schema workaround)")
|
|
print(f"{'=' * 80}")
|
|
print(f"Schema: {schema_path}")
|
|
print(f"Output format: {output_format}")
|
|
if output_path:
|
|
print(f"Output file: {output_path}")
|
|
print(f"{'=' * 80}\n")
|
|
|
|
# Step 1: Load schema via SchemaView
|
|
if verbose:
|
|
print("Step 1: Loading schema via SchemaView...")
|
|
|
|
try:
|
|
sv = SchemaView(str(schema_path))
|
|
schema = sv.schema
|
|
except Exception as e:
|
|
print(f"ERROR: Failed to load schema: {e}")
|
|
raise
|
|
|
|
if verbose:
|
|
print(f" Loaded schema: {schema.name}")
|
|
print(f" Classes (via imports): {len(sv.all_classes())}")
|
|
print(f" Slots (via imports): {len(sv.all_slots())}")
|
|
print(f" Enums (via imports): {len(sv.all_enums())}")
|
|
|
|
# Step 2: Merge all classes/slots/enums from imports into main schema
|
|
# This is needed because schema.classes is empty for modular schemas
|
|
if verbose:
|
|
print("\nStep 2: Merging imported definitions into main schema...")
|
|
|
|
merged_classes = 0
|
|
merged_slots = 0
|
|
merged_enums = 0
|
|
merged_types = 0
|
|
|
|
# Merge all classes from imports
|
|
for cls_name, cls_def in sv.all_classes().items():
|
|
if cls_name not in schema.classes:
|
|
schema.classes[cls_name] = cls_def
|
|
merged_classes += 1
|
|
|
|
# Merge all slots from imports
|
|
for slot_name, slot_def in sv.all_slots().items():
|
|
if slot_name not in schema.slots:
|
|
schema.slots[slot_name] = slot_def
|
|
merged_slots += 1
|
|
|
|
# Merge all enums from imports
|
|
for enum_name, enum_def in sv.all_enums().items():
|
|
if enum_name not in schema.enums:
|
|
schema.enums[enum_name] = enum_def
|
|
merged_enums += 1
|
|
|
|
# Merge only non-builtin types from imports (avoid conflict with linkml:types)
|
|
for type_name, type_def in sv.all_types().items():
|
|
if type_name not in schema.types and type_name not in BUILTIN_TYPES:
|
|
schema.types[type_name] = type_def
|
|
merged_types += 1
|
|
|
|
if verbose:
|
|
print(f" Merged {merged_classes} classes")
|
|
print(f" Merged {merged_slots} slots")
|
|
print(f" Merged {merged_enums} enums")
|
|
print(f" Merged {merged_types} types (excluding {len(BUILTIN_TYPES)} builtins)")
|
|
|
|
# Step 3: Clear from_schema references that cause KeyError in gen-shacl
|
|
if verbose:
|
|
print("\nStep 3: Clearing from_schema references...")
|
|
|
|
cleared_count = 0
|
|
|
|
for cls in schema.classes.values():
|
|
if cls.from_schema:
|
|
cls.from_schema = None
|
|
cleared_count += 1
|
|
|
|
for slot in schema.slots.values():
|
|
if slot.from_schema:
|
|
slot.from_schema = None
|
|
cleared_count += 1
|
|
|
|
for enum in schema.enums.values():
|
|
if enum.from_schema:
|
|
enum.from_schema = None
|
|
cleared_count += 1
|
|
|
|
for typ in schema.types.values():
|
|
if typ.from_schema:
|
|
typ.from_schema = None
|
|
cleared_count += 1
|
|
|
|
if verbose:
|
|
print(f" Cleared {cleared_count} from_schema references")
|
|
|
|
# Step 4: Set imports to only linkml:types
|
|
if verbose:
|
|
print("\nStep 4: Simplifying imports...")
|
|
|
|
original_imports = schema.imports.copy() if schema.imports else []
|
|
schema.imports = ['linkml:types']
|
|
|
|
if verbose:
|
|
print(f" Original imports: {len(original_imports)}")
|
|
print(f" New imports: {schema.imports}")
|
|
|
|
# Step 5: Write to temp file
|
|
if verbose:
|
|
print("\nStep 5: Writing cleaned schema to temp file...")
|
|
|
|
with tempfile.NamedTemporaryFile(
|
|
mode='w',
|
|
suffix='.yaml',
|
|
delete=False,
|
|
prefix='linkml_shacl_'
|
|
) as tmp_file:
|
|
tmp_path = tmp_file.name
|
|
|
|
# yaml_dumper.dump expects a file path, not a file object
|
|
yaml_dumper.dump(schema, tmp_path)
|
|
|
|
if verbose:
|
|
print(f" Temp file: {tmp_path}")
|
|
|
|
# Step 6: Run ShaclGenerator on cleaned schema
|
|
if verbose:
|
|
print("\nStep 6: Running ShaclGenerator...")
|
|
|
|
try:
|
|
generator = ShaclGenerator(tmp_path)
|
|
shacl_output = generator.serialize()
|
|
except Exception as e:
|
|
print(f"ERROR: SHACL generation failed: {e}")
|
|
# Cleanup temp file
|
|
Path(tmp_path).unlink(missing_ok=True)
|
|
raise
|
|
finally:
|
|
# Cleanup temp file
|
|
Path(tmp_path).unlink(missing_ok=True)
|
|
if verbose:
|
|
print(f" Cleaned up temp file")
|
|
|
|
if verbose:
|
|
lines = shacl_output.count('\n')
|
|
print(f" Generated {lines} lines of SHACL")
|
|
|
|
# Step 7: Write output if path specified
|
|
if output_path:
|
|
if verbose:
|
|
print(f"\nStep 7: Writing output to {output_path}...")
|
|
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
output_path.write_text(shacl_output)
|
|
|
|
if verbose:
|
|
print(f" Written successfully")
|
|
|
|
print(f"\n{'=' * 80}")
|
|
print("✅ SHACL GENERATION COMPLETE")
|
|
print(f"{'=' * 80}\n")
|
|
|
|
return shacl_output
|
|
|
|
|
|
def generate_timestamped_shacl(
|
|
schema_path: "Path | None" = None,
|
|
output_dir: "Path | None" = None,
|
|
verbose: bool = False
|
|
) -> Path:
|
|
"""
|
|
Generate SHACL with timestamped filename per AGENTS.md rules.
|
|
|
|
Filename format: custodian_shacl_{YYYYMMDD}_{HHMMSS}.ttl
|
|
|
|
Args:
|
|
schema_path: Path to LinkML schema (default: DEFAULT_SCHEMA)
|
|
output_dir: Output directory (default: DEFAULT_OUTPUT_DIR)
|
|
verbose: Print detailed progress
|
|
|
|
Returns:
|
|
Path to generated SHACL file
|
|
"""
|
|
|
|
if schema_path is None:
|
|
schema_path = Path(DEFAULT_SCHEMA)
|
|
|
|
if output_dir is None:
|
|
output_dir = Path(DEFAULT_OUTPUT_DIR)
|
|
|
|
# Generate timestamp per AGENTS.md: {base_name}_{YYYYMMDD}_{HHMMSS}.{extension}
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
output_filename = f"custodian_shacl_{timestamp}.ttl"
|
|
output_path = output_dir / output_filename
|
|
|
|
generate_shacl(
|
|
schema_path=schema_path,
|
|
output_path=output_path,
|
|
output_format="turtle",
|
|
verbose=verbose
|
|
)
|
|
|
|
return output_path
|
|
|
|
|
|
# ============================================================================
|
|
# CLI Interface
|
|
# ============================================================================
|
|
|
|
def main():
|
|
"""Main entry point for CLI."""
|
|
|
|
parser = argparse.ArgumentParser(
|
|
description="Generate SHACL shapes from Heritage Custodian LinkML schema",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
Examples:
|
|
# Generate SHACL with default settings (timestamped output)
|
|
python scripts/generate_shacl.py
|
|
|
|
# Generate SHACL to specific file
|
|
python scripts/generate_shacl.py --output shapes.ttl
|
|
|
|
# Use custom schema file
|
|
python scripts/generate_shacl.py --schema my_schema.yaml
|
|
|
|
# Verbose output
|
|
python scripts/generate_shacl.py --verbose
|
|
|
|
Output:
|
|
By default, generates timestamped files in schemas/20251121/shacl/
|
|
Format: custodian_shacl_{YYYYMMDD}_{HHMMSS}.ttl
|
|
|
|
Note:
|
|
This script works around a LinkML bug where gen-shacl fails on modular
|
|
schemas due to schema_map keying mismatch. It pre-processes the schema
|
|
via SchemaView to resolve all imports before running ShaclGenerator.
|
|
"""
|
|
)
|
|
|
|
parser.add_argument(
|
|
"-s", "--schema",
|
|
type=Path,
|
|
default=Path(DEFAULT_SCHEMA),
|
|
help=f"LinkML schema file (default: {DEFAULT_SCHEMA})"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"-o", "--output",
|
|
type=Path,
|
|
default=None,
|
|
help="Output SHACL file (default: timestamped file in shacl/ dir)"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"-f", "--format",
|
|
type=str,
|
|
default="turtle",
|
|
choices=SUPPORTED_FORMATS,
|
|
help="RDF output format (default: turtle)"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"-v", "--verbose",
|
|
action="store_true",
|
|
help="Print detailed progress information"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--stdout",
|
|
action="store_true",
|
|
help="Write SHACL to stdout instead of file"
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
try:
|
|
if args.stdout:
|
|
# Write to stdout only
|
|
shacl = generate_shacl(
|
|
schema_path=args.schema,
|
|
output_path=None,
|
|
output_format=args.format,
|
|
verbose=args.verbose
|
|
)
|
|
print(shacl)
|
|
elif args.output:
|
|
# Write to specified file
|
|
generate_shacl(
|
|
schema_path=args.schema,
|
|
output_path=args.output,
|
|
output_format=args.format,
|
|
verbose=args.verbose
|
|
)
|
|
print(f"SHACL shapes written to: {args.output}")
|
|
else:
|
|
# Write to timestamped file
|
|
output_path = generate_timestamped_shacl(
|
|
schema_path=args.schema,
|
|
verbose=args.verbose
|
|
)
|
|
print(f"SHACL shapes written to: {output_path}")
|
|
|
|
sys.exit(0)
|
|
|
|
except KeyboardInterrupt:
|
|
print("\n\nGeneration interrupted by user.")
|
|
sys.exit(2)
|
|
|
|
except Exception as e:
|
|
print(f"\n\nFATAL ERROR: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
sys.exit(1)
|
|
|
|
|
|
# ============================================================================
|
|
# Entry Point
|
|
# ============================================================================
|
|
|
|
if __name__ == "__main__":
|
|
main()
|