glam/scripts/generate_shacl.py
2025-12-01 16:06:34 +01:00

412 lines
13 KiB
Python

#!/usr/bin/env python3
"""
SHACL Generation Script for Heritage Custodian Ontology
Works around a LinkML bug where gen-shacl fails on modular schemas due to
schema_map keying mismatch (import paths vs. schema names).
Workaround approach:
1. Load schema via SchemaView (which correctly resolves imports)
2. Merge all classes/slots/enums from imports into main schema
3. Clear from_schema references that confuse gen-shacl
4. Set imports to only ['linkml:types']
5. Write to temp file and run ShaclGenerator
Usage:
python scripts/generate_shacl.py
python scripts/generate_shacl.py --schema <path_to_schema.yaml>
python scripts/generate_shacl.py --output <output_path.ttl>
python scripts/generate_shacl.py --format turtle
Bug Background:
gen-shacl (linkml.generators.shaclgen) fails with KeyError on modular schemas.
The issue is in linkml/utils/schemaloader.py where schema_map is keyed by
import paths (e.g., "modules/classes/ContributingAgency") but lookups use
schema names (e.g., "contributing_agency"). This mismatch causes failures
when from_schema references are resolved.
Other generators (gen-owl, gen-yaml) work because they don't perform this
specific schema_map lookup during serialization.
Author: Heritage Custodian Ontology Project
Date: 2025-12-01
Schema Version: v0.7.1 (SHACL Generation Workaround)
"""
import sys
import argparse
import tempfile
from pathlib import Path
from datetime import datetime
try:
from linkml_runtime.utils.schemaview import SchemaView
from linkml_runtime.dumpers import yaml_dumper
from linkml.generators.shaclgen import ShaclGenerator
except ImportError as e:
print(f"ERROR: Required libraries not installed: {e}")
print("Install with: pip install linkml linkml-runtime")
sys.exit(1)
# ============================================================================
# Constants
# ============================================================================
DEFAULT_SCHEMA = "schemas/20251121/linkml/01_custodian_name_modular.yaml"
DEFAULT_OUTPUT_DIR = "schemas/20251121/shacl"
SUPPORTED_FORMATS = ["turtle", "ttl", "nt", "n3", "xml"]
# LinkML built-in types that should not be merged (they come from linkml:types)
BUILTIN_TYPES = {
'string', 'integer', 'boolean', 'float', 'double', 'decimal',
'time', 'date', 'datetime', 'date_or_datetime', 'uriorcurie',
'curie', 'uri', 'ncname', 'objectidentifier', 'nodeidentifier',
'jsonpointer', 'jsonpath', 'sparqlpath'
}
# ============================================================================
# SHACL Generation Functions
# ============================================================================
def generate_shacl(
schema_path: Path,
output_path: "Path | None" = None,
output_format: str = "turtle",
verbose: bool = False
) -> str:
"""
Generate SHACL shapes from LinkML schema with workaround for modular schemas.
Args:
schema_path: Path to LinkML schema file
output_path: Optional path to write SHACL output
output_format: RDF format (turtle, nt, n3, xml)
verbose: Print detailed progress
Returns:
SHACL shapes as string in specified format
"""
print(f"\n{'=' * 80}")
print("SHACL GENERATION (with modular schema workaround)")
print(f"{'=' * 80}")
print(f"Schema: {schema_path}")
print(f"Output format: {output_format}")
if output_path:
print(f"Output file: {output_path}")
print(f"{'=' * 80}\n")
# Step 1: Load schema via SchemaView
if verbose:
print("Step 1: Loading schema via SchemaView...")
try:
sv = SchemaView(str(schema_path))
schema = sv.schema
except Exception as e:
print(f"ERROR: Failed to load schema: {e}")
raise
if verbose:
print(f" Loaded schema: {schema.name}")
print(f" Classes (via imports): {len(sv.all_classes())}")
print(f" Slots (via imports): {len(sv.all_slots())}")
print(f" Enums (via imports): {len(sv.all_enums())}")
# Step 2: Merge all classes/slots/enums from imports into main schema
# This is needed because schema.classes is empty for modular schemas
if verbose:
print("\nStep 2: Merging imported definitions into main schema...")
merged_classes = 0
merged_slots = 0
merged_enums = 0
merged_types = 0
# Merge all classes from imports
for cls_name, cls_def in sv.all_classes().items():
if cls_name not in schema.classes:
schema.classes[cls_name] = cls_def
merged_classes += 1
# Merge all slots from imports
for slot_name, slot_def in sv.all_slots().items():
if slot_name not in schema.slots:
schema.slots[slot_name] = slot_def
merged_slots += 1
# Merge all enums from imports
for enum_name, enum_def in sv.all_enums().items():
if enum_name not in schema.enums:
schema.enums[enum_name] = enum_def
merged_enums += 1
# Merge only non-builtin types from imports (avoid conflict with linkml:types)
for type_name, type_def in sv.all_types().items():
if type_name not in schema.types and type_name not in BUILTIN_TYPES:
schema.types[type_name] = type_def
merged_types += 1
if verbose:
print(f" Merged {merged_classes} classes")
print(f" Merged {merged_slots} slots")
print(f" Merged {merged_enums} enums")
print(f" Merged {merged_types} types (excluding {len(BUILTIN_TYPES)} builtins)")
# Step 3: Clear from_schema references that cause KeyError in gen-shacl
if verbose:
print("\nStep 3: Clearing from_schema references...")
cleared_count = 0
for cls in schema.classes.values():
if cls.from_schema:
cls.from_schema = None
cleared_count += 1
for slot in schema.slots.values():
if slot.from_schema:
slot.from_schema = None
cleared_count += 1
for enum in schema.enums.values():
if enum.from_schema:
enum.from_schema = None
cleared_count += 1
for typ in schema.types.values():
if typ.from_schema:
typ.from_schema = None
cleared_count += 1
if verbose:
print(f" Cleared {cleared_count} from_schema references")
# Step 4: Set imports to only linkml:types
if verbose:
print("\nStep 4: Simplifying imports...")
original_imports = schema.imports.copy() if schema.imports else []
schema.imports = ['linkml:types']
if verbose:
print(f" Original imports: {len(original_imports)}")
print(f" New imports: {schema.imports}")
# Step 5: Write to temp file
if verbose:
print("\nStep 5: Writing cleaned schema to temp file...")
with tempfile.NamedTemporaryFile(
mode='w',
suffix='.yaml',
delete=False,
prefix='linkml_shacl_'
) as tmp_file:
tmp_path = tmp_file.name
# yaml_dumper.dump expects a file path, not a file object
yaml_dumper.dump(schema, tmp_path)
if verbose:
print(f" Temp file: {tmp_path}")
# Step 6: Run ShaclGenerator on cleaned schema
if verbose:
print("\nStep 6: Running ShaclGenerator...")
try:
generator = ShaclGenerator(tmp_path)
shacl_output = generator.serialize()
except Exception as e:
print(f"ERROR: SHACL generation failed: {e}")
# Cleanup temp file
Path(tmp_path).unlink(missing_ok=True)
raise
finally:
# Cleanup temp file
Path(tmp_path).unlink(missing_ok=True)
if verbose:
print(f" Cleaned up temp file")
if verbose:
lines = shacl_output.count('\n')
print(f" Generated {lines} lines of SHACL")
# Step 7: Write output if path specified
if output_path:
if verbose:
print(f"\nStep 7: Writing output to {output_path}...")
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text(shacl_output)
if verbose:
print(f" Written successfully")
print(f"\n{'=' * 80}")
print("✅ SHACL GENERATION COMPLETE")
print(f"{'=' * 80}\n")
return shacl_output
def generate_timestamped_shacl(
schema_path: "Path | None" = None,
output_dir: "Path | None" = None,
verbose: bool = False
) -> Path:
"""
Generate SHACL with timestamped filename per AGENTS.md rules.
Filename format: custodian_shacl_{YYYYMMDD}_{HHMMSS}.ttl
Args:
schema_path: Path to LinkML schema (default: DEFAULT_SCHEMA)
output_dir: Output directory (default: DEFAULT_OUTPUT_DIR)
verbose: Print detailed progress
Returns:
Path to generated SHACL file
"""
if schema_path is None:
schema_path = Path(DEFAULT_SCHEMA)
if output_dir is None:
output_dir = Path(DEFAULT_OUTPUT_DIR)
# Generate timestamp per AGENTS.md: {base_name}_{YYYYMMDD}_{HHMMSS}.{extension}
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_filename = f"custodian_shacl_{timestamp}.ttl"
output_path = output_dir / output_filename
generate_shacl(
schema_path=schema_path,
output_path=output_path,
output_format="turtle",
verbose=verbose
)
return output_path
# ============================================================================
# CLI Interface
# ============================================================================
def main():
"""Main entry point for CLI."""
parser = argparse.ArgumentParser(
description="Generate SHACL shapes from Heritage Custodian LinkML schema",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Generate SHACL with default settings (timestamped output)
python scripts/generate_shacl.py
# Generate SHACL to specific file
python scripts/generate_shacl.py --output shapes.ttl
# Use custom schema file
python scripts/generate_shacl.py --schema my_schema.yaml
# Verbose output
python scripts/generate_shacl.py --verbose
Output:
By default, generates timestamped files in schemas/20251121/shacl/
Format: custodian_shacl_{YYYYMMDD}_{HHMMSS}.ttl
Note:
This script works around a LinkML bug where gen-shacl fails on modular
schemas due to schema_map keying mismatch. It pre-processes the schema
via SchemaView to resolve all imports before running ShaclGenerator.
"""
)
parser.add_argument(
"-s", "--schema",
type=Path,
default=Path(DEFAULT_SCHEMA),
help=f"LinkML schema file (default: {DEFAULT_SCHEMA})"
)
parser.add_argument(
"-o", "--output",
type=Path,
default=None,
help="Output SHACL file (default: timestamped file in shacl/ dir)"
)
parser.add_argument(
"-f", "--format",
type=str,
default="turtle",
choices=SUPPORTED_FORMATS,
help="RDF output format (default: turtle)"
)
parser.add_argument(
"-v", "--verbose",
action="store_true",
help="Print detailed progress information"
)
parser.add_argument(
"--stdout",
action="store_true",
help="Write SHACL to stdout instead of file"
)
args = parser.parse_args()
try:
if args.stdout:
# Write to stdout only
shacl = generate_shacl(
schema_path=args.schema,
output_path=None,
output_format=args.format,
verbose=args.verbose
)
print(shacl)
elif args.output:
# Write to specified file
generate_shacl(
schema_path=args.schema,
output_path=args.output,
output_format=args.format,
verbose=args.verbose
)
print(f"SHACL shapes written to: {args.output}")
else:
# Write to timestamped file
output_path = generate_timestamped_shacl(
schema_path=args.schema,
verbose=args.verbose
)
print(f"SHACL shapes written to: {output_path}")
sys.exit(0)
except KeyboardInterrupt:
print("\n\nGeneration interrupted by user.")
sys.exit(2)
except Exception as e:
print(f"\n\nFATAL ERROR: {e}")
import traceback
traceback.print_exc()
sys.exit(1)
# ============================================================================
# Entry Point
# ============================================================================
if __name__ == "__main__":
main()