#!/usr/bin/env python3 """ SHACL Generation Script for Heritage Custodian Ontology Works around a LinkML bug where gen-shacl fails on modular schemas due to schema_map keying mismatch (import paths vs. schema names). Workaround approach: 1. Load schema via SchemaView (which correctly resolves imports) 2. Merge all classes/slots/enums from imports into main schema 3. Clear from_schema references that confuse gen-shacl 4. Set imports to only ['linkml:types'] 5. Write to temp file and run ShaclGenerator Usage: python scripts/generate_shacl.py python scripts/generate_shacl.py --schema python scripts/generate_shacl.py --output python scripts/generate_shacl.py --format turtle Bug Background: gen-shacl (linkml.generators.shaclgen) fails with KeyError on modular schemas. The issue is in linkml/utils/schemaloader.py where schema_map is keyed by import paths (e.g., "modules/classes/ContributingAgency") but lookups use schema names (e.g., "contributing_agency"). This mismatch causes failures when from_schema references are resolved. Other generators (gen-owl, gen-yaml) work because they don't perform this specific schema_map lookup during serialization. Author: Heritage Custodian Ontology Project Date: 2025-12-01 Schema Version: v0.7.1 (SHACL Generation Workaround) """ import sys import argparse import tempfile from pathlib import Path from datetime import datetime try: from linkml_runtime.utils.schemaview import SchemaView from linkml_runtime.dumpers import yaml_dumper from linkml.generators.shaclgen import ShaclGenerator except ImportError as e: print(f"ERROR: Required libraries not installed: {e}") print("Install with: pip install linkml linkml-runtime") sys.exit(1) # ============================================================================ # Constants # ============================================================================ DEFAULT_SCHEMA = "schemas/20251121/linkml/01_custodian_name_modular.yaml" DEFAULT_OUTPUT_DIR = "schemas/20251121/shacl" SUPPORTED_FORMATS = ["turtle", "ttl", "nt", "n3", "xml"] # LinkML built-in types that should not be merged (they come from linkml:types) BUILTIN_TYPES = { 'string', 'integer', 'boolean', 'float', 'double', 'decimal', 'time', 'date', 'datetime', 'date_or_datetime', 'uriorcurie', 'curie', 'uri', 'ncname', 'objectidentifier', 'nodeidentifier', 'jsonpointer', 'jsonpath', 'sparqlpath' } # ============================================================================ # SHACL Generation Functions # ============================================================================ def generate_shacl( schema_path: Path, output_path: "Path | None" = None, output_format: str = "turtle", verbose: bool = False ) -> str: """ Generate SHACL shapes from LinkML schema with workaround for modular schemas. Args: schema_path: Path to LinkML schema file output_path: Optional path to write SHACL output output_format: RDF format (turtle, nt, n3, xml) verbose: Print detailed progress Returns: SHACL shapes as string in specified format """ print(f"\n{'=' * 80}") print("SHACL GENERATION (with modular schema workaround)") print(f"{'=' * 80}") print(f"Schema: {schema_path}") print(f"Output format: {output_format}") if output_path: print(f"Output file: {output_path}") print(f"{'=' * 80}\n") # Step 1: Load schema via SchemaView if verbose: print("Step 1: Loading schema via SchemaView...") try: sv = SchemaView(str(schema_path)) schema = sv.schema except Exception as e: print(f"ERROR: Failed to load schema: {e}") raise if verbose: print(f" Loaded schema: {schema.name}") print(f" Classes (via imports): {len(sv.all_classes())}") print(f" Slots (via imports): {len(sv.all_slots())}") print(f" Enums (via imports): {len(sv.all_enums())}") # Step 2: Merge all classes/slots/enums from imports into main schema # This is needed because schema.classes is empty for modular schemas if verbose: print("\nStep 2: Merging imported definitions into main schema...") merged_classes = 0 merged_slots = 0 merged_enums = 0 merged_types = 0 # Merge all classes from imports for cls_name, cls_def in sv.all_classes().items(): if cls_name not in schema.classes: schema.classes[cls_name] = cls_def merged_classes += 1 # Merge all slots from imports for slot_name, slot_def in sv.all_slots().items(): if slot_name not in schema.slots: schema.slots[slot_name] = slot_def merged_slots += 1 # Merge all enums from imports for enum_name, enum_def in sv.all_enums().items(): if enum_name not in schema.enums: schema.enums[enum_name] = enum_def merged_enums += 1 # Merge only non-builtin types from imports (avoid conflict with linkml:types) for type_name, type_def in sv.all_types().items(): if type_name not in schema.types and type_name not in BUILTIN_TYPES: schema.types[type_name] = type_def merged_types += 1 if verbose: print(f" Merged {merged_classes} classes") print(f" Merged {merged_slots} slots") print(f" Merged {merged_enums} enums") print(f" Merged {merged_types} types (excluding {len(BUILTIN_TYPES)} builtins)") # Step 3: Clear from_schema references that cause KeyError in gen-shacl if verbose: print("\nStep 3: Clearing from_schema references...") cleared_count = 0 for cls in schema.classes.values(): if cls.from_schema: cls.from_schema = None cleared_count += 1 for slot in schema.slots.values(): if slot.from_schema: slot.from_schema = None cleared_count += 1 for enum in schema.enums.values(): if enum.from_schema: enum.from_schema = None cleared_count += 1 for typ in schema.types.values(): if typ.from_schema: typ.from_schema = None cleared_count += 1 if verbose: print(f" Cleared {cleared_count} from_schema references") # Step 4: Set imports to only linkml:types if verbose: print("\nStep 4: Simplifying imports...") original_imports = schema.imports.copy() if schema.imports else [] schema.imports = ['linkml:types'] if verbose: print(f" Original imports: {len(original_imports)}") print(f" New imports: {schema.imports}") # Step 5: Write to temp file if verbose: print("\nStep 5: Writing cleaned schema to temp file...") with tempfile.NamedTemporaryFile( mode='w', suffix='.yaml', delete=False, prefix='linkml_shacl_' ) as tmp_file: tmp_path = tmp_file.name # yaml_dumper.dump expects a file path, not a file object yaml_dumper.dump(schema, tmp_path) if verbose: print(f" Temp file: {tmp_path}") # Step 6: Run ShaclGenerator on cleaned schema if verbose: print("\nStep 6: Running ShaclGenerator...") try: generator = ShaclGenerator(tmp_path) shacl_output = generator.serialize() except Exception as e: print(f"ERROR: SHACL generation failed: {e}") # Cleanup temp file Path(tmp_path).unlink(missing_ok=True) raise finally: # Cleanup temp file Path(tmp_path).unlink(missing_ok=True) if verbose: print(f" Cleaned up temp file") if verbose: lines = shacl_output.count('\n') print(f" Generated {lines} lines of SHACL") # Step 7: Write output if path specified if output_path: if verbose: print(f"\nStep 7: Writing output to {output_path}...") output_path.parent.mkdir(parents=True, exist_ok=True) output_path.write_text(shacl_output) if verbose: print(f" Written successfully") print(f"\n{'=' * 80}") print("✅ SHACL GENERATION COMPLETE") print(f"{'=' * 80}\n") return shacl_output def generate_timestamped_shacl( schema_path: "Path | None" = None, output_dir: "Path | None" = None, verbose: bool = False ) -> Path: """ Generate SHACL with timestamped filename per AGENTS.md rules. Filename format: custodian_shacl_{YYYYMMDD}_{HHMMSS}.ttl Args: schema_path: Path to LinkML schema (default: DEFAULT_SCHEMA) output_dir: Output directory (default: DEFAULT_OUTPUT_DIR) verbose: Print detailed progress Returns: Path to generated SHACL file """ if schema_path is None: schema_path = Path(DEFAULT_SCHEMA) if output_dir is None: output_dir = Path(DEFAULT_OUTPUT_DIR) # Generate timestamp per AGENTS.md: {base_name}_{YYYYMMDD}_{HHMMSS}.{extension} timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") output_filename = f"custodian_shacl_{timestamp}.ttl" output_path = output_dir / output_filename generate_shacl( schema_path=schema_path, output_path=output_path, output_format="turtle", verbose=verbose ) return output_path # ============================================================================ # CLI Interface # ============================================================================ def main(): """Main entry point for CLI.""" parser = argparse.ArgumentParser( description="Generate SHACL shapes from Heritage Custodian LinkML schema", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: # Generate SHACL with default settings (timestamped output) python scripts/generate_shacl.py # Generate SHACL to specific file python scripts/generate_shacl.py --output shapes.ttl # Use custom schema file python scripts/generate_shacl.py --schema my_schema.yaml # Verbose output python scripts/generate_shacl.py --verbose Output: By default, generates timestamped files in schemas/20251121/shacl/ Format: custodian_shacl_{YYYYMMDD}_{HHMMSS}.ttl Note: This script works around a LinkML bug where gen-shacl fails on modular schemas due to schema_map keying mismatch. It pre-processes the schema via SchemaView to resolve all imports before running ShaclGenerator. """ ) parser.add_argument( "-s", "--schema", type=Path, default=Path(DEFAULT_SCHEMA), help=f"LinkML schema file (default: {DEFAULT_SCHEMA})" ) parser.add_argument( "-o", "--output", type=Path, default=None, help="Output SHACL file (default: timestamped file in shacl/ dir)" ) parser.add_argument( "-f", "--format", type=str, default="turtle", choices=SUPPORTED_FORMATS, help="RDF output format (default: turtle)" ) parser.add_argument( "-v", "--verbose", action="store_true", help="Print detailed progress information" ) parser.add_argument( "--stdout", action="store_true", help="Write SHACL to stdout instead of file" ) args = parser.parse_args() try: if args.stdout: # Write to stdout only shacl = generate_shacl( schema_path=args.schema, output_path=None, output_format=args.format, verbose=args.verbose ) print(shacl) elif args.output: # Write to specified file generate_shacl( schema_path=args.schema, output_path=args.output, output_format=args.format, verbose=args.verbose ) print(f"SHACL shapes written to: {args.output}") else: # Write to timestamped file output_path = generate_timestamped_shacl( schema_path=args.schema, verbose=args.verbose ) print(f"SHACL shapes written to: {output_path}") sys.exit(0) except KeyboardInterrupt: print("\n\nGeneration interrupted by user.") sys.exit(2) except Exception as e: print(f"\n\nFATAL ERROR: {e}") import traceback traceback.print_exc() sys.exit(1) # ============================================================================ # Entry Point # ============================================================================ if __name__ == "__main__": main()