#!/usr/bin/env python3 """ Centralize inline slot definitions from class files to modules/slots/ This script: 1. Extracts all inline slot definitions from modules/classes/*.yaml 2. Checks which slots already exist in modules/slots/ 3. Generates missing slot files 4. Updates class files to import centralized slots instead of inline definitions Usage: python scripts/centralize_inline_slots.py --dry-run # Preview changes python scripts/centralize_inline_slots.py # Apply changes """ import os import re import sys import yaml import argparse from pathlib import Path from collections import defaultdict from datetime import datetime # Paths SCHEMA_ROOT = Path(__file__).parent.parent / "schemas" / "20251121" / "linkml" CLASSES_DIR = SCHEMA_ROOT / "modules" / "classes" SLOTS_DIR = SCHEMA_ROOT / "modules" / "slots" def load_yaml(path: Path) -> dict: """Load YAML file.""" with open(path, 'r', encoding='utf-8') as f: return yaml.safe_load(f) or {} def save_yaml(path: Path, data: dict, dry_run: bool = False): """Save YAML file with proper formatting.""" if dry_run: print(f" [DRY-RUN] Would write: {path}") return with open(path, 'w', encoding='utf-8') as f: yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False, width=120) def get_existing_centralized_slots() -> set: """Get set of slot names already defined in modules/slots/.""" existing = set() for slot_file in SLOTS_DIR.glob("*.yaml"): try: data = load_yaml(slot_file) if 'slots' in data: existing.update(data['slots'].keys()) except Exception as e: print(f"Warning: Could not parse {slot_file}: {e}") return existing def extract_inline_slots(class_file: Path) -> dict: """Extract inline slot definitions from a class file. Returns dict mapping slot_name -> slot_definition """ try: data = load_yaml(class_file) except Exception as e: print(f"Warning: Could not parse {class_file}: {e}") return {} # Top-level slots section contains inline definitions inline_slots = data.get('slots', {}) # Filter to only actual slot definitions (not just references) # A slot definition has properties like range, slot_uri, description, etc. defined_slots = {} for slot_name, slot_def in inline_slots.items(): if isinstance(slot_def, dict): # Has properties = is a definition defined_slots[slot_name] = slot_def return defined_slots def generate_slot_file_content(slot_name: str, slot_def: dict, source_class: str) -> dict: """Generate content for a centralized slot file.""" # Extract prefixes needed for this slot prefixes = { 'linkml': 'https://w3id.org/linkml/', 'hc': 'https://nde.nl/ontology/hc/', } # Add prefixes based on slot_uri slot_uri = slot_def.get('slot_uri', '') if slot_uri: prefix_match = re.match(r'^([a-z]+):', slot_uri) if prefix_match: prefix = prefix_match.group(1) # Common prefix mappings prefix_map = { 'schema': 'http://schema.org/', 'dcterms': 'http://purl.org/dc/terms/', 'prov': 'http://www.w3.org/ns/prov#', 'crm': 'http://www.cidoc-crm.org/cidoc-crm/', 'rico': 'https://www.ica.org/standards/RiC/ontology#', 'org': 'http://www.w3.org/ns/org#', 'foaf': 'http://xmlns.com/foaf/0.1/', 'skos': 'http://www.w3.org/2004/02/skos/core#', 'rdfs': 'http://www.w3.org/2000/01/rdf-schema#', 'owl': 'http://www.w3.org/2002/07/owl#', 'sosa': 'http://www.w3.org/ns/sosa/', 'premis': 'http://www.loc.gov/premis/rdf/v3/', 'odrl': 'http://www.w3.org/ns/odrl/2/', 'frapo': 'http://purl.org/cerif/frapo/', 'dcat': 'http://www.w3.org/ns/dcat#', 'bf': 'http://id.loc.gov/ontologies/bibframe/', } if prefix in prefix_map: prefixes[prefix] = prefix_map[prefix] content = { 'id': f'https://nde.nl/ontology/hc/slot/{slot_name}', 'name': f'{slot_name}_slot', 'title': f'{slot_name.replace("_", " ").title()} Slot', 'prefixes': prefixes, 'imports': ['linkml:types'], 'default_prefix': 'hc', 'slots': { slot_name: slot_def } } # Add comment about origin content['comments'] = [f'Centralized from {source_class} - {datetime.now().isoformat()}'] return content def update_class_file(class_file: Path, slots_to_remove: list, dry_run: bool = False): """Update class file to remove inline slots and add imports.""" try: data = load_yaml(class_file) except Exception as e: print(f"Warning: Could not parse {class_file}: {e}") return if 'slots' not in data: return # Remove inline slot definitions original_slots = data.get('slots', {}) remaining_slots = {} removed = [] for slot_name, slot_def in original_slots.items(): if slot_name in slots_to_remove and isinstance(slot_def, dict): removed.append(slot_name) else: remaining_slots[slot_name] = slot_def if not removed: return # Update slots section if remaining_slots: data['slots'] = remaining_slots else: del data['slots'] # Add imports for centralized slots imports = data.get('imports', []) for slot_name in removed: import_path = f'../slots/{slot_name}' if import_path not in imports: imports.append(import_path) data['imports'] = imports # Save updated file if dry_run: print(f" [DRY-RUN] Would update {class_file.name}: remove {len(removed)} inline slots, add {len(removed)} imports") else: save_yaml(class_file, data) print(f" Updated {class_file.name}: removed {len(removed)} inline slots") def main(): parser = argparse.ArgumentParser(description='Centralize inline slot definitions') parser.add_argument('--dry-run', action='store_true', help='Preview changes without writing') parser.add_argument('--verbose', '-v', action='store_true', help='Show detailed output') args = parser.parse_args() print("=" * 60) print("Centralizing Inline Slot Definitions") print("=" * 60) # Get existing centralized slots print("\n1. Checking existing centralized slots...") existing_slots = get_existing_centralized_slots() print(f" Found {len(existing_slots)} slots already in modules/slots/") # Extract inline slots from all class files print("\n2. Extracting inline slot definitions from class files...") all_inline_slots = {} # slot_name -> (definition, source_file) class_files_with_inline = {} # file -> list of slot names for class_file in sorted(CLASSES_DIR.glob("*.yaml")): inline_slots = extract_inline_slots(class_file) if inline_slots: class_files_with_inline[class_file] = list(inline_slots.keys()) for slot_name, slot_def in inline_slots.items(): if slot_name not in all_inline_slots: all_inline_slots[slot_name] = (slot_def, class_file.name) print(f" Found {len(all_inline_slots)} unique inline slot definitions") print(f" Across {len(class_files_with_inline)} class files") # Find slots that need to be created print("\n3. Identifying slots to create...") slots_to_create = {} slots_already_exist = set() for slot_name, (slot_def, source) in all_inline_slots.items(): if slot_name in existing_slots: slots_already_exist.add(slot_name) else: slots_to_create[slot_name] = (slot_def, source) print(f" {len(slots_already_exist)} slots already exist (will remove inline, keep import)") print(f" {len(slots_to_create)} slots need to be created") if args.verbose: print("\n Slots to create:") for name in sorted(slots_to_create.keys())[:20]: print(f" - {name}") if len(slots_to_create) > 20: print(f" ... and {len(slots_to_create) - 20} more") # Create missing slot files print("\n4. Creating centralized slot files...") created_count = 0 for slot_name, (slot_def, source) in sorted(slots_to_create.items()): slot_file = SLOTS_DIR / f"{slot_name}.yaml" if slot_file.exists(): if args.verbose: print(f" Skipping {slot_name} (file exists)") continue content = generate_slot_file_content(slot_name, slot_def, source) if args.dry_run: print(f" [DRY-RUN] Would create: {slot_file.name}") else: save_yaml(slot_file, content) if args.verbose: print(f" Created: {slot_file.name}") created_count += 1 print(f" {'Would create' if args.dry_run else 'Created'} {created_count} slot files") # Update class files print("\n5. Updating class files to remove inline definitions...") updated_count = 0 for class_file, slot_names in sorted(class_files_with_inline.items()): update_class_file(class_file, slot_names, args.dry_run) updated_count += 1 print(f" {'Would update' if args.dry_run else 'Updated'} {updated_count} class files") # Summary print("\n" + "=" * 60) print("Summary") print("=" * 60) print(f" Slots already centralized: {len(slots_already_exist)}") print(f" New slot files {'to create' if args.dry_run else 'created'}: {created_count}") print(f" Class files {'to update' if args.dry_run else 'updated'}: {updated_count}") if args.dry_run: print("\n Run without --dry-run to apply changes.") else: print("\n Done! Remember to:") print(" 1. Run linkml-validate to verify schema integrity") print(" 2. Update manifest.json") print(" 3. Commit changes") if __name__ == '__main__': main()