#!/usr/bin/env python3 """ Cleanup redundant slot_usage entries from LinkML class files. This script removes slot_usage entries that merely re-declare the same range and inlined values already defined in the generic slot definition. Per Rule 49 (slot-usage-minimization-rule.md): - REMOVE entries that only contain range/inlined matching the generic slot - KEEP entries with other modifications (required, pattern, examples, etc.) - TOLERATE description-only changes (semantic definiteness) Target slots: - template_specificity: range=TemplateSpecificityScores, inlined=true - specificity_annotation: range=SpecificityAnnotation, inlined=true Usage: python scripts/cleanup_redundant_slot_usage.py --dry-run # Preview changes python scripts/cleanup_redundant_slot_usage.py # Apply changes """ import argparse import re from pathlib import Path from typing import NamedTuple import yaml class SlotUsageStats(NamedTuple): """Statistics for slot_usage cleanup.""" files_scanned: int files_modified: int entries_removed: int entries_kept: int empty_slot_usage_removed: int # Generic slot definitions - if slot_usage only has these values, it's redundant REDUNDANT_SLOTS = { 'template_specificity': { 'range': 'TemplateSpecificityScores', 'inlined': True, }, 'specificity_annotation': { 'range': 'SpecificityAnnotation', 'inlined': True, }, } def is_redundant_slot_usage(slot_name: str, slot_config: dict) -> bool: """ Check if a slot_usage entry is redundant. Returns True if the entry only contains range/inlined matching the generic definition. Returns False if there are other properties or different values. """ if slot_name not in REDUNDANT_SLOTS: return False generic = REDUNDANT_SLOTS[slot_name] # Get actual values from slot_config actual_range = slot_config.get('range') actual_inlined = slot_config.get('inlined') # Check if range and inlined match generic (or are not specified) range_matches = actual_range is None or actual_range == generic['range'] inlined_matches = actual_inlined is None or actual_inlined == generic['inlined'] if not (range_matches and inlined_matches): return False # Check if there are other properties beyond range/inlined # These would make the slot_usage meaningful meaningful_keys = set(slot_config.keys()) - {'range', 'inlined'} # If there are other keys (like description, required, pattern), it's not redundant if meaningful_keys: return False return True def process_yaml_file(file_path: Path, dry_run: bool = True) -> tuple[bool, int, int]: """ Process a single YAML file to remove redundant slot_usage entries. Returns: tuple of (file_modified, entries_removed, entries_kept) """ with open(file_path, 'r', encoding='utf-8') as f: content = f.read() try: data = yaml.safe_load(content) except yaml.YAMLError as e: print(f" Warning: Could not parse {file_path}: {e}") return False, 0, 0 if not data or 'classes' not in data: return False, 0, 0 modified = False entries_removed = 0 entries_kept = 0 for class_name, class_def in data.get('classes', {}).items(): if not isinstance(class_def, dict): continue slot_usage = class_def.get('slot_usage') if not slot_usage or not isinstance(slot_usage, dict): continue slots_to_remove = [] for slot_name, slot_config in slot_usage.items(): if not isinstance(slot_config, dict): continue if is_redundant_slot_usage(slot_name, slot_config): slots_to_remove.append(slot_name) entries_removed += 1 else: entries_kept += 1 # Remove redundant slots for slot_name in slots_to_remove: del slot_usage[slot_name] modified = True # Remove empty slot_usage if not slot_usage: del class_def['slot_usage'] modified = True if modified and not dry_run: # Write back using custom YAML formatting to preserve style with open(file_path, 'w', encoding='utf-8') as f: yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False, width=120) return modified, entries_removed, entries_kept def main(): parser = argparse.ArgumentParser( description='Remove redundant slot_usage entries from LinkML class files.' ) parser.add_argument( '--dry-run', action='store_true', help='Preview changes without modifying files' ) parser.add_argument( '--verbose', '-v', action='store_true', help='Show details for each file' ) args = parser.parse_args() classes_dir = Path('/Users/kempersc/apps/glam/schemas/20251121/linkml/modules/classes') if not classes_dir.exists(): print(f"Error: Classes directory not found: {classes_dir}") return 1 yaml_files = list(classes_dir.glob('*.yaml')) print(f"{'[DRY RUN] ' if args.dry_run else ''}Scanning {len(yaml_files)} class files...") print(f"Target slots: {', '.join(REDUNDANT_SLOTS.keys())}") print() stats = { 'files_scanned': 0, 'files_modified': 0, 'entries_removed': 0, 'entries_kept': 0, } for yaml_file in sorted(yaml_files): stats['files_scanned'] += 1 modified, removed, kept = process_yaml_file(yaml_file, dry_run=args.dry_run) if modified: stats['files_modified'] += 1 if args.verbose or args.dry_run: print(f" {'Would modify' if args.dry_run else 'Modified'}: {yaml_file.name} (removed {removed} entries)") stats['entries_removed'] += removed stats['entries_kept'] += kept print() print("=" * 60) print("Summary:") print(f" Files scanned: {stats['files_scanned']}") print(f" Files modified: {stats['files_modified']}") print(f" Entries removed: {stats['entries_removed']}") print(f" Entries kept: {stats['entries_kept']}") print("=" * 60) if args.dry_run and stats['files_modified'] > 0: print() print("Run without --dry-run to apply changes.") return 0 if __name__ == '__main__': exit(main())