glam/scripts/cleanup_redundant_slot_usage.py
2026-01-12 14:33:56 +01:00

212 lines
6.5 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Cleanup redundant slot_usage entries from LinkML class files.
This script removes slot_usage entries that merely re-declare the same
range and inlined values already defined in the generic slot definition.
Per Rule 49 (slot-usage-minimization-rule.md):
- REMOVE entries that only contain range/inlined matching the generic slot
- KEEP entries with other modifications (required, pattern, examples, etc.)
- TOLERATE description-only changes (semantic definiteness)
Target slots:
- template_specificity: range=TemplateSpecificityScores, inlined=true
- specificity_annotation: range=SpecificityAnnotation, inlined=true
Usage:
python scripts/cleanup_redundant_slot_usage.py --dry-run # Preview changes
python scripts/cleanup_redundant_slot_usage.py # Apply changes
"""
import argparse
import re
from pathlib import Path
from typing import NamedTuple
import yaml
class SlotUsageStats(NamedTuple):
"""Statistics for slot_usage cleanup."""
files_scanned: int
files_modified: int
entries_removed: int
entries_kept: int
empty_slot_usage_removed: int
# Generic slot definitions - if slot_usage only has these values, it's redundant
REDUNDANT_SLOTS = {
'template_specificity': {
'range': 'TemplateSpecificityScores',
'inlined': True,
},
'specificity_annotation': {
'range': 'SpecificityAnnotation',
'inlined': True,
},
}
def is_redundant_slot_usage(slot_name: str, slot_config: dict) -> bool:
"""
Check if a slot_usage entry is redundant.
Returns True if the entry only contains range/inlined matching the generic definition.
Returns False if there are other properties or different values.
"""
if slot_name not in REDUNDANT_SLOTS:
return False
generic = REDUNDANT_SLOTS[slot_name]
# Get actual values from slot_config
actual_range = slot_config.get('range')
actual_inlined = slot_config.get('inlined')
# Check if range and inlined match generic (or are not specified)
range_matches = actual_range is None or actual_range == generic['range']
inlined_matches = actual_inlined is None or actual_inlined == generic['inlined']
if not (range_matches and inlined_matches):
return False
# Check if there are other properties beyond range/inlined
# These would make the slot_usage meaningful
meaningful_keys = set(slot_config.keys()) - {'range', 'inlined'}
# If there are other keys (like description, required, pattern), it's not redundant
if meaningful_keys:
return False
return True
def process_yaml_file(file_path: Path, dry_run: bool = True) -> tuple[bool, int, int]:
"""
Process a single YAML file to remove redundant slot_usage entries.
Returns:
tuple of (file_modified, entries_removed, entries_kept)
"""
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
try:
data = yaml.safe_load(content)
except yaml.YAMLError as e:
print(f" Warning: Could not parse {file_path}: {e}")
return False, 0, 0
if not data or 'classes' not in data:
return False, 0, 0
modified = False
entries_removed = 0
entries_kept = 0
for class_name, class_def in data.get('classes', {}).items():
if not isinstance(class_def, dict):
continue
slot_usage = class_def.get('slot_usage')
if not slot_usage or not isinstance(slot_usage, dict):
continue
slots_to_remove = []
for slot_name, slot_config in slot_usage.items():
if not isinstance(slot_config, dict):
continue
if is_redundant_slot_usage(slot_name, slot_config):
slots_to_remove.append(slot_name)
entries_removed += 1
else:
entries_kept += 1
# Remove redundant slots
for slot_name in slots_to_remove:
del slot_usage[slot_name]
modified = True
# Remove empty slot_usage
if not slot_usage:
del class_def['slot_usage']
modified = True
if modified and not dry_run:
# Write back using custom YAML formatting to preserve style
with open(file_path, 'w', encoding='utf-8') as f:
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False, width=120)
return modified, entries_removed, entries_kept
def main():
parser = argparse.ArgumentParser(
description='Remove redundant slot_usage entries from LinkML class files.'
)
parser.add_argument(
'--dry-run',
action='store_true',
help='Preview changes without modifying files'
)
parser.add_argument(
'--verbose', '-v',
action='store_true',
help='Show details for each file'
)
args = parser.parse_args()
classes_dir = Path('/Users/kempersc/apps/glam/schemas/20251121/linkml/modules/classes')
if not classes_dir.exists():
print(f"Error: Classes directory not found: {classes_dir}")
return 1
yaml_files = list(classes_dir.glob('*.yaml'))
print(f"{'[DRY RUN] ' if args.dry_run else ''}Scanning {len(yaml_files)} class files...")
print(f"Target slots: {', '.join(REDUNDANT_SLOTS.keys())}")
print()
stats = {
'files_scanned': 0,
'files_modified': 0,
'entries_removed': 0,
'entries_kept': 0,
}
for yaml_file in sorted(yaml_files):
stats['files_scanned'] += 1
modified, removed, kept = process_yaml_file(yaml_file, dry_run=args.dry_run)
if modified:
stats['files_modified'] += 1
if args.verbose or args.dry_run:
print(f" {'Would modify' if args.dry_run else 'Modified'}: {yaml_file.name} (removed {removed} entries)")
stats['entries_removed'] += removed
stats['entries_kept'] += kept
print()
print("=" * 60)
print("Summary:")
print(f" Files scanned: {stats['files_scanned']}")
print(f" Files modified: {stats['files_modified']}")
print(f" Entries removed: {stats['entries_removed']}")
print(f" Entries kept: {stats['entries_kept']}")
print("=" * 60)
if args.dry_run and stats['files_modified'] > 0:
print()
print("Run without --dry-run to apply changes.")
return 0
if __name__ == '__main__':
exit(main())