212 lines
6.5 KiB
Python
Executable file
212 lines
6.5 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Cleanup redundant slot_usage entries from LinkML class files.
|
|
|
|
This script removes slot_usage entries that merely re-declare the same
|
|
range and inlined values already defined in the generic slot definition.
|
|
|
|
Per Rule 49 (slot-usage-minimization-rule.md):
|
|
- REMOVE entries that only contain range/inlined matching the generic slot
|
|
- KEEP entries with other modifications (required, pattern, examples, etc.)
|
|
- TOLERATE description-only changes (semantic definiteness)
|
|
|
|
Target slots:
|
|
- template_specificity: range=TemplateSpecificityScores, inlined=true
|
|
- specificity_annotation: range=SpecificityAnnotation, inlined=true
|
|
|
|
Usage:
|
|
python scripts/cleanup_redundant_slot_usage.py --dry-run # Preview changes
|
|
python scripts/cleanup_redundant_slot_usage.py # Apply changes
|
|
"""
|
|
|
|
import argparse
|
|
import re
|
|
from pathlib import Path
|
|
from typing import NamedTuple
|
|
|
|
import yaml
|
|
|
|
|
|
class SlotUsageStats(NamedTuple):
|
|
"""Statistics for slot_usage cleanup."""
|
|
files_scanned: int
|
|
files_modified: int
|
|
entries_removed: int
|
|
entries_kept: int
|
|
empty_slot_usage_removed: int
|
|
|
|
|
|
# Generic slot definitions - if slot_usage only has these values, it's redundant
|
|
REDUNDANT_SLOTS = {
|
|
'template_specificity': {
|
|
'range': 'TemplateSpecificityScores',
|
|
'inlined': True,
|
|
},
|
|
'specificity_annotation': {
|
|
'range': 'SpecificityAnnotation',
|
|
'inlined': True,
|
|
},
|
|
}
|
|
|
|
|
|
def is_redundant_slot_usage(slot_name: str, slot_config: dict) -> bool:
|
|
"""
|
|
Check if a slot_usage entry is redundant.
|
|
|
|
Returns True if the entry only contains range/inlined matching the generic definition.
|
|
Returns False if there are other properties or different values.
|
|
"""
|
|
if slot_name not in REDUNDANT_SLOTS:
|
|
return False
|
|
|
|
generic = REDUNDANT_SLOTS[slot_name]
|
|
|
|
# Get actual values from slot_config
|
|
actual_range = slot_config.get('range')
|
|
actual_inlined = slot_config.get('inlined')
|
|
|
|
# Check if range and inlined match generic (or are not specified)
|
|
range_matches = actual_range is None or actual_range == generic['range']
|
|
inlined_matches = actual_inlined is None or actual_inlined == generic['inlined']
|
|
|
|
if not (range_matches and inlined_matches):
|
|
return False
|
|
|
|
# Check if there are other properties beyond range/inlined
|
|
# These would make the slot_usage meaningful
|
|
meaningful_keys = set(slot_config.keys()) - {'range', 'inlined'}
|
|
|
|
# If there are other keys (like description, required, pattern), it's not redundant
|
|
if meaningful_keys:
|
|
return False
|
|
|
|
return True
|
|
|
|
|
|
def process_yaml_file(file_path: Path, dry_run: bool = True) -> tuple[bool, int, int]:
|
|
"""
|
|
Process a single YAML file to remove redundant slot_usage entries.
|
|
|
|
Returns:
|
|
tuple of (file_modified, entries_removed, entries_kept)
|
|
"""
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
try:
|
|
data = yaml.safe_load(content)
|
|
except yaml.YAMLError as e:
|
|
print(f" Warning: Could not parse {file_path}: {e}")
|
|
return False, 0, 0
|
|
|
|
if not data or 'classes' not in data:
|
|
return False, 0, 0
|
|
|
|
modified = False
|
|
entries_removed = 0
|
|
entries_kept = 0
|
|
|
|
for class_name, class_def in data.get('classes', {}).items():
|
|
if not isinstance(class_def, dict):
|
|
continue
|
|
|
|
slot_usage = class_def.get('slot_usage')
|
|
if not slot_usage or not isinstance(slot_usage, dict):
|
|
continue
|
|
|
|
slots_to_remove = []
|
|
|
|
for slot_name, slot_config in slot_usage.items():
|
|
if not isinstance(slot_config, dict):
|
|
continue
|
|
|
|
if is_redundant_slot_usage(slot_name, slot_config):
|
|
slots_to_remove.append(slot_name)
|
|
entries_removed += 1
|
|
else:
|
|
entries_kept += 1
|
|
|
|
# Remove redundant slots
|
|
for slot_name in slots_to_remove:
|
|
del slot_usage[slot_name]
|
|
modified = True
|
|
|
|
# Remove empty slot_usage
|
|
if not slot_usage:
|
|
del class_def['slot_usage']
|
|
modified = True
|
|
|
|
if modified and not dry_run:
|
|
# Write back using custom YAML formatting to preserve style
|
|
with open(file_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False, width=120)
|
|
|
|
return modified, entries_removed, entries_kept
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description='Remove redundant slot_usage entries from LinkML class files.'
|
|
)
|
|
parser.add_argument(
|
|
'--dry-run',
|
|
action='store_true',
|
|
help='Preview changes without modifying files'
|
|
)
|
|
parser.add_argument(
|
|
'--verbose', '-v',
|
|
action='store_true',
|
|
help='Show details for each file'
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
classes_dir = Path('/Users/kempersc/apps/glam/schemas/20251121/linkml/modules/classes')
|
|
|
|
if not classes_dir.exists():
|
|
print(f"Error: Classes directory not found: {classes_dir}")
|
|
return 1
|
|
|
|
yaml_files = list(classes_dir.glob('*.yaml'))
|
|
|
|
print(f"{'[DRY RUN] ' if args.dry_run else ''}Scanning {len(yaml_files)} class files...")
|
|
print(f"Target slots: {', '.join(REDUNDANT_SLOTS.keys())}")
|
|
print()
|
|
|
|
stats = {
|
|
'files_scanned': 0,
|
|
'files_modified': 0,
|
|
'entries_removed': 0,
|
|
'entries_kept': 0,
|
|
}
|
|
|
|
for yaml_file in sorted(yaml_files):
|
|
stats['files_scanned'] += 1
|
|
|
|
modified, removed, kept = process_yaml_file(yaml_file, dry_run=args.dry_run)
|
|
|
|
if modified:
|
|
stats['files_modified'] += 1
|
|
if args.verbose or args.dry_run:
|
|
print(f" {'Would modify' if args.dry_run else 'Modified'}: {yaml_file.name} (removed {removed} entries)")
|
|
|
|
stats['entries_removed'] += removed
|
|
stats['entries_kept'] += kept
|
|
|
|
print()
|
|
print("=" * 60)
|
|
print("Summary:")
|
|
print(f" Files scanned: {stats['files_scanned']}")
|
|
print(f" Files modified: {stats['files_modified']}")
|
|
print(f" Entries removed: {stats['entries_removed']}")
|
|
print(f" Entries kept: {stats['entries_kept']}")
|
|
print("=" * 60)
|
|
|
|
if args.dry_run and stats['files_modified'] > 0:
|
|
print()
|
|
print("Run without --dry-run to apply changes.")
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == '__main__':
|
|
exit(main())
|