feat(scripts): improve types-vocab extraction to derive all vocabulary from schema

- Remove hardcoded type mappings, derive dynamically from LinkML
- Extract keywords from annotations, structured_aliases, and comments
- Add rename_plural_slot.py utility for schema slot renaming
This commit is contained in:
kempersc 2026-01-10 15:37:52 +01:00
parent ec18e1810d
commit ad74d8379e
3 changed files with 14211 additions and 7079 deletions

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,201 @@
#!/usr/bin/env python3
"""
Rename a plural slot to singular form following Rule 43.
This script:
1. Renames the slot file (identifiers.yaml identifier.yaml)
2. Updates slot name inside the file
3. Updates all class files that reference the slot
4. Preserves natural language descriptions containing the plural word
Usage:
python scripts/rename_plural_slot.py identifiers identifier --dry-run
python scripts/rename_plural_slot.py identifiers identifier
"""
import argparse
import os
import re
import sys
from pathlib import Path
SLOTS_DIR = Path("schemas/20251121/linkml/modules/slots")
CLASSES_DIR = Path("schemas/20251121/linkml/modules/classes")
def rename_slot_file(old_name: str, new_name: str, dry_run: bool = False) -> bool:
"""Rename the slot YAML file and update its contents."""
old_path = SLOTS_DIR / f"{old_name}.yaml"
new_path = SLOTS_DIR / f"{new_name}.yaml"
if not old_path.exists():
print(f"ERROR: Slot file not found: {old_path}")
return False
if new_path.exists():
print(f"ERROR: Target slot file already exists: {new_path}")
return False
# Read the file
content = old_path.read_text()
# Update the slot definition inside the file
# Pattern 1: id field
content = re.sub(
rf"^id: (.*/slot/){old_name}$",
rf"id: \g<1>{new_name}",
content,
flags=re.MULTILINE
)
# Pattern 2: name field (e.g., name: identifiers_slot → name: identifier_slot)
content = re.sub(
rf"^name: {old_name}_slot$",
f"name: {new_name}_slot",
content,
flags=re.MULTILINE
)
content = re.sub(
rf"^name: {old_name}$",
f"name: {new_name}",
content,
flags=re.MULTILINE
)
# Pattern 3: slots section - the slot key itself
# Be careful: only match when it's a YAML key (followed by :)
content = re.sub(
rf"^(slots:\s*\n ){old_name}:",
rf"\g<1>{new_name}:",
content,
flags=re.MULTILINE
)
# Also handle when the slot key is not right after slots:
content = re.sub(
rf"^ {old_name}:$",
f" {new_name}:",
content,
flags=re.MULTILINE
)
if dry_run:
print(f"[DRY-RUN] Would rename: {old_path}{new_path}")
print(f"[DRY-RUN] Updated content preview:")
# Show first 30 lines
for i, line in enumerate(content.split('\n')[:30], 1):
print(f" {i:3d}| {line}")
else:
# Write the updated content
new_path.write_text(content)
# Remove the old file
old_path.unlink()
print(f"✓ Renamed slot file: {old_path.name}{new_path.name}")
return True
def update_class_files(old_name: str, new_name: str, dry_run: bool = False) -> int:
"""Update all class files that reference the slot."""
updated_count = 0
for class_file in sorted(CLASSES_DIR.glob("*.yaml")):
content = class_file.read_text()
original_content = content
# Pattern 1: Import statement
# - ../slots/identifiers → - ../slots/identifier
content = re.sub(
rf"^(- \.\./slots/){old_name}$",
rf"\g<1>{new_name}",
content,
flags=re.MULTILINE
)
# Pattern 2: Slot reference in slots list (under class definition)
# slots:
# - identifiers
content = re.sub(
rf"^(\s+- ){old_name}$",
rf"\g<1>{new_name}",
content,
flags=re.MULTILINE
)
# Pattern 3: Slot usage as YAML key (indented)
# Be careful: only match YAML keys (word at start of logical line, followed by :)
# This handles cases like:
# identifiers:
# range: CustodianIdentifier
# But NOT: "External identifiers assigned"
content = re.sub(
rf"^(\s+){old_name}:$",
rf"\g<1>{new_name}:",
content,
flags=re.MULTILINE
)
# Also handle key with value on same line
content = re.sub(
rf"^(\s+){old_name}: ",
rf"\g<1>{new_name}: ",
content,
flags=re.MULTILINE
)
if content != original_content:
updated_count += 1
if dry_run:
print(f"[DRY-RUN] Would update: {class_file.name}")
# Show diff-like output
old_lines = original_content.split('\n')
new_lines = content.split('\n')
for i, (old, new) in enumerate(zip(old_lines, new_lines), 1):
if old != new:
print(f" Line {i}:")
print(f" - {old}")
print(f" + {new}")
else:
class_file.write_text(content)
print(f"✓ Updated class: {class_file.name}")
return updated_count
def main():
parser = argparse.ArgumentParser(
description="Rename a plural slot to singular form (Rule 43)"
)
parser.add_argument("old_name", help="Current slot name (e.g., 'identifiers')")
parser.add_argument("new_name", help="New slot name (e.g., 'identifier')")
parser.add_argument("--dry-run", action="store_true",
help="Show what would be changed without making changes")
args = parser.parse_args()
# Change to repo root
repo_root = Path(__file__).parent.parent
os.chdir(repo_root)
print(f"{'='*60}")
print(f"Renaming slot: {args.old_name}{args.new_name}")
print(f"Mode: {'DRY-RUN' if args.dry_run else 'LIVE'}")
print(f"{'='*60}")
# Step 1: Rename slot file
print("\n[Step 1] Renaming slot file...")
if not rename_slot_file(args.old_name, args.new_name, args.dry_run):
sys.exit(1)
# Step 2: Update class files
print("\n[Step 2] Updating class files...")
count = update_class_files(args.old_name, args.new_name, args.dry_run)
print(f"\n{'Would update' if args.dry_run else 'Updated'} {count} class files")
if args.dry_run:
print("\n[DRY-RUN] No changes made. Run without --dry-run to apply changes.")
else:
print(f"\n✓ Successfully renamed slot: {args.old_name}{args.new_name}")
if __name__ == "__main__":
main()