glam/scripts/extract_inline_slots.py

222 lines
7.1 KiB
Python

#!/usr/bin/env python3
"""
Extract inline slots from LinkML class files to individual slot files.
Per Rule 38 (AGENTS.md), all LinkML slots MUST be centralized in
schemas/20251121/linkml/modules/slots/, never inline in class files.
Usage:
python scripts/extract_inline_slots.py [--dry-run] [--file PATH]
Options:
--dry-run Show what would be done without making changes
--file PATH Process only a single file
"""
import argparse
import yaml
import os
import sys
from pathlib import Path
from typing import Dict, Any, List, Tuple
# Schema paths
SCHEMA_ROOT = Path("schemas/20251121/linkml")
CLASSES_DIR = SCHEMA_ROOT / "modules" / "classes"
SLOTS_DIR = SCHEMA_ROOT / "modules" / "slots"
# Standard prefixes for slot files
STANDARD_PREFIXES = {
"linkml": "https://w3id.org/linkml/",
"hc": "https://nde.nl/ontology/hc/",
"schema": "http://schema.org/",
"dcterms": "http://purl.org/dc/terms/",
"skos": "http://www.w3.org/2004/02/skos/core#",
"rico": "https://www.ica.org/standards/RiC/ontology#",
"prov": "http://www.w3.org/ns/prov#",
"crm": "http://www.cidoc-crm.org/cidoc-crm/",
"foaf": "http://xmlns.com/foaf/0.1/",
"bf": "http://id.loc.gov/ontologies/bibframe/",
}
def extract_slot_prefix(slot_uri: str) -> str:
"""Extract the prefix from a slot_uri like 'schema:description'."""
if ":" in slot_uri and not slot_uri.startswith("http"):
return slot_uri.split(":")[0]
return None
def get_required_prefixes(slot_def: Dict[str, Any]) -> Dict[str, str]:
"""Determine which prefixes are needed for this slot."""
prefixes = {"linkml": STANDARD_PREFIXES["linkml"], "hc": STANDARD_PREFIXES["hc"]}
# Check slot_uri
if "slot_uri" in slot_def:
prefix = extract_slot_prefix(slot_def["slot_uri"])
if prefix and prefix in STANDARD_PREFIXES:
prefixes[prefix] = STANDARD_PREFIXES[prefix]
# Check mappings
for mapping_type in ["exact_mappings", "close_mappings", "related_mappings", "narrow_mappings", "broad_mappings"]:
if mapping_type in slot_def:
for mapping in slot_def[mapping_type]:
prefix = extract_slot_prefix(mapping)
if prefix and prefix in STANDARD_PREFIXES:
prefixes[prefix] = STANDARD_PREFIXES[prefix]
return prefixes
def create_slot_file_content(slot_name: str, slot_def: Dict[str, Any]) -> str:
"""Create the content for an individual slot file."""
prefixes = get_required_prefixes(slot_def)
# Build the slot file structure
slot_file = {
"id": f"https://nde.nl/ontology/hc/slot/{slot_name}",
"name": f"{slot_name}_slot",
"title": f"{slot_name.replace('_', ' ').title()} Slot",
"prefixes": prefixes,
"imports": ["linkml:types"],
"default_prefix": "hc",
"slots": {
slot_name: slot_def
}
}
# Convert to YAML
return yaml.dump(slot_file, default_flow_style=False, allow_unicode=True, sort_keys=False, width=120)
def parse_class_file(file_path: Path) -> Tuple[Dict[str, Any], Dict[str, Dict[str, Any]]]:
"""
Parse a class file and extract inline slots.
Returns:
Tuple of (full_yaml_dict, {slot_name: slot_definition})
"""
with open(file_path, 'r') as f:
content = yaml.safe_load(f)
slots = {}
if content and "slots" in content:
slots = content.get("slots", {})
return content, slots
def slot_file_exists(slot_name: str) -> bool:
"""Check if a slot file already exists."""
slot_file = SLOTS_DIR / f"{slot_name}.yaml"
return slot_file.exists()
def process_class_file(file_path: Path, dry_run: bool = False) -> Dict[str, Any]:
"""
Process a single class file, extracting inline slots.
Returns:
Dict with statistics: {created: [], skipped: [], errors: []}
"""
stats = {"created": [], "skipped": [], "errors": []}
try:
content, slots = parse_class_file(file_path)
except Exception as e:
stats["errors"].append(f"Failed to parse {file_path}: {e}")
return stats
if not slots:
return stats
for slot_name, slot_def in slots.items():
if slot_file_exists(slot_name):
stats["skipped"].append(slot_name)
continue
try:
slot_content = create_slot_file_content(slot_name, slot_def)
slot_file_path = SLOTS_DIR / f"{slot_name}.yaml"
if dry_run:
print(f" Would create: {slot_file_path}")
stats["created"].append(slot_name)
else:
with open(slot_file_path, 'w') as f:
f.write(slot_content)
print(f" Created: {slot_file_path}")
stats["created"].append(slot_name)
except Exception as e:
stats["errors"].append(f"Failed to create slot {slot_name}: {e}")
return stats
def find_class_files_with_inline_slots() -> List[Path]:
"""Find all class files that have inline slots."""
files_with_slots = []
for yaml_file in CLASSES_DIR.glob("*.yaml"):
try:
with open(yaml_file, 'r') as f:
content = yaml.safe_load(f)
if content and "slots" in content and content["slots"]:
files_with_slots.append(yaml_file)
except:
continue
return sorted(files_with_slots)
def main():
parser = argparse.ArgumentParser(description="Extract inline slots from LinkML class files")
parser.add_argument("--dry-run", action="store_true", help="Show what would be done without making changes")
parser.add_argument("--file", type=str, help="Process only a single file")
args = parser.parse_args()
# Change to project root
project_root = Path(__file__).parent.parent
os.chdir(project_root)
# Ensure slots directory exists
SLOTS_DIR.mkdir(parents=True, exist_ok=True)
total_stats = {"created": [], "skipped": [], "errors": []}
if args.file:
files_to_process = [Path(args.file)]
else:
files_to_process = find_class_files_with_inline_slots()
print(f"Processing {len(files_to_process)} class file(s)...")
if args.dry_run:
print("(DRY RUN - no changes will be made)\n")
for class_file in files_to_process:
print(f"\nProcessing: {class_file.name}")
stats = process_class_file(class_file, dry_run=args.dry_run)
total_stats["created"].extend(stats["created"])
total_stats["skipped"].extend(stats["skipped"])
total_stats["errors"].extend(stats["errors"])
# Summary
print("\n" + "=" * 60)
print("SUMMARY")
print("=" * 60)
print(f"Slot files created: {len(total_stats['created'])}")
print(f"Slots skipped (already exist): {len(total_stats['skipped'])}")
print(f"Errors: {len(total_stats['errors'])}")
if total_stats["errors"]:
print("\nErrors encountered:")
for error in total_stats["errors"]:
print(f" - {error}")
return 0 if not total_stats["errors"] else 1
if __name__ == "__main__":
sys.exit(main())