glam/scripts/extract_inline_slots.py

#!/usr/bin/env python3
"""
Extract inline slots from LinkML class files to individual slot files.

Per Rule 38 (AGENTS.md), all LinkML slots MUST be centralized in
schemas/20251121/linkml/modules/slots/, never inline in class files.

Usage:
    python scripts/extract_inline_slots.py [--dry-run] [--file PATH]

Options:
    --dry-run    Show what would be done without making changes
    --file PATH  Process only a single file
"""

import argparse
import yaml
import os
import sys
from pathlib import Path
from typing import Dict, Any, List, Tuple

# Schema paths
SCHEMA_ROOT = Path("schemas/20251121/linkml")
CLASSES_DIR = SCHEMA_ROOT / "modules" / "classes"
SLOTS_DIR = SCHEMA_ROOT / "modules" / "slots"

# Standard prefixes for slot files
STANDARD_PREFIXES = {
    "linkml": "https://w3id.org/linkml/",
    "hc": "https://nde.nl/ontology/hc/",
    "schema": "http://schema.org/",
    "dcterms": "http://purl.org/dc/terms/",
    "skos": "http://www.w3.org/2004/02/skos/core#",
    "rico": "https://www.ica.org/standards/RiC/ontology#",
    "prov": "http://www.w3.org/ns/prov#",
    "crm": "http://www.cidoc-crm.org/cidoc-crm/",
    "foaf": "http://xmlns.com/foaf/0.1/",
    "bf": "http://id.loc.gov/ontologies/bibframe/",
}


def extract_slot_prefix(slot_uri: str) -> str:
    """Extract the prefix from a slot_uri like 'schema:description'."""
    if ":" in slot_uri and not slot_uri.startswith("http"):
        return slot_uri.split(":")[0]
    return None


def get_required_prefixes(slot_def: Dict[str, Any]) -> Dict[str, str]:
    """Determine which prefixes are needed for this slot."""
    prefixes = {"linkml": STANDARD_PREFIXES["linkml"], "hc": STANDARD_PREFIXES["hc"]}

    # Check slot_uri
    if "slot_uri" in slot_def:
        prefix = extract_slot_prefix(slot_def["slot_uri"])
        if prefix and prefix in STANDARD_PREFIXES:
            prefixes[prefix] = STANDARD_PREFIXES[prefix]

    # Check mappings
    for mapping_type in ["exact_mappings", "close_mappings", "related_mappings", "narrow_mappings", "broad_mappings"]:
        if mapping_type in slot_def:
            for mapping in slot_def[mapping_type]:
                prefix = extract_slot_prefix(mapping)
                if prefix and prefix in STANDARD_PREFIXES:
                    prefixes[prefix] = STANDARD_PREFIXES[prefix]

    return prefixes


def create_slot_file_content(slot_name: str, slot_def: Dict[str, Any]) -> str:
    """Create the content for an individual slot file."""
    prefixes = get_required_prefixes(slot_def)

    # Build the slot file structure
    slot_file = {
        "id": f"https://nde.nl/ontology/hc/slot/{slot_name}",
        "name": f"{slot_name}_slot",
        "title": f"{slot_name.replace('_', ' ').title()} Slot",
        "prefixes": prefixes,
        "imports": ["linkml:types"],
        "default_prefix": "hc",
        "slots": {
            slot_name: slot_def
        }
    }

    # Convert to YAML
    return yaml.dump(slot_file, default_flow_style=False, allow_unicode=True, sort_keys=False, width=120)


def parse_class_file(file_path: Path) -> Tuple[Dict[str, Any], Dict[str, Dict[str, Any]]]:
    """
    Parse a class file and extract inline slots.

    Returns:
        Tuple of (full_yaml_dict, {slot_name: slot_definition})
    """
    with open(file_path, 'r') as f:
        content = yaml.safe_load(f)

    slots = {}
    if content and "slots" in content:
        slots = content.get("slots", {})

    return content, slots


def slot_file_exists(slot_name: str) -> bool:
    """Check if a slot file already exists."""
    slot_file = SLOTS_DIR / f"{slot_name}.yaml"
    return slot_file.exists()


def process_class_file(file_path: Path, dry_run: bool = False) -> Dict[str, Any]:
    """
    Process a single class file, extracting inline slots.

    Returns:
        Dict with statistics: {created: [], skipped: [], errors: []}
    """
    stats = {"created": [], "skipped": [], "errors": []}

    try:
        content, slots = parse_class_file(file_path)
    except Exception as e:
        stats["errors"].append(f"Failed to parse {file_path}: {e}")
        return stats

    if not slots:
        return stats

    for slot_name, slot_def in slots.items():
        if slot_file_exists(slot_name):
            stats["skipped"].append(slot_name)
            continue

        try:
            slot_content = create_slot_file_content(slot_name, slot_def)
            slot_file_path = SLOTS_DIR / f"{slot_name}.yaml"

            if dry_run:
                print(f"  Would create: {slot_file_path}")
                stats["created"].append(slot_name)
            else:
                with open(slot_file_path, 'w') as f:
                    f.write(slot_content)
                print(f"  Created: {slot_file_path}")
                stats["created"].append(slot_name)

        except Exception as e:
            stats["errors"].append(f"Failed to create slot {slot_name}: {e}")

    return stats


def find_class_files_with_inline_slots() -> List[Path]:
    """Find all class files that have inline slots."""
    files_with_slots = []

    for yaml_file in CLASSES_DIR.glob("*.yaml"):
        try:
            with open(yaml_file, 'r') as f:
                content = yaml.safe_load(f)
            if content and "slots" in content and content["slots"]:
                files_with_slots.append(yaml_file)
        except:
            continue

    return sorted(files_with_slots)


def main():
    parser = argparse.ArgumentParser(description="Extract inline slots from LinkML class files")
    parser.add_argument("--dry-run", action="store_true", help="Show what would be done without making changes")
    parser.add_argument("--file", type=str, help="Process only a single file")
    args = parser.parse_args()

    # Change to project root
    project_root = Path(__file__).parent.parent
    os.chdir(project_root)

    # Ensure slots directory exists
    SLOTS_DIR.mkdir(parents=True, exist_ok=True)

    total_stats = {"created": [], "skipped": [], "errors": []}

    if args.file:
        files_to_process = [Path(args.file)]
    else:
        files_to_process = find_class_files_with_inline_slots()

    print(f"Processing {len(files_to_process)} class file(s)...")
    if args.dry_run:
        print("(DRY RUN - no changes will be made)\n")

    for class_file in files_to_process:
        print(f"\nProcessing: {class_file.name}")
        stats = process_class_file(class_file, dry_run=args.dry_run)

        total_stats["created"].extend(stats["created"])
        total_stats["skipped"].extend(stats["skipped"])
        total_stats["errors"].extend(stats["errors"])

    # Summary
    print("\n" + "=" * 60)
    print("SUMMARY")
    print("=" * 60)
    print(f"Slot files created: {len(total_stats['created'])}")
    print(f"Slots skipped (already exist): {len(total_stats['skipped'])}")
    print(f"Errors: {len(total_stats['errors'])}")

    if total_stats["errors"]:
        print("\nErrors encountered:")
        for error in total_stats["errors"]:
            print(f"  - {error}")

    return 0 if not total_stats["errors"] else 1


if __name__ == "__main__":
    sys.exit(main())