glam/scripts/scan_ghost_references.py

import os
import re

# Paths
SCHEMA_ROOT = "/Users/kempersc/apps/glam/schemas/20251121/linkml/modules"
CLASSES_DIR = os.path.join(SCHEMA_ROOT, "classes")
SLOTS_DIR = os.path.join(SCHEMA_ROOT, "slots")
ARCHIVED_SLOTS_FILE = "archived_slots.txt"
ARCHIVED_CLASSES_FILE = "archived_classes.txt"

def parse_archived_name(filename):
    # Removes _archived_YYYYMMDD.yaml suffix
    # Also handles cases without date if any (though list showed dates)
    # The pattern seems to be name_archived_date.yaml
    match = re.match(r"(.+)_archived_\d+\.yaml", filename)
    if match:
        return match.group(1)
    # Fallback if no date
    if filename.endswith(".yaml"):
        return filename[:-5]
    return filename

def load_archived_entities(filepath):
    entities = set()
    try:
        with open(filepath, 'r') as f:
            for line in f:
                filename = line.strip()
                if filename:
                    entity_name = parse_archived_name(filename)
                    entities.add(entity_name)
    except FileNotFoundError:
        print(f"Warning: {filepath} not found.")
    return entities

def scan_files(directory, archived_slots, archived_classes):
    print(f"Scanning {directory}...")
    issues = []

    for root, _, files in os.walk(directory):
        if "archive" in root: # Skip archive directories
            continue

        for file in files:
            if not file.endswith(".yaml"):
                continue

            filepath = os.path.join(root, file)
            try:
                with open(filepath, 'r') as f:
                    content = f.read()

                lines = content.split('\n')

                for i, line in enumerate(lines):
                    # Check for slot imports
                    # Pattern: - ../slots/slot_name
                    # or - ./slot_name (if in slots dir)

                    # Check for slot usage in slots list
                    # Pattern: - slot_name (indented)

                    stripped = line.strip()

                    # Check for archived slots
                    for slot in archived_slots:
                        # Explicit import check
                        if f"../slots/{slot}" in stripped or f"./{slot}" in stripped:
                            # Be strict about word boundaries to avoid partial matches
                            # e.g. "slot" vs "slot_suffix"
                            # But yaml references usually are explicit paths or items
                            if stripped.endswith(slot) or f"/{slot}" in stripped:
                                 issues.append(f"{filepath}:{i+1}: Import of archived slot '{slot}'")

                        # Usage check (simple heuristic)
                        # We look for "- slot_name" in a slots list context
                        if stripped == f"- {slot}":
                             issues.append(f"{filepath}:{i+1}: Usage of archived slot '{slot}'")

                        # Slot usage keys
                        if stripped.startswith(f"{slot}:"):
                             issues.append(f"{filepath}:{i+1}: Slot usage key for archived slot '{slot}'")

                    # Check for archived classes
                    for cls in archived_classes:
                        if cls in stripped:
                             # Exclude the file itself defining the class (if it were somehow here)
                             # Check for imports or range references
                             if f"/{cls}" in stripped or f"range: {cls}" in stripped:
                                 issues.append(f"{filepath}:{i+1}: Reference to archived class '{cls}'")

            except Exception as e:
                print(f"Error reading {filepath}: {e}")

    return issues

def main():
    archived_slots = load_archived_entities(ARCHIVED_SLOTS_FILE)
    archived_classes = load_archived_entities(ARCHIVED_CLASSES_FILE)

    # Remove common metadata fields that collide with archived slot names
    archived_slots.discard("description")
    archived_slots.discard("title")
    archived_slots.discard("name") # "name" might be a slot too, but also metadata

    print(f"Loaded {len(archived_slots)} archived slots.")
    print(f"Loaded {len(archived_classes)} archived classes.")

    issues = scan_files(CLASSES_DIR, archived_slots, archived_classes)
    # Also scan slots dir, though less likely to have cross-slot dependencies that rot
    issues.extend(scan_files(SLOTS_DIR, archived_slots, archived_classes))

    if issues:
        print("\nFound potential ghost references:")
        for issue in issues:
            print(issue)
    else:
        print("\nNo obvious ghost references found.")

if __name__ == "__main__":
    main()