glam/check_duplicates.py

import yaml
import os

def check_dir(directory):
    print(f"Checking directory: {directory}")
    target_keys = ["related_mappings", "close_mappings", "exact_mappings", "broad_mappings", "narrow_mappings", "slots", "slot_usage", "attributes", "annotations", "description", "class_uri", "id", "name", "title", "imports", "prefixes", "default_prefix", "default_range", "classes", "types", "enums", "subsets"]

    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".yaml"):
                path = os.path.join(root, file)

                with open(path, 'r') as f:
                    lines = f.readlines()

                keys_at_indent = {} # {indent: {key: line_no}}
                prev_indent = 0

                for i, line in enumerate(lines):
                    stripped = line.strip()
                    if not stripped or stripped.startswith('#') or stripped.startswith('-'):
                        continue

                    indent = len(line) - len(line.lstrip())

                    if ':' in stripped:
                        key = stripped.split(':')[0].strip()

                        # Only check for specific structural keys to avoid noise
                        if key not in target_keys:
                            continue

                        # If indentation increased, we are in a new block
                        if indent > prev_indent:
                            pass
                        # If indentation decreased, clear deeper levels
                        elif indent < prev_indent:
                            keys_to_remove = [k for k in keys_at_indent if k > indent]
                            for k in keys_to_remove:
                                del keys_at_indent[k]

                        if indent not in keys_at_indent:
                            keys_at_indent[indent] = {}

                        if key in keys_at_indent[indent]:
                            prev_line = keys_at_indent[indent][key]
                            # Heuristic: if lines are in same block (no lower indent between)
                            # We assume it's a duplicate in the same object

                            # Double check if there was a lower indent line between them
                            parent_found = False
                            for j in range(prev_line + 1, i):
                                inner_line = lines[j]
                                if inner_line.strip() and not inner_line.strip().startswith('#'):
                                    curr_indent = len(inner_line) - len(inner_line.lstrip())
                                    if curr_indent < indent:
                                        parent_found = True
                                        break

                            if not parent_found:
                                print(f"DUPLICATE KEY '{key}' in {path} at line {i+1} (previous at {prev_line+1})")

                        keys_at_indent[indent][key] = i
                        prev_indent = indent

check_dir("schemas/20251121/linkml/modules/classes")
check_dir("schemas/20251121/linkml/modules/slots")