glam/remove_duplicates.py

import os

def remove_duplicates_in_file(path):
    with open(path, 'r') as f:
        lines = f.readlines()

    new_lines = []
    # Track keys at each indentation level to detect duplicates
    # keys_at_indent: {indent: set(keys)}
    # But we need to clear deeper levels when indent decreases.

    keys_at_indent = {}
    prev_indent = 0

    # We also need to skip lines belonging to the removed duplicate key (list items)
    skip_mode = False
    skip_indent = -1

    keys_to_check = ["broad_mappings", "close_mappings", "related_mappings", "exact_mappings"]

    for i, line in enumerate(lines):
        stripped = line.strip()

        # Determine indent
        if not stripped:
            new_lines.append(line)
            continue

        indent = len(line) - len(line.lstrip())

        # If we are skipping a block (children of removed key)
        if skip_mode:
            if indent > skip_indent:
                # Still inside the block of removed key
                continue
            else:
                # Block ended
                skip_mode = False
                skip_indent = -1

        # Update indentation tracking
        if indent > prev_indent:
            pass
        elif indent < prev_indent:
            # Clear keys for deeper levels
            levels = [k for k in keys_at_indent if k > indent]
            for l in levels:
                del keys_at_indent[l]

        if indent not in keys_at_indent:
            keys_at_indent[indent] = set()

        # Check if line is a key
        if ':' in stripped and not stripped.startswith('-') and not stripped.startswith('#'):
            key = stripped.split(':')[0].strip()

            if key in keys_to_check:
                if key in keys_at_indent[indent]:
                    print(f"Removing duplicate key '{key}' in {path} at line {i+1}")
                    skip_mode = True
                    skip_indent = indent
                    continue
                else:
                    keys_at_indent[indent].add(key)

        new_lines.append(line)
        prev_indent = indent

    with open(path, 'w') as f:
        f.writelines(new_lines)

def process_directory(directory):
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".yaml"):
                remove_duplicates_in_file(os.path.join(root, file))

process_directory("schemas/20251121/linkml/modules/classes")