glam/scripts/deduplicate_classes.py

#!/usr/bin/env python3
"""
Deduplicate class definitions in LinkML schema.

When a class is defined in both:
1. A "container" file (e.g., StaffRoles.yaml, FindingAidTypes.yaml)
2. An individual file (e.g., Curator.yaml, Archivist.yaml)

This script:
1. Updates imports in all files to point to the container
2. Deletes the duplicate individual files

Rule: Container files are the authoritative source.
"""

import os
import re
import yaml
from collections import defaultdict
from datetime import datetime

CLASSES_DIR = 'schemas/20251121/linkml/modules/classes'

# Files that contain multiple class definitions (containers)
CONTAINER_PATTERNS = [
    'Types.yaml', 'Roles.yaml', 'FindingAid.yaml', 'LinkedInProfile.yaml',
    'VideoAudioAnnotation.yaml', 'DataLicensePolicy.yaml', 'EncompassingBody.yaml',
    'DataServiceEndpointTypes.yaml', 'OAIPMHEndpoint.yaml', 'SearchAPI.yaml',
    'OAIPMHSet.yaml', 'SearchQueryParameter.yaml'
]

def is_container_file(filename):
    return any(p in filename for p in CONTAINER_PATTERNS)

def build_class_to_container_mapping(classes_dir):
    """Build mapping of class names to their container files."""
    class_to_container = {}

    for filename in sorted(os.listdir(classes_dir)):
        if not filename.endswith('.yaml'):
            continue
        if not is_container_file(filename):
            continue

        filepath = os.path.join(classes_dir, filename)
        with open(filepath) as f:
            try:
                data = yaml.safe_load(f)
                if data and 'classes' in data and isinstance(data['classes'], dict):
                    for class_name in data['classes'].keys():
                        container_name = filename.replace('.yaml', '')
                        class_to_container[class_name] = container_name
            except Exception as e:
                print(f"Error parsing {filename}: {e}")

    return class_to_container

def find_duplicate_files(classes_dir, class_to_container):
    """Find individual files that duplicate container class definitions."""
    duplicates = []

    for filename in sorted(os.listdir(classes_dir)):
        if not filename.endswith('.yaml'):
            continue
        if is_container_file(filename):
            continue

        class_name = filename.replace('.yaml', '')
        if class_name in class_to_container:
            duplicates.append({
                'file': filename,
                'class': class_name,
                'container': class_to_container[class_name]
            })

    return duplicates

def update_imports_in_file(filepath, import_mapping):
    """Update imports in a YAML file."""
    with open(filepath) as f:
        content = f.read()

    original = content
    updated = False

    for old_import, new_import in import_mapping.items():
        # Match various import formats
        patterns = [
            (f'- {old_import}\n', f'- {new_import}\n'),
            (f"- '{old_import}'\n", f"- '{new_import}'\n"),
            (f'- "{old_import}"\n', f'- "{new_import}"\n'),
        ]
        for old_pattern, new_pattern in patterns:
            if old_pattern in content:
                content = content.replace(old_pattern, new_pattern)
                updated = True

    # Remove duplicate imports
    if updated:
        lines = content.split('\n')
        seen_imports = set()
        new_lines = []
        in_imports = False

        for line in lines:
            stripped = line.strip()
            if stripped.startswith('imports:'):
                in_imports = True
                new_lines.append(line)
                continue

            if in_imports:
                if stripped.startswith('- '):
                    import_val = stripped[2:].strip().strip("'\"")
                    if import_val in seen_imports:
                        continue  # Skip duplicate
                    seen_imports.add(import_val)
                elif stripped and not stripped.startswith('#'):
                    in_imports = False

            new_lines.append(line)

        content = '\n'.join(new_lines)

    if content != original:
        with open(filepath, 'w') as f:
            f.write(content)
        return True
    return False

def main():
    os.chdir(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

    print("Building class-to-container mapping...")
    class_to_container = build_class_to_container_mapping(CLASSES_DIR)
    print(f"Found {len(class_to_container)} classes in container files")

    print("\nFinding duplicate individual files...")
    duplicates = find_duplicate_files(CLASSES_DIR, class_to_container)
    print(f"Found {len(duplicates)} duplicate files to remove")

    # Build import mapping
    import_mapping = {}
    for dup in duplicates:
        old_import = f"./{dup['class']}"
        new_import = f"./{dup['container']}"
        import_mapping[old_import] = new_import

    print("\nUpdating imports in all schema files...")
    files_updated = 0

    # Update imports in all YAML files in the schema directory
    schema_root = 'schemas/20251121/linkml'
    for root, dirs, files in os.walk(schema_root):
        for filename in files:
            if not filename.endswith('.yaml'):
                continue
            filepath = os.path.join(root, filename)

            # Skip files we're about to delete
            if os.path.basename(filepath) in [d['file'] for d in duplicates]:
                continue

            if update_imports_in_file(filepath, import_mapping):
                files_updated += 1
                print(f"  Updated: {filepath}")

    print(f"\nUpdated imports in {files_updated} files")

    print("\nDeleting duplicate files...")
    deleted = 0
    for dup in duplicates:
        filepath = os.path.join(CLASSES_DIR, dup['file'])
        if os.path.exists(filepath):
            os.remove(filepath)
            deleted += 1
            print(f"  Deleted: {dup['file']} (class {dup['class']} defined in {dup['container']})")

    print(f"\nDeleted {deleted} duplicate files")
    print("\nDone!")

if __name__ == '__main__':
    main()