glam/scripts/cleanup_redundant_descriptions.py

#!/usr/bin/env python3
"""
Cleanup Redundant Description Text from LinkML Class Files

This script removes description sections that are NOW represented as structured
slots/classes in the LinkML schema. It follows the principle:
"Only remove text that is already represented as structured KG classes and predicates"

SECTIONS REMOVED (fully structured):
- **Dual-Class Pattern**: → dual_class_link slot (DualClassLink class)
- **Ontological Alignment**: → class_uri, *_mappings LinkML elements
- **Multilingual Labels**: → structured_aliases LinkML element

SECTIONS KEPT (not yet fully structured):
- **Scope**: - Domain-specific content descriptions
- **Notable Examples**: - Real-world institution examples
- **Historical Significance**: - Contextual importance
- **Privacy Considerations**: - Only partially structured (privacy_note slot)
- **Preservation Challenges**: - Only partially structured (preservation_note slot)
- **Related Types**: - Partially in see_also, but descriptions not structured
- **Wikidata**: Q##### - Keep as human-readable reference

Usage:
    python scripts/cleanup_redundant_descriptions.py --dry-run  # Preview changes
    python scripts/cleanup_redundant_descriptions.py            # Apply changes
"""

import argparse
import re
from pathlib import Path
from typing import Tuple, List


# Sections that ARE fully structured and can be removed
STRUCTURED_SECTIONS = [
    'Dual-Class Pattern',
    'Ontological Alignment',
    'Multilingual Labels',
]


def remove_section(text: str, section_header: str) -> Tuple[str, bool]:
    """
    Remove a markdown section from description text.

    A section starts with **Header**: and ends at the next **Header**: or end of text.

    Returns:
        Tuple of (modified_text, was_modified)
    """
    # Pattern matches **Section Header**: followed by content until next **Header**: or end
    # Using re.DOTALL to match across newlines
    pattern = rf'\*\*{re.escape(section_header)}\*\*:.*?(?=\n[ ]*\*\*[A-Z]|\Z)'

    new_text, count = re.subn(pattern, '', text, flags=re.DOTALL)

    return new_text, count > 0


def cleanup_description(description: str) -> Tuple[str, List[str]]:
    """
    Remove structured sections from a description.

    Returns:
        Tuple of (cleaned_description, list_of_removed_sections)
    """
    removed = []

    for section in STRUCTURED_SECTIONS:
        description, was_removed = remove_section(description, section)
        if was_removed:
            removed.append(section)

    # Clean up extra newlines
    description = re.sub(r'\n{3,}', '\n\n', description)
    description = description.rstrip()

    return description, removed


def process_yaml_file(filepath: Path, dry_run: bool = True) -> dict:
    """
    Process a single YAML file to clean up class-level descriptions.

    We only target class-level descriptions (4 spaces indent), NOT slot_usage descriptions.

    Returns dict with statistics about changes made.
    """
    content = filepath.read_text(encoding='utf-8')

    stats = {
        'file': filepath.name,
        'classes_modified': 0,
        'sections_removed': [],
        'modified': False
    }

    lines = content.split('\n')
    new_lines = []
    i = 0

    while i < len(lines):
        line = lines[i]

        # Look for class-level description (exactly 4 spaces indent)
        # This avoids matching slot_usage descriptions which have deeper indentation
        if re.match(r'^    description: \|', line):
            # Found a class-level description block
            desc_lines = [line]
            i += 1

            # Collect all indented lines that are part of this description
            # Description content is indented with 6+ spaces
            while i < len(lines):
                next_line = lines[i]
                # Check if line is part of description (6+ spaces) or empty
                if next_line == '' or re.match(r'^      ', next_line):
                    desc_lines.append(next_line)
                    i += 1
                else:
                    break

            # Join and process the description
            desc_block = '\n'.join(desc_lines)

            # Check if this description has any structured sections to remove
            has_sections = any(f'**{s}**:' in desc_block for s in STRUCTURED_SECTIONS)

            if has_sections:
                # Extract the description content (after "description: |")
                desc_content = '\n'.join(desc_lines[1:])  # Skip the "description: |" line

                # Clean it up
                cleaned_content, removed = cleanup_description(desc_content)

                if removed:
                    stats['classes_modified'] += 1
                    stats['sections_removed'].extend(removed)
                    stats['modified'] = True

                    # Rebuild the description block
                    new_lines.append('    description: |')
                    for cleaned_line in cleaned_content.split('\n'):
                        new_lines.append(cleaned_line)
                else:
                    # No changes, keep original
                    new_lines.extend(desc_lines)
            else:
                # No structured sections, keep original
                new_lines.extend(desc_lines)
        else:
            new_lines.append(line)
            i += 1

    if stats['modified']:
        new_content = '\n'.join(new_lines)

        # Ensure file ends with newline
        if not new_content.endswith('\n'):
            new_content += '\n'

        if not dry_run:
            filepath.write_text(new_content, encoding='utf-8')

    return stats


def main():
    parser = argparse.ArgumentParser(
        description='Clean up redundant description text from LinkML class files'
    )
    parser.add_argument(
        '--dry-run',
        action='store_true',
        help='Preview changes without modifying files'
    )
    parser.add_argument(
        '--verbose', '-v',
        action='store_true',
        help='Show detailed output for each file'
    )
    parser.add_argument(
        '--file',
        type=str,
        help='Process a single file instead of all files'
    )

    args = parser.parse_args()

    # Find all class YAML files
    classes_dir = Path('schemas/20251121/linkml/modules/classes')

    if args.file:
        yaml_files = [Path(args.file)]
    else:
        yaml_files = sorted(classes_dir.glob('*.yaml'))

    total_modified = 0
    total_sections = 0

    print(f"{'[DRY RUN] ' if args.dry_run else ''}Processing {len(yaml_files)} files...\n")

    for filepath in yaml_files:
        stats = process_yaml_file(filepath, dry_run=args.dry_run)

        if stats['modified']:
            total_modified += 1
            total_sections += len(stats['sections_removed'])

            if args.verbose or args.dry_run:
                print(f"{'Would modify' if args.dry_run else 'Modified'}: {stats['file']}")
                print(f"  Classes: {stats['classes_modified']}")
                print(f"  Sections removed: {', '.join(sorted(set(stats['sections_removed'])))}")
                print()

    print(f"\n{'=' * 60}")
    print(f"Summary:")
    print(f"  Files {'that would be' if args.dry_run else ''} modified: {total_modified}")
    print(f"  Total sections removed: {total_sections}")

    if args.dry_run:
        print(f"\nRun without --dry-run to apply changes.")


if __name__ == '__main__':
    main()