glam/scripts/cleanup_class_descriptions_v2.py

#!/usr/bin/env python3
"""
Cleanup Class Descriptions v2 - Text-Based Approach

This script removes redundant sections from class descriptions using TEXT-BASED
regex replacement, NOT YAML parsing. This preserves the exact formatting of files.

Sections removed:
- **Dual-Class Pattern**: - Redundant (class hierarchy captures this)
- **Ontological Alignment**: - Redundant (mappings capture this)
- **ONTOLOGY ALIGNMENT**: - Same as above, different case
- **Multilingual Labels**: - Redundant (structured_aliases captures this)
- **RDF Serialization**: - Implementation detail
- **SKOS**: / **SKOS Alignment**: - Redundant (mappings capture this)
- **Dublin Core**: - Redundant (mappings capture this)
- **Primary GLAMORCUBESFIXPHDNT Category**: - Redundant (annotations capture this)
- **Example Structure**: - Implementation detail

Usage:
    python scripts/cleanup_class_descriptions_v2.py [--dry-run] [--verbose] [--file PATH]
"""

import argparse
import re
from pathlib import Path


# Patterns to remove from description content
# These patterns are designed to match section content without consuming the final newline before YAML keys
REMOVE_PATTERNS = [
    # Dual-class pattern - matches until next section or end of indented block
    (r'\n      \*\*Dual-Class Pattern\*\*:[^\n]*\n(?:      [^\n*][^\n]*\n)*', 'dual_class_pattern'),

    # Ontology alignment sections (various cases)
    (r'\n      \*\*ONTOLOGY ALIGNMENT\*\*:[^\n]*\n(?:      [^\n*][^\n]*\n|      \n)*(?:      [0-9]+\. \*\*[^\n]+\n(?:         [^\n]+\n)*)*', 'ontology_alignment_upper'),
    (r'\n      \*\*Ontological Alignment\*\*:[^\n]*\n(?:      - \*\*[^\n]+\n)*', 'ontological_alignment'),
    (r'\n      \*\*Ontology Alignment\*\*:[^\n]*\n(?:      - \*\*[^\n]+\n)*', 'ontology_alignment_mixed'),

    # Multilingual labels - bullet list
    (r'\n      \*\*Multilingual Labels\*\*:\n(?:      - [a-z]{2,3}: [^\n]+\n)+', 'multilingual_labels'),

    # SKOS alignment sections
    (r'\n      \*\*SKOS\*\*:[^\n]*\n(?:      [^\n*][^\n]*\n)*', 'skos_alignment'),
    (r'\n      \*\*SKOS Alignment\*\*:[^\n]*\n(?:      [^\n*][^\n]*\n)*', 'skos_alignment_full'),

    # Dublin Core section
    (r'\n      \*\*Dublin Core\*\*:[^\n]*\n(?:      [^\n*][^\n]*\n)*', 'dublin_core'),

    # RDF examples with code blocks
    (r'\n      \*\*RDF Serialization(?: Example)?\*\*:\s*\n      ```[^\n]*\n(?:      [^\n]*\n)*?      ```\n', 'rdf_serialization'),

    # Example JSON/YAML structure with code blocks
    (r'\n      \*\*Example(?: JSON| YAML)? Structure\*\*:\s*\n      ```[^\n]*\n(?:      [^\n]*\n)*?      ```\n', 'example_structure'),

    # GLAMORCUBES category
    (r'\n      \*\*Primary GLAMORCUBESFIXPHDNT Category\*\*:[^\n]*\n(?:      [^\n*][^\n]*\n)*', 'glamorcubes_category'),
]


def process_file(file_path: Path, dry_run: bool = False, verbose: bool = False) -> dict:
    """Process a single class YAML file using text-based replacement."""
    result = {
        'file': str(file_path),
        'modified': False,
        'removed_sections': [],
        'errors': []
    }

    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()

        original_content = content

        # Apply each removal pattern
        for pattern, section_name in REMOVE_PATTERNS:
            regex = re.compile(pattern, re.DOTALL | re.IGNORECASE)
            if regex.search(content):
                content = regex.sub('', content)
                result['removed_sections'].append(section_name)
                if verbose:
                    print(f"      Removed: {section_name}")

        # Clean up multiple consecutive blank lines (more than 2)
        content = re.sub(r'\n{4,}', '\n\n\n', content)

        # Check if content changed
        if content != original_content:
            result['modified'] = True

            if not dry_run:
                with open(file_path, 'w', encoding='utf-8') as f:
                    f.write(content)

    except Exception as e:
        result['errors'].append(str(e))
        import traceback
        if verbose:
            traceback.print_exc()

    return result


def main():
    parser = argparse.ArgumentParser(description='Cleanup class descriptions (text-based)')
    parser.add_argument('--dry-run', action='store_true', help='Preview changes without modifying files')
    parser.add_argument('--verbose', action='store_true', help='Show detailed output')
    parser.add_argument('--file', type=str, help='Process a single file')
    args = parser.parse_args()

    classes_dir = Path('schemas/20251121/linkml/modules/classes')

    if args.file:
        files = [Path(args.file)]
    else:
        files = sorted(classes_dir.glob('*.yaml'))

    print(f"Processing {len(files)} class files...")
    if args.dry_run:
        print("DRY RUN - no files will be modified\n")

    stats = {
        'files_processed': 0,
        'files_modified': 0,
        'sections_removed': {},
        'errors': []
    }

    for file_path in files:
        if args.verbose:
            print(f"\nProcessing: {file_path.name}")

        result = process_file(file_path, dry_run=args.dry_run, verbose=args.verbose)

        stats['files_processed'] += 1
        if result['modified']:
            stats['files_modified'] += 1
            if not args.verbose:
                print(f"  Modified: {file_path.name}")

        for section in result['removed_sections']:
            stats['sections_removed'][section] = stats['sections_removed'].get(section, 0) + 1

        if result['errors']:
            stats['errors'].extend(result['errors'])
            print(f"  ERROR in {file_path.name}: {result['errors']}")

    # Summary
    print(f"\n{'=' * 60}")
    print("SUMMARY")
    print(f"{'=' * 60}")
    print(f"Files processed: {stats['files_processed']}")
    print(f"Files modified: {stats['files_modified']}")
    print(f"\nSections removed by type:")
    for section, count in sorted(stats['sections_removed'].items(), key=lambda x: -x[1]):
        print(f"  {section}: {count}")

    if stats['errors']:
        print(f"\nErrors: {len(stats['errors'])}")
        for error in stats['errors'][:10]:
            print(f"  - {error}")

    if args.dry_run:
        print("\nDRY RUN complete. Run without --dry-run to apply changes.")


if __name__ == '__main__':
    main()