glam/scripts/cleanup_class_descriptions.py

#!/usr/bin/env python3
"""
Cleanup Class Descriptions - Simplified Version

This script ONLY removes redundant sections from class descriptions.
It does NOT try to extract and store complex data structures.

Sections removed:
- **Dual-Class Pattern**: - Redundant (class hierarchy captures this)
- **Ontological Alignment**: - Redundant (mappings capture this)
- **Multilingual Labels**: - Redundant (structured_aliases captures this)
- **RDF Serialization**: - Implementation detail
- **SKOS**: - Redundant (mappings capture this)
- **Dublin Core**: - Redundant (mappings capture this)
- **Primary GLAMORCUBESFIXPHDNT Category**: - Redundant (annotations capture this)
- **Example Structure**: - Implementation detail

Sections KEPT (contain unique information):
- **Wikidata**: Q-number reference (important)
- **Scope**: Detailed scope description
- **Notable Examples**: Real-world institution examples
- **Related Types**: Linked types with Wikidata IDs
- **Historical Significance**: Historical context
- **Dutch Context**: Dutch-specific information
- etc.

Usage:
    python scripts/cleanup_class_descriptions.py [--dry-run] [--verbose] [--file PATH]
"""

import argparse
import re
import sys
from pathlib import Path

try:
    from ruamel.yaml import YAML
    yaml = YAML()
    yaml.preserve_quotes = True
    yaml.width = 120
    yaml.indent(mapping=2, sequence=2, offset=2)
    USE_RUAMEL = True
except ImportError:
    import yaml
    USE_RUAMEL = False


# Sections to REMOVE entirely (already structured elsewhere or redundant)
REMOVE_PATTERNS = [
    # Dual-class pattern - redundant with class hierarchy
    (r'\n\s*\*\*Dual-Class Pattern\*\*:\s*\n.*?(?=\n\s*\*\*[A-Z]|\n\s*\Z)', 'dual_class_pattern'),

    # Ontology alignment sections - redundant with mappings
    (r'\n\s*\*\*Ontological Alignment\*\*:\s*\n.*?(?=\n\s*\*\*[A-Z]|\n\s*\Z)', 'ontological_alignment'),
    (r'\n\s*\*\*ONTOLOGY ALIGNMENT\*\*:\s*\n.*?(?=\n\s*\*\*[A-Z]|\n\s*\Z)', 'ontology_alignment_upper'),
    (r'\n\s*\*\*Ontology Alignment\*\*:\s*\n.*?(?=\n\s*\*\*[A-Z]|\n\s*\Z)', 'ontology_alignment_mixed'),

    # Multilingual labels - redundant with structured_aliases
    (r'\n\s*\*\*Multilingual Labels\*\*:\s*\n(?:\s*- [a-z]{2}: .*\n)+', 'multilingual_labels'),

    # SKOS alignment - redundant with mappings
    (r'\n\s*\*\*SKOS\*\*:\s*\n.*?(?=\n\s*\*\*[A-Z]|\n\s*\Z)', 'skos_alignment'),
    (r'\n\s*\*\*SKOS Alignment\*\*:\s*\n.*?(?=\n\s*\*\*[A-Z]|\n\s*\Z)', 'skos_alignment_full'),

    # Dublin Core - redundant with mappings
    (r'\n\s*\*\*Dublin Core\*\*:\s*\n.*?(?=\n\s*\*\*[A-Z]|\n\s*\Z)', 'dublin_core'),

    # RDF examples - implementation details
    (r'\n\s*\*\*RDF Serialization(?: Example)?\*\*:\s*\n```.*?```', 'rdf_serialization'),

    # Example JSON/YAML structure - implementation details
    (r'\n\s*\*\*Example(?: JSON| YAML)? Structure\*\*:\s*\n```.*?```', 'example_structure'),

    # GLAMORCUBES category - redundant with annotations
    (r'\n\s*\*\*Primary GLAMORCUBESFIXPHDNT Category\*\*:\s*\n.*?(?=\n\s*\*\*[A-Z]|\n\s*\Z)', 'glamorcubes_category'),
]


def clean_description(description: str, verbose: bool = False) -> tuple[str, list[str]]:
    """
    Remove redundant sections from a class description.

    Returns:
        tuple: (cleaned_description, list_of_removed_sections)
    """
    if not description:
        return description, []

    cleaned = description
    removed_sections = []

    for pattern, section_name in REMOVE_PATTERNS:
        regex = re.compile(pattern, re.DOTALL | re.IGNORECASE)
        if regex.search(cleaned):
            cleaned = regex.sub('', cleaned)
            removed_sections.append(section_name)
            if verbose:
                print(f"      Removed: {section_name}")

    # Clean up extra whitespace
    cleaned = re.sub(r'\n{3,}', '\n\n', cleaned)
    cleaned = cleaned.strip()

    return cleaned, removed_sections


def process_file(file_path: Path, dry_run: bool = False, verbose: bool = False) -> dict:
    """Process a single class YAML file."""
    result = {
        'file': str(file_path),
        'modified': False,
        'classes_processed': [],
        'removed_sections': [],
        'errors': []
    }

    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()

        # Parse YAML
        if USE_RUAMEL:
            from io import StringIO
            data = yaml.load(StringIO(content))
        else:
            import yaml as pyyaml
            data = pyyaml.safe_load(content)

        if not data:
            return result

        modified = False

        # Process classes
        if 'classes' in data and isinstance(data['classes'], dict):
            for class_name, class_data in data['classes'].items():
                if not isinstance(class_data, dict):
                    continue

                if 'description' not in class_data:
                    continue

                desc = class_data['description']
                if not isinstance(desc, str):
                    continue

                original_desc = str(desc)
                cleaned_desc, removed = clean_description(original_desc, verbose)

                if removed:
                    class_data['description'] = cleaned_desc
                    result['classes_processed'].append(class_name)
                    result['removed_sections'].extend(removed)
                    modified = True

                    if verbose:
                        print(f"    Class: {class_name}")

        result['modified'] = modified

        if modified and not dry_run:
            with open(file_path, 'w', encoding='utf-8') as f:
                if USE_RUAMEL:
                    yaml.dump(data, f)
                else:
                    import yaml as pyyaml
                    pyyaml.dump(data, f,
                             default_flow_style=False,
                             allow_unicode=True,
                             sort_keys=False,
                             width=120)

    except Exception as e:
        result['errors'].append(str(e))
        import traceback
        if verbose:
            traceback.print_exc()

    return result


def main():
    parser = argparse.ArgumentParser(description='Cleanup class descriptions by removing redundant sections')
    parser.add_argument('--dry-run', action='store_true', help='Preview changes without modifying files')
    parser.add_argument('--verbose', action='store_true', help='Show detailed output')
    parser.add_argument('--file', type=str, help='Process a single file')
    args = parser.parse_args()

    classes_dir = Path('schemas/20251121/linkml/modules/classes')

    if args.file:
        files = [Path(args.file)]
    else:
        files = sorted(classes_dir.glob('*.yaml'))

    print(f"Processing {len(files)} class files...")
    if args.dry_run:
        print("DRY RUN - no files will be modified\n")

    stats = {
        'files_processed': 0,
        'files_modified': 0,
        'classes_processed': 0,
        'sections_removed': {},
        'errors': []
    }

    for file_path in files:
        if args.verbose:
            print(f"\nProcessing: {file_path.name}")

        result = process_file(file_path, dry_run=args.dry_run, verbose=args.verbose)

        stats['files_processed'] += 1
        if result['modified']:
            stats['files_modified'] += 1
            if not args.verbose:
                print(f"  Modified: {file_path.name} ({len(result['classes_processed'])} classes)")

        stats['classes_processed'] += len(result['classes_processed'])

        for section in result['removed_sections']:
            stats['sections_removed'][section] = stats['sections_removed'].get(section, 0) + 1

        if result['errors']:
            stats['errors'].extend(result['errors'])
            print(f"  ERROR in {file_path.name}: {result['errors']}")

    # Summary
    print(f"\n{'=' * 60}")
    print("SUMMARY")
    print(f"{'=' * 60}")
    print(f"Files processed: {stats['files_processed']}")
    print(f"Files modified: {stats['files_modified']}")
    print(f"Classes processed: {stats['classes_processed']}")
    print(f"\nSections removed by type:")
    for section, count in sorted(stats['sections_removed'].items(), key=lambda x: -x[1]):
        print(f"  {section}: {count}")

    if stats['errors']:
        print(f"\nErrors: {len(stats['errors'])}")
        for error in stats['errors'][:10]:
            print(f"  - {error}")

    if args.dry_run:
        print("\nDRY RUN complete. Run without --dry-run to apply changes.")


if __name__ == '__main__':
    main()