#!/usr/bin/env python3 """ Cleanup Class Descriptions v2 - Text-Based Approach This script removes redundant sections from class descriptions using TEXT-BASED regex replacement, NOT YAML parsing. This preserves the exact formatting of files. Sections removed: - **Dual-Class Pattern**: - Redundant (class hierarchy captures this) - **Ontological Alignment**: - Redundant (mappings capture this) - **ONTOLOGY ALIGNMENT**: - Same as above, different case - **Multilingual Labels**: - Redundant (structured_aliases captures this) - **RDF Serialization**: - Implementation detail - **SKOS**: / **SKOS Alignment**: - Redundant (mappings capture this) - **Dublin Core**: - Redundant (mappings capture this) - **Primary GLAMORCUBESFIXPHDNT Category**: - Redundant (annotations capture this) - **Example Structure**: - Implementation detail Usage: python scripts/cleanup_class_descriptions_v2.py [--dry-run] [--verbose] [--file PATH] """ import argparse import re from pathlib import Path # Patterns to remove from description content # These patterns are designed to match section content without consuming the final newline before YAML keys REMOVE_PATTERNS = [ # Dual-class pattern - matches until next section or end of indented block (r'\n \*\*Dual-Class Pattern\*\*:[^\n]*\n(?: [^\n*][^\n]*\n)*', 'dual_class_pattern'), # Ontology alignment sections (various cases) (r'\n \*\*ONTOLOGY ALIGNMENT\*\*:[^\n]*\n(?: [^\n*][^\n]*\n| \n)*(?: [0-9]+\. \*\*[^\n]+\n(?: [^\n]+\n)*)*', 'ontology_alignment_upper'), (r'\n \*\*Ontological Alignment\*\*:[^\n]*\n(?: - \*\*[^\n]+\n)*', 'ontological_alignment'), (r'\n \*\*Ontology Alignment\*\*:[^\n]*\n(?: - \*\*[^\n]+\n)*', 'ontology_alignment_mixed'), # Multilingual labels - bullet list (r'\n \*\*Multilingual Labels\*\*:\n(?: - [a-z]{2,3}: [^\n]+\n)+', 'multilingual_labels'), # SKOS alignment sections (r'\n \*\*SKOS\*\*:[^\n]*\n(?: [^\n*][^\n]*\n)*', 'skos_alignment'), (r'\n \*\*SKOS Alignment\*\*:[^\n]*\n(?: [^\n*][^\n]*\n)*', 'skos_alignment_full'), # Dublin Core section (r'\n \*\*Dublin Core\*\*:[^\n]*\n(?: [^\n*][^\n]*\n)*', 'dublin_core'), # RDF examples with code blocks (r'\n \*\*RDF Serialization(?: Example)?\*\*:\s*\n ```[^\n]*\n(?: [^\n]*\n)*? ```\n', 'rdf_serialization'), # Example JSON/YAML structure with code blocks (r'\n \*\*Example(?: JSON| YAML)? Structure\*\*:\s*\n ```[^\n]*\n(?: [^\n]*\n)*? ```\n', 'example_structure'), # GLAMORCUBES category (r'\n \*\*Primary GLAMORCUBESFIXPHDNT Category\*\*:[^\n]*\n(?: [^\n*][^\n]*\n)*', 'glamorcubes_category'), ] def process_file(file_path: Path, dry_run: bool = False, verbose: bool = False) -> dict: """Process a single class YAML file using text-based replacement.""" result = { 'file': str(file_path), 'modified': False, 'removed_sections': [], 'errors': [] } try: with open(file_path, 'r', encoding='utf-8') as f: content = f.read() original_content = content # Apply each removal pattern for pattern, section_name in REMOVE_PATTERNS: regex = re.compile(pattern, re.DOTALL | re.IGNORECASE) if regex.search(content): content = regex.sub('', content) result['removed_sections'].append(section_name) if verbose: print(f" Removed: {section_name}") # Clean up multiple consecutive blank lines (more than 2) content = re.sub(r'\n{4,}', '\n\n\n', content) # Check if content changed if content != original_content: result['modified'] = True if not dry_run: with open(file_path, 'w', encoding='utf-8') as f: f.write(content) except Exception as e: result['errors'].append(str(e)) import traceback if verbose: traceback.print_exc() return result def main(): parser = argparse.ArgumentParser(description='Cleanup class descriptions (text-based)') parser.add_argument('--dry-run', action='store_true', help='Preview changes without modifying files') parser.add_argument('--verbose', action='store_true', help='Show detailed output') parser.add_argument('--file', type=str, help='Process a single file') args = parser.parse_args() classes_dir = Path('schemas/20251121/linkml/modules/classes') if args.file: files = [Path(args.file)] else: files = sorted(classes_dir.glob('*.yaml')) print(f"Processing {len(files)} class files...") if args.dry_run: print("DRY RUN - no files will be modified\n") stats = { 'files_processed': 0, 'files_modified': 0, 'sections_removed': {}, 'errors': [] } for file_path in files: if args.verbose: print(f"\nProcessing: {file_path.name}") result = process_file(file_path, dry_run=args.dry_run, verbose=args.verbose) stats['files_processed'] += 1 if result['modified']: stats['files_modified'] += 1 if not args.verbose: print(f" Modified: {file_path.name}") for section in result['removed_sections']: stats['sections_removed'][section] = stats['sections_removed'].get(section, 0) + 1 if result['errors']: stats['errors'].extend(result['errors']) print(f" ERROR in {file_path.name}: {result['errors']}") # Summary print(f"\n{'=' * 60}") print("SUMMARY") print(f"{'=' * 60}") print(f"Files processed: {stats['files_processed']}") print(f"Files modified: {stats['files_modified']}") print(f"\nSections removed by type:") for section, count in sorted(stats['sections_removed'].items(), key=lambda x: -x[1]): print(f" {section}: {count}") if stats['errors']: print(f"\nErrors: {len(stats['errors'])}") for error in stats['errors'][:10]: print(f" - {error}") if args.dry_run: print("\nDRY RUN complete. Run without --dry-run to apply changes.") if __name__ == '__main__': main()