#!/usr/bin/env python3 """ Cleanup Class Descriptions - Simplified Version This script ONLY removes redundant sections from class descriptions. It does NOT try to extract and store complex data structures. Sections removed: - **Dual-Class Pattern**: - Redundant (class hierarchy captures this) - **Ontological Alignment**: - Redundant (mappings capture this) - **Multilingual Labels**: - Redundant (structured_aliases captures this) - **RDF Serialization**: - Implementation detail - **SKOS**: - Redundant (mappings capture this) - **Dublin Core**: - Redundant (mappings capture this) - **Primary GLAMORCUBESFIXPHDNT Category**: - Redundant (annotations capture this) - **Example Structure**: - Implementation detail Sections KEPT (contain unique information): - **Wikidata**: Q-number reference (important) - **Scope**: Detailed scope description - **Notable Examples**: Real-world institution examples - **Related Types**: Linked types with Wikidata IDs - **Historical Significance**: Historical context - **Dutch Context**: Dutch-specific information - etc. Usage: python scripts/cleanup_class_descriptions.py [--dry-run] [--verbose] [--file PATH] """ import argparse import re import sys from pathlib import Path try: from ruamel.yaml import YAML yaml = YAML() yaml.preserve_quotes = True yaml.width = 120 yaml.indent(mapping=2, sequence=2, offset=2) USE_RUAMEL = True except ImportError: import yaml USE_RUAMEL = False # Sections to REMOVE entirely (already structured elsewhere or redundant) REMOVE_PATTERNS = [ # Dual-class pattern - redundant with class hierarchy (r'\n\s*\*\*Dual-Class Pattern\*\*:\s*\n.*?(?=\n\s*\*\*[A-Z]|\n\s*\Z)', 'dual_class_pattern'), # Ontology alignment sections - redundant with mappings (r'\n\s*\*\*Ontological Alignment\*\*:\s*\n.*?(?=\n\s*\*\*[A-Z]|\n\s*\Z)', 'ontological_alignment'), (r'\n\s*\*\*ONTOLOGY ALIGNMENT\*\*:\s*\n.*?(?=\n\s*\*\*[A-Z]|\n\s*\Z)', 'ontology_alignment_upper'), (r'\n\s*\*\*Ontology Alignment\*\*:\s*\n.*?(?=\n\s*\*\*[A-Z]|\n\s*\Z)', 'ontology_alignment_mixed'), # Multilingual labels - redundant with structured_aliases (r'\n\s*\*\*Multilingual Labels\*\*:\s*\n(?:\s*- [a-z]{2}: .*\n)+', 'multilingual_labels'), # SKOS alignment - redundant with mappings (r'\n\s*\*\*SKOS\*\*:\s*\n.*?(?=\n\s*\*\*[A-Z]|\n\s*\Z)', 'skos_alignment'), (r'\n\s*\*\*SKOS Alignment\*\*:\s*\n.*?(?=\n\s*\*\*[A-Z]|\n\s*\Z)', 'skos_alignment_full'), # Dublin Core - redundant with mappings (r'\n\s*\*\*Dublin Core\*\*:\s*\n.*?(?=\n\s*\*\*[A-Z]|\n\s*\Z)', 'dublin_core'), # RDF examples - implementation details (r'\n\s*\*\*RDF Serialization(?: Example)?\*\*:\s*\n```.*?```', 'rdf_serialization'), # Example JSON/YAML structure - implementation details (r'\n\s*\*\*Example(?: JSON| YAML)? Structure\*\*:\s*\n```.*?```', 'example_structure'), # GLAMORCUBES category - redundant with annotations (r'\n\s*\*\*Primary GLAMORCUBESFIXPHDNT Category\*\*:\s*\n.*?(?=\n\s*\*\*[A-Z]|\n\s*\Z)', 'glamorcubes_category'), ] def clean_description(description: str, verbose: bool = False) -> tuple[str, list[str]]: """ Remove redundant sections from a class description. Returns: tuple: (cleaned_description, list_of_removed_sections) """ if not description: return description, [] cleaned = description removed_sections = [] for pattern, section_name in REMOVE_PATTERNS: regex = re.compile(pattern, re.DOTALL | re.IGNORECASE) if regex.search(cleaned): cleaned = regex.sub('', cleaned) removed_sections.append(section_name) if verbose: print(f" Removed: {section_name}") # Clean up extra whitespace cleaned = re.sub(r'\n{3,}', '\n\n', cleaned) cleaned = cleaned.strip() return cleaned, removed_sections def process_file(file_path: Path, dry_run: bool = False, verbose: bool = False) -> dict: """Process a single class YAML file.""" result = { 'file': str(file_path), 'modified': False, 'classes_processed': [], 'removed_sections': [], 'errors': [] } try: with open(file_path, 'r', encoding='utf-8') as f: content = f.read() # Parse YAML if USE_RUAMEL: from io import StringIO data = yaml.load(StringIO(content)) else: import yaml as pyyaml data = pyyaml.safe_load(content) if not data: return result modified = False # Process classes if 'classes' in data and isinstance(data['classes'], dict): for class_name, class_data in data['classes'].items(): if not isinstance(class_data, dict): continue if 'description' not in class_data: continue desc = class_data['description'] if not isinstance(desc, str): continue original_desc = str(desc) cleaned_desc, removed = clean_description(original_desc, verbose) if removed: class_data['description'] = cleaned_desc result['classes_processed'].append(class_name) result['removed_sections'].extend(removed) modified = True if verbose: print(f" Class: {class_name}") result['modified'] = modified if modified and not dry_run: with open(file_path, 'w', encoding='utf-8') as f: if USE_RUAMEL: yaml.dump(data, f) else: import yaml as pyyaml pyyaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False, width=120) except Exception as e: result['errors'].append(str(e)) import traceback if verbose: traceback.print_exc() return result def main(): parser = argparse.ArgumentParser(description='Cleanup class descriptions by removing redundant sections') parser.add_argument('--dry-run', action='store_true', help='Preview changes without modifying files') parser.add_argument('--verbose', action='store_true', help='Show detailed output') parser.add_argument('--file', type=str, help='Process a single file') args = parser.parse_args() classes_dir = Path('schemas/20251121/linkml/modules/classes') if args.file: files = [Path(args.file)] else: files = sorted(classes_dir.glob('*.yaml')) print(f"Processing {len(files)} class files...") if args.dry_run: print("DRY RUN - no files will be modified\n") stats = { 'files_processed': 0, 'files_modified': 0, 'classes_processed': 0, 'sections_removed': {}, 'errors': [] } for file_path in files: if args.verbose: print(f"\nProcessing: {file_path.name}") result = process_file(file_path, dry_run=args.dry_run, verbose=args.verbose) stats['files_processed'] += 1 if result['modified']: stats['files_modified'] += 1 if not args.verbose: print(f" Modified: {file_path.name} ({len(result['classes_processed'])} classes)") stats['classes_processed'] += len(result['classes_processed']) for section in result['removed_sections']: stats['sections_removed'][section] = stats['sections_removed'].get(section, 0) + 1 if result['errors']: stats['errors'].extend(result['errors']) print(f" ERROR in {file_path.name}: {result['errors']}") # Summary print(f"\n{'=' * 60}") print("SUMMARY") print(f"{'=' * 60}") print(f"Files processed: {stats['files_processed']}") print(f"Files modified: {stats['files_modified']}") print(f"Classes processed: {stats['classes_processed']}") print(f"\nSections removed by type:") for section, count in sorted(stats['sections_removed'].items(), key=lambda x: -x[1]): print(f" {section}: {count}") if stats['errors']: print(f"\nErrors: {len(stats['errors'])}") for error in stats['errors'][:10]: print(f" - {error}") if args.dry_run: print("\nDRY RUN complete. Run without --dry-run to apply changes.") if __name__ == '__main__': main()