#!/usr/bin/env python3 """ Cleanup Redundant Description Text from LinkML Class Files This script removes description sections that are NOW represented as structured slots/classes in the LinkML schema. It follows the principle: "Only remove text that is already represented as structured KG classes and predicates" SECTIONS REMOVED (fully structured): - **Dual-Class Pattern**: → dual_class_link slot (DualClassLink class) - **Ontological Alignment**: → class_uri, *_mappings LinkML elements - **Multilingual Labels**: → structured_aliases LinkML element SECTIONS KEPT (not yet fully structured): - **Scope**: - Domain-specific content descriptions - **Notable Examples**: - Real-world institution examples - **Historical Significance**: - Contextual importance - **Privacy Considerations**: - Only partially structured (privacy_note slot) - **Preservation Challenges**: - Only partially structured (preservation_note slot) - **Related Types**: - Partially in see_also, but descriptions not structured - **Wikidata**: Q##### - Keep as human-readable reference Usage: python scripts/cleanup_redundant_descriptions.py --dry-run # Preview changes python scripts/cleanup_redundant_descriptions.py # Apply changes """ import argparse import re from pathlib import Path from typing import Tuple, List # Sections that ARE fully structured and can be removed STRUCTURED_SECTIONS = [ 'Dual-Class Pattern', 'Ontological Alignment', 'Multilingual Labels', ] def remove_section(text: str, section_header: str) -> Tuple[str, bool]: """ Remove a markdown section from description text. A section starts with **Header**: and ends at the next **Header**: or end of text. Returns: Tuple of (modified_text, was_modified) """ # Pattern matches **Section Header**: followed by content until next **Header**: or end # Using re.DOTALL to match across newlines pattern = rf'\*\*{re.escape(section_header)}\*\*:.*?(?=\n[ ]*\*\*[A-Z]|\Z)' new_text, count = re.subn(pattern, '', text, flags=re.DOTALL) return new_text, count > 0 def cleanup_description(description: str) -> Tuple[str, List[str]]: """ Remove structured sections from a description. Returns: Tuple of (cleaned_description, list_of_removed_sections) """ removed = [] for section in STRUCTURED_SECTIONS: description, was_removed = remove_section(description, section) if was_removed: removed.append(section) # Clean up extra newlines description = re.sub(r'\n{3,}', '\n\n', description) description = description.rstrip() return description, removed def process_yaml_file(filepath: Path, dry_run: bool = True) -> dict: """ Process a single YAML file to clean up class-level descriptions. We only target class-level descriptions (4 spaces indent), NOT slot_usage descriptions. Returns dict with statistics about changes made. """ content = filepath.read_text(encoding='utf-8') stats = { 'file': filepath.name, 'classes_modified': 0, 'sections_removed': [], 'modified': False } lines = content.split('\n') new_lines = [] i = 0 while i < len(lines): line = lines[i] # Look for class-level description (exactly 4 spaces indent) # This avoids matching slot_usage descriptions which have deeper indentation if re.match(r'^ description: \|', line): # Found a class-level description block desc_lines = [line] i += 1 # Collect all indented lines that are part of this description # Description content is indented with 6+ spaces while i < len(lines): next_line = lines[i] # Check if line is part of description (6+ spaces) or empty if next_line == '' or re.match(r'^ ', next_line): desc_lines.append(next_line) i += 1 else: break # Join and process the description desc_block = '\n'.join(desc_lines) # Check if this description has any structured sections to remove has_sections = any(f'**{s}**:' in desc_block for s in STRUCTURED_SECTIONS) if has_sections: # Extract the description content (after "description: |") desc_content = '\n'.join(desc_lines[1:]) # Skip the "description: |" line # Clean it up cleaned_content, removed = cleanup_description(desc_content) if removed: stats['classes_modified'] += 1 stats['sections_removed'].extend(removed) stats['modified'] = True # Rebuild the description block new_lines.append(' description: |') for cleaned_line in cleaned_content.split('\n'): new_lines.append(cleaned_line) else: # No changes, keep original new_lines.extend(desc_lines) else: # No structured sections, keep original new_lines.extend(desc_lines) else: new_lines.append(line) i += 1 if stats['modified']: new_content = '\n'.join(new_lines) # Ensure file ends with newline if not new_content.endswith('\n'): new_content += '\n' if not dry_run: filepath.write_text(new_content, encoding='utf-8') return stats def main(): parser = argparse.ArgumentParser( description='Clean up redundant description text from LinkML class files' ) parser.add_argument( '--dry-run', action='store_true', help='Preview changes without modifying files' ) parser.add_argument( '--verbose', '-v', action='store_true', help='Show detailed output for each file' ) parser.add_argument( '--file', type=str, help='Process a single file instead of all files' ) args = parser.parse_args() # Find all class YAML files classes_dir = Path('schemas/20251121/linkml/modules/classes') if args.file: yaml_files = [Path(args.file)] else: yaml_files = sorted(classes_dir.glob('*.yaml')) total_modified = 0 total_sections = 0 print(f"{'[DRY RUN] ' if args.dry_run else ''}Processing {len(yaml_files)} files...\n") for filepath in yaml_files: stats = process_yaml_file(filepath, dry_run=args.dry_run) if stats['modified']: total_modified += 1 total_sections += len(stats['sections_removed']) if args.verbose or args.dry_run: print(f"{'Would modify' if args.dry_run else 'Modified'}: {stats['file']}") print(f" Classes: {stats['classes_modified']}") print(f" Sections removed: {', '.join(sorted(set(stats['sections_removed'])))}") print() print(f"\n{'=' * 60}") print(f"Summary:") print(f" Files {'that would be' if args.dry_run else ''} modified: {total_modified}") print(f" Total sections removed: {total_sections}") if args.dry_run: print(f"\nRun without --dry-run to apply changes.") if __name__ == '__main__': main()