#!/usr/bin/env python3 """ Structuralize Slot Descriptions Migrates unstructured content from slot description fields to proper LinkML elements: - **Example**: → examples: list - **INVERSE PROPERTY**: → comments: list item + inverse annotation - **W3C ORG Alignment**:/etc → already in exact_mappings, remove from description - **Navigation**: → comments: list item - **Rationale**: → comments: list item - YAML code blocks → examples: list Usage: python scripts/structuralize_slot_descriptions.py [--dry-run] [--verbose] [--file PATH] """ import argparse import re import sys from pathlib import Path from ruamel.yaml import YAML yaml = YAML() yaml.preserve_quotes = True yaml.width = 120 yaml.indent(mapping=2, sequence=2, offset=2) # Section patterns to extract from descriptions SECTION_PATTERNS = { # Patterns that should become examples 'yaml_code_block': re.compile(r'\*\*Example\*\*:\s*\n```(?:yaml|turtle)?\n(.*?)```', re.DOTALL | re.IGNORECASE), # Patterns that should become comments 'inverse_property': re.compile(r'\*\*INVERSE PROPERTY\*\*:\s*\n((?:- .*\n?)+)', re.IGNORECASE), 'navigation': re.compile(r'\*\*Navigation\*\*:\s*\n((?:- .*\n?)+)', re.IGNORECASE), 'rationale': re.compile(r'\*\*Rationale\*\*:\s*\n(.*?)(?=\n\*\*|\n\n[A-Z]|\Z)', re.DOTALL | re.IGNORECASE), 'see_also': re.compile(r'\*\*See Also\*\*:\s*\n((?:- .*\n?)+)', re.IGNORECASE), 'see': re.compile(r'\*\*See\*\*:\s*\n((?:- .*\n?)+)', re.IGNORECASE), # Patterns that should be REMOVED (already in structured elements) 'ontology_alignment_w3c': re.compile(r'\*\*W3C ORG(?: Alignment)?\*\*:\s*\n.*?(?=\n\*\*|\n\n[A-Z]|\Z)', re.DOTALL | re.IGNORECASE), 'ontology_alignment_cidoc': re.compile(r'\*\*CIDOC-CRM(?: Alignment)?\*\*:\s*\n.*?(?=\n\*\*|\n\n[A-Z]|\Z)', re.DOTALL | re.IGNORECASE), 'ontology_alignment_prov': re.compile(r'\*\*PROV-O(?: Alignment)?\*\*:\s*\n.*?(?=\n\*\*|\n\n[A-Z]|\Z)', re.DOTALL | re.IGNORECASE), 'ontology_alignment_schema': re.compile(r'\*\*Schema\.org(?: Alignment)?\*\*:\s*\n.*?(?=\n\*\*|\n\n[A-Z]|\Z)', re.DOTALL | re.IGNORECASE), 'ontology_alignment_rico': re.compile(r'\*\*RiC-O(?: Alignment)?\*\*:\s*\n.*?(?=\n\*\*|\n\n[A-Z]|\Z)', re.DOTALL | re.IGNORECASE), 'ontology_alignment_generic': re.compile(r'\*\*Ontology Alignment\*\*:\s*\n.*?(?=\n\*\*|\n\n[A-Z]|\Z)', re.DOTALL | re.IGNORECASE), 'three_layer_alignment': re.compile(r'\*\*Three-Layer Ontology Alignment\*\*:.*?(?=\n\*\*[A-Z]|\Z)', re.DOTALL | re.IGNORECASE), 'why_primary': re.compile(r'\*\*Why .*? is Primary\*\*:.*?(?=\n\*\*|\Z)', re.DOTALL | re.IGNORECASE), 'why_secondary': re.compile(r'\*\*Why .*? is Secondary\*\*:.*?(?=\n\*\*|\Z)', re.DOTALL | re.IGNORECASE), 'why_tertiary': re.compile(r'\*\*Why .*? is Tertiary\*\*:.*?(?=\n\*\*|\Z)', re.DOTALL | re.IGNORECASE), 'property_details': re.compile(r'Property: `[a-z]+:[A-Za-z0-9_]+`\s*\n- Domain:.*?(?=\n\*\*|\n\n[A-Z]|\Z)', re.DOTALL), 'rdf_serialization': re.compile(r'\*\*RDF Serialization(?: Example)?\*\*:\s*\n```.*?```', re.DOTALL | re.IGNORECASE), 'glamorcubesfixphdnt': re.compile(r'\*\*GLAMORCUBESFIXPHDNT Taxonomy.*?\*\*:\s*\n(?:- \*\*[A-Z]\*\* - .*\n)+', re.IGNORECASE), 'relationship_to': re.compile(r'\*\*Relationship to .*?\*\*:\s*\n.*?(?=\n\*\*|\Z)', re.DOTALL | re.IGNORECASE), 'data_population': re.compile(r'\*\*Data Population Strategy\*\*:\s*\n.*?(?=\n\*\*|\Z)', re.DOTALL | re.IGNORECASE), 'special_case': re.compile(r'\*\*Special Case.*?\*\*:\s*\n.*?(?=\n\*\*|\Z)', re.DOTALL | re.IGNORECASE), 'example_dutch': re.compile(r'\*\*Example - Dutch.*?\*\*:\s*\n```.*?```', re.DOTALL | re.IGNORECASE), 'ghcid_code': re.compile(r'\*\*GHCID Code Derivation\*\*:\s*\n.*?(?=\n\*\*|\Z)', re.DOTALL | re.IGNORECASE), 'migration_note': re.compile(r'\*\*Migration Note\*\*.*?:\s*\n.*?(?=\n\*\*|\Z)', re.DOTALL | re.IGNORECASE), } # Patterns to remove entirely (redundant with structured elements) REMOVE_PATTERNS = [ 'ontology_alignment_w3c', 'ontology_alignment_cidoc', 'ontology_alignment_prov', 'ontology_alignment_schema', 'ontology_alignment_rico', 'ontology_alignment_generic', 'three_layer_alignment', 'why_primary', 'why_secondary', 'why_tertiary', 'property_details', 'rdf_serialization', 'glamorcubesfixphdnt', 'relationship_to', 'data_population', 'special_case', 'example_dutch', 'ghcid_code', 'migration_note', ] # Patterns to extract to comments COMMENT_PATTERNS = [ 'inverse_property', 'navigation', 'rationale', 'see_also', 'see', ] # Patterns to extract to examples EXAMPLE_PATTERNS = [ 'yaml_code_block', ] def extract_inverse_info(match_text: str) -> tuple[str | None, str]: """Extract inverse slot name and comment from inverse property section.""" inverse_name = None comment_lines = [] for line in match_text.strip().split('\n'): line = line.strip().lstrip('- ') if line.startswith('**Inverse**:'): # Extract inverse slot name: `slot_name` (predicate) inverse_match = re.search(r'`([a-z_]+)`', line) if inverse_match: inverse_name = inverse_match.group(1) comment_lines.append(f"Inverse: {line.split(':', 1)[1].strip()}") elif line.startswith('Pattern:'): comment_lines.append(line) return inverse_name, ' | '.join(comment_lines) if comment_lines else match_text.strip() def extract_yaml_example(match_text: str) -> dict: """Extract YAML code block as an example.""" # Clean up the YAML cleaned = match_text.strip() # Try to extract a meaningful description from context return { 'value': cleaned[:200] + '...' if len(cleaned) > 200 else cleaned, 'description': 'Usage example' } def process_slot_description(description: str, slot_data: dict, verbose: bool = False) -> tuple[str, dict, list[str]]: """ Process a slot description, extracting structured content. Returns: tuple: (cleaned_description, updates_dict, removed_sections_list) """ if not description: return description, {}, [] cleaned = description updates = {} removed_sections = [] # First, remove patterns that are redundant with existing structured elements for pattern_name in REMOVE_PATTERNS: pattern = SECTION_PATTERNS.get(pattern_name) if pattern: matches = pattern.findall(cleaned) if matches: removed_sections.append(pattern_name) cleaned = pattern.sub('', cleaned) if verbose: print(f" Removed: {pattern_name} ({len(matches)} match(es))") # Extract inverse property info to comments inverse_match = SECTION_PATTERNS['inverse_property'].search(cleaned) if inverse_match: inverse_name, comment = extract_inverse_info(inverse_match.group(1)) # Add to comments if 'comments' not in updates: updates['comments'] = [] updates['comments'].append(comment) # Add inverse annotation if found if inverse_name: updates['annotations'] = updates.get('annotations', {}) updates['annotations']['inverse_slot'] = inverse_name cleaned = SECTION_PATTERNS['inverse_property'].sub('', cleaned) removed_sections.append('inverse_property') if verbose: print(f" Extracted inverse property: {inverse_name}") # Extract navigation to comments nav_match = SECTION_PATTERNS['navigation'].search(cleaned) if nav_match: nav_text = nav_match.group(1).strip() nav_lines = [line.strip().lstrip('- ') for line in nav_text.split('\n') if line.strip()] if 'comments' not in updates: updates['comments'] = [] updates['comments'].append(f"Navigation: {' | '.join(nav_lines)}") cleaned = SECTION_PATTERNS['navigation'].sub('', cleaned) removed_sections.append('navigation') if verbose: print(f" Extracted navigation") # Extract rationale to comments rationale_match = SECTION_PATTERNS['rationale'].search(cleaned) if rationale_match: rationale_text = rationale_match.group(1).strip() if rationale_text: if 'comments' not in updates: updates['comments'] = [] updates['comments'].append(f"Rationale: {rationale_text[:200]}") cleaned = SECTION_PATTERNS['rationale'].sub('', cleaned) removed_sections.append('rationale') if verbose: print(f" Extracted rationale") # Extract see also to comments for see_pattern in ['see_also', 'see']: see_match = SECTION_PATTERNS[see_pattern].search(cleaned) if see_match: see_text = see_match.group(1).strip() see_lines = [line.strip().lstrip('- ') for line in see_text.split('\n') if line.strip()] if 'comments' not in updates: updates['comments'] = [] updates['comments'].append(f"See: {' | '.join(see_lines)}") cleaned = SECTION_PATTERNS[see_pattern].sub('', cleaned) removed_sections.append(see_pattern) if verbose: print(f" Extracted {see_pattern}") # Extract YAML examples - but only if there are no existing examples if 'examples' not in slot_data or not slot_data['examples']: yaml_match = SECTION_PATTERNS['yaml_code_block'].search(cleaned) if yaml_match: example = extract_yaml_example(yaml_match.group(1)) updates['examples'] = [example] cleaned = SECTION_PATTERNS['yaml_code_block'].sub('', cleaned) removed_sections.append('yaml_code_block') if verbose: print(f" Extracted YAML example") else: # Remove YAML blocks anyway since we have examples cleaned = SECTION_PATTERNS['yaml_code_block'].sub('', cleaned) # Clean up extra whitespace cleaned = re.sub(r'\n{3,}', '\n\n', cleaned) cleaned = cleaned.strip() return cleaned, updates, removed_sections def process_file(file_path: Path, dry_run: bool = False, verbose: bool = False) -> dict: """Process a single slot YAML file.""" result = { 'file': str(file_path), 'modified': False, 'file_description_cleaned': False, 'slots_processed': [], 'removed_sections': [], 'errors': [] } try: with open(file_path, 'r', encoding='utf-8') as f: content = f.read() data = yaml.load(content) if not data: return result modified = False # Process file-level description (if exists) if 'description' in data and isinstance(data['description'], str): orig_len = len(data['description']) cleaned, updates, removed = process_slot_description( data['description'], {}, verbose ) if removed: # For file-level description, just clean it (don't add updates) if len(cleaned) < orig_len * 0.5: # If more than 50% was removed # Keep only first paragraph first_para = cleaned.split('\n\n')[0] if '\n\n' in cleaned else cleaned data['description'] = first_para.strip() else: data['description'] = cleaned result['file_description_cleaned'] = True result['removed_sections'].extend(removed) modified = True if verbose: print(f" Cleaned file-level description") # Process slots if 'slots' in data and isinstance(data['slots'], dict): for slot_name, slot_data in data['slots'].items(): if not isinstance(slot_data, dict): continue if 'description' in slot_data and isinstance(slot_data['description'], str): cleaned, updates, removed = process_slot_description( slot_data['description'], slot_data, verbose ) if removed or updates: slot_data['description'] = cleaned # Merge updates for key, value in updates.items(): if key == 'comments': existing = slot_data.get('comments', []) if not isinstance(existing, list): existing = [existing] if existing else [] # Add new comments, avoiding duplicates for comment in value: if comment not in existing: existing.append(comment) slot_data['comments'] = existing elif key == 'examples': if 'examples' not in slot_data: slot_data['examples'] = value elif key == 'annotations': existing = slot_data.get('annotations', {}) existing.update(value) slot_data['annotations'] = existing else: slot_data[key] = value result['slots_processed'].append(slot_name) result['removed_sections'].extend(removed) modified = True result['modified'] = modified if modified and not dry_run: with open(file_path, 'w', encoding='utf-8') as f: yaml.dump(data, f) except Exception as e: result['errors'].append(str(e)) return result def main(): parser = argparse.ArgumentParser(description='Structuralize slot descriptions') parser.add_argument('--dry-run', action='store_true', help='Preview changes without modifying files') parser.add_argument('--verbose', action='store_true', help='Show detailed output') parser.add_argument('--file', type=str, help='Process a single file') args = parser.parse_args() slots_dir = Path('schemas/20251121/linkml/modules/slots') if args.file: files = [Path(args.file)] else: files = sorted(slots_dir.glob('*.yaml')) print(f"Processing {len(files)} slot files...") if args.dry_run: print("DRY RUN - no files will be modified\n") stats = { 'files_processed': 0, 'files_modified': 0, 'slots_processed': 0, 'sections_removed': {}, 'errors': [] } for file_path in files: if args.verbose: print(f"\nProcessing: {file_path.name}") result = process_file(file_path, dry_run=args.dry_run, verbose=args.verbose) stats['files_processed'] += 1 if result['modified']: stats['files_modified'] += 1 if not args.verbose: print(f" Modified: {file_path.name} ({len(result['slots_processed'])} slots)") stats['slots_processed'] += len(result['slots_processed']) for section in result['removed_sections']: stats['sections_removed'][section] = stats['sections_removed'].get(section, 0) + 1 if result['errors']: stats['errors'].extend(result['errors']) print(f" ERROR in {file_path.name}: {result['errors']}") # Summary print(f"\n{'=' * 60}") print("SUMMARY") print(f"{'=' * 60}") print(f"Files processed: {stats['files_processed']}") print(f"Files modified: {stats['files_modified']}") print(f"Slots processed: {stats['slots_processed']}") print(f"\nSections removed by type:") for section, count in sorted(stats['sections_removed'].items(), key=lambda x: -x[1]): print(f" {section}: {count}") if stats['errors']: print(f"\nErrors: {len(stats['errors'])}") for error in stats['errors'][:10]: print(f" - {error}") if args.dry_run: print("\nDRY RUN complete. Run without --dry-run to apply changes.") if __name__ == '__main__': main()