glam/scripts/fix_yaml_examples_v2.py

#!/usr/bin/env python3
"""
Fix YAML examples formatting issues in LinkML slot files.

The main issue is examples sections with bad formatting like:
    examples:
      - value: "multi\nline\nstring..."
      description: Some description

When it should be:
    examples:
      - value: |
          multi
          line
          string
        description: Some description
"""

import re
import sys
from pathlib import Path


def fix_yaml_content(content: str) -> str:
    """Fix YAML content with examples issues."""
    lines = content.split('\n')
    result = []
    i = 0

    while i < len(lines):
        line = lines[i]

        # Check if this is an examples section start
        if re.match(r'^(\s*)examples:\s*$', line):
            result.append(line)
            i += 1

            # Process examples items
            while i < len(lines):
                item_line = lines[i]

                # Check if we've left examples section (unindented line that's not empty or comment)
                if item_line.strip() and not item_line.startswith('    ') and not item_line.strip().startswith('#'):
                    break

                # Check for "- value:" pattern with inline content
                value_match = re.match(r'^(\s+)- value:\s*"(.*)$', item_line)
                if value_match:
                    indent = value_match.group(1)
                    value_start = value_match.group(2)

                    # Convert inline string to block scalar
                    # First collect the full value (may span multiple logical lines due to escapes)
                    full_value = value_start

                    # Check if string continues (no closing quote)
                    while not full_value.rstrip().endswith('"') or full_value.rstrip().endswith('\\"'):
                        i += 1
                        if i >= len(lines):
                            break
                        full_value += lines[i]

                    # Now we have the full value string
                    # Remove trailing quote and unescape
                    full_value = full_value.rstrip()
                    if full_value.endswith('"'):
                        full_value = full_value[:-1]

                    # Unescape common escapes
                    full_value = full_value.replace('\\n', '\n').replace('\\"', '"').replace('\\t', '\t')

                    # Write as block scalar
                    result.append(f'{indent}- value: |')
                    for val_line in full_value.split('\n'):
                        result.append(f'{indent}    {val_line}')

                    i += 1

                    # Check for description at wrong indent
                    if i < len(lines):
                        desc_match = re.match(rf'^{indent}(description:\s*.*)$', lines[i])
                        if desc_match:
                            # Fix indentation
                            result.append(f'{indent}  {desc_match.group(1)}')
                            i += 1
                            continue
                    continue

                # Check for description at wrong indent (following a properly formatted value)
                desc_wrong_indent = re.match(r'^(\s+)(description:\s*.*)$', item_line)
                if desc_wrong_indent:
                    indent = desc_wrong_indent.group(1)
                    desc_content = desc_wrong_indent.group(2)
                    # Check if previous line was a "- value:" line or content under it
                    if result and ('- value:' in result[-1] or result[-1].startswith(indent + '  ')):
                        # This description should be at indent + 2
                        result.append(f'{indent}  {desc_content}')
                        i += 1
                        continue

                result.append(item_line)
                i += 1
            continue

        result.append(line)
        i += 1

    return '\n'.join(result)


def fix_file(filepath: Path) -> bool:
    """Fix a single file. Returns True if modified."""
    try:
        content = filepath.read_text()
    except Exception as e:
        print(f"Error reading {filepath}: {e}")
        return False

    original = content
    fixed = fix_yaml_content(content)

    if fixed != original:
        filepath.write_text(fixed)
        return True
    return False


def main():
    """Main entry point."""
    slots_dir = Path('schemas/20251121/linkml/modules/slots')

    if not slots_dir.exists():
        print(f"Directory not found: {slots_dir}")
        sys.exit(1)

    modified = 0
    errors = 0

    for yaml_file in sorted(slots_dir.glob('*.yaml')):
        # First check if file has YAML errors
        try:
            import yaml
            yaml.safe_load(yaml_file.read_text())
            continue  # File is valid, skip
        except yaml.YAMLError:
            pass  # File has errors, try to fix

        if fix_file(yaml_file):
            # Verify the fix worked
            try:
                yaml.safe_load(yaml_file.read_text())
                print(f"Fixed: {yaml_file.name}")
                modified += 1
            except yaml.YAMLError as e:
                print(f"Still broken after fix: {yaml_file.name}")
                errors += 1

    print(f"\nModified {modified} files, {errors} still have errors")


if __name__ == '__main__':
    main()