glam/scripts/normalize_linkml_structured_aliases.py

#!/usr/bin/env python3
"""Normalize LinkML structured_aliases to the standard list-of-objects form.

LinkML expects structured_aliases to be a list of Alias-like objects, e.g.:

  structured_aliases:
    - literal_form: foo
      in_language: nl

In this repo many modules use a language-keyed map:

  structured_aliases:
    nl:
      - literal_form: foo
        language: nl

This script flattens language-keyed maps into a single list and normalizes
`language` -> `in_language`.

Usage:
  python scripts/normalize_linkml_structured_aliases.py \
    --root schemas/20251121/linkml/modules
"""

from __future__ import annotations

import argparse
from pathlib import Path
from typing import Any, List, Tuple

from ruamel.yaml import YAML
from ruamel.yaml.comments import CommentedMap, CommentedSeq


def _normalize_alias_item(item: CommentedMap, default_lang: str | None) -> bool:
    changed = False
    if "language" in item and "in_language" not in item:
        item["in_language"] = item["language"]
        del item["language"]
        changed = True
    if "in_language" not in item and default_lang:
        item["in_language"] = default_lang
        changed = True
    return changed


def normalize_structured_aliases(obj: Any) -> bool:
    changed = False

    if isinstance(obj, CommentedMap):
        for key in list(obj.keys()):
            val = obj[key]

            if key == "structured_aliases":
                # Case 1: language-keyed map
                if isinstance(val, CommentedMap):
                    flat = CommentedSeq()
                    for lang, entries in val.items():
                        if isinstance(entries, list):
                            for e in entries:
                                if isinstance(e, CommentedMap):
                                    _normalize_alias_item(e, str(lang))
                                    flat.append(e)
                                elif isinstance(e, dict):
                                    cm = CommentedMap(e)
                                    _normalize_alias_item(cm, str(lang))
                                    flat.append(cm)
                        elif isinstance(entries, CommentedMap):
                            _normalize_alias_item(entries, str(lang))
                            flat.append(entries)
                    obj[key] = flat
                    changed = True

                # Case 2: already a list; just normalize keys
                elif isinstance(val, list):
                    for e in val:
                        if isinstance(e, CommentedMap):
                            if _normalize_alias_item(e, None):
                                changed = True
                        elif isinstance(e, dict):
                            cm = CommentedMap(e)
                            if _normalize_alias_item(cm, None):
                                changed = True
                    # If list items were plain dicts, ruamel will preserve them,
                    # but we don't force conversion unless needed.

            else:
                if normalize_structured_aliases(val):
                    changed = True

    elif isinstance(obj, (CommentedSeq, list)):
        for item in obj:
            if normalize_structured_aliases(item):
                changed = True

    return changed


def process_file(yaml_rt: YAML, path: Path) -> Tuple[bool, str]:
    try:
        data = yaml_rt.load(path.read_text(encoding="utf-8"))
    except Exception as e:
        return False, f"YAML parse error: {e}"

    if data is None:
        return False, "Empty YAML"

    changed = normalize_structured_aliases(data)
    if changed:
        with path.open("w", encoding="utf-8") as f:
            yaml_rt.dump(data, f)
    return changed, ""


def main() -> int:
    parser = argparse.ArgumentParser(description="Normalize LinkML structured_aliases")
    parser.add_argument(
        "--root",
        default="schemas/20251121/linkml/modules",
        help="Root directory containing LinkML modules",
    )
    args = parser.parse_args()

    root = Path(args.root)
    if not root.exists():
        print(f"Root not found: {root}")
        return 2

    yaml_rt = YAML(typ="rt")
    yaml_rt.preserve_quotes = True
    yaml_rt.width = 4096

    files = sorted(root.rglob("*.yaml"))
    changed_files: List[Path] = []
    errors: List[Tuple[Path, str]] = []

    for p in files:
        changed, err = process_file(yaml_rt, p)
        if err:
            errors.append((p, err))
            continue
        if changed:
            changed_files.append(p)

    print(f"Scanned:  {len(files)}")
    print(f"Changed:  {len(changed_files)}")
    print(f"Errors:   {len(errors)}")
    if errors:
        for p, e in errors[:30]:
            print(f"- {p}: {e}")
        if len(errors) > 30:
            print(f"... and {len(errors) - 30} more")
        return 1
    return 0


if __name__ == "__main__":
    raise SystemExit(main())