#!/usr/bin/env python3 """Normalize LinkML alt_descriptions to simple lang->string form. The LinkML runtime expects: alt_descriptions: en: "..." nl: "..." However, many modules in this repo use a structured form: alt_descriptions: nl: {text: "...", language: nl} This script converts the structured form to the expected scalar form while preserving YAML formatting as much as possible (ruamel.yaml round-trip). It also handles malformed inline maps where extra keys were introduced due to unquoted fragments (heuristic: append keys with null values to the text). Usage: python scripts/normalize_linkml_alt_descriptions.py \ --root schemas/20251121/linkml/modules """ from __future__ import annotations import argparse from pathlib import Path from typing import Any, List, Tuple from ruamel.yaml import YAML from ruamel.yaml.comments import CommentedMap, CommentedSeq def normalize_alt_descriptions(obj: Any) -> bool: """Normalize alt_descriptions blocks in-place. Returns True if any change was made. """ changed = False if isinstance(obj, CommentedMap): for key in list(obj.keys()): val = obj[key] if key == "alt_descriptions" and isinstance(val, CommentedMap): for lang in list(val.keys()): entry = val[lang] if isinstance(entry, CommentedMap) and "text" in entry: text = entry.get("text") if text is None: text = "" text = str(text) # Heuristic: if the inline map was malformed, YAML may # have created extra keys with null values. extras: List[str] = [] for k2, v2 in entry.items(): if k2 in ("text", "language"): continue if v2 is None: extras.append(str(k2)) elif isinstance(v2, str) and v2.strip(): extras.append(f"{k2} {v2}") else: extras.append(str(k2)) if extras: text = (text.rstrip() + " " + " ".join(extras)).strip() val[lang] = text changed = True elif isinstance(entry, dict) and "text" in entry: # Non-ruamel map (should be rare), but support it. text = entry.get("text") or "" val[lang] = str(text) changed = True else: if normalize_alt_descriptions(val): changed = True elif isinstance(obj, (CommentedSeq, list)): for item in obj: if normalize_alt_descriptions(item): changed = True return changed def process_file(yaml_rt: YAML, path: Path) -> Tuple[bool, str]: try: data = yaml_rt.load(path.read_text(encoding="utf-8")) except Exception as e: return False, f"YAML parse error: {e}" if data is None: return False, "Empty YAML" changed = normalize_alt_descriptions(data) if changed: with path.open("w", encoding="utf-8") as f: yaml_rt.dump(data, f) return changed, "" def main() -> int: parser = argparse.ArgumentParser(description="Normalize LinkML alt_descriptions blocks") parser.add_argument( "--root", default="schemas/20251121/linkml/modules", help="Root directory containing LinkML modules", ) args = parser.parse_args() root = Path(args.root) if not root.exists(): print(f"Root not found: {root}") return 2 yaml_rt = YAML(typ="rt") yaml_rt.preserve_quotes = True yaml_rt.width = 4096 files = sorted(root.rglob("*.yaml")) changed_files: List[Path] = [] errors: List[Tuple[Path, str]] = [] for p in files: changed, err = process_file(yaml_rt, p) if err: errors.append((p, err)) continue if changed: changed_files.append(p) print(f"Scanned: {len(files)}") print(f"Changed: {len(changed_files)}") print(f"Errors: {len(errors)}") if errors: for p, e in errors[:30]: print(f"- {p}: {e}") if len(errors) > 30: print(f"... and {len(errors) - 30} more") return 1 return 0 if __name__ == "__main__": raise SystemExit(main())