#!/usr/bin/env python3 """Normalize LinkML structured_aliases to the standard list-of-objects form. LinkML expects structured_aliases to be a list of Alias-like objects, e.g.: structured_aliases: - literal_form: foo in_language: nl In this repo many modules use a language-keyed map: structured_aliases: nl: - literal_form: foo language: nl This script flattens language-keyed maps into a single list and normalizes `language` -> `in_language`. Usage: python scripts/normalize_linkml_structured_aliases.py \ --root schemas/20251121/linkml/modules """ from __future__ import annotations import argparse from pathlib import Path from typing import Any, List, Tuple from ruamel.yaml import YAML from ruamel.yaml.comments import CommentedMap, CommentedSeq def _normalize_alias_item(item: CommentedMap, default_lang: str | None) -> bool: changed = False if "language" in item and "in_language" not in item: item["in_language"] = item["language"] del item["language"] changed = True if "in_language" not in item and default_lang: item["in_language"] = default_lang changed = True return changed def normalize_structured_aliases(obj: Any) -> bool: changed = False if isinstance(obj, CommentedMap): for key in list(obj.keys()): val = obj[key] if key == "structured_aliases": # Case 1: language-keyed map if isinstance(val, CommentedMap): flat = CommentedSeq() for lang, entries in val.items(): if isinstance(entries, list): for e in entries: if isinstance(e, CommentedMap): _normalize_alias_item(e, str(lang)) flat.append(e) elif isinstance(e, dict): cm = CommentedMap(e) _normalize_alias_item(cm, str(lang)) flat.append(cm) elif isinstance(entries, CommentedMap): _normalize_alias_item(entries, str(lang)) flat.append(entries) obj[key] = flat changed = True # Case 2: already a list; just normalize keys elif isinstance(val, list): for e in val: if isinstance(e, CommentedMap): if _normalize_alias_item(e, None): changed = True elif isinstance(e, dict): cm = CommentedMap(e) if _normalize_alias_item(cm, None): changed = True # If list items were plain dicts, ruamel will preserve them, # but we don't force conversion unless needed. else: if normalize_structured_aliases(val): changed = True elif isinstance(obj, (CommentedSeq, list)): for item in obj: if normalize_structured_aliases(item): changed = True return changed def process_file(yaml_rt: YAML, path: Path) -> Tuple[bool, str]: try: data = yaml_rt.load(path.read_text(encoding="utf-8")) except Exception as e: return False, f"YAML parse error: {e}" if data is None: return False, "Empty YAML" changed = normalize_structured_aliases(data) if changed: with path.open("w", encoding="utf-8") as f: yaml_rt.dump(data, f) return changed, "" def main() -> int: parser = argparse.ArgumentParser(description="Normalize LinkML structured_aliases") parser.add_argument( "--root", default="schemas/20251121/linkml/modules", help="Root directory containing LinkML modules", ) args = parser.parse_args() root = Path(args.root) if not root.exists(): print(f"Root not found: {root}") return 2 yaml_rt = YAML(typ="rt") yaml_rt.preserve_quotes = True yaml_rt.width = 4096 files = sorted(root.rglob("*.yaml")) changed_files: List[Path] = [] errors: List[Tuple[Path, str]] = [] for p in files: changed, err = process_file(yaml_rt, p) if err: errors.append((p, err)) continue if changed: changed_files.append(p) print(f"Scanned: {len(files)}") print(f"Changed: {len(changed_files)}") print(f"Errors: {len(errors)}") if errors: for p, e in errors[:30]: print(f"- {p}: {e}") if len(errors) > 30: print(f"... and {len(errors) - 30} more") return 1 return 0 if __name__ == "__main__": raise SystemExit(main())