glam/scripts/normalize_linkml_structured_aliases.py
kempersc 66adec257e Add scripts for normalizing LinkML schemas and validating schema integrity
- Implement `normalize_linkml_alt_descriptions.py` to convert structured alt_descriptions to the expected scalar form.
- Implement `normalize_linkml_structured_aliases.py` to flatten language-keyed structured_aliases into a standard list-of-objects format.
- Implement `validate_linkml_schema_integrity.py` to validate the integrity of LinkML schema bundles, checking for import resolution, YAML parsing, and reference existence.
2026-02-16 10:16:51 +01:00

158 lines
4.9 KiB
Python

#!/usr/bin/env python3
"""Normalize LinkML structured_aliases to the standard list-of-objects form.
LinkML expects structured_aliases to be a list of Alias-like objects, e.g.:
structured_aliases:
- literal_form: foo
in_language: nl
In this repo many modules use a language-keyed map:
structured_aliases:
nl:
- literal_form: foo
language: nl
This script flattens language-keyed maps into a single list and normalizes
`language` -> `in_language`.
Usage:
python scripts/normalize_linkml_structured_aliases.py \
--root schemas/20251121/linkml/modules
"""
from __future__ import annotations
import argparse
from pathlib import Path
from typing import Any, List, Tuple
from ruamel.yaml import YAML
from ruamel.yaml.comments import CommentedMap, CommentedSeq
def _normalize_alias_item(item: CommentedMap, default_lang: str | None) -> bool:
changed = False
if "language" in item and "in_language" not in item:
item["in_language"] = item["language"]
del item["language"]
changed = True
if "in_language" not in item and default_lang:
item["in_language"] = default_lang
changed = True
return changed
def normalize_structured_aliases(obj: Any) -> bool:
changed = False
if isinstance(obj, CommentedMap):
for key in list(obj.keys()):
val = obj[key]
if key == "structured_aliases":
# Case 1: language-keyed map
if isinstance(val, CommentedMap):
flat = CommentedSeq()
for lang, entries in val.items():
if isinstance(entries, list):
for e in entries:
if isinstance(e, CommentedMap):
_normalize_alias_item(e, str(lang))
flat.append(e)
elif isinstance(e, dict):
cm = CommentedMap(e)
_normalize_alias_item(cm, str(lang))
flat.append(cm)
elif isinstance(entries, CommentedMap):
_normalize_alias_item(entries, str(lang))
flat.append(entries)
obj[key] = flat
changed = True
# Case 2: already a list; just normalize keys
elif isinstance(val, list):
for e in val:
if isinstance(e, CommentedMap):
if _normalize_alias_item(e, None):
changed = True
elif isinstance(e, dict):
cm = CommentedMap(e)
if _normalize_alias_item(cm, None):
changed = True
# If list items were plain dicts, ruamel will preserve them,
# but we don't force conversion unless needed.
else:
if normalize_structured_aliases(val):
changed = True
elif isinstance(obj, (CommentedSeq, list)):
for item in obj:
if normalize_structured_aliases(item):
changed = True
return changed
def process_file(yaml_rt: YAML, path: Path) -> Tuple[bool, str]:
try:
data = yaml_rt.load(path.read_text(encoding="utf-8"))
except Exception as e:
return False, f"YAML parse error: {e}"
if data is None:
return False, "Empty YAML"
changed = normalize_structured_aliases(data)
if changed:
with path.open("w", encoding="utf-8") as f:
yaml_rt.dump(data, f)
return changed, ""
def main() -> int:
parser = argparse.ArgumentParser(description="Normalize LinkML structured_aliases")
parser.add_argument(
"--root",
default="schemas/20251121/linkml/modules",
help="Root directory containing LinkML modules",
)
args = parser.parse_args()
root = Path(args.root)
if not root.exists():
print(f"Root not found: {root}")
return 2
yaml_rt = YAML(typ="rt")
yaml_rt.preserve_quotes = True
yaml_rt.width = 4096
files = sorted(root.rglob("*.yaml"))
changed_files: List[Path] = []
errors: List[Tuple[Path, str]] = []
for p in files:
changed, err = process_file(yaml_rt, p)
if err:
errors.append((p, err))
continue
if changed:
changed_files.append(p)
print(f"Scanned: {len(files)}")
print(f"Changed: {len(changed_files)}")
print(f"Errors: {len(errors)}")
if errors:
for p, e in errors[:30]:
print(f"- {p}: {e}")
if len(errors) > 30:
print(f"... and {len(errors) - 30} more")
return 1
return 0
if __name__ == "__main__":
raise SystemExit(main())