glam/scripts/normalize_linkml_alt_descriptions.py
kempersc 66adec257e Add scripts for normalizing LinkML schemas and validating schema integrity
- Implement `normalize_linkml_alt_descriptions.py` to convert structured alt_descriptions to the expected scalar form.
- Implement `normalize_linkml_structured_aliases.py` to flatten language-keyed structured_aliases into a standard list-of-objects format.
- Implement `validate_linkml_schema_integrity.py` to validate the integrity of LinkML schema bundles, checking for import resolution, YAML parsing, and reference existence.
2026-02-16 10:16:51 +01:00

155 lines
4.5 KiB
Python

#!/usr/bin/env python3
"""Normalize LinkML alt_descriptions to simple lang->string form.
The LinkML runtime expects:
alt_descriptions:
en: "..."
nl: "..."
However, many modules in this repo use a structured form:
alt_descriptions:
nl: {text: "...", language: nl}
This script converts the structured form to the expected scalar form while
preserving YAML formatting as much as possible (ruamel.yaml round-trip).
It also handles malformed inline maps where extra keys were introduced due to
unquoted fragments (heuristic: append keys with null values to the text).
Usage:
python scripts/normalize_linkml_alt_descriptions.py \
--root schemas/20251121/linkml/modules
"""
from __future__ import annotations
import argparse
from pathlib import Path
from typing import Any, List, Tuple
from ruamel.yaml import YAML
from ruamel.yaml.comments import CommentedMap, CommentedSeq
def normalize_alt_descriptions(obj: Any) -> bool:
"""Normalize alt_descriptions blocks in-place.
Returns True if any change was made.
"""
changed = False
if isinstance(obj, CommentedMap):
for key in list(obj.keys()):
val = obj[key]
if key == "alt_descriptions" and isinstance(val, CommentedMap):
for lang in list(val.keys()):
entry = val[lang]
if isinstance(entry, CommentedMap) and "text" in entry:
text = entry.get("text")
if text is None:
text = ""
text = str(text)
# Heuristic: if the inline map was malformed, YAML may
# have created extra keys with null values.
extras: List[str] = []
for k2, v2 in entry.items():
if k2 in ("text", "language"):
continue
if v2 is None:
extras.append(str(k2))
elif isinstance(v2, str) and v2.strip():
extras.append(f"{k2} {v2}")
else:
extras.append(str(k2))
if extras:
text = (text.rstrip() + " " + " ".join(extras)).strip()
val[lang] = text
changed = True
elif isinstance(entry, dict) and "text" in entry:
# Non-ruamel map (should be rare), but support it.
text = entry.get("text") or ""
val[lang] = str(text)
changed = True
else:
if normalize_alt_descriptions(val):
changed = True
elif isinstance(obj, (CommentedSeq, list)):
for item in obj:
if normalize_alt_descriptions(item):
changed = True
return changed
def process_file(yaml_rt: YAML, path: Path) -> Tuple[bool, str]:
try:
data = yaml_rt.load(path.read_text(encoding="utf-8"))
except Exception as e:
return False, f"YAML parse error: {e}"
if data is None:
return False, "Empty YAML"
changed = normalize_alt_descriptions(data)
if changed:
with path.open("w", encoding="utf-8") as f:
yaml_rt.dump(data, f)
return changed, ""
def main() -> int:
parser = argparse.ArgumentParser(description="Normalize LinkML alt_descriptions blocks")
parser.add_argument(
"--root",
default="schemas/20251121/linkml/modules",
help="Root directory containing LinkML modules",
)
args = parser.parse_args()
root = Path(args.root)
if not root.exists():
print(f"Root not found: {root}")
return 2
yaml_rt = YAML(typ="rt")
yaml_rt.preserve_quotes = True
yaml_rt.width = 4096
files = sorted(root.rglob("*.yaml"))
changed_files: List[Path] = []
errors: List[Tuple[Path, str]] = []
for p in files:
changed, err = process_file(yaml_rt, p)
if err:
errors.append((p, err))
continue
if changed:
changed_files.append(p)
print(f"Scanned: {len(files)}")
print(f"Changed: {len(changed_files)}")
print(f"Errors: {len(errors)}")
if errors:
for p, e in errors[:30]:
print(f"- {p}: {e}")
if len(errors) > 30:
print(f"... and {len(errors) - 30} more")
return 1
return 0
if __name__ == "__main__":
raise SystemExit(main())