- Implement `normalize_linkml_alt_descriptions.py` to convert structured alt_descriptions to the expected scalar form. - Implement `normalize_linkml_structured_aliases.py` to flatten language-keyed structured_aliases into a standard list-of-objects format. - Implement `validate_linkml_schema_integrity.py` to validate the integrity of LinkML schema bundles, checking for import resolution, YAML parsing, and reference existence.
155 lines
4.5 KiB
Python
155 lines
4.5 KiB
Python
#!/usr/bin/env python3
|
|
"""Normalize LinkML alt_descriptions to simple lang->string form.
|
|
|
|
The LinkML runtime expects:
|
|
|
|
alt_descriptions:
|
|
en: "..."
|
|
nl: "..."
|
|
|
|
However, many modules in this repo use a structured form:
|
|
|
|
alt_descriptions:
|
|
nl: {text: "...", language: nl}
|
|
|
|
This script converts the structured form to the expected scalar form while
|
|
preserving YAML formatting as much as possible (ruamel.yaml round-trip).
|
|
|
|
It also handles malformed inline maps where extra keys were introduced due to
|
|
unquoted fragments (heuristic: append keys with null values to the text).
|
|
|
|
Usage:
|
|
python scripts/normalize_linkml_alt_descriptions.py \
|
|
--root schemas/20251121/linkml/modules
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
from pathlib import Path
|
|
from typing import Any, List, Tuple
|
|
|
|
from ruamel.yaml import YAML
|
|
from ruamel.yaml.comments import CommentedMap, CommentedSeq
|
|
|
|
|
|
def normalize_alt_descriptions(obj: Any) -> bool:
|
|
"""Normalize alt_descriptions blocks in-place.
|
|
|
|
Returns True if any change was made.
|
|
"""
|
|
|
|
changed = False
|
|
|
|
if isinstance(obj, CommentedMap):
|
|
for key in list(obj.keys()):
|
|
val = obj[key]
|
|
|
|
if key == "alt_descriptions" and isinstance(val, CommentedMap):
|
|
for lang in list(val.keys()):
|
|
entry = val[lang]
|
|
|
|
if isinstance(entry, CommentedMap) and "text" in entry:
|
|
text = entry.get("text")
|
|
if text is None:
|
|
text = ""
|
|
text = str(text)
|
|
|
|
# Heuristic: if the inline map was malformed, YAML may
|
|
# have created extra keys with null values.
|
|
extras: List[str] = []
|
|
for k2, v2 in entry.items():
|
|
if k2 in ("text", "language"):
|
|
continue
|
|
if v2 is None:
|
|
extras.append(str(k2))
|
|
elif isinstance(v2, str) and v2.strip():
|
|
extras.append(f"{k2} {v2}")
|
|
else:
|
|
extras.append(str(k2))
|
|
|
|
if extras:
|
|
text = (text.rstrip() + " " + " ".join(extras)).strip()
|
|
|
|
val[lang] = text
|
|
changed = True
|
|
|
|
elif isinstance(entry, dict) and "text" in entry:
|
|
# Non-ruamel map (should be rare), but support it.
|
|
text = entry.get("text") or ""
|
|
val[lang] = str(text)
|
|
changed = True
|
|
|
|
else:
|
|
if normalize_alt_descriptions(val):
|
|
changed = True
|
|
|
|
elif isinstance(obj, (CommentedSeq, list)):
|
|
for item in obj:
|
|
if normalize_alt_descriptions(item):
|
|
changed = True
|
|
|
|
return changed
|
|
|
|
|
|
def process_file(yaml_rt: YAML, path: Path) -> Tuple[bool, str]:
|
|
try:
|
|
data = yaml_rt.load(path.read_text(encoding="utf-8"))
|
|
except Exception as e:
|
|
return False, f"YAML parse error: {e}"
|
|
|
|
if data is None:
|
|
return False, "Empty YAML"
|
|
|
|
changed = normalize_alt_descriptions(data)
|
|
if changed:
|
|
with path.open("w", encoding="utf-8") as f:
|
|
yaml_rt.dump(data, f)
|
|
return changed, ""
|
|
|
|
|
|
def main() -> int:
|
|
parser = argparse.ArgumentParser(description="Normalize LinkML alt_descriptions blocks")
|
|
parser.add_argument(
|
|
"--root",
|
|
default="schemas/20251121/linkml/modules",
|
|
help="Root directory containing LinkML modules",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
root = Path(args.root)
|
|
if not root.exists():
|
|
print(f"Root not found: {root}")
|
|
return 2
|
|
|
|
yaml_rt = YAML(typ="rt")
|
|
yaml_rt.preserve_quotes = True
|
|
yaml_rt.width = 4096
|
|
|
|
files = sorted(root.rglob("*.yaml"))
|
|
changed_files: List[Path] = []
|
|
errors: List[Tuple[Path, str]] = []
|
|
|
|
for p in files:
|
|
changed, err = process_file(yaml_rt, p)
|
|
if err:
|
|
errors.append((p, err))
|
|
continue
|
|
if changed:
|
|
changed_files.append(p)
|
|
|
|
print(f"Scanned: {len(files)}")
|
|
print(f"Changed: {len(changed_files)}")
|
|
print(f"Errors: {len(errors)}")
|
|
if errors:
|
|
for p, e in errors[:30]:
|
|
print(f"- {p}: {e}")
|
|
if len(errors) > 30:
|
|
print(f"... and {len(errors) - 30} more")
|
|
return 1
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|