- Implement `normalize_linkml_alt_descriptions.py` to convert structured alt_descriptions to the expected scalar form. - Implement `normalize_linkml_structured_aliases.py` to flatten language-keyed structured_aliases into a standard list-of-objects format. - Implement `validate_linkml_schema_integrity.py` to validate the integrity of LinkML schema bundles, checking for import resolution, YAML parsing, and reference existence.
158 lines
4.9 KiB
Python
158 lines
4.9 KiB
Python
#!/usr/bin/env python3
|
|
"""Normalize LinkML structured_aliases to the standard list-of-objects form.
|
|
|
|
LinkML expects structured_aliases to be a list of Alias-like objects, e.g.:
|
|
|
|
structured_aliases:
|
|
- literal_form: foo
|
|
in_language: nl
|
|
|
|
In this repo many modules use a language-keyed map:
|
|
|
|
structured_aliases:
|
|
nl:
|
|
- literal_form: foo
|
|
language: nl
|
|
|
|
This script flattens language-keyed maps into a single list and normalizes
|
|
`language` -> `in_language`.
|
|
|
|
Usage:
|
|
python scripts/normalize_linkml_structured_aliases.py \
|
|
--root schemas/20251121/linkml/modules
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
from pathlib import Path
|
|
from typing import Any, List, Tuple
|
|
|
|
from ruamel.yaml import YAML
|
|
from ruamel.yaml.comments import CommentedMap, CommentedSeq
|
|
|
|
|
|
def _normalize_alias_item(item: CommentedMap, default_lang: str | None) -> bool:
|
|
changed = False
|
|
if "language" in item and "in_language" not in item:
|
|
item["in_language"] = item["language"]
|
|
del item["language"]
|
|
changed = True
|
|
if "in_language" not in item and default_lang:
|
|
item["in_language"] = default_lang
|
|
changed = True
|
|
return changed
|
|
|
|
|
|
def normalize_structured_aliases(obj: Any) -> bool:
|
|
changed = False
|
|
|
|
if isinstance(obj, CommentedMap):
|
|
for key in list(obj.keys()):
|
|
val = obj[key]
|
|
|
|
if key == "structured_aliases":
|
|
# Case 1: language-keyed map
|
|
if isinstance(val, CommentedMap):
|
|
flat = CommentedSeq()
|
|
for lang, entries in val.items():
|
|
if isinstance(entries, list):
|
|
for e in entries:
|
|
if isinstance(e, CommentedMap):
|
|
_normalize_alias_item(e, str(lang))
|
|
flat.append(e)
|
|
elif isinstance(e, dict):
|
|
cm = CommentedMap(e)
|
|
_normalize_alias_item(cm, str(lang))
|
|
flat.append(cm)
|
|
elif isinstance(entries, CommentedMap):
|
|
_normalize_alias_item(entries, str(lang))
|
|
flat.append(entries)
|
|
obj[key] = flat
|
|
changed = True
|
|
|
|
# Case 2: already a list; just normalize keys
|
|
elif isinstance(val, list):
|
|
for e in val:
|
|
if isinstance(e, CommentedMap):
|
|
if _normalize_alias_item(e, None):
|
|
changed = True
|
|
elif isinstance(e, dict):
|
|
cm = CommentedMap(e)
|
|
if _normalize_alias_item(cm, None):
|
|
changed = True
|
|
# If list items were plain dicts, ruamel will preserve them,
|
|
# but we don't force conversion unless needed.
|
|
|
|
else:
|
|
if normalize_structured_aliases(val):
|
|
changed = True
|
|
|
|
elif isinstance(obj, (CommentedSeq, list)):
|
|
for item in obj:
|
|
if normalize_structured_aliases(item):
|
|
changed = True
|
|
|
|
return changed
|
|
|
|
|
|
def process_file(yaml_rt: YAML, path: Path) -> Tuple[bool, str]:
|
|
try:
|
|
data = yaml_rt.load(path.read_text(encoding="utf-8"))
|
|
except Exception as e:
|
|
return False, f"YAML parse error: {e}"
|
|
|
|
if data is None:
|
|
return False, "Empty YAML"
|
|
|
|
changed = normalize_structured_aliases(data)
|
|
if changed:
|
|
with path.open("w", encoding="utf-8") as f:
|
|
yaml_rt.dump(data, f)
|
|
return changed, ""
|
|
|
|
|
|
def main() -> int:
|
|
parser = argparse.ArgumentParser(description="Normalize LinkML structured_aliases")
|
|
parser.add_argument(
|
|
"--root",
|
|
default="schemas/20251121/linkml/modules",
|
|
help="Root directory containing LinkML modules",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
root = Path(args.root)
|
|
if not root.exists():
|
|
print(f"Root not found: {root}")
|
|
return 2
|
|
|
|
yaml_rt = YAML(typ="rt")
|
|
yaml_rt.preserve_quotes = True
|
|
yaml_rt.width = 4096
|
|
|
|
files = sorted(root.rglob("*.yaml"))
|
|
changed_files: List[Path] = []
|
|
errors: List[Tuple[Path, str]] = []
|
|
|
|
for p in files:
|
|
changed, err = process_file(yaml_rt, p)
|
|
if err:
|
|
errors.append((p, err))
|
|
continue
|
|
if changed:
|
|
changed_files.append(p)
|
|
|
|
print(f"Scanned: {len(files)}")
|
|
print(f"Changed: {len(changed_files)}")
|
|
print(f"Errors: {len(errors)}")
|
|
if errors:
|
|
for p, e in errors[:30]:
|
|
print(f"- {p}: {e}")
|
|
if len(errors) > 30:
|
|
print(f"... and {len(errors) - 30} more")
|
|
return 1
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|