#!/usr/bin/env python3 """Verify external ontology mappings used in LinkML YAML files. Default behavior targets changed/untracked YAML files under: schemas/20251121/linkml/ It validates mapping CURIEs under mapping keys: exact_mappings, close_mappings, broad_mappings, narrow_mappings, related_mappings Supported prefixes: - la (Linked Art) - rdac (RDA classes) - rdau (RDA unconstrained properties) - pav (PAV 2.3) - ardo (ArDO) - pca (POSC Caesar RDS) """ from __future__ import annotations import argparse import json import re import subprocess import sys import urllib.error import urllib.request import xml.etree.ElementTree as ET from pathlib import Path MAPPING_KEYS = { "exact_mappings", "close_mappings", "broad_mappings", "narrow_mappings", "related_mappings", } SUPPORTED_PREFIXES = {"la", "rdac", "rdau", "pav", "ardo", "pca"} CURIE_RE = re.compile(r"^(?P[a-z][a-z0-9_-]*):(?P[A-Za-z0-9_./-]+)$") def fetch_text(url: str, timeout: int = 60) -> str: with urllib.request.urlopen(url, timeout=timeout) as resp: return resp.read().decode("utf-8", errors="ignore") def fetch_bytes(url: str, timeout: int = 60) -> bytes: with urllib.request.urlopen(url, timeout=timeout) as resp: return resp.read() def parse_mapping_curies(file_path: Path) -> list[tuple[int, str, str]]: """Return (line_number, prefix, local) mapping CURIEs from mapping blocks.""" out: list[tuple[int, str, str]] = [] lines = file_path.read_text(encoding="utf-8", errors="ignore").splitlines() in_block = False block_indent = -1 for idx, line in enumerate(lines, start=1): stripped = line.strip() indent = len(line) - len(line.lstrip(" ")) if not in_block: if not stripped or stripped.startswith("#"): continue if ":" in stripped: key = stripped.split(":", 1)[0].strip() if key in MAPPING_KEYS and stripped.endswith(":"): in_block = True block_indent = indent continue # Exit mapping block on dedent to same or lower level and non-list content if stripped and not stripped.startswith("#"): if indent <= block_indent and not stripped.startswith("-"): in_block = False block_indent = -1 # re-process this line as potential new key if ":" in stripped: key = stripped.split(":", 1)[0].strip() if key in MAPPING_KEYS and stripped.endswith(":"): in_block = True block_indent = indent continue if stripped.startswith("-"): item = stripped[1:].strip() # remove inline comment if " #" in item: item = item.split(" #", 1)[0].strip() m = CURIE_RE.match(item) if m: pfx = m.group("prefix") local = m.group("local") out.append((idx, pfx, local)) return out def changed_yaml_files(repo_root: Path, scope: Path) -> list[Path]: """Collect changed and untracked YAML files inside scope.""" files: set[Path] = set() def run(cmd: list[str]) -> list[str]: try: out = subprocess.check_output(cmd, cwd=repo_root) return [x for x in out.decode().splitlines() if x] except subprocess.CalledProcessError: return [] tracked = run(["git", "diff", "--name-only"]) untracked = run(["git", "ls-files", "--others", "--exclude-standard"]) for rel in tracked + untracked: if not rel.endswith(".yaml"): continue p = (repo_root / rel).resolve() try: p.relative_to(scope.resolve()) except ValueError: continue if p.is_file(): files.add(p) return sorted(files) def load_linked_art_terms() -> tuple[set[str], set[str]]: xml_data = fetch_bytes("https://linked.art/ns/terms/") root = ET.fromstring(xml_data) ns = { "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", "rdfs": "http://www.w3.org/2000/01/rdf-schema#", } props: set[str] = set() classes: set[str] = set() for p in root.findall("rdf:Property", ns): uri = p.attrib.get("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about", "") if uri.startswith("https://linked.art/ns/terms/"): props.add(uri.rsplit("/", 1)[-1]) for c in root.findall("rdfs:Class", ns): uri = c.attrib.get("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about", "") if uri.startswith("https://linked.art/ns/terms/"): classes.add(uri.rsplit("/", 1)[-1]) return props, classes def load_rda_ids(path: str, marker: str) -> set[str]: txt = fetch_text(f"https://www.rdaregistry.info/jsonld/Elements/{path}.jsonld") return set(re.findall(marker, txt)) def main() -> int: parser = argparse.ArgumentParser(description="Verify LinkML external mappings") parser.add_argument( "files", nargs="*", help="YAML files to verify (defaults to changed/untracked files under scope)", ) parser.add_argument( "--scope", default="schemas/20251121/linkml", help="Default scope used when no files are provided", ) parser.add_argument( "--all", action="store_true", help="Scan all YAML files under --scope (instead of changed/untracked files)", ) args = parser.parse_args() repo_root = Path(__file__).resolve().parents[1] scope = (repo_root / args.scope).resolve() if args.files: files = [Path(f).resolve() for f in args.files] elif args.all: files = sorted(scope.rglob("*.yaml")) else: files = changed_yaml_files(repo_root, scope) if not files: print("No target YAML files found. Nothing to verify.") return 0 occurrences: dict[str, list[tuple[Path, int, str]]] = {} for file_path in files: if not file_path.exists() or file_path.suffix != ".yaml": continue for line_no, pfx, local in parse_mapping_curies(file_path): if pfx not in SUPPORTED_PREFIXES: continue occurrences.setdefault(pfx, []).append((file_path, line_no, local)) if not occurrences: print("No supported external mapping CURIEs found in selected files.") return 0 failures: list[str] = [] la_props: set[str] = set() la_classes: set[str] = set() rdac_ids: set[str] = set() rdau_ids: set[str] = set() pav_text = "" try: la_props, la_classes = load_linked_art_terms() except Exception as e: # pragma: no cover - network failures failures.append(f"[load] Linked Art: {e}") try: rdac_ids = load_rda_ids("c", r"Elements/c/(C\d+)") except Exception as e: # pragma: no cover failures.append(f"[load] RDA c.jsonld: {e}") try: rdau_ids = load_rda_ids("u", r"Elements/u/(P\d+)") except Exception as e: # pragma: no cover failures.append(f"[load] RDA u.jsonld: {e}") try: pav_text = fetch_text("https://purl.org/pav/2.3") except Exception as e: # pragma: no cover failures.append(f"[load] PAV 2.3: {e}") print("Verifying mapping CURIEs:") for prefix in sorted(occurrences): locals_unique = sorted({x[2] for x in occurrences[prefix]}) print(f"- {prefix}: {', '.join(locals_unique)}") # prefix-specific verification for file_path, line_no, local in occurrences.get("la", []): if local not in la_props and local not in la_classes: failures.append(f"{file_path}:{line_no} la:{local} not found in linked.art/ns/terms") for file_path, line_no, local in occurrences.get("rdac", []): if local not in rdac_ids: failures.append(f"{file_path}:{line_no} rdac:{local} not found in RDA Elements/c.jsonld") for file_path, line_no, local in occurrences.get("rdau", []): if local not in rdau_ids: failures.append(f"{file_path}:{line_no} rdau:{local} not found in RDA Elements/u.jsonld") for file_path, line_no, local in occurrences.get("pav", []): if local not in pav_text: failures.append(f"{file_path}:{line_no} pav:{local} not found in PAV 2.3 ontology") for file_path, line_no, local in occurrences.get("ardo", []): url = f"https://w3id.org/ardo/2.0/{local}" try: txt = fetch_text(url) if local not in txt: failures.append(f"{file_path}:{line_no} ardo:{local} not found at {url}") except urllib.error.URLError as e: failures.append(f"{file_path}:{line_no} ardo:{local} fetch error: {e}") for file_path, line_no, local in occurrences.get("pca", []): url = f"https://rds.posccaesar.org/ontology/plm/rdl/{local}" try: txt = fetch_text(url) if local not in txt: failures.append(f"{file_path}:{line_no} pca:{local} not found at {url}") except urllib.error.URLError as e: failures.append(f"{file_path}:{line_no} pca:{local} fetch error: {e}") if failures: print("\nFAIL") for f in failures: print(f"- {f}") return 1 print("\nOK: all checked mapping CURIEs were verified against source ontologies.") return 0 if __name__ == "__main__": sys.exit(main())