glam/scripts/verify_external_mappings.py

282 lines
9.3 KiB
Python
Executable file

#!/usr/bin/env python3
"""Verify external ontology mappings used in LinkML YAML files.
Default behavior targets changed/untracked YAML files under:
schemas/20251121/linkml/
It validates mapping CURIEs under mapping keys:
exact_mappings, close_mappings, broad_mappings, narrow_mappings, related_mappings
Supported prefixes:
- la (Linked Art)
- rdac (RDA classes)
- rdau (RDA unconstrained properties)
- pav (PAV 2.3)
- ardo (ArDO)
- pca (POSC Caesar RDS)
"""
from __future__ import annotations
import argparse
import json
import re
import subprocess
import sys
import urllib.error
import urllib.request
import xml.etree.ElementTree as ET
from pathlib import Path
MAPPING_KEYS = {
"exact_mappings",
"close_mappings",
"broad_mappings",
"narrow_mappings",
"related_mappings",
}
SUPPORTED_PREFIXES = {"la", "rdac", "rdau", "pav", "ardo", "pca"}
CURIE_RE = re.compile(r"^(?P<prefix>[a-z][a-z0-9_-]*):(?P<local>[A-Za-z0-9_./-]+)$")
def fetch_text(url: str, timeout: int = 60) -> str:
with urllib.request.urlopen(url, timeout=timeout) as resp:
return resp.read().decode("utf-8", errors="ignore")
def fetch_bytes(url: str, timeout: int = 60) -> bytes:
with urllib.request.urlopen(url, timeout=timeout) as resp:
return resp.read()
def parse_mapping_curies(file_path: Path) -> list[tuple[int, str, str]]:
"""Return (line_number, prefix, local) mapping CURIEs from mapping blocks."""
out: list[tuple[int, str, str]] = []
lines = file_path.read_text(encoding="utf-8", errors="ignore").splitlines()
in_block = False
block_indent = -1
for idx, line in enumerate(lines, start=1):
stripped = line.strip()
indent = len(line) - len(line.lstrip(" "))
if not in_block:
if not stripped or stripped.startswith("#"):
continue
if ":" in stripped:
key = stripped.split(":", 1)[0].strip()
if key in MAPPING_KEYS and stripped.endswith(":"):
in_block = True
block_indent = indent
continue
# Exit mapping block on dedent to same or lower level and non-list content
if stripped and not stripped.startswith("#"):
if indent <= block_indent and not stripped.startswith("-"):
in_block = False
block_indent = -1
# re-process this line as potential new key
if ":" in stripped:
key = stripped.split(":", 1)[0].strip()
if key in MAPPING_KEYS and stripped.endswith(":"):
in_block = True
block_indent = indent
continue
if stripped.startswith("-"):
item = stripped[1:].strip()
# remove inline comment
if " #" in item:
item = item.split(" #", 1)[0].strip()
m = CURIE_RE.match(item)
if m:
pfx = m.group("prefix")
local = m.group("local")
out.append((idx, pfx, local))
return out
def changed_yaml_files(repo_root: Path, scope: Path) -> list[Path]:
"""Collect changed and untracked YAML files inside scope."""
files: set[Path] = set()
def run(cmd: list[str]) -> list[str]:
try:
out = subprocess.check_output(cmd, cwd=repo_root)
return [x for x in out.decode().splitlines() if x]
except subprocess.CalledProcessError:
return []
tracked = run(["git", "diff", "--name-only"])
untracked = run(["git", "ls-files", "--others", "--exclude-standard"])
for rel in tracked + untracked:
if not rel.endswith(".yaml"):
continue
p = (repo_root / rel).resolve()
try:
p.relative_to(scope.resolve())
except ValueError:
continue
if p.is_file():
files.add(p)
return sorted(files)
def load_linked_art_terms() -> tuple[set[str], set[str]]:
xml_data = fetch_bytes("https://linked.art/ns/terms/")
root = ET.fromstring(xml_data)
ns = {
"rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
"rdfs": "http://www.w3.org/2000/01/rdf-schema#",
}
props: set[str] = set()
classes: set[str] = set()
for p in root.findall("rdf:Property", ns):
uri = p.attrib.get("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about", "")
if uri.startswith("https://linked.art/ns/terms/"):
props.add(uri.rsplit("/", 1)[-1])
for c in root.findall("rdfs:Class", ns):
uri = c.attrib.get("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about", "")
if uri.startswith("https://linked.art/ns/terms/"):
classes.add(uri.rsplit("/", 1)[-1])
return props, classes
def load_rda_ids(path: str, marker: str) -> set[str]:
txt = fetch_text(f"https://www.rdaregistry.info/jsonld/Elements/{path}.jsonld")
return set(re.findall(marker, txt))
def main() -> int:
parser = argparse.ArgumentParser(description="Verify LinkML external mappings")
parser.add_argument(
"files",
nargs="*",
help="YAML files to verify (defaults to changed/untracked files under scope)",
)
parser.add_argument(
"--scope",
default="schemas/20251121/linkml",
help="Default scope used when no files are provided",
)
parser.add_argument(
"--all",
action="store_true",
help="Scan all YAML files under --scope (instead of changed/untracked files)",
)
args = parser.parse_args()
repo_root = Path(__file__).resolve().parents[1]
scope = (repo_root / args.scope).resolve()
if args.files:
files = [Path(f).resolve() for f in args.files]
elif args.all:
files = sorted(scope.rglob("*.yaml"))
else:
files = changed_yaml_files(repo_root, scope)
if not files:
print("No target YAML files found. Nothing to verify.")
return 0
occurrences: dict[str, list[tuple[Path, int, str]]] = {}
for file_path in files:
if not file_path.exists() or file_path.suffix != ".yaml":
continue
for line_no, pfx, local in parse_mapping_curies(file_path):
if pfx not in SUPPORTED_PREFIXES:
continue
occurrences.setdefault(pfx, []).append((file_path, line_no, local))
if not occurrences:
print("No supported external mapping CURIEs found in selected files.")
return 0
failures: list[str] = []
la_props: set[str] = set()
la_classes: set[str] = set()
rdac_ids: set[str] = set()
rdau_ids: set[str] = set()
pav_text = ""
try:
la_props, la_classes = load_linked_art_terms()
except Exception as e: # pragma: no cover - network failures
failures.append(f"[load] Linked Art: {e}")
try:
rdac_ids = load_rda_ids("c", r"Elements/c/(C\d+)")
except Exception as e: # pragma: no cover
failures.append(f"[load] RDA c.jsonld: {e}")
try:
rdau_ids = load_rda_ids("u", r"Elements/u/(P\d+)")
except Exception as e: # pragma: no cover
failures.append(f"[load] RDA u.jsonld: {e}")
try:
pav_text = fetch_text("https://purl.org/pav/2.3")
except Exception as e: # pragma: no cover
failures.append(f"[load] PAV 2.3: {e}")
print("Verifying mapping CURIEs:")
for prefix in sorted(occurrences):
locals_unique = sorted({x[2] for x in occurrences[prefix]})
print(f"- {prefix}: {', '.join(locals_unique)}")
# prefix-specific verification
for file_path, line_no, local in occurrences.get("la", []):
if local not in la_props and local not in la_classes:
failures.append(f"{file_path}:{line_no} la:{local} not found in linked.art/ns/terms")
for file_path, line_no, local in occurrences.get("rdac", []):
if local not in rdac_ids:
failures.append(f"{file_path}:{line_no} rdac:{local} not found in RDA Elements/c.jsonld")
for file_path, line_no, local in occurrences.get("rdau", []):
if local not in rdau_ids:
failures.append(f"{file_path}:{line_no} rdau:{local} not found in RDA Elements/u.jsonld")
for file_path, line_no, local in occurrences.get("pav", []):
if local not in pav_text:
failures.append(f"{file_path}:{line_no} pav:{local} not found in PAV 2.3 ontology")
for file_path, line_no, local in occurrences.get("ardo", []):
url = f"https://w3id.org/ardo/2.0/{local}"
try:
txt = fetch_text(url)
if local not in txt:
failures.append(f"{file_path}:{line_no} ardo:{local} not found at {url}")
except urllib.error.URLError as e:
failures.append(f"{file_path}:{line_no} ardo:{local} fetch error: {e}")
for file_path, line_no, local in occurrences.get("pca", []):
url = f"https://rds.posccaesar.org/ontology/plm/rdl/{local}"
try:
txt = fetch_text(url)
if local not in txt:
failures.append(f"{file_path}:{line_no} pca:{local} not found at {url}")
except urllib.error.URLError as e:
failures.append(f"{file_path}:{line_no} pca:{local} fetch error: {e}")
if failures:
print("\nFAIL")
for f in failures:
print(f"- {f}")
return 1
print("\nOK: all checked mapping CURIEs were verified against source ontologies.")
return 0
if __name__ == "__main__":
sys.exit(main())