#!/usr/bin/env python3 """ Validate custodian YAML files against LinkML schema. This script generates a JSON Schema from the LinkML schema and applies a fix for the exactly_one_of + default_range conflict that causes integer values to be rejected when they should be allowed. Usage: python scripts/validate_custodian_schema.py [file_or_dir] [--sample N] Examples: # Validate a single file python scripts/validate_custodian_schema.py data/custodian/NL-NH-AMS-M-RMA.yaml # Validate all files in directory python scripts/validate_custodian_schema.py data/custodian/ # Validate random sample of N files python scripts/validate_custodian_schema.py data/custodian/ --sample 50 """ import json import yaml import sys import os import random import warnings from pathlib import Path from typing import List, Dict, Any, Optional from dataclasses import dataclass # Suppress deprecation warnings from jsonschema warnings.filterwarnings("ignore", category=DeprecationWarning) from linkml.generators.jsonschemagen import JsonSchemaGenerator from jsonschema import Draft7Validator # Schema file path SCHEMA_FILE = Path(__file__).parent.parent / "schemas/20251121/linkml/custodian_source.yaml" @dataclass class ValidationResult: """Result of validating a single file.""" file_path: str valid: bool errors: List[str] def fix_oneof_types(obj: Any) -> int: """ Fix JSON Schema by removing conflicting 'type' when 'oneOf' or 'anyOf' is present. LinkML generates both 'oneOf'/'anyOf' and 'type' when using exactly_one_of or any_of with a default_range, which causes validation to fail for mixed types. Returns the number of fixes applied. """ count = 0 if isinstance(obj, dict): # Fix both oneOf and anyOf conflicts with type if ('oneOf' in obj or 'anyOf' in obj) and 'type' in obj: del obj['type'] count += 1 for v in obj.values(): count += fix_oneof_types(v) elif isinstance(obj, list): for item in obj: count += fix_oneof_types(item) return count def generate_json_schema(schema_file: Path) -> Dict[str, Any]: """Generate JSON Schema from LinkML and apply fixes.""" gen = JsonSchemaGenerator(str(schema_file)) json_schema = json.loads(gen.serialize()) # Apply fixes for exactly_one_of + default_range conflict fixes = fix_oneof_types(json_schema) if fixes > 0: print(f"Applied {fixes} JSON Schema fixes for mixed-type fields") return json_schema def validate_file(file_path: Path, validator: Draft7Validator) -> ValidationResult: """Validate a single YAML file.""" try: with open(file_path, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) errors = [] for error in validator.iter_errors(data): path = '/'.join(str(p) for p in error.path) errors.append(f"{error.message} at /{path}") return ValidationResult( file_path=str(file_path), valid=len(errors) == 0, errors=errors ) except yaml.YAMLError as e: return ValidationResult( file_path=str(file_path), valid=False, errors=[f"YAML parse error: {e}"] ) except Exception as e: return ValidationResult( file_path=str(file_path), valid=False, errors=[f"Error reading file: {e}"] ) def validate_files(files: List[Path], schema_file: Path = SCHEMA_FILE) -> List[ValidationResult]: """Validate multiple YAML files against the schema.""" print(f"Generating JSON Schema from {schema_file}...") json_schema = generate_json_schema(schema_file) validator = Draft7Validator(json_schema) results = [] for i, file_path in enumerate(files, 1): result = validate_file(file_path, validator) results.append(result) status = "āœ“" if result.valid else "āœ—" if not result.valid or i % 100 == 0: print(f"[{i}/{len(files)}] {status} {file_path.name}") if not result.valid: for err in result.errors[:3]: print(f" - {err[:100]}") if len(result.errors) > 3: print(f" ... and {len(result.errors) - 3} more errors") return results def main(): import argparse parser = argparse.ArgumentParser(description='Validate custodian YAML files against LinkML schema') parser.add_argument('path', nargs='?', default='data/custodian/', help='File or directory to validate (default: data/custodian/)') parser.add_argument('--sample', type=int, default=None, help='Validate random sample of N files') parser.add_argument('--schema', type=str, default=str(SCHEMA_FILE), help='Path to LinkML schema file') args = parser.parse_args() path = Path(args.path) schema_file = Path(args.schema) if not schema_file.exists(): print(f"Schema file not found: {schema_file}") sys.exit(1) # Collect files to validate if path.is_file(): files = [path] elif path.is_dir(): files = list(path.glob('*.yaml')) if args.sample and args.sample < len(files): files = random.sample(files, args.sample) else: print(f"Path not found: {path}") sys.exit(1) print(f"Validating {len(files)} files...") results = validate_files(files, schema_file) # Summary valid_count = sum(1 for r in results if r.valid) invalid_count = len(results) - valid_count print(f"\n{'='*60}") print(f"VALIDATION SUMMARY") print(f"{'='*60}") print(f"Total files: {len(results)}") print(f"Valid: {valid_count} ({100*valid_count/len(results):.1f}%)") print(f"Invalid: {invalid_count} ({100*invalid_count/len(results):.1f}%)") if invalid_count > 0: print(f"\nInvalid files:") for r in results: if not r.valid: print(f" - {r.file_path}") sys.exit(1) else: print(f"\nāœ“ All files valid!") sys.exit(0) if __name__ == '__main__': main()