glam/scripts/validate_all_instances.py

#!/usr/bin/env python3
"""
Validate all YAML instance files and report errors by type.

This script systematically validates all heritage institution YAML files
and groups errors by type for systematic fixing.
"""

import subprocess
import sys
from pathlib import Path
from collections import defaultdict
import re

def get_active_yaml_files():
    """Get all active YAML files (excluding archives, backups, tests)."""
    instance_dir = Path.cwd() / "data" / "instances"
    all_files = list(instance_dir.rglob("*.yaml"))

    # Filter out archives, backups, test files
    active_files = [
        f for f in all_files
        if "archive" not in str(f)
        and "backups" not in str(f)
        and "test_outputs" not in str(f)
    ]

    return sorted(active_files)

def validate_file(filepath):
    """Validate a single YAML file using linkml-validate."""
    cmd = [
        "linkml-validate",
        "-s", "schemas/heritage_custodian.yaml",
        "--target-class", "HeritageCustodian",
        str(filepath)
    ]

    result = subprocess.run(
        cmd,
        capture_output=True,
        text=True,
        cwd=Path.cwd()
    )

    if result.returncode == 0:
        return None  # No errors

    # Parse errors from stderr
    errors = []
    for line in result.stderr.split('\n'):
        if line.strip().startswith('[ERROR]'):
            errors.append(line.strip())

    return errors if errors else None

def categorize_errors(errors):
    """Categorize errors by type."""
    categories = defaultdict(list)

    for error in errors:
        # Extract error type
        if "does not match" in error:
            if "temporal_coverage" in error:
                categories["temporal_coverage_pattern"].append(error)
            else:
                categories["pattern_mismatch"].append(error)
        elif "is not a valid" in error:
            categories["invalid_value"].append(error)
        elif "Additional properties are not allowed" in error:
            categories["invalid_field"].append(error)
        elif "required" in error.lower():
            categories["missing_required"].append(error)
        else:
            categories["other"].append(error)

    return categories

def main():
    print("🔍 GLAM Data Validation Report")
    print("=" * 80)
    print()

    # Get all files
    files = get_active_yaml_files()
    print(f"📄 Found {len(files)} active YAML files to validate")
    print()

    # Validate each file
    files_with_errors = {}
    files_ok = []

    for i, filepath in enumerate(files, 1):
        relative_path = filepath.relative_to(Path.cwd())
        print(f"[{i}/{len(files)}] Validating {relative_path}...", end=" ")

        errors = validate_file(filepath)

        if errors:
            print(f"❌ {len(errors)} errors")
            files_with_errors[str(relative_path)] = errors
        else:
            print("✅")
            files_ok.append(str(relative_path))

    print()
    print("=" * 80)
    print(f"✅ Valid files: {len(files_ok)}")
    print(f"❌ Files with errors: {len(files_with_errors)}")
    print()

    if not files_with_errors:
        print("🎉 All files pass validation!")
        return 0

    # Categorize all errors
    print("📊 ERROR SUMMARY BY TYPE")
    print("=" * 80)

    all_errors_by_category = defaultdict(list)

    for filepath, errors in files_with_errors.items():
        categories = categorize_errors(errors)
        for category, error_list in categories.items():
            for error in error_list:
                all_errors_by_category[category].append((filepath, error))

    # Report by category
    for category, error_list in sorted(all_errors_by_category.items()):
        print(f"\n{category.upper().replace('_', ' ')}: {len(error_list)} errors")
        print("-" * 80)

        # Show first 5 examples
        for filepath, error in error_list[:5]:
            print(f"  {filepath}")
            print(f"    {error}")

        if len(error_list) > 5:
            print(f"  ... and {len(error_list) - 5} more")

    # List all files with errors
    print()
    print("=" * 80)
    print(f"FILES NEEDING FIXES ({len(files_with_errors)}):")
    print("-" * 80)

    for filepath in sorted(files_with_errors.keys()):
        error_count = len(files_with_errors[filepath])
        print(f"  {filepath} ({error_count} errors)")

    return 1

if __name__ == "__main__":
    sys.exit(main())