glam/scripts/batch_fix_validation_errors.py

#!/usr/bin/env python3
"""
Batch fix common validation errors across all YAML instance files.

Fixes:
1. extent → item_count (Collection field alias)
2. metadata_standards → implemented_standards (DigitalPlatform field alias)
3. Temporal coverage format (ISO 8601: YYYY-MM-DD/YYYY-MM-DD)
4. Invalid platform_type enum values (CATALOG → COLLECTION_MANAGEMENT, REPOSITORY → DIGITAL_REPOSITORY)
5. subject_areas → subjects (Collection field alias)
"""

import sys
from pathlib import Path
import re

def load_yaml_content(filepath):
    """Load YAML file as raw text (preserve formatting)."""
    with open(filepath, 'r', encoding='utf-8') as f:
        return f.read()

def save_yaml_content(filepath, content):
    """Save YAML content to file."""
    with open(filepath, 'w', encoding='utf-8') as f:
        f.write(content)

def fix_alias_fields(content):
    """Fix alias field names to canonical names."""
    changes = []

    # Fix: extent → item_count (in Collection context)
    # Pattern: Look for 'extent:' within collections section
    pattern_extent = r'(\n\s+)(extent:)'
    if re.search(pattern_extent, content):
        content = re.sub(pattern_extent, r'\1item_count:', content)
        changes.append("extent → item_count")

    # Fix: metadata_standards → implemented_standards (in DigitalPlatform context)
    # Pattern: Look for 'metadata_standards:' within digital_platforms section
    pattern_metadata = r'(\n\s+)(metadata_standards:)'
    if re.search(pattern_metadata, content):
        content = re.sub(pattern_metadata, r'\1implemented_standards:', content)
        changes.append("metadata_standards → implemented_standards")

    # Fix: subject_areas → subjects (in Collection context)
    pattern_subjects = r'(\n\s+)(subject_areas:)'
    if re.search(pattern_subjects, content):
        content = re.sub(pattern_subjects, r'\1subjects:', content)
        changes.append("subject_areas → subjects")

    return content, changes

def fix_temporal_coverage(content):
    """Fix temporal coverage format to ISO 8601 with full dates."""
    changes = []

    # Pattern: temporal_coverage: YYYY/YYYY or partial dates
    # Matches:
    # - 1800/1930 → 1800-01-01/1930-12-31
    # - 800/1500 → 0800-01-01/1500-12-31
    # - 1800/1930-12-31 → 1800-01-01/1930-12-31 (incomplete start)

    def replace_temporal(match):
        indent = match.group(1)
        start = match.group(2)
        end = match.group(3)

        # Fix start date
        if re.match(r'^\d{1,4}$', start):
            # Just year: pad and add month/day
            start_fixed = f"{start.zfill(4)}-01-01"
        elif re.match(r'^-?\d{1,5}-\d{2}-\d{2}$', start):
            # Already valid
            start_fixed = start
        else:
            # Incomplete format, skip
            return match.group(0)

        # Fix end date
        if re.match(r'^\d{1,4}$', end):
            # Just year: pad and add month/day
            end_fixed = f"{end.zfill(4)}-12-31"
        elif re.match(r'^-?\d{1,5}-\d{2}-\d{2}$', end):
            # Already valid
            end_fixed = end
        else:
            # Incomplete format, skip
            return match.group(0)

        changes.append(f"temporal_coverage: {start}/{end} → {start_fixed}/{end_fixed}")
        return f"{indent}temporal_coverage: {start_fixed}/{end_fixed}"

    # Match temporal_coverage with various formats
    pattern = r'(\n\s+temporal_coverage: )([^\n]+?)/([^\n]+)'
    content = re.sub(pattern, replace_temporal, content)

    return content, changes

def fix_platform_types(content):
    """Fix invalid platform_type enum values."""
    changes = []

    # CATALOG → COLLECTION_MANAGEMENT
    if 'platform_type: CATALOG' in content or "platform_type: 'CATALOG'" in content:
        content = content.replace('platform_type: CATALOG', 'platform_type: COLLECTION_MANAGEMENT')
        content = content.replace("platform_type: 'CATALOG'", 'platform_type: COLLECTION_MANAGEMENT')
        changes.append("platform_type: CATALOG → COLLECTION_MANAGEMENT")

    # REPOSITORY → DIGITAL_REPOSITORY
    if 'platform_type: REPOSITORY' in content or "platform_type: 'REPOSITORY'" in content:
        content = content.replace('platform_type: REPOSITORY', 'platform_type: DIGITAL_REPOSITORY')
        content = content.replace("platform_type: 'REPOSITORY'", 'platform_type: DIGITAL_REPOSITORY')
        changes.append("platform_type: REPOSITORY → DIGITAL_REPOSITORY")

    return content, changes

def process_file(filepath):
    """Process a single YAML file and apply all fixes."""
    try:
        content = load_yaml_content(filepath)
        original_content = content
        all_changes = []

        # Apply fixes
        content, changes1 = fix_alias_fields(content)
        all_changes.extend(changes1)

        content, changes2 = fix_temporal_coverage(content)
        all_changes.extend(changes2)

        content, changes3 = fix_platform_types(content)
        all_changes.extend(changes3)

        # Only save if changes were made
        if content != original_content:
            save_yaml_content(filepath, content)
            return True, all_changes

        return False, []

    except Exception as e:
        print(f"  ❌ Error processing {filepath}: {e}")
        return False, []

def get_active_yaml_files():
    """Get all active YAML files (excluding archives, backups, tests, cache)."""
    instance_dir = Path.cwd() / "data" / "instances"
    all_files = list(instance_dir.rglob("*.yaml"))

    # Filter out archives, backups, test files, cache
    active_files = [
        f for f in all_files
        if "archive" not in str(f)
        and "backups" not in str(f)
        and "test_outputs" not in str(f)
        and "cache" not in str(f)
    ]

    return sorted(active_files)

def main():
    print("🔧 Batch Fix Validation Errors")
    print("=" * 80)
    print()

    # Get all files
    files = get_active_yaml_files()
    print(f"📄 Found {len(files)} active YAML files")
    print()

    # Process each file
    files_modified = []
    files_unchanged = []

    for i, filepath in enumerate(files, 1):
        relative_path = filepath.relative_to(Path.cwd())
        print(f"[{i}/{len(files)}] Processing {relative_path}...", end=" ")

        modified, changes = process_file(filepath)

        if modified:
            print(f"✅ Fixed ({len(changes)} changes)")
            files_modified.append((str(relative_path), changes))
        else:
            print("⏭️  No changes needed")
            files_unchanged.append(str(relative_path))

    print()
    print("=" * 80)
    print(f"✅ Files modified: {len(files_modified)}")
    print(f"⏭️  Files unchanged: {len(files_unchanged)}")
    print()

    if files_modified:
        print("📝 CHANGES MADE:")
        print("-" * 80)
        for filepath, changes in files_modified:
            print(f"\n{filepath}:")
            for change in changes[:10]:  # Show first 10 changes
                print(f"  - {change}")
            if len(changes) > 10:
                print(f"  ... and {len(changes) - 10} more changes")

    print()
    print("✅ Batch fix complete!")
    return 0

if __name__ == "__main__":
    sys.exit(main())