glam/scripts/fix_remaining_validation_errors.py

#!/usr/bin/env python3
"""
Fix remaining validation errors in YAML files.

Common error patterns:
1. Invalid metadata standard enum values (Dublin Core → DUBLIN_CORE, DSpace, OAI-PMH, Z39.50, UNIMARC, ISAD(G))
2. 'notes' field in provenance (should be in HeritageCustodian.description)
3. 'description' field in collections/digital_platforms (not allowed)
4. 'enrichment_history' at root level (should be in provenance)
5. Temporal coverage with "present" or empty strings
6. Missing required fields in enrichment_history
"""

import yaml
import re
from pathlib import Path
from datetime import datetime


# Metadata standards mapping
STANDARDS_MAPPING = {
    'Dublin Core': 'DUBLIN_CORE',
    'dublin core': 'DUBLIN_CORE',
    'DUBLIN-CORE': 'DUBLIN_CORE',
    'DSpace': None,  # DSpace is a platform, not a metadata standard - remove
    'OAI-PMH': None,  # OAI-PMH is a protocol, not a metadata standard - remove
    'Z39.50': None,  # Z39.50 is a protocol, not a metadata standard - remove
    'UNIMARC': 'MARC21',  # Map UNIMARC to MARC21 (closest equivalent)
    'ISAD(G)': 'EAD',  # ISAD(G) archival standard maps to EAD
}


def fix_metadata_standards(standards_list):
    """Map or remove invalid metadata standards."""
    if not standards_list:
        return standards_list

    fixed = []
    for std in standards_list:
        if std in STANDARDS_MAPPING:
            mapped = STANDARDS_MAPPING[std]
            if mapped:  # Only add if there's a valid mapping
                fixed.append(mapped)
        else:
            fixed.append(std)  # Keep valid standards as-is

    return list(set(fixed)) if fixed else None  # Remove duplicates


def fix_temporal_coverage(coverage_str):
    """Fix temporal coverage patterns like '1983-01-01/present' or empty strings."""
    if not coverage_str or coverage_str.strip() == '':
        return None  # Remove empty temporal coverage

    # Replace "present" with current year
    if 'present' in coverage_str.lower():
        current_year = datetime.now().year
        coverage_str = re.sub(
            r'/present',
            f'/{current_year}-12-31',
            coverage_str,
            flags=re.IGNORECASE
        )

    return coverage_str


def fix_enrichment_history(entry):
    """Add required fields to enrichment_history entries."""
    if 'enrichment_type' not in entry:
        entry['enrichment_type'] = 'MANUAL'  # Default to manual enrichment

    if 'verified' not in entry:
        entry['verified'] = False  # Default to unverified

    # Fix enrichment_source if it's a list (should be a string)
    if 'enrichment_source' in entry and isinstance(entry['enrichment_source'], list):
        entry['enrichment_source'] = entry['enrichment_source'][0] if entry['enrichment_source'] else None

    return entry


def fix_institution(inst):
    """Fix a single institution record."""
    changes = []

    # Move 'notes' from provenance to description
    if 'provenance' in inst and 'notes' in inst['provenance']:
        notes = inst['provenance'].pop('notes')
        if 'description' not in inst or not inst['description']:
            inst['description'] = notes
        else:
            inst['description'] += f"\n\n{notes}"
        changes.append("Moved provenance.notes to description")

    # Move enrichment_history from root to provenance
    if 'enrichment_history' in inst:
        if 'provenance' not in inst:
            inst['provenance'] = {}
        inst['provenance']['enrichment_history'] = inst.pop('enrichment_history')
        changes.append("Moved enrichment_history to provenance")

    # Fix enrichment_history entries
    if 'provenance' in inst and 'enrichment_history' in inst['provenance']:
        for entry in inst['provenance']['enrichment_history']:
            fix_enrichment_history(entry)
        changes.append("Fixed enrichment_history entries")

    # Fix digital platforms
    if 'digital_platforms' in inst:
        for platform in inst['digital_platforms']:
            # Remove 'description' field (not allowed)
            if 'description' in platform:
                platform.pop('description')
                changes.append("Removed description from digital_platform")

            # Remove 'notes' field (not allowed)
            if 'notes' in platform:
                platform.pop('notes')
                changes.append("Removed notes from digital_platform")

            # Fix metadata standards
            if 'implemented_standards' in platform:
                fixed_standards = fix_metadata_standards(platform['implemented_standards'])
                if fixed_standards != platform['implemented_standards']:
                    platform['implemented_standards'] = fixed_standards
                    changes.append("Fixed implemented_standards")

    # Fix collections
    if 'collections' in inst:
        for collection in inst['collections']:
            # Remove 'description' field (not allowed)
            if 'description' in collection:
                collection.pop('description')
                changes.append("Removed description from collection")

            # Fix temporal coverage
            if 'temporal_coverage' in collection:
                fixed = fix_temporal_coverage(collection['temporal_coverage'])
                if fixed != collection['temporal_coverage']:
                    collection['temporal_coverage'] = fixed
                    changes.append("Fixed temporal_coverage")

    # Fix change_history
    if 'change_history' in inst:
        for event in inst['change_history']:
            # Add required event_date if missing
            if 'event_date' not in event:
                # Try to infer from event_description or use a placeholder
                event['event_date'] = '1900-01-01'  # Placeholder
                changes.append("Added missing event_date")

    return changes


def process_file(file_path):
    """Process a single YAML file."""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = yaml.safe_load(f)

        if not data:
            return 0

        total_changes = 0

        # Handle different YAML structures
        if isinstance(data, list):
            # List of institutions
            for inst in data:
                changes = fix_institution(inst)
                total_changes += len(changes)
        elif isinstance(data, dict):
            if 'institutions' in data:
                # Wrapped structure with metadata
                print(f"⚠️  Skipping {file_path.name} - wrapped structure not a valid HeritageCustodian")
                return 0
            else:
                # Single institution
                changes = fix_institution(data)
                total_changes += len(changes)

        if total_changes > 0:
            # Write back
            with open(file_path, 'w', encoding='utf-8') as f:
                yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
            print(f"✅ Fixed {total_changes} issues in {file_path.relative_to(file_path.parent.parent.parent)}")

        return total_changes

    except Exception as e:
        print(f"❌ Error processing {file_path.name}: {e}")
        return 0


def main():
    """Process all YAML files in data/instances."""
    repo_root = Path(__file__).parent.parent
    instances_dir = repo_root / 'data' / 'instances'

    # Skip metadata files
    skip_files = {
        'DATASET_STATISTICS.yaml',
        'ENRICHMENT_CANDIDATES.yaml',
        'tunisian_institutions_enhanced.yaml',
        'tunisian_institutions_enhanced.backup.yaml',
    }

    total_fixes = 0
    fixed_count = 0

    # Process all YAML files recursively
    for yaml_file in instances_dir.rglob('*.yaml'):
        if yaml_file.name in skip_files:
            print(f"⏭️  Skipping metadata file: {yaml_file.name}")
            continue

        changes = process_file(yaml_file)
        if changes > 0:
            total_fixes += changes
            fixed_count += 1

    print(f"\n{'='*70}")
    print(f"✅ Fixed {total_fixes} issues across {fixed_count} files")
    print(f"{'='*70}")


if __name__ == "__main__":
    main()