glam/scripts/fix_metadata_standards.py

#!/usr/bin/env python3
"""
Fix metadata_standards field values in Norwegian institution data files.

Issues to fix:
1. Technical formats (REST API, JSON, XML) → move to integration_method
2. Software names (Primus) → move to vendor or platform_name
3. Non-standard formats → convert to canonical enum values or remove
4. Add missing integration_method fields

Schema-compliant values for MetadataStandardEnum:
- DUBLIN_CORE, MARC21, EAD, BIBFRAME, LIDO, CIDOC_CRM, SCHEMA_ORG,
  RIC_O, MODS, PREMIS, SPECTRUM, DACS
"""

import yaml
import sys
from pathlib import Path
from typing import Any, Dict, List, Set


# Valid metadata standard enum values (from schemas/enums.yaml)
VALID_STANDARDS = {
    'DUBLIN_CORE', 'MARC21', 'EAD', 'BIBFRAME', 'LIDO', 'CIDOC_CRM',
    'SCHEMA_ORG', 'RIC_O', 'MODS', 'PREMIS', 'SPECTRUM', 'DACS'
}

# Mapping from text to canonical enum values
STANDARD_MAPPINGS = {
    'Dublin Core': 'DUBLIN_CORE',
    'DUBLIN CORE': 'DUBLIN_CORE',
    'dublin core': 'DUBLIN_CORE',
    'DC': 'DUBLIN_CORE',
    'MARC21': 'MARC21',
    'MARC 21': 'MARC21',
    'marc21': 'MARC21',
    'EAD': 'EAD',
    'EAD (Encoded Archival Description)': 'EAD',
    'Encoded Archival Description': 'EAD',
    'BIBFRAME': 'BIBFRAME',
    'LIDO': 'LIDO',
    'CIDOC-CRM': 'CIDOC_CRM',
    'CIDOC CRM': 'CIDOC_CRM',
    'Schema.org': 'SCHEMA_ORG',
    'schema.org': 'SCHEMA_ORG',
    'RiC-O': 'RIC_O',
    'RIC-O': 'RIC_O',
    'Records in Contexts': 'RIC_O',
    'MODS': 'MODS',
    'PREMIS': 'PREMIS',
    'SPECTRUM': 'SPECTRUM',
    'DACS': 'DACS',
}

# Standards that need to be added to the schema enum
# These are valid but not yet in MetadataStandardEnum
PENDING_SCHEMA_ADDITIONS = {
    'Europeana Data Model (EDM)': 'EDM',
    'EDM': 'EDM',
    'IIIF': 'IIIF',
    'International Image Interoperability Framework': 'IIIF',
    'Noark 5': 'NOARK5',
    'NOARK 5': 'NOARK5',
    'Noark-5': 'NOARK5',
}

# Semantic web technologies (not metadata standards per se)
# Should be noted in platform description or removed
SEMANTIC_WEB_TECH = {
    'RDF', 'RDFS', 'OWL', 'SKOS', 'Linked Open Data', 'LOD',
    'Linked Data', 'SPARQL', 'Triple Store'
}

# Technical formats that should go in integration_method
TECHNICAL_FORMATS = {
    'REST API', 'REST', 'JSON', 'XML', 'OAI-PMH', 'SPARQL', 'GraphQL',
    'SOAP', 'CSV', 'API', 'HTTP'
}

# Software/vendor names
SOFTWARE_NAMES = {
    'Primus', 'CollectiveAccess', 'Adlib', 'TMS', 'Axiell', 'MAIS',
    'Atlantis', 'DSpace', 'EPrints', 'Fedora', 'Islandora'
}

# Standards to investigate (not in enum but might be valid)
UNKNOWN_STANDARDS = set()


def normalize_standard(value: str) -> str | None:
    """
    Normalize a metadata standard value to canonical enum format.

    Returns:
        Canonical enum value, or None if not a valid standard
    """
    value_clean = value.strip()

    # Check if already canonical
    if value_clean in VALID_STANDARDS:
        return value_clean

    # Check mappings
    if value_clean in STANDARD_MAPPINGS:
        return STANDARD_MAPPINGS[value_clean]

    # Check pending schema additions (valid but not yet in enum)
    if value_clean in PENDING_SCHEMA_ADDITIONS:
        mapped_value = PENDING_SCHEMA_ADDITIONS[value_clean]
        UNKNOWN_STANDARDS.add(f"{value_clean} [→ {mapped_value}, pending schema update]")
        return None  # Will need schema update

    # Check semantic web technologies (not metadata standards)
    if value_clean in SEMANTIC_WEB_TECH or any(tech in value_clean for tech in SEMANTIC_WEB_TECH):
        return None  # Will be filtered out with note

    return None


def is_technical_format(value: str) -> bool:
    """Check if value is a technical format (REST, JSON, etc.)."""
    return any(fmt in value.upper() for fmt in TECHNICAL_FORMATS)


def is_software_name(value: str) -> bool:
    """Check if value is a software/vendor name."""
    return any(sw in value for sw in SOFTWARE_NAMES)


def fix_digital_platform(platform: Dict[str, Any], changes: List[str]) -> Dict[str, Any]:
    """
    Fix metadata_standards in a digital platform.

    Returns:
        Fixed platform dict, with changes logged
    """
    fixed = platform.copy()

    # Handle metadata_standards field
    if 'metadata_standards' in fixed:
        original_standards = fixed['metadata_standards']
        valid_standards = []
        integration_methods = []
        vendors = []

        for std in original_standards:
            if isinstance(std, str):
                # Check if it's a valid standard
                canonical = normalize_standard(std)
                if canonical:
                    valid_standards.append(canonical)
                    continue

                # Check if it's a technical format
                if is_technical_format(std):
                    integration_methods.append(std)
                    changes.append(f"  - Moved '{std}' from metadata_standards to integration_method")
                    continue

                # Check if it's software
                if is_software_name(std):
                    vendors.append(std)
                    changes.append(f"  - Moved '{std}' from metadata_standards to vendor")
                    continue

                # Check if it's semantic web technology (not a metadata standard)
                if std in SEMANTIC_WEB_TECH or any(tech in std for tech in SEMANTIC_WEB_TECH):
                    changes.append(f"  - Removed '{std}' (semantic web technology, not a metadata standard)")
                    continue

                # Unknown - log it
                UNKNOWN_STANDARDS.add(std)
                changes.append(f"  - ⚠️  Unknown standard '{std}' - needs manual review")

        # Update platform with fixed values
        if valid_standards:
            fixed['metadata_standards'] = valid_standards
        else:
            # Remove empty metadata_standards
            del fixed['metadata_standards']

        # Add integration_method if we found technical formats
        if integration_methods:
            existing_method = fixed.get('integration_method', '')
            if existing_method:
                # Merge with existing
                all_methods = set(integration_methods + [existing_method])
                fixed['integration_method'] = ', '.join(sorted(all_methods))
            else:
                fixed['integration_method'] = ', '.join(integration_methods)

        # Add vendor if we found software names
        if vendors:
            existing_vendor = fixed.get('vendor', '')
            if existing_vendor:
                all_vendors = set(vendors + [existing_vendor])
                fixed['vendor'] = ', '.join(sorted(all_vendors))
            else:
                fixed['vendor'] = ', '.join(vendors)

    return fixed


def fix_heritage_custodian(custodian: Dict[str, Any]) -> tuple[Dict[str, Any], List[str]]:
    """
    Fix metadata_standards across all digital platforms in a custodian.

    Returns:
        (fixed_custodian, list_of_changes)
    """
    fixed = custodian.copy()
    changes = []

    if 'digital_platforms' in fixed and fixed['digital_platforms']:
        fixed_platforms = []
        for i, platform in enumerate(fixed['digital_platforms']):
            platform_changes = []
            fixed_platform = fix_digital_platform(platform, platform_changes)
            fixed_platforms.append(fixed_platform)

            if platform_changes:
                platform_name = platform.get('platform_name', f'Platform {i+1}')
                changes.append(f"\n  Platform: {platform_name}")
                changes.extend(platform_changes)

        fixed['digital_platforms'] = fixed_platforms

    return fixed, changes


def fix_yaml_file(file_path: Path, dry_run: bool = False) -> tuple[int, List[str]]:
    """
    Fix metadata_standards in a YAML file.

    Returns:
        (number_of_institutions_changed, list_of_all_changes)
    """
    # Read file
    with open(file_path, 'r', encoding='utf-8') as f:
        data = yaml.safe_load(f)

    if not data:
        return 0, []

    # Process each institution
    all_changes = []
    institutions_changed = 0

    institutions = data if isinstance(data, list) else [data]

    for i, inst in enumerate(institutions):
        inst_changes = []
        fixed_inst, changes = fix_heritage_custodian(inst)

        if changes:
            institutions_changed += 1
            inst_name = inst.get('name', f'Institution {i+1}')
            all_changes.append(f"\n📍 {inst_name}:")
            all_changes.extend(changes)

            # Update in place
            if isinstance(data, list):
                data[i] = fixed_inst
            else:
                data = fixed_inst

    # Write back if not dry run
    if not dry_run and institutions_changed > 0:
        with open(file_path, 'w', encoding='utf-8') as f:
            yaml.dump(
                data,
                f,
                default_flow_style=False,
                allow_unicode=True,
                sort_keys=False,
                width=120
            )

    return institutions_changed, all_changes


def main():
    """Main entry point."""
    import argparse

    parser = argparse.ArgumentParser(
        description='Fix metadata_standards field values in Norwegian institution YAML files'
    )
    parser.add_argument(
        'files',
        nargs='+',
        type=Path,
        help='YAML files to fix (or directory containing YAML files)'
    )
    parser.add_argument(
        '--dry-run',
        action='store_true',
        help='Show what would be changed without modifying files'
    )

    args = parser.parse_args()

    # Collect all YAML files
    yaml_files = []
    for path in args.files:
        if path.is_dir():
            yaml_files.extend(path.glob('*.yaml'))
            yaml_files.extend(path.glob('*.yml'))
        else:
            yaml_files.append(path)

    if not yaml_files:
        print("❌ No YAML files found")
        return 1

    print(f"{'🔍 DRY RUN - ' if args.dry_run else ''}Processing {len(yaml_files)} files...\n")

    total_institutions_changed = 0
    total_files_changed = 0

    for file_path in sorted(yaml_files):
        print(f"\n{'='*80}")
        print(f"📄 {file_path.name}")
        print('='*80)

        try:
            num_changed, changes = fix_yaml_file(file_path, dry_run=args.dry_run)

            if num_changed > 0:
                total_files_changed += 1
                total_institutions_changed += num_changed
                print(f"\n✏️  {num_changed} institution(s) modified:")
                for change in changes:
                    print(change)
            else:
                print("✅ No changes needed")

        except Exception as e:
            print(f"❌ Error processing {file_path}: {e}")
            import traceback
            traceback.print_exc()

    # Summary
    print(f"\n{'='*80}")
    print("📊 SUMMARY")
    print('='*80)
    print(f"Files processed: {len(yaml_files)}")
    print(f"Files changed: {total_files_changed}")
    print(f"Institutions modified: {total_institutions_changed}")

    if UNKNOWN_STANDARDS:
        print(f"\n⚠️  Unknown standards found (need manual review or schema update):")
        for std in sorted(UNKNOWN_STANDARDS):
            print(f"  - {std}")

    if args.dry_run:
        print(f"\n🔍 This was a dry run. Run without --dry-run to apply changes.")
    else:
        print(f"\n✅ Changes applied successfully!")

    return 0


if __name__ == "__main__":
    sys.exit(main())