glam/scripts/normalize_field_names.py

#!/usr/bin/env python3
"""
Normalize field names from aliases to canonical names for LinkML validation.

This script converts intuitive field names (aliases) used in extracted data
to the canonical field names expected by the LinkML schema.

Field Mappings:
- Collection.description → collection_description
- Collection.extent → item_count
- Collection.subject_areas → subjects
- DigitalPlatform.description → platform_description
- DigitalPlatform.metadata_standards → implemented_standards
- Provenance.notes → provenance_notes
"""

import yaml
import sys
from pathlib import Path
from typing import Any, Dict, List


def normalize_collection(collection: Dict[str, Any]) -> Dict[str, Any]:
    """Normalize Collection field names from aliases to canonical."""
    normalized = collection.copy()

    # description → collection_description
    if 'description' in normalized:
        normalized['collection_description'] = normalized.pop('description')

    # extent → item_count
    if 'extent' in normalized:
        normalized['item_count'] = normalized.pop('extent')

    # subject_areas → subjects
    if 'subject_areas' in normalized:
        normalized['subjects'] = normalized.pop('subject_areas')

    return normalized


def normalize_digital_platform(platform: Dict[str, Any]) -> Dict[str, Any]:
    """Normalize DigitalPlatform field names from aliases to canonical."""
    normalized = platform.copy()

    # description → platform_description
    if 'description' in normalized:
        normalized['platform_description'] = normalized.pop('description')

    # metadata_standards → implemented_standards
    if 'metadata_standards' in normalized:
        normalized['implemented_standards'] = normalized.pop('metadata_standards')

    return normalized


def normalize_provenance(provenance: Dict[str, Any]) -> Dict[str, Any]:
    """Normalize Provenance field names from aliases to canonical."""
    normalized = provenance.copy()

    # notes → provenance_notes
    if 'notes' in normalized:
        normalized['provenance_notes'] = normalized.pop('notes')

    return normalized


def normalize_heritage_custodian(custodian: Dict[str, Any]) -> Dict[str, Any]:
    """Normalize HeritageCustodian and nested objects."""
    normalized = custodian.copy()

    # Normalize collections
    if 'collections' in normalized and normalized['collections']:
        normalized['collections'] = [
            normalize_collection(coll) for coll in normalized['collections']
        ]

    # Normalize digital_platforms
    if 'digital_platforms' in normalized and normalized['digital_platforms']:
        normalized['digital_platforms'] = [
            normalize_digital_platform(plat) for plat in normalized['digital_platforms']
        ]

    # Normalize provenance
    if 'provenance' in normalized and normalized['provenance']:
        normalized['provenance'] = normalize_provenance(normalized['provenance'])

    return normalized


def normalize_yaml_file(input_path: Path, output_path: Path) -> None:
    """
    Read YAML file, normalize field names, write to output file.

    Args:
        input_path: Path to input YAML file with alias field names
        output_path: Path to output YAML file with canonical field names
    """
    # Read input YAML
    with open(input_path, 'r', encoding='utf-8') as f:
        data = yaml.safe_load(f)

    # Normalize each institution
    if isinstance(data, list):
        normalized_data = [normalize_heritage_custodian(inst) for inst in data]
    else:
        normalized_data = normalize_heritage_custodian(data)

    # Write output YAML
    with open(output_path, 'w', encoding='utf-8') as f:
        yaml.dump(
            normalized_data,
            f,
            default_flow_style=False,
            allow_unicode=True,
            sort_keys=False,
            width=120
        )

    print(f"✅ Normalized {input_path} → {output_path}")


def main():
    """Main entry point."""
    if len(sys.argv) < 2:
        print("Usage: python normalize_field_names.py <input.yaml> [output.yaml]")
        print("\nNormalizes field names from aliases to canonical schema names.")
        print("If output path not specified, writes to <input>_normalized.yaml")
        sys.exit(1)

    input_path = Path(sys.argv[1])

    if len(sys.argv) >= 3:
        output_path = Path(sys.argv[2])
    else:
        output_path = input_path.parent / f"{input_path.stem}_normalized.yaml"

    if not input_path.exists():
        print(f"❌ Error: Input file not found: {input_path}")
        sys.exit(1)

    normalize_yaml_file(input_path, output_path)
    print(f"\n✅ Validation-ready file created: {output_path}")
    print(f"\nValidate with:")
    print(f"  linkml-validate -s schemas/heritage_custodian.yaml -C HeritageCustodian {output_path}")


if __name__ == "__main__":
    main()