glam/scripts/convert_nde_csv_to_yaml.py

#!/usr/bin/env python3
"""
Convert NDE Dutch heritage organizations CSV to YAML format.

This script reads the CSV file and converts it to YAML, preserving all content
and only including fields with actual values in each entry.
"""

import csv
import yaml
from pathlib import Path
from typing import Dict, List, Any


def clean_field_name(name: str) -> str:
    """
    Clean and normalize field names for YAML.

    Args:
        name: Raw field name from CSV header

    Returns:
        Cleaned field name suitable for YAML keys
    """
    # Remove leading/trailing whitespace
    name = name.strip()

    # Replace newlines and carriage returns with underscores
    name = name.replace('\r\n', '_')
    name = name.replace('\n', '_')
    name = name.replace('\r', '_')

    # Replace spaces with underscores
    name = name.replace(' ', '_')

    # Replace special characters
    name = name.replace('/', '_')
    name = name.replace('(', '')
    name = name.replace(')', '')
    name = name.replace(',', '')
    name = name.replace('"', '')

    # Remove multiple consecutive underscores
    while '__' in name:
        name = name.replace('__', '_')

    # Remove leading/trailing underscores
    name = name.strip('_')

    # Convert to lowercase
    name = name.lower()

    # Handle empty field names (use a placeholder)
    if not name:
        name = 'unnamed_field'

    return name


def parse_value(value: str) -> str | None:
    """
    Parse and clean a CSV cell value.

    Args:
        value: Raw cell value

    Returns:
        Cleaned value or None if empty
    """
    if not value:
        return None

    # Strip whitespace
    value = value.strip()

    # Return None for empty strings
    if not value:
        return None

    return value


def convert_csv_to_yaml(csv_path: Path, yaml_path: Path) -> None:
    """
    Convert CSV file to YAML format.

    Args:
        csv_path: Path to input CSV file
        yaml_path: Path to output YAML file
    """
    records: List[Dict[str, Any]] = []

    # Read CSV with UTF-8-sig encoding to handle BOM
    with open(csv_path, 'r', encoding='utf-8-sig', newline='') as f:
        # The CSV has a multi-line header. First row is the actual header.
        reader = csv.DictReader(f)

        # Get field names and clean them
        raw_fieldnames = reader.fieldnames
        if not raw_fieldnames:
            raise ValueError("No field names found in CSV")

        # Create mapping from raw to clean field names
        field_mapping = {
            raw: clean_field_name(raw) for raw in raw_fieldnames
        }

        print(f"Processing {csv_path.name}...")
        print(f"Found {len(raw_fieldnames)} columns")

        # Process each row
        for row_num, row in enumerate(reader, start=2):  # Start at 2 (header is row 1)
            record: Dict[str, Any] = {}

            # Process each field
            for raw_field, clean_field in field_mapping.items():
                value = parse_value(row.get(raw_field, ''))

                # Only add fields with actual content
                if value is not None:
                    record[clean_field] = value

            # Only add records that have at least one field
            if record:
                records.append(record)

        print(f"Processed {len(records)} records")

    # Write to YAML
    print(f"Writing to {yaml_path.name}...")
    with open(yaml_path, 'w', encoding='utf-8') as f:
        # Use default_flow_style=False for block style (more readable)
        # Use allow_unicode=True to preserve special characters
        yaml.dump(
            records,
            f,
            default_flow_style=False,
            allow_unicode=True,
            sort_keys=False,
            width=120
        )

    print(f"Successfully converted {len(records)} records to YAML")
    print(f"Output: {yaml_path}")


def main():
    """Main entry point."""
    # Define paths
    csv_path = Path("/Users/kempersc/apps/glam/data/nde/voorbeeld_lijst_organisaties_en_diensten-totaallijst_nederland.csv")
    yaml_path = csv_path.parent / f"{csv_path.stem}.yaml"

    # Check if input file exists
    if not csv_path.exists():
        print(f"Error: CSV file not found: {csv_path}")
        return 1

    try:
        convert_csv_to_yaml(csv_path, yaml_path)
        return 0
    except Exception as e:
        print(f"Error: {e}")
        import traceback
        traceback.print_exc()
        return 1


if __name__ == "__main__":
    exit(main())