#!/usr/bin/env python3 """ Convert NDE Dutch heritage organizations CSV to YAML format. This script reads the CSV file and converts it to YAML, preserving all content and only including fields with actual values in each entry. """ import csv import yaml from pathlib import Path from typing import Dict, List, Any def clean_field_name(name: str) -> str: """ Clean and normalize field names for YAML. Args: name: Raw field name from CSV header Returns: Cleaned field name suitable for YAML keys """ # Remove leading/trailing whitespace name = name.strip() # Replace newlines and carriage returns with underscores name = name.replace('\r\n', '_') name = name.replace('\n', '_') name = name.replace('\r', '_') # Replace spaces with underscores name = name.replace(' ', '_') # Replace special characters name = name.replace('/', '_') name = name.replace('(', '') name = name.replace(')', '') name = name.replace(',', '') name = name.replace('"', '') # Remove multiple consecutive underscores while '__' in name: name = name.replace('__', '_') # Remove leading/trailing underscores name = name.strip('_') # Convert to lowercase name = name.lower() # Handle empty field names (use a placeholder) if not name: name = 'unnamed_field' return name def parse_value(value: str) -> str | None: """ Parse and clean a CSV cell value. Args: value: Raw cell value Returns: Cleaned value or None if empty """ if not value: return None # Strip whitespace value = value.strip() # Return None for empty strings if not value: return None return value def convert_csv_to_yaml(csv_path: Path, yaml_path: Path) -> None: """ Convert CSV file to YAML format. Args: csv_path: Path to input CSV file yaml_path: Path to output YAML file """ records: List[Dict[str, Any]] = [] # Read CSV with UTF-8-sig encoding to handle BOM with open(csv_path, 'r', encoding='utf-8-sig', newline='') as f: # The CSV has a multi-line header. First row is the actual header. reader = csv.DictReader(f) # Get field names and clean them raw_fieldnames = reader.fieldnames if not raw_fieldnames: raise ValueError("No field names found in CSV") # Create mapping from raw to clean field names field_mapping = { raw: clean_field_name(raw) for raw in raw_fieldnames } print(f"Processing {csv_path.name}...") print(f"Found {len(raw_fieldnames)} columns") # Process each row for row_num, row in enumerate(reader, start=2): # Start at 2 (header is row 1) record: Dict[str, Any] = {} # Process each field for raw_field, clean_field in field_mapping.items(): value = parse_value(row.get(raw_field, '')) # Only add fields with actual content if value is not None: record[clean_field] = value # Only add records that have at least one field if record: records.append(record) print(f"Processed {len(records)} records") # Write to YAML print(f"Writing to {yaml_path.name}...") with open(yaml_path, 'w', encoding='utf-8') as f: # Use default_flow_style=False for block style (more readable) # Use allow_unicode=True to preserve special characters yaml.dump( records, f, default_flow_style=False, allow_unicode=True, sort_keys=False, width=120 ) print(f"Successfully converted {len(records)} records to YAML") print(f"Output: {yaml_path}") def main(): """Main entry point.""" # Define paths csv_path = Path("/Users/kempersc/apps/glam/data/nde/voorbeeld_lijst_organisaties_en_diensten-totaallijst_nederland.csv") yaml_path = csv_path.parent / f"{csv_path.stem}.yaml" # Check if input file exists if not csv_path.exists(): print(f"Error: CSV file not found: {csv_path}") return 1 try: convert_csv_to_yaml(csv_path, yaml_path) return 0 except Exception as e: print(f"Error: {e}") import traceback traceback.print_exc() return 1 if __name__ == "__main__": exit(main())