glam/scripts/convert_nde_csv_to_yaml.py
2025-11-19 23:25:22 +01:00

169 lines
4.6 KiB
Python

#!/usr/bin/env python3
"""
Convert NDE Dutch heritage organizations CSV to YAML format.
This script reads the CSV file and converts it to YAML, preserving all content
and only including fields with actual values in each entry.
"""
import csv
import yaml
from pathlib import Path
from typing import Dict, List, Any
def clean_field_name(name: str) -> str:
"""
Clean and normalize field names for YAML.
Args:
name: Raw field name from CSV header
Returns:
Cleaned field name suitable for YAML keys
"""
# Remove leading/trailing whitespace
name = name.strip()
# Replace newlines and carriage returns with underscores
name = name.replace('\r\n', '_')
name = name.replace('\n', '_')
name = name.replace('\r', '_')
# Replace spaces with underscores
name = name.replace(' ', '_')
# Replace special characters
name = name.replace('/', '_')
name = name.replace('(', '')
name = name.replace(')', '')
name = name.replace(',', '')
name = name.replace('"', '')
# Remove multiple consecutive underscores
while '__' in name:
name = name.replace('__', '_')
# Remove leading/trailing underscores
name = name.strip('_')
# Convert to lowercase
name = name.lower()
# Handle empty field names (use a placeholder)
if not name:
name = 'unnamed_field'
return name
def parse_value(value: str) -> str | None:
"""
Parse and clean a CSV cell value.
Args:
value: Raw cell value
Returns:
Cleaned value or None if empty
"""
if not value:
return None
# Strip whitespace
value = value.strip()
# Return None for empty strings
if not value:
return None
return value
def convert_csv_to_yaml(csv_path: Path, yaml_path: Path) -> None:
"""
Convert CSV file to YAML format.
Args:
csv_path: Path to input CSV file
yaml_path: Path to output YAML file
"""
records: List[Dict[str, Any]] = []
# Read CSV with UTF-8-sig encoding to handle BOM
with open(csv_path, 'r', encoding='utf-8-sig', newline='') as f:
# The CSV has a multi-line header. First row is the actual header.
reader = csv.DictReader(f)
# Get field names and clean them
raw_fieldnames = reader.fieldnames
if not raw_fieldnames:
raise ValueError("No field names found in CSV")
# Create mapping from raw to clean field names
field_mapping = {
raw: clean_field_name(raw) for raw in raw_fieldnames
}
print(f"Processing {csv_path.name}...")
print(f"Found {len(raw_fieldnames)} columns")
# Process each row
for row_num, row in enumerate(reader, start=2): # Start at 2 (header is row 1)
record: Dict[str, Any] = {}
# Process each field
for raw_field, clean_field in field_mapping.items():
value = parse_value(row.get(raw_field, ''))
# Only add fields with actual content
if value is not None:
record[clean_field] = value
# Only add records that have at least one field
if record:
records.append(record)
print(f"Processed {len(records)} records")
# Write to YAML
print(f"Writing to {yaml_path.name}...")
with open(yaml_path, 'w', encoding='utf-8') as f:
# Use default_flow_style=False for block style (more readable)
# Use allow_unicode=True to preserve special characters
yaml.dump(
records,
f,
default_flow_style=False,
allow_unicode=True,
sort_keys=False,
width=120
)
print(f"Successfully converted {len(records)} records to YAML")
print(f"Output: {yaml_path}")
def main():
"""Main entry point."""
# Define paths
csv_path = Path("/Users/kempersc/apps/glam/data/nde/voorbeeld_lijst_organisaties_en_diensten-totaallijst_nederland.csv")
yaml_path = csv_path.parent / f"{csv_path.stem}.yaml"
# Check if input file exists
if not csv_path.exists():
print(f"Error: CSV file not found: {csv_path}")
return 1
try:
convert_csv_to_yaml(csv_path, yaml_path)
return 0
except Exception as e:
print(f"Error: {e}")
import traceback
traceback.print_exc()
return 1
if __name__ == "__main__":
exit(main())