169 lines
4.6 KiB
Python
169 lines
4.6 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Convert NDE Dutch heritage organizations CSV to YAML format.
|
|
|
|
This script reads the CSV file and converts it to YAML, preserving all content
|
|
and only including fields with actual values in each entry.
|
|
"""
|
|
|
|
import csv
|
|
import yaml
|
|
from pathlib import Path
|
|
from typing import Dict, List, Any
|
|
|
|
|
|
def clean_field_name(name: str) -> str:
|
|
"""
|
|
Clean and normalize field names for YAML.
|
|
|
|
Args:
|
|
name: Raw field name from CSV header
|
|
|
|
Returns:
|
|
Cleaned field name suitable for YAML keys
|
|
"""
|
|
# Remove leading/trailing whitespace
|
|
name = name.strip()
|
|
|
|
# Replace newlines and carriage returns with underscores
|
|
name = name.replace('\r\n', '_')
|
|
name = name.replace('\n', '_')
|
|
name = name.replace('\r', '_')
|
|
|
|
# Replace spaces with underscores
|
|
name = name.replace(' ', '_')
|
|
|
|
# Replace special characters
|
|
name = name.replace('/', '_')
|
|
name = name.replace('(', '')
|
|
name = name.replace(')', '')
|
|
name = name.replace(',', '')
|
|
name = name.replace('"', '')
|
|
|
|
# Remove multiple consecutive underscores
|
|
while '__' in name:
|
|
name = name.replace('__', '_')
|
|
|
|
# Remove leading/trailing underscores
|
|
name = name.strip('_')
|
|
|
|
# Convert to lowercase
|
|
name = name.lower()
|
|
|
|
# Handle empty field names (use a placeholder)
|
|
if not name:
|
|
name = 'unnamed_field'
|
|
|
|
return name
|
|
|
|
|
|
def parse_value(value: str) -> str | None:
|
|
"""
|
|
Parse and clean a CSV cell value.
|
|
|
|
Args:
|
|
value: Raw cell value
|
|
|
|
Returns:
|
|
Cleaned value or None if empty
|
|
"""
|
|
if not value:
|
|
return None
|
|
|
|
# Strip whitespace
|
|
value = value.strip()
|
|
|
|
# Return None for empty strings
|
|
if not value:
|
|
return None
|
|
|
|
return value
|
|
|
|
|
|
def convert_csv_to_yaml(csv_path: Path, yaml_path: Path) -> None:
|
|
"""
|
|
Convert CSV file to YAML format.
|
|
|
|
Args:
|
|
csv_path: Path to input CSV file
|
|
yaml_path: Path to output YAML file
|
|
"""
|
|
records: List[Dict[str, Any]] = []
|
|
|
|
# Read CSV with UTF-8-sig encoding to handle BOM
|
|
with open(csv_path, 'r', encoding='utf-8-sig', newline='') as f:
|
|
# The CSV has a multi-line header. First row is the actual header.
|
|
reader = csv.DictReader(f)
|
|
|
|
# Get field names and clean them
|
|
raw_fieldnames = reader.fieldnames
|
|
if not raw_fieldnames:
|
|
raise ValueError("No field names found in CSV")
|
|
|
|
# Create mapping from raw to clean field names
|
|
field_mapping = {
|
|
raw: clean_field_name(raw) for raw in raw_fieldnames
|
|
}
|
|
|
|
print(f"Processing {csv_path.name}...")
|
|
print(f"Found {len(raw_fieldnames)} columns")
|
|
|
|
# Process each row
|
|
for row_num, row in enumerate(reader, start=2): # Start at 2 (header is row 1)
|
|
record: Dict[str, Any] = {}
|
|
|
|
# Process each field
|
|
for raw_field, clean_field in field_mapping.items():
|
|
value = parse_value(row.get(raw_field, ''))
|
|
|
|
# Only add fields with actual content
|
|
if value is not None:
|
|
record[clean_field] = value
|
|
|
|
# Only add records that have at least one field
|
|
if record:
|
|
records.append(record)
|
|
|
|
print(f"Processed {len(records)} records")
|
|
|
|
# Write to YAML
|
|
print(f"Writing to {yaml_path.name}...")
|
|
with open(yaml_path, 'w', encoding='utf-8') as f:
|
|
# Use default_flow_style=False for block style (more readable)
|
|
# Use allow_unicode=True to preserve special characters
|
|
yaml.dump(
|
|
records,
|
|
f,
|
|
default_flow_style=False,
|
|
allow_unicode=True,
|
|
sort_keys=False,
|
|
width=120
|
|
)
|
|
|
|
print(f"Successfully converted {len(records)} records to YAML")
|
|
print(f"Output: {yaml_path}")
|
|
|
|
|
|
def main():
|
|
"""Main entry point."""
|
|
# Define paths
|
|
csv_path = Path("/Users/kempersc/apps/glam/data/nde/voorbeeld_lijst_organisaties_en_diensten-totaallijst_nederland.csv")
|
|
yaml_path = csv_path.parent / f"{csv_path.stem}.yaml"
|
|
|
|
# Check if input file exists
|
|
if not csv_path.exists():
|
|
print(f"Error: CSV file not found: {csv_path}")
|
|
return 1
|
|
|
|
try:
|
|
convert_csv_to_yaml(csv_path, yaml_path)
|
|
return 0
|
|
except Exception as e:
|
|
print(f"Error: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return 1
|
|
|
|
|
|
if __name__ == "__main__":
|
|
exit(main())
|