glam/scripts/convert_isil_csv_to_yaml.py
2025-11-19 23:25:22 +01:00

239 lines
8.1 KiB
Python

#!/usr/bin/env python3
"""
Convert Dutch ISIL CSV to LinkML-compliant YAML.
Input: /data/isil/nl/nan/ISIL-codes_2025-11-06.csv
Output: /data/isil/nl/nan/ISIL-codes_2025-11-06.yaml
The CSV has a malformed structure:
- All fields in one cell separated by ","
- Extra trailing semicolons
- Latin-1 encoding
- Header includes sequence number as first field
This script:
1. Parses the malformed CSV correctly
2. Maps to HeritageCustodian schema with Identifier class
3. Preserves all original fields
4. Validates field preservation
5. Generates LinkML-compliant YAML
"""
import re
import yaml
from pathlib import Path
from datetime import datetime, timezone
def parse_malformed_csv(file_path):
"""Parse the malformed ISIL CSV."""
with open(file_path, 'r', encoding='latin-1') as f:
lines = f.readlines()
# Parse header (skip sequence number field)
header_line = lines[0].strip().rstrip(';;;;')
header_parts = header_line.split('","')
header_raw = [part.strip('"').strip() for part in header_parts]
# Expected header: ['1', 'Plaats', 'Instelling', 'ISIL code', 'Toegekend op', 'Opmerking']
# We'll use indices 1-5 (skip sequence number)
records = []
for i, line in enumerate(lines[1:], start=1):
line = line.strip()
if not line:
continue
parts = line.rstrip(';;;;').split('","')
row = [part.strip('"').strip() for part in parts]
if len(row) >= 5:
record = {
'row_number': i,
'plaats': row[1] if len(row) > 1 else '',
'instelling': row[2] if len(row) > 2 else '',
'isil_code': row[3] if len(row) > 3 else '',
'toegekend_op': row[4] if len(row) > 4 else '',
'opmerking': row[5] if len(row) > 5 else '',
}
records.append(record)
return records, header_raw
def map_to_linkml(records):
"""Map ISIL records to LinkML HeritageCustodian schema."""
heritage_custodians = []
for record in records:
# Create identifier structure
identifier = {
'identifier_scheme': 'ISIL',
'identifier_value': record['isil_code'],
'identifier_url': f"https://isil.org/{record['isil_code']}",
}
# Add assigned date if present
if record['toegekend_op']:
identifier['assigned_date'] = record['toegekend_op']
# Create location
location = {
'city': record['plaats'],
'country': 'NL',
}
# Create HeritageCustodian record
custodian = {
# Original CSV fields (preserved)
'csv_row_number': record['row_number'],
'csv_plaats': record['plaats'],
'csv_instelling': record['instelling'],
'csv_isil_code': record['isil_code'],
'csv_toegekend_op': record['toegekend_op'],
'csv_opmerking': record['opmerking'],
# LinkML mapped fields
'name': record['instelling'],
'locations': [location],
'identifiers': [identifier],
}
# Add description if there's an opmerking (remark)
if record['opmerking']:
custodian['description'] = f"Opmerking: {record['opmerking']}"
# Add provenance
custodian['provenance'] = {
'data_source': 'ISIL_REGISTRY',
'data_tier': 'TIER_1_AUTHORITATIVE',
'extraction_date': datetime.now(timezone.utc).isoformat(),
'extraction_method': 'CSV to YAML conversion (preserve all fields)',
'source_url': 'https://www.nationaalarchief.nl/isil',
'confidence_score': 1.0,
}
heritage_custodians.append(custodian)
return heritage_custodians
def validate_field_preservation(original_records, linkml_records):
"""Validate that all CSV fields were preserved."""
print("=" * 80)
print("FIELD PRESERVATION VALIDATION")
print("=" * 80)
print()
# Check field mapping
csv_fields = ['row_number', 'plaats', 'instelling', 'isil_code', 'toegekend_op', 'opmerking']
linkml_fields = ['csv_row_number', 'csv_plaats', 'csv_instelling', 'csv_isil_code',
'csv_toegekend_op', 'csv_opmerking']
print("CSV Field Mapping:")
for csv_field, linkml_field in zip(csv_fields, linkml_fields):
print(f" {csv_field:20s}{linkml_field}")
print()
# Verify all records
missing_fields = []
value_mismatches = []
for i, (orig, mapped) in enumerate(zip(original_records, linkml_records)):
for csv_field, linkml_field in zip(csv_fields, linkml_fields):
if linkml_field not in mapped:
missing_fields.append((i, linkml_field))
elif str(orig[csv_field]) != str(mapped[linkml_field]):
value_mismatches.append((i, csv_field, orig[csv_field], mapped[linkml_field]))
if missing_fields:
print(f"⚠️ WARNING: {len(missing_fields)} missing fields detected:")
for row_idx, field in missing_fields[:10]:
print(f" Row {row_idx}: Missing field '{field}'")
print()
else:
print("✅ All CSV fields preserved in LinkML structure")
print()
if value_mismatches:
print(f"⚠️ WARNING: {len(value_mismatches)} value mismatches detected:")
for row_idx, field, orig_val, mapped_val in value_mismatches[:10]:
print(f" Row {row_idx}, field '{field}': '{orig_val}''{mapped_val}'")
print()
else:
print("✅ All field values match exactly")
print()
# Summary statistics
total_records = len(original_records)
total_fields = len(csv_fields) * total_records
print("=" * 80)
print("SUMMARY")
print("=" * 80)
print(f"Total records: {total_records}")
print(f"Total fields: {total_fields}")
print(f"Fields preserved: {total_fields - len(missing_fields)}")
print(f"Value mismatches: {len(value_mismatches)}")
print(f"Preservation rate: {((total_fields - len(missing_fields)) / total_fields * 100):.1f}%")
print()
return len(missing_fields) == 0 and len(value_mismatches) == 0
def main():
"""Convert ISIL CSV to LinkML YAML."""
base_dir = Path(__file__).parent.parent
csv_path = base_dir / "data" / "isil" / "nl" / "nan" / "ISIL-codes_2025-11-06.csv"
yaml_path = base_dir / "data" / "isil" / "nl" / "nan" / "ISIL-codes_2025-11-06.yaml"
print("=" * 80)
print("DUTCH ISIL CSV TO LINKML YAML CONVERTER")
print("=" * 80)
print()
print(f"Input: {csv_path}")
print(f"Output: {yaml_path}")
print()
# Parse CSV
print("Parsing malformed CSV...")
records, header = parse_malformed_csv(csv_path)
print(f"✅ Parsed {len(records)} records")
print(f" Header: {header}")
print()
# Map to LinkML
print("Mapping to LinkML HeritageCustodian schema...")
linkml_records = map_to_linkml(records)
print(f"✅ Mapped {len(linkml_records)} records")
print()
# Validate field preservation
validation_passed = validate_field_preservation(records, linkml_records)
if not validation_passed:
print("⚠️ WARNING: Field preservation validation failed!")
response = input("Continue anyway? (y/N): ")
if response.lower() != 'y':
print("Aborted.")
return
print()
# Write YAML
print(f"Writing LinkML YAML to {yaml_path.name}...")
with open(yaml_path, 'w', encoding='utf-8') as f:
yaml.dump(linkml_records, f, allow_unicode=True, default_flow_style=False,
sort_keys=False, width=120)
print("✅ YAML file created successfully!")
print()
# Show sample output
print("=" * 80)
print("SAMPLE OUTPUT (first record)")
print("=" * 80)
print(yaml.dump([linkml_records[0]], allow_unicode=True, default_flow_style=False,
sort_keys=False, width=120))
print("=" * 80)
print(f"CONVERSION COMPLETE: {len(linkml_records)} records")
print("=" * 80)
if __name__ == "__main__":
main()