#!/usr/bin/env python3 """ Convert Dutch ISIL CSV to LinkML-compliant YAML. Input: /data/isil/nl/nan/ISIL-codes_2025-11-06.csv Output: /data/isil/nl/nan/ISIL-codes_2025-11-06.yaml The CSV has a malformed structure: - All fields in one cell separated by "," - Extra trailing semicolons - Latin-1 encoding - Header includes sequence number as first field This script: 1. Parses the malformed CSV correctly 2. Maps to HeritageCustodian schema with Identifier class 3. Preserves all original fields 4. Validates field preservation 5. Generates LinkML-compliant YAML """ import re import yaml from pathlib import Path from datetime import datetime, timezone def parse_malformed_csv(file_path): """Parse the malformed ISIL CSV.""" with open(file_path, 'r', encoding='latin-1') as f: lines = f.readlines() # Parse header (skip sequence number field) header_line = lines[0].strip().rstrip(';;;;') header_parts = header_line.split('","') header_raw = [part.strip('"').strip() for part in header_parts] # Expected header: ['1', 'Plaats', 'Instelling', 'ISIL code', 'Toegekend op', 'Opmerking'] # We'll use indices 1-5 (skip sequence number) records = [] for i, line in enumerate(lines[1:], start=1): line = line.strip() if not line: continue parts = line.rstrip(';;;;').split('","') row = [part.strip('"').strip() for part in parts] if len(row) >= 5: record = { 'row_number': i, 'plaats': row[1] if len(row) > 1 else '', 'instelling': row[2] if len(row) > 2 else '', 'isil_code': row[3] if len(row) > 3 else '', 'toegekend_op': row[4] if len(row) > 4 else '', 'opmerking': row[5] if len(row) > 5 else '', } records.append(record) return records, header_raw def map_to_linkml(records): """Map ISIL records to LinkML HeritageCustodian schema.""" heritage_custodians = [] for record in records: # Create identifier structure identifier = { 'identifier_scheme': 'ISIL', 'identifier_value': record['isil_code'], # ISIL codes don't have a universal URLisil_code']}", } # Add assigned date if present if record['toegekend_op']: identifier['assigned_date'] = record['toegekend_op'] # Create location location = { 'city': record['plaats'], 'country': 'NL', } # Create HeritageCustodian record custodian = { # Original CSV fields (preserved) 'csv_row_number': record['row_number'], 'csv_plaats': record['plaats'], 'csv_instelling': record['instelling'], 'csv_isil_code': record['isil_code'], 'csv_toegekend_op': record['toegekend_op'], 'csv_opmerking': record['opmerking'], # LinkML mapped fields 'name': record['instelling'], 'locations': [location], 'identifiers': [identifier], } # Add description if there's an opmerking (remark) if record['opmerking']: custodian['description'] = f"Opmerking: {record['opmerking']}" # Add provenance custodian['provenance'] = { 'data_source': 'ISIL_REGISTRY', 'data_tier': 'TIER_1_AUTHORITATIVE', 'extraction_date': datetime.now(timezone.utc).isoformat(), 'extraction_method': 'CSV to YAML conversion (preserve all fields)', 'source_url': 'https://www.nationaalarchief.nl/isil', 'confidence_score': 1.0, } heritage_custodians.append(custodian) return heritage_custodians def validate_field_preservation(original_records, linkml_records): """Validate that all CSV fields were preserved.""" print("=" * 80) print("FIELD PRESERVATION VALIDATION") print("=" * 80) print() # Check field mapping csv_fields = ['row_number', 'plaats', 'instelling', 'isil_code', 'toegekend_op', 'opmerking'] linkml_fields = ['csv_row_number', 'csv_plaats', 'csv_instelling', 'csv_isil_code', 'csv_toegekend_op', 'csv_opmerking'] print("CSV Field Mapping:") for csv_field, linkml_field in zip(csv_fields, linkml_fields): print(f" {csv_field:20s} → {linkml_field}") print() # Verify all records missing_fields = [] value_mismatches = [] for i, (orig, mapped) in enumerate(zip(original_records, linkml_records)): for csv_field, linkml_field in zip(csv_fields, linkml_fields): if linkml_field not in mapped: missing_fields.append((i, linkml_field)) elif str(orig[csv_field]) != str(mapped[linkml_field]): value_mismatches.append((i, csv_field, orig[csv_field], mapped[linkml_field])) if missing_fields: print(f"⚠️ WARNING: {len(missing_fields)} missing fields detected:") for row_idx, field in missing_fields[:10]: print(f" Row {row_idx}: Missing field '{field}'") print() else: print("✅ All CSV fields preserved in LinkML structure") print() if value_mismatches: print(f"⚠️ WARNING: {len(value_mismatches)} value mismatches detected:") for row_idx, field, orig_val, mapped_val in value_mismatches[:10]: print(f" Row {row_idx}, field '{field}': '{orig_val}' → '{mapped_val}'") print() else: print("✅ All field values match exactly") print() # Summary statistics total_records = len(original_records) total_fields = len(csv_fields) * total_records print("=" * 80) print("SUMMARY") print("=" * 80) print(f"Total records: {total_records}") print(f"Total fields: {total_fields}") print(f"Fields preserved: {total_fields - len(missing_fields)}") print(f"Value mismatches: {len(value_mismatches)}") print(f"Preservation rate: {((total_fields - len(missing_fields)) / total_fields * 100):.1f}%") print() return len(missing_fields) == 0 and len(value_mismatches) == 0 def main(): """Convert ISIL CSV to LinkML YAML.""" base_dir = Path(__file__).parent.parent csv_path = base_dir / "data" / "isil" / "nl" / "nan" / "ISIL-codes_2025-11-06.csv" yaml_path = base_dir / "data" / "isil" / "nl" / "nan" / "ISIL-codes_2025-11-06.yaml" print("=" * 80) print("DUTCH ISIL CSV TO LINKML YAML CONVERTER") print("=" * 80) print() print(f"Input: {csv_path}") print(f"Output: {yaml_path}") print() # Parse CSV print("Parsing malformed CSV...") records, header = parse_malformed_csv(csv_path) print(f"✅ Parsed {len(records)} records") print(f" Header: {header}") print() # Map to LinkML print("Mapping to LinkML HeritageCustodian schema...") linkml_records = map_to_linkml(records) print(f"✅ Mapped {len(linkml_records)} records") print() # Validate field preservation validation_passed = validate_field_preservation(records, linkml_records) if not validation_passed: print("⚠️ WARNING: Field preservation validation failed!") response = input("Continue anyway? (y/N): ") if response.lower() != 'y': print("Aborted.") return print() # Write YAML print(f"Writing LinkML YAML to {yaml_path.name}...") with open(yaml_path, 'w', encoding='utf-8') as f: yaml.dump(linkml_records, f, allow_unicode=True, default_flow_style=False, sort_keys=False, width=120) print("✅ YAML file created successfully!") print() # Show sample output print("=" * 80) print("SAMPLE OUTPUT (first record)") print("=" * 80) print(yaml.dump([linkml_records[0]], allow_unicode=True, default_flow_style=False, sort_keys=False, width=120)) print("=" * 80) print(f"CONVERSION COMPLETE: {len(linkml_records)} records") print("=" * 80) if __name__ == "__main__": main()