239 lines
8.1 KiB
Python
239 lines
8.1 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Convert Dutch ISIL CSV to LinkML-compliant YAML.
|
|
|
|
Input: /data/isil/nl/nan/ISIL-codes_2025-11-06.csv
|
|
Output: /data/isil/nl/nan/ISIL-codes_2025-11-06.yaml
|
|
|
|
The CSV has a malformed structure:
|
|
- All fields in one cell separated by ","
|
|
- Extra trailing semicolons
|
|
- Latin-1 encoding
|
|
- Header includes sequence number as first field
|
|
|
|
This script:
|
|
1. Parses the malformed CSV correctly
|
|
2. Maps to HeritageCustodian schema with Identifier class
|
|
3. Preserves all original fields
|
|
4. Validates field preservation
|
|
5. Generates LinkML-compliant YAML
|
|
"""
|
|
|
|
import re
|
|
import yaml
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
|
|
def parse_malformed_csv(file_path):
|
|
"""Parse the malformed ISIL CSV."""
|
|
with open(file_path, 'r', encoding='latin-1') as f:
|
|
lines = f.readlines()
|
|
|
|
# Parse header (skip sequence number field)
|
|
header_line = lines[0].strip().rstrip(';;;;')
|
|
header_parts = header_line.split('","')
|
|
header_raw = [part.strip('"').strip() for part in header_parts]
|
|
|
|
# Expected header: ['1', 'Plaats', 'Instelling', 'ISIL code', 'Toegekend op', 'Opmerking']
|
|
# We'll use indices 1-5 (skip sequence number)
|
|
|
|
records = []
|
|
for i, line in enumerate(lines[1:], start=1):
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
|
|
parts = line.rstrip(';;;;').split('","')
|
|
row = [part.strip('"').strip() for part in parts]
|
|
|
|
if len(row) >= 5:
|
|
record = {
|
|
'row_number': i,
|
|
'plaats': row[1] if len(row) > 1 else '',
|
|
'instelling': row[2] if len(row) > 2 else '',
|
|
'isil_code': row[3] if len(row) > 3 else '',
|
|
'toegekend_op': row[4] if len(row) > 4 else '',
|
|
'opmerking': row[5] if len(row) > 5 else '',
|
|
}
|
|
records.append(record)
|
|
|
|
return records, header_raw
|
|
|
|
def map_to_linkml(records):
|
|
"""Map ISIL records to LinkML HeritageCustodian schema."""
|
|
heritage_custodians = []
|
|
|
|
for record in records:
|
|
# Create identifier structure
|
|
identifier = {
|
|
'identifier_scheme': 'ISIL',
|
|
'identifier_value': record['isil_code'],
|
|
'identifier_url': f"https://isil.org/{record['isil_code']}",
|
|
}
|
|
|
|
# Add assigned date if present
|
|
if record['toegekend_op']:
|
|
identifier['assigned_date'] = record['toegekend_op']
|
|
|
|
# Create location
|
|
location = {
|
|
'city': record['plaats'],
|
|
'country': 'NL',
|
|
}
|
|
|
|
# Create HeritageCustodian record
|
|
custodian = {
|
|
# Original CSV fields (preserved)
|
|
'csv_row_number': record['row_number'],
|
|
'csv_plaats': record['plaats'],
|
|
'csv_instelling': record['instelling'],
|
|
'csv_isil_code': record['isil_code'],
|
|
'csv_toegekend_op': record['toegekend_op'],
|
|
'csv_opmerking': record['opmerking'],
|
|
|
|
# LinkML mapped fields
|
|
'name': record['instelling'],
|
|
'locations': [location],
|
|
'identifiers': [identifier],
|
|
}
|
|
|
|
# Add description if there's an opmerking (remark)
|
|
if record['opmerking']:
|
|
custodian['description'] = f"Opmerking: {record['opmerking']}"
|
|
|
|
# Add provenance
|
|
custodian['provenance'] = {
|
|
'data_source': 'ISIL_REGISTRY',
|
|
'data_tier': 'TIER_1_AUTHORITATIVE',
|
|
'extraction_date': datetime.now(timezone.utc).isoformat(),
|
|
'extraction_method': 'CSV to YAML conversion (preserve all fields)',
|
|
'source_url': 'https://www.nationaalarchief.nl/isil',
|
|
'confidence_score': 1.0,
|
|
}
|
|
|
|
heritage_custodians.append(custodian)
|
|
|
|
return heritage_custodians
|
|
|
|
def validate_field_preservation(original_records, linkml_records):
|
|
"""Validate that all CSV fields were preserved."""
|
|
print("=" * 80)
|
|
print("FIELD PRESERVATION VALIDATION")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
# Check field mapping
|
|
csv_fields = ['row_number', 'plaats', 'instelling', 'isil_code', 'toegekend_op', 'opmerking']
|
|
linkml_fields = ['csv_row_number', 'csv_plaats', 'csv_instelling', 'csv_isil_code',
|
|
'csv_toegekend_op', 'csv_opmerking']
|
|
|
|
print("CSV Field Mapping:")
|
|
for csv_field, linkml_field in zip(csv_fields, linkml_fields):
|
|
print(f" {csv_field:20s} → {linkml_field}")
|
|
print()
|
|
|
|
# Verify all records
|
|
missing_fields = []
|
|
value_mismatches = []
|
|
|
|
for i, (orig, mapped) in enumerate(zip(original_records, linkml_records)):
|
|
for csv_field, linkml_field in zip(csv_fields, linkml_fields):
|
|
if linkml_field not in mapped:
|
|
missing_fields.append((i, linkml_field))
|
|
elif str(orig[csv_field]) != str(mapped[linkml_field]):
|
|
value_mismatches.append((i, csv_field, orig[csv_field], mapped[linkml_field]))
|
|
|
|
if missing_fields:
|
|
print(f"⚠️ WARNING: {len(missing_fields)} missing fields detected:")
|
|
for row_idx, field in missing_fields[:10]:
|
|
print(f" Row {row_idx}: Missing field '{field}'")
|
|
print()
|
|
else:
|
|
print("✅ All CSV fields preserved in LinkML structure")
|
|
print()
|
|
|
|
if value_mismatches:
|
|
print(f"⚠️ WARNING: {len(value_mismatches)} value mismatches detected:")
|
|
for row_idx, field, orig_val, mapped_val in value_mismatches[:10]:
|
|
print(f" Row {row_idx}, field '{field}': '{orig_val}' → '{mapped_val}'")
|
|
print()
|
|
else:
|
|
print("✅ All field values match exactly")
|
|
print()
|
|
|
|
# Summary statistics
|
|
total_records = len(original_records)
|
|
total_fields = len(csv_fields) * total_records
|
|
|
|
print("=" * 80)
|
|
print("SUMMARY")
|
|
print("=" * 80)
|
|
print(f"Total records: {total_records}")
|
|
print(f"Total fields: {total_fields}")
|
|
print(f"Fields preserved: {total_fields - len(missing_fields)}")
|
|
print(f"Value mismatches: {len(value_mismatches)}")
|
|
print(f"Preservation rate: {((total_fields - len(missing_fields)) / total_fields * 100):.1f}%")
|
|
print()
|
|
|
|
return len(missing_fields) == 0 and len(value_mismatches) == 0
|
|
|
|
def main():
|
|
"""Convert ISIL CSV to LinkML YAML."""
|
|
base_dir = Path(__file__).parent.parent
|
|
csv_path = base_dir / "data" / "isil" / "nl" / "nan" / "ISIL-codes_2025-11-06.csv"
|
|
yaml_path = base_dir / "data" / "isil" / "nl" / "nan" / "ISIL-codes_2025-11-06.yaml"
|
|
|
|
print("=" * 80)
|
|
print("DUTCH ISIL CSV TO LINKML YAML CONVERTER")
|
|
print("=" * 80)
|
|
print()
|
|
print(f"Input: {csv_path}")
|
|
print(f"Output: {yaml_path}")
|
|
print()
|
|
|
|
# Parse CSV
|
|
print("Parsing malformed CSV...")
|
|
records, header = parse_malformed_csv(csv_path)
|
|
print(f"✅ Parsed {len(records)} records")
|
|
print(f" Header: {header}")
|
|
print()
|
|
|
|
# Map to LinkML
|
|
print("Mapping to LinkML HeritageCustodian schema...")
|
|
linkml_records = map_to_linkml(records)
|
|
print(f"✅ Mapped {len(linkml_records)} records")
|
|
print()
|
|
|
|
# Validate field preservation
|
|
validation_passed = validate_field_preservation(records, linkml_records)
|
|
|
|
if not validation_passed:
|
|
print("⚠️ WARNING: Field preservation validation failed!")
|
|
response = input("Continue anyway? (y/N): ")
|
|
if response.lower() != 'y':
|
|
print("Aborted.")
|
|
return
|
|
print()
|
|
|
|
# Write YAML
|
|
print(f"Writing LinkML YAML to {yaml_path.name}...")
|
|
with open(yaml_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(linkml_records, f, allow_unicode=True, default_flow_style=False,
|
|
sort_keys=False, width=120)
|
|
|
|
print("✅ YAML file created successfully!")
|
|
print()
|
|
|
|
# Show sample output
|
|
print("=" * 80)
|
|
print("SAMPLE OUTPUT (first record)")
|
|
print("=" * 80)
|
|
print(yaml.dump([linkml_records[0]], allow_unicode=True, default_flow_style=False,
|
|
sort_keys=False, width=120))
|
|
|
|
print("=" * 80)
|
|
print(f"CONVERSION COMPLETE: {len(linkml_records)} records")
|
|
print("=" * 80)
|
|
|
|
if __name__ == "__main__":
|
|
main()
|