288 lines
10 KiB
Python
288 lines
10 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Convert Dutch Library ISIL CSV (KB Bnetwerk) to LinkML-compliant YAML.
|
|
|
|
Input: /data/isil/nl/kb/20250401 Bnetwerk overzicht ISIL-codes Bibliotheken Nederland.csv
|
|
Output: /data/isil/nl/kb/20250401_Bnetwerk_ISIL_Bibliotheken_Nederland.yaml
|
|
|
|
The CSV has a cleaner structure than the national archive CSV:
|
|
- UTF-8 encoding with BOM
|
|
- Semicolon delimiter
|
|
- 3 header rows (metadata), then column headers (row 4), then data
|
|
- 4 fields: ISIL-code, Naam bibliotheek, Vestigingsplaats, Opmerking
|
|
- Empty 5th column
|
|
|
|
This script:
|
|
1. Parses the CSV correctly (skip first 3 rows)
|
|
2. Maps to HeritageCustodian schema with Identifier class
|
|
3. Preserves all original fields
|
|
4. Classifies institution types based on remarks
|
|
5. Validates field preservation
|
|
6. Generates LinkML-compliant YAML
|
|
"""
|
|
|
|
import csv
|
|
import yaml
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
|
|
def parse_library_csv(file_path):
|
|
"""Parse the library ISIL CSV."""
|
|
with open(file_path, 'r', encoding='utf-8-sig') as f:
|
|
reader = csv.reader(f, delimiter=';')
|
|
all_rows = list(reader)
|
|
|
|
# Extract metadata from header
|
|
title = all_rows[0][0] if len(all_rows) > 0 else ''
|
|
date_info = all_rows[1][0] if len(all_rows) > 1 else ''
|
|
|
|
# Header is at row 4 (index 3): ISIL-code, Naam bibliotheek, Vestigingsplaats, Opmerking
|
|
header = all_rows[3] if len(all_rows) > 3 else []
|
|
|
|
# Data starts at row 5 (index 4)
|
|
data_rows = all_rows[4:]
|
|
|
|
records = []
|
|
for i, row in enumerate(data_rows, start=1):
|
|
# Skip empty rows
|
|
if not any(row):
|
|
continue
|
|
|
|
# Skip rows with less than 3 fields
|
|
if len(row) < 3:
|
|
continue
|
|
|
|
record = {
|
|
'row_number': i,
|
|
'isil_code': row[0].strip() if len(row) > 0 else '',
|
|
'naam_bibliotheek': row[1].strip() if len(row) > 1 else '',
|
|
'vestigingsplaats': row[2].strip() if len(row) > 2 else '',
|
|
'opmerking': row[3].strip() if len(row) > 3 else '',
|
|
}
|
|
|
|
# Only add records with at least an ISIL code
|
|
if record['isil_code']:
|
|
records.append(record)
|
|
|
|
metadata = {
|
|
'title': title,
|
|
'date_info': date_info,
|
|
'header': header,
|
|
}
|
|
|
|
return records, metadata
|
|
|
|
def classify_library_type(opmerking):
|
|
"""Classify library type based on opmerking (remarks) field."""
|
|
if not opmerking:
|
|
return 'public_library'
|
|
|
|
opmerking_lower = opmerking.lower()
|
|
|
|
if 'landelijke bibliotheekorganisatie' in opmerking_lower:
|
|
return 'national_library_organization'
|
|
elif 'provinciale bibliotheekorganisatie' in opmerking_lower:
|
|
return 'provincial_library_organization'
|
|
elif 'poi' in opmerking_lower:
|
|
return 'library_automation_system' # POI = Public Online Information system
|
|
elif 'nationale bibliotheek' in opmerking_lower:
|
|
return 'national_library'
|
|
else:
|
|
return 'public_library'
|
|
|
|
def map_to_linkml(records, metadata):
|
|
"""Map library ISIL records to LinkML HeritageCustodian schema."""
|
|
heritage_custodians = []
|
|
|
|
for record in records:
|
|
# Classify library type
|
|
library_type = classify_library_type(record['opmerking'])
|
|
|
|
# Create identifier structure
|
|
identifier = {
|
|
'identifier_scheme': 'ISIL',
|
|
'identifier_value': record['isil_code'],
|
|
# ISIL codes don't have a universal URLisil_code']}",
|
|
}
|
|
|
|
# Create location
|
|
location = {
|
|
'city': record['vestigingsplaats'],
|
|
'country': 'NL',
|
|
}
|
|
|
|
# Create HeritageCustodian record
|
|
custodian = {
|
|
# Original CSV fields (preserved)
|
|
'csv_row_number': record['row_number'],
|
|
'csv_isil_code': record['isil_code'],
|
|
'csv_naam_bibliotheek': record['naam_bibliotheek'],
|
|
'csv_vestigingsplaats': record['vestigingsplaats'],
|
|
'csv_opmerking': record['opmerking'],
|
|
|
|
# LinkML mapped fields
|
|
'name': record['naam_bibliotheek'],
|
|
'institution_type': 'LIBRARY',
|
|
'locations': [location] if record['vestigingsplaats'] else [],
|
|
'identifiers': [identifier],
|
|
|
|
# Library-specific metadata
|
|
'library_type': library_type,
|
|
}
|
|
|
|
# Add description combining remarks and library type
|
|
description_parts = []
|
|
if record['opmerking']:
|
|
description_parts.append(f"Type: {record['opmerking']}")
|
|
description_parts.append(f"Library classification: {library_type}")
|
|
|
|
custodian['description'] = '. '.join(description_parts)
|
|
|
|
# Add provenance
|
|
custodian['provenance'] = {
|
|
'data_source': 'ISIL_REGISTRY',
|
|
'data_tier': 'TIER_1_AUTHORITATIVE',
|
|
'extraction_date': datetime.now(timezone.utc).isoformat(),
|
|
'extraction_method': 'CSV to YAML conversion (KB Bnetwerk library ISIL codes)',
|
|
'source_url': 'https://www.kb.nl/organisatie/bibliotheken-in-nederland/isil-codes',
|
|
'source_date': metadata['date_info'],
|
|
'confidence_score': 1.0,
|
|
}
|
|
|
|
heritage_custodians.append(custodian)
|
|
|
|
return heritage_custodians
|
|
|
|
def validate_field_preservation(original_records, linkml_records):
|
|
"""Validate that all CSV fields were preserved."""
|
|
print("=" * 80)
|
|
print("FIELD PRESERVATION VALIDATION")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
# Check field mapping
|
|
csv_fields = ['row_number', 'isil_code', 'naam_bibliotheek', 'vestigingsplaats', 'opmerking']
|
|
linkml_fields = ['csv_row_number', 'csv_isil_code', 'csv_naam_bibliotheek',
|
|
'csv_vestigingsplaats', 'csv_opmerking']
|
|
|
|
print("CSV Field Mapping:")
|
|
for csv_field, linkml_field in zip(csv_fields, linkml_fields):
|
|
print(f" {csv_field:20s} → {linkml_field}")
|
|
print()
|
|
|
|
# Verify all records
|
|
missing_fields = []
|
|
value_mismatches = []
|
|
|
|
for i, (orig, mapped) in enumerate(zip(original_records, linkml_records)):
|
|
for csv_field, linkml_field in zip(csv_fields, linkml_fields):
|
|
if linkml_field not in mapped:
|
|
missing_fields.append((i, linkml_field))
|
|
elif str(orig[csv_field]) != str(mapped[linkml_field]):
|
|
value_mismatches.append((i, csv_field, orig[csv_field], mapped[linkml_field]))
|
|
|
|
if missing_fields:
|
|
print(f"⚠️ WARNING: {len(missing_fields)} missing fields detected:")
|
|
for row_idx, field in missing_fields[:10]:
|
|
print(f" Row {row_idx}: Missing field '{field}'")
|
|
print()
|
|
else:
|
|
print("✅ All CSV fields preserved in LinkML structure")
|
|
print()
|
|
|
|
if value_mismatches:
|
|
print(f"⚠️ WARNING: {len(value_mismatches)} value mismatches detected:")
|
|
for row_idx, field, orig_val, mapped_val in value_mismatches[:10]:
|
|
print(f" Row {row_idx}, field '{field}': '{orig_val}' → '{mapped_val}'")
|
|
print()
|
|
else:
|
|
print("✅ All field values match exactly")
|
|
print()
|
|
|
|
# Summary statistics
|
|
total_records = len(original_records)
|
|
total_fields = len(csv_fields) * total_records
|
|
|
|
print("=" * 80)
|
|
print("SUMMARY")
|
|
print("=" * 80)
|
|
print(f"Total records: {total_records}")
|
|
print(f"Total fields: {total_fields}")
|
|
print(f"Fields preserved: {total_fields - len(missing_fields)}")
|
|
print(f"Value mismatches: {len(value_mismatches)}")
|
|
print(f"Preservation rate: {((total_fields - len(missing_fields)) / total_fields * 100):.1f}%")
|
|
print()
|
|
|
|
return len(missing_fields) == 0 and len(value_mismatches) == 0
|
|
|
|
def main():
|
|
"""Convert library ISIL CSV to LinkML YAML."""
|
|
base_dir = Path(__file__).parent.parent
|
|
csv_path = base_dir / "data" / "isil" / "nl" / "kb" / "20250401 Bnetwerk overzicht ISIL-codes Bibliotheken Nederland.csv"
|
|
yaml_path = base_dir / "data" / "isil" / "nl" / "kb" / "20250401_Bnetwerk_ISIL_Bibliotheken_Nederland.yaml"
|
|
|
|
print("=" * 80)
|
|
print("DUTCH LIBRARY (KB BNETWERK) ISIL CSV TO LINKML YAML CONVERTER")
|
|
print("=" * 80)
|
|
print()
|
|
print(f"Input: {csv_path.name}")
|
|
print(f"Output: {yaml_path.name}")
|
|
print()
|
|
|
|
# Parse CSV
|
|
print("Parsing library ISIL CSV...")
|
|
records, metadata = parse_library_csv(csv_path)
|
|
print(f"✅ Parsed {len(records)} library records")
|
|
print(f" Metadata: {metadata['title']} - {metadata['date_info']}")
|
|
print()
|
|
|
|
# Map to LinkML
|
|
print("Mapping to LinkML HeritageCustodian schema...")
|
|
linkml_records = map_to_linkml(records, metadata)
|
|
print(f"✅ Mapped {len(linkml_records)} records")
|
|
print()
|
|
|
|
# Validate field preservation
|
|
validation_passed = validate_field_preservation(records, linkml_records)
|
|
|
|
if not validation_passed:
|
|
print("⚠️ WARNING: Field preservation validation failed!")
|
|
response = input("Continue anyway? (y/N): ")
|
|
if response.lower() != 'y':
|
|
print("Aborted.")
|
|
return
|
|
print()
|
|
|
|
# Write YAML
|
|
print(f"Writing LinkML YAML to {yaml_path.name}...")
|
|
with open(yaml_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(linkml_records, f, allow_unicode=True, default_flow_style=False,
|
|
sort_keys=False, width=120)
|
|
|
|
print("✅ YAML file created successfully!")
|
|
print()
|
|
|
|
# Show sample output
|
|
print("=" * 80)
|
|
print("SAMPLE OUTPUT (first record)")
|
|
print("=" * 80)
|
|
print(yaml.dump([linkml_records[0]], allow_unicode=True, default_flow_style=False,
|
|
sort_keys=False, width=120))
|
|
|
|
# Show library type distribution
|
|
from collections import Counter
|
|
type_counts = Counter(r['library_type'] for r in linkml_records)
|
|
|
|
print("=" * 80)
|
|
print("LIBRARY TYPE DISTRIBUTION")
|
|
print("=" * 80)
|
|
for lib_type, count in type_counts.most_common():
|
|
print(f" {lib_type:40s} {count:3d} libraries")
|
|
print()
|
|
|
|
print("=" * 80)
|
|
print(f"CONVERSION COMPLETE: {len(linkml_records)} library records")
|
|
print("=" * 80)
|
|
|
|
if __name__ == "__main__":
|
|
main()
|