glam/scripts/convert_library_isil_csv_to_yaml.py
2025-11-19 23:25:22 +01:00

288 lines
10 KiB
Python

#!/usr/bin/env python3
"""
Convert Dutch Library ISIL CSV (KB Bnetwerk) to LinkML-compliant YAML.
Input: /data/isil/nl/kb/20250401 Bnetwerk overzicht ISIL-codes Bibliotheken Nederland.csv
Output: /data/isil/nl/kb/20250401_Bnetwerk_ISIL_Bibliotheken_Nederland.yaml
The CSV has a cleaner structure than the national archive CSV:
- UTF-8 encoding with BOM
- Semicolon delimiter
- 3 header rows (metadata), then column headers (row 4), then data
- 4 fields: ISIL-code, Naam bibliotheek, Vestigingsplaats, Opmerking
- Empty 5th column
This script:
1. Parses the CSV correctly (skip first 3 rows)
2. Maps to HeritageCustodian schema with Identifier class
3. Preserves all original fields
4. Classifies institution types based on remarks
5. Validates field preservation
6. Generates LinkML-compliant YAML
"""
import csv
import yaml
from pathlib import Path
from datetime import datetime, timezone
def parse_library_csv(file_path):
"""Parse the library ISIL CSV."""
with open(file_path, 'r', encoding='utf-8-sig') as f:
reader = csv.reader(f, delimiter=';')
all_rows = list(reader)
# Extract metadata from header
title = all_rows[0][0] if len(all_rows) > 0 else ''
date_info = all_rows[1][0] if len(all_rows) > 1 else ''
# Header is at row 4 (index 3): ISIL-code, Naam bibliotheek, Vestigingsplaats, Opmerking
header = all_rows[3] if len(all_rows) > 3 else []
# Data starts at row 5 (index 4)
data_rows = all_rows[4:]
records = []
for i, row in enumerate(data_rows, start=1):
# Skip empty rows
if not any(row):
continue
# Skip rows with less than 3 fields
if len(row) < 3:
continue
record = {
'row_number': i,
'isil_code': row[0].strip() if len(row) > 0 else '',
'naam_bibliotheek': row[1].strip() if len(row) > 1 else '',
'vestigingsplaats': row[2].strip() if len(row) > 2 else '',
'opmerking': row[3].strip() if len(row) > 3 else '',
}
# Only add records with at least an ISIL code
if record['isil_code']:
records.append(record)
metadata = {
'title': title,
'date_info': date_info,
'header': header,
}
return records, metadata
def classify_library_type(opmerking):
"""Classify library type based on opmerking (remarks) field."""
if not opmerking:
return 'public_library'
opmerking_lower = opmerking.lower()
if 'landelijke bibliotheekorganisatie' in opmerking_lower:
return 'national_library_organization'
elif 'provinciale bibliotheekorganisatie' in opmerking_lower:
return 'provincial_library_organization'
elif 'poi' in opmerking_lower:
return 'library_automation_system' # POI = Public Online Information system
elif 'nationale bibliotheek' in opmerking_lower:
return 'national_library'
else:
return 'public_library'
def map_to_linkml(records, metadata):
"""Map library ISIL records to LinkML HeritageCustodian schema."""
heritage_custodians = []
for record in records:
# Classify library type
library_type = classify_library_type(record['opmerking'])
# Create identifier structure
identifier = {
'identifier_scheme': 'ISIL',
'identifier_value': record['isil_code'],
'identifier_url': f"https://isil.org/{record['isil_code']}",
}
# Create location
location = {
'city': record['vestigingsplaats'],
'country': 'NL',
}
# Create HeritageCustodian record
custodian = {
# Original CSV fields (preserved)
'csv_row_number': record['row_number'],
'csv_isil_code': record['isil_code'],
'csv_naam_bibliotheek': record['naam_bibliotheek'],
'csv_vestigingsplaats': record['vestigingsplaats'],
'csv_opmerking': record['opmerking'],
# LinkML mapped fields
'name': record['naam_bibliotheek'],
'institution_type': 'LIBRARY',
'locations': [location] if record['vestigingsplaats'] else [],
'identifiers': [identifier],
# Library-specific metadata
'library_type': library_type,
}
# Add description combining remarks and library type
description_parts = []
if record['opmerking']:
description_parts.append(f"Type: {record['opmerking']}")
description_parts.append(f"Library classification: {library_type}")
custodian['description'] = '. '.join(description_parts)
# Add provenance
custodian['provenance'] = {
'data_source': 'ISIL_REGISTRY',
'data_tier': 'TIER_1_AUTHORITATIVE',
'extraction_date': datetime.now(timezone.utc).isoformat(),
'extraction_method': 'CSV to YAML conversion (KB Bnetwerk library ISIL codes)',
'source_url': 'https://www.kb.nl/organisatie/bibliotheken-in-nederland/isil-codes',
'source_date': metadata['date_info'],
'confidence_score': 1.0,
}
heritage_custodians.append(custodian)
return heritage_custodians
def validate_field_preservation(original_records, linkml_records):
"""Validate that all CSV fields were preserved."""
print("=" * 80)
print("FIELD PRESERVATION VALIDATION")
print("=" * 80)
print()
# Check field mapping
csv_fields = ['row_number', 'isil_code', 'naam_bibliotheek', 'vestigingsplaats', 'opmerking']
linkml_fields = ['csv_row_number', 'csv_isil_code', 'csv_naam_bibliotheek',
'csv_vestigingsplaats', 'csv_opmerking']
print("CSV Field Mapping:")
for csv_field, linkml_field in zip(csv_fields, linkml_fields):
print(f" {csv_field:20s}{linkml_field}")
print()
# Verify all records
missing_fields = []
value_mismatches = []
for i, (orig, mapped) in enumerate(zip(original_records, linkml_records)):
for csv_field, linkml_field in zip(csv_fields, linkml_fields):
if linkml_field not in mapped:
missing_fields.append((i, linkml_field))
elif str(orig[csv_field]) != str(mapped[linkml_field]):
value_mismatches.append((i, csv_field, orig[csv_field], mapped[linkml_field]))
if missing_fields:
print(f"⚠️ WARNING: {len(missing_fields)} missing fields detected:")
for row_idx, field in missing_fields[:10]:
print(f" Row {row_idx}: Missing field '{field}'")
print()
else:
print("✅ All CSV fields preserved in LinkML structure")
print()
if value_mismatches:
print(f"⚠️ WARNING: {len(value_mismatches)} value mismatches detected:")
for row_idx, field, orig_val, mapped_val in value_mismatches[:10]:
print(f" Row {row_idx}, field '{field}': '{orig_val}''{mapped_val}'")
print()
else:
print("✅ All field values match exactly")
print()
# Summary statistics
total_records = len(original_records)
total_fields = len(csv_fields) * total_records
print("=" * 80)
print("SUMMARY")
print("=" * 80)
print(f"Total records: {total_records}")
print(f"Total fields: {total_fields}")
print(f"Fields preserved: {total_fields - len(missing_fields)}")
print(f"Value mismatches: {len(value_mismatches)}")
print(f"Preservation rate: {((total_fields - len(missing_fields)) / total_fields * 100):.1f}%")
print()
return len(missing_fields) == 0 and len(value_mismatches) == 0
def main():
"""Convert library ISIL CSV to LinkML YAML."""
base_dir = Path(__file__).parent.parent
csv_path = base_dir / "data" / "isil" / "nl" / "kb" / "20250401 Bnetwerk overzicht ISIL-codes Bibliotheken Nederland.csv"
yaml_path = base_dir / "data" / "isil" / "nl" / "kb" / "20250401_Bnetwerk_ISIL_Bibliotheken_Nederland.yaml"
print("=" * 80)
print("DUTCH LIBRARY (KB BNETWERK) ISIL CSV TO LINKML YAML CONVERTER")
print("=" * 80)
print()
print(f"Input: {csv_path.name}")
print(f"Output: {yaml_path.name}")
print()
# Parse CSV
print("Parsing library ISIL CSV...")
records, metadata = parse_library_csv(csv_path)
print(f"✅ Parsed {len(records)} library records")
print(f" Metadata: {metadata['title']} - {metadata['date_info']}")
print()
# Map to LinkML
print("Mapping to LinkML HeritageCustodian schema...")
linkml_records = map_to_linkml(records, metadata)
print(f"✅ Mapped {len(linkml_records)} records")
print()
# Validate field preservation
validation_passed = validate_field_preservation(records, linkml_records)
if not validation_passed:
print("⚠️ WARNING: Field preservation validation failed!")
response = input("Continue anyway? (y/N): ")
if response.lower() != 'y':
print("Aborted.")
return
print()
# Write YAML
print(f"Writing LinkML YAML to {yaml_path.name}...")
with open(yaml_path, 'w', encoding='utf-8') as f:
yaml.dump(linkml_records, f, allow_unicode=True, default_flow_style=False,
sort_keys=False, width=120)
print("✅ YAML file created successfully!")
print()
# Show sample output
print("=" * 80)
print("SAMPLE OUTPUT (first record)")
print("=" * 80)
print(yaml.dump([linkml_records[0]], allow_unicode=True, default_flow_style=False,
sort_keys=False, width=120))
# Show library type distribution
from collections import Counter
type_counts = Counter(r['library_type'] for r in linkml_records)
print("=" * 80)
print("LIBRARY TYPE DISTRIBUTION")
print("=" * 80)
for lib_type, count in type_counts.most_common():
print(f" {lib_type:40s} {count:3d} libraries")
print()
print("=" * 80)
print(f"CONVERSION COMPLETE: {len(linkml_records)} library records")
print("=" * 80)
if __name__ == "__main__":
main()