#!/usr/bin/env python3 """ Convert Dutch Library ISIL CSV (KB Bnetwerk) to LinkML-compliant YAML. Input: /data/isil/nl/kb/20250401 Bnetwerk overzicht ISIL-codes Bibliotheken Nederland.csv Output: /data/isil/nl/kb/20250401_Bnetwerk_ISIL_Bibliotheken_Nederland.yaml The CSV has a cleaner structure than the national archive CSV: - UTF-8 encoding with BOM - Semicolon delimiter - 3 header rows (metadata), then column headers (row 4), then data - 4 fields: ISIL-code, Naam bibliotheek, Vestigingsplaats, Opmerking - Empty 5th column This script: 1. Parses the CSV correctly (skip first 3 rows) 2. Maps to HeritageCustodian schema with Identifier class 3. Preserves all original fields 4. Classifies institution types based on remarks 5. Validates field preservation 6. Generates LinkML-compliant YAML """ import csv import yaml from pathlib import Path from datetime import datetime, timezone def parse_library_csv(file_path): """Parse the library ISIL CSV.""" with open(file_path, 'r', encoding='utf-8-sig') as f: reader = csv.reader(f, delimiter=';') all_rows = list(reader) # Extract metadata from header title = all_rows[0][0] if len(all_rows) > 0 else '' date_info = all_rows[1][0] if len(all_rows) > 1 else '' # Header is at row 4 (index 3): ISIL-code, Naam bibliotheek, Vestigingsplaats, Opmerking header = all_rows[3] if len(all_rows) > 3 else [] # Data starts at row 5 (index 4) data_rows = all_rows[4:] records = [] for i, row in enumerate(data_rows, start=1): # Skip empty rows if not any(row): continue # Skip rows with less than 3 fields if len(row) < 3: continue record = { 'row_number': i, 'isil_code': row[0].strip() if len(row) > 0 else '', 'naam_bibliotheek': row[1].strip() if len(row) > 1 else '', 'vestigingsplaats': row[2].strip() if len(row) > 2 else '', 'opmerking': row[3].strip() if len(row) > 3 else '', } # Only add records with at least an ISIL code if record['isil_code']: records.append(record) metadata = { 'title': title, 'date_info': date_info, 'header': header, } return records, metadata def classify_library_type(opmerking): """Classify library type based on opmerking (remarks) field.""" if not opmerking: return 'public_library' opmerking_lower = opmerking.lower() if 'landelijke bibliotheekorganisatie' in opmerking_lower: return 'national_library_organization' elif 'provinciale bibliotheekorganisatie' in opmerking_lower: return 'provincial_library_organization' elif 'poi' in opmerking_lower: return 'library_automation_system' # POI = Public Online Information system elif 'nationale bibliotheek' in opmerking_lower: return 'national_library' else: return 'public_library' def map_to_linkml(records, metadata): """Map library ISIL records to LinkML HeritageCustodian schema.""" heritage_custodians = [] for record in records: # Classify library type library_type = classify_library_type(record['opmerking']) # Create identifier structure identifier = { 'identifier_scheme': 'ISIL', 'identifier_value': record['isil_code'], # ISIL codes don't have a universal URLisil_code']}", } # Create location location = { 'city': record['vestigingsplaats'], 'country': 'NL', } # Create HeritageCustodian record custodian = { # Original CSV fields (preserved) 'csv_row_number': record['row_number'], 'csv_isil_code': record['isil_code'], 'csv_naam_bibliotheek': record['naam_bibliotheek'], 'csv_vestigingsplaats': record['vestigingsplaats'], 'csv_opmerking': record['opmerking'], # LinkML mapped fields 'name': record['naam_bibliotheek'], 'institution_type': 'LIBRARY', 'locations': [location] if record['vestigingsplaats'] else [], 'identifiers': [identifier], # Library-specific metadata 'library_type': library_type, } # Add description combining remarks and library type description_parts = [] if record['opmerking']: description_parts.append(f"Type: {record['opmerking']}") description_parts.append(f"Library classification: {library_type}") custodian['description'] = '. '.join(description_parts) # Add provenance custodian['provenance'] = { 'data_source': 'ISIL_REGISTRY', 'data_tier': 'TIER_1_AUTHORITATIVE', 'extraction_date': datetime.now(timezone.utc).isoformat(), 'extraction_method': 'CSV to YAML conversion (KB Bnetwerk library ISIL codes)', 'source_url': 'https://www.kb.nl/organisatie/bibliotheken-in-nederland/isil-codes', 'source_date': metadata['date_info'], 'confidence_score': 1.0, } heritage_custodians.append(custodian) return heritage_custodians def validate_field_preservation(original_records, linkml_records): """Validate that all CSV fields were preserved.""" print("=" * 80) print("FIELD PRESERVATION VALIDATION") print("=" * 80) print() # Check field mapping csv_fields = ['row_number', 'isil_code', 'naam_bibliotheek', 'vestigingsplaats', 'opmerking'] linkml_fields = ['csv_row_number', 'csv_isil_code', 'csv_naam_bibliotheek', 'csv_vestigingsplaats', 'csv_opmerking'] print("CSV Field Mapping:") for csv_field, linkml_field in zip(csv_fields, linkml_fields): print(f" {csv_field:20s} → {linkml_field}") print() # Verify all records missing_fields = [] value_mismatches = [] for i, (orig, mapped) in enumerate(zip(original_records, linkml_records)): for csv_field, linkml_field in zip(csv_fields, linkml_fields): if linkml_field not in mapped: missing_fields.append((i, linkml_field)) elif str(orig[csv_field]) != str(mapped[linkml_field]): value_mismatches.append((i, csv_field, orig[csv_field], mapped[linkml_field])) if missing_fields: print(f"⚠️ WARNING: {len(missing_fields)} missing fields detected:") for row_idx, field in missing_fields[:10]: print(f" Row {row_idx}: Missing field '{field}'") print() else: print("✅ All CSV fields preserved in LinkML structure") print() if value_mismatches: print(f"⚠️ WARNING: {len(value_mismatches)} value mismatches detected:") for row_idx, field, orig_val, mapped_val in value_mismatches[:10]: print(f" Row {row_idx}, field '{field}': '{orig_val}' → '{mapped_val}'") print() else: print("✅ All field values match exactly") print() # Summary statistics total_records = len(original_records) total_fields = len(csv_fields) * total_records print("=" * 80) print("SUMMARY") print("=" * 80) print(f"Total records: {total_records}") print(f"Total fields: {total_fields}") print(f"Fields preserved: {total_fields - len(missing_fields)}") print(f"Value mismatches: {len(value_mismatches)}") print(f"Preservation rate: {((total_fields - len(missing_fields)) / total_fields * 100):.1f}%") print() return len(missing_fields) == 0 and len(value_mismatches) == 0 def main(): """Convert library ISIL CSV to LinkML YAML.""" base_dir = Path(__file__).parent.parent csv_path = base_dir / "data" / "isil" / "nl" / "kb" / "20250401 Bnetwerk overzicht ISIL-codes Bibliotheken Nederland.csv" yaml_path = base_dir / "data" / "isil" / "nl" / "kb" / "20250401_Bnetwerk_ISIL_Bibliotheken_Nederland.yaml" print("=" * 80) print("DUTCH LIBRARY (KB BNETWERK) ISIL CSV TO LINKML YAML CONVERTER") print("=" * 80) print() print(f"Input: {csv_path.name}") print(f"Output: {yaml_path.name}") print() # Parse CSV print("Parsing library ISIL CSV...") records, metadata = parse_library_csv(csv_path) print(f"✅ Parsed {len(records)} library records") print(f" Metadata: {metadata['title']} - {metadata['date_info']}") print() # Map to LinkML print("Mapping to LinkML HeritageCustodian schema...") linkml_records = map_to_linkml(records, metadata) print(f"✅ Mapped {len(linkml_records)} records") print() # Validate field preservation validation_passed = validate_field_preservation(records, linkml_records) if not validation_passed: print("⚠️ WARNING: Field preservation validation failed!") response = input("Continue anyway? (y/N): ") if response.lower() != 'y': print("Aborted.") return print() # Write YAML print(f"Writing LinkML YAML to {yaml_path.name}...") with open(yaml_path, 'w', encoding='utf-8') as f: yaml.dump(linkml_records, f, allow_unicode=True, default_flow_style=False, sort_keys=False, width=120) print("✅ YAML file created successfully!") print() # Show sample output print("=" * 80) print("SAMPLE OUTPUT (first record)") print("=" * 80) print(yaml.dump([linkml_records[0]], allow_unicode=True, default_flow_style=False, sort_keys=False, width=120)) # Show library type distribution from collections import Counter type_counts = Counter(r['library_type'] for r in linkml_records) print("=" * 80) print("LIBRARY TYPE DISTRIBUTION") print("=" * 80) for lib_type, count in type_counts.most_common(): print(f" {lib_type:40s} {count:3d} libraries") print() print("=" * 80) print(f"CONVERSION COMPLETE: {len(linkml_records)} library records") print("=" * 80) if __name__ == "__main__": main()