glam/scripts/convert_library_isil_csv_to_yaml.py

#!/usr/bin/env python3
"""
Convert Dutch Library ISIL CSV (KB Bnetwerk) to LinkML-compliant YAML.

Input: /data/isil/nl/kb/20250401 Bnetwerk overzicht ISIL-codes Bibliotheken Nederland.csv
Output: /data/isil/nl/kb/20250401_Bnetwerk_ISIL_Bibliotheken_Nederland.yaml

The CSV has a cleaner structure than the national archive CSV:
- UTF-8 encoding with BOM
- Semicolon delimiter
- 3 header rows (metadata), then column headers (row 4), then data
- 4 fields: ISIL-code, Naam bibliotheek, Vestigingsplaats, Opmerking
- Empty 5th column

This script:
1. Parses the CSV correctly (skip first 3 rows)
2. Maps to HeritageCustodian schema with Identifier class
3. Preserves all original fields
4. Classifies institution types based on remarks
5. Validates field preservation
6. Generates LinkML-compliant YAML
"""

import csv
import yaml
from pathlib import Path
from datetime import datetime, timezone

def parse_library_csv(file_path):
    """Parse the library ISIL CSV."""
    with open(file_path, 'r', encoding='utf-8-sig') as f:
        reader = csv.reader(f, delimiter=';')
        all_rows = list(reader)

    # Extract metadata from header
    title = all_rows[0][0] if len(all_rows) > 0 else ''
    date_info = all_rows[1][0] if len(all_rows) > 1 else ''

    # Header is at row 4 (index 3): ISIL-code, Naam bibliotheek, Vestigingsplaats, Opmerking
    header = all_rows[3] if len(all_rows) > 3 else []

    # Data starts at row 5 (index 4)
    data_rows = all_rows[4:]

    records = []
    for i, row in enumerate(data_rows, start=1):
        # Skip empty rows
        if not any(row):
            continue

        # Skip rows with less than 3 fields
        if len(row) < 3:
            continue

        record = {
            'row_number': i,
            'isil_code': row[0].strip() if len(row) > 0 else '',
            'naam_bibliotheek': row[1].strip() if len(row) > 1 else '',
            'vestigingsplaats': row[2].strip() if len(row) > 2 else '',
            'opmerking': row[3].strip() if len(row) > 3 else '',
        }

        # Only add records with at least an ISIL code
        if record['isil_code']:
            records.append(record)

    metadata = {
        'title': title,
        'date_info': date_info,
        'header': header,
    }

    return records, metadata

def classify_library_type(opmerking):
    """Classify library type based on opmerking (remarks) field."""
    if not opmerking:
        return 'public_library'

    opmerking_lower = opmerking.lower()

    if 'landelijke bibliotheekorganisatie' in opmerking_lower:
        return 'national_library_organization'
    elif 'provinciale bibliotheekorganisatie' in opmerking_lower:
        return 'provincial_library_organization'
    elif 'poi' in opmerking_lower:
        return 'library_automation_system'  # POI = Public Online Information system
    elif 'nationale bibliotheek' in opmerking_lower:
        return 'national_library'
    else:
        return 'public_library'

def map_to_linkml(records, metadata):
    """Map library ISIL records to LinkML HeritageCustodian schema."""
    heritage_custodians = []

    for record in records:
        # Classify library type
        library_type = classify_library_type(record['opmerking'])

        # Create identifier structure
        identifier = {
            'identifier_scheme': 'ISIL',
            'identifier_value': record['isil_code'],
            'identifier_url': f"https://isil.org/{record['isil_code']}",
        }

        # Create location
        location = {
            'city': record['vestigingsplaats'],
            'country': 'NL',
        }

        # Create HeritageCustodian record
        custodian = {
            # Original CSV fields (preserved)
            'csv_row_number': record['row_number'],
            'csv_isil_code': record['isil_code'],
            'csv_naam_bibliotheek': record['naam_bibliotheek'],
            'csv_vestigingsplaats': record['vestigingsplaats'],
            'csv_opmerking': record['opmerking'],

            # LinkML mapped fields
            'name': record['naam_bibliotheek'],
            'institution_type': 'LIBRARY',
            'locations': [location] if record['vestigingsplaats'] else [],
            'identifiers': [identifier],

            # Library-specific metadata
            'library_type': library_type,
        }

        # Add description combining remarks and library type
        description_parts = []
        if record['opmerking']:
            description_parts.append(f"Type: {record['opmerking']}")
        description_parts.append(f"Library classification: {library_type}")

        custodian['description'] = '. '.join(description_parts)

        # Add provenance
        custodian['provenance'] = {
            'data_source': 'ISIL_REGISTRY',
            'data_tier': 'TIER_1_AUTHORITATIVE',
            'extraction_date': datetime.now(timezone.utc).isoformat(),
            'extraction_method': 'CSV to YAML conversion (KB Bnetwerk library ISIL codes)',
            'source_url': 'https://www.kb.nl/organisatie/bibliotheken-in-nederland/isil-codes',
            'source_date': metadata['date_info'],
            'confidence_score': 1.0,
        }

        heritage_custodians.append(custodian)

    return heritage_custodians

def validate_field_preservation(original_records, linkml_records):
    """Validate that all CSV fields were preserved."""
    print("=" * 80)
    print("FIELD PRESERVATION VALIDATION")
    print("=" * 80)
    print()

    # Check field mapping
    csv_fields = ['row_number', 'isil_code', 'naam_bibliotheek', 'vestigingsplaats', 'opmerking']
    linkml_fields = ['csv_row_number', 'csv_isil_code', 'csv_naam_bibliotheek',
                     'csv_vestigingsplaats', 'csv_opmerking']

    print("CSV Field Mapping:")
    for csv_field, linkml_field in zip(csv_fields, linkml_fields):
        print(f"  {csv_field:20s} → {linkml_field}")
    print()

    # Verify all records
    missing_fields = []
    value_mismatches = []

    for i, (orig, mapped) in enumerate(zip(original_records, linkml_records)):
        for csv_field, linkml_field in zip(csv_fields, linkml_fields):
            if linkml_field not in mapped:
                missing_fields.append((i, linkml_field))
            elif str(orig[csv_field]) != str(mapped[linkml_field]):
                value_mismatches.append((i, csv_field, orig[csv_field], mapped[linkml_field]))

    if missing_fields:
        print(f"⚠️  WARNING: {len(missing_fields)} missing fields detected:")
        for row_idx, field in missing_fields[:10]:
            print(f"  Row {row_idx}: Missing field '{field}'")
        print()
    else:
        print("✅ All CSV fields preserved in LinkML structure")
        print()

    if value_mismatches:
        print(f"⚠️  WARNING: {len(value_mismatches)} value mismatches detected:")
        for row_idx, field, orig_val, mapped_val in value_mismatches[:10]:
            print(f"  Row {row_idx}, field '{field}': '{orig_val}' → '{mapped_val}'")
        print()
    else:
        print("✅ All field values match exactly")
        print()

    # Summary statistics
    total_records = len(original_records)
    total_fields = len(csv_fields) * total_records

    print("=" * 80)
    print("SUMMARY")
    print("=" * 80)
    print(f"Total records:       {total_records}")
    print(f"Total fields:        {total_fields}")
    print(f"Fields preserved:    {total_fields - len(missing_fields)}")
    print(f"Value mismatches:    {len(value_mismatches)}")
    print(f"Preservation rate:   {((total_fields - len(missing_fields)) / total_fields * 100):.1f}%")
    print()

    return len(missing_fields) == 0 and len(value_mismatches) == 0

def main():
    """Convert library ISIL CSV to LinkML YAML."""
    base_dir = Path(__file__).parent.parent
    csv_path = base_dir / "data" / "isil" / "nl" / "kb" / "20250401 Bnetwerk overzicht ISIL-codes Bibliotheken Nederland.csv"
    yaml_path = base_dir / "data" / "isil" / "nl" / "kb" / "20250401_Bnetwerk_ISIL_Bibliotheken_Nederland.yaml"

    print("=" * 80)
    print("DUTCH LIBRARY (KB BNETWERK) ISIL CSV TO LINKML YAML CONVERTER")
    print("=" * 80)
    print()
    print(f"Input:  {csv_path.name}")
    print(f"Output: {yaml_path.name}")
    print()

    # Parse CSV
    print("Parsing library ISIL CSV...")
    records, metadata = parse_library_csv(csv_path)
    print(f"✅ Parsed {len(records)} library records")
    print(f"   Metadata: {metadata['title']} - {metadata['date_info']}")
    print()

    # Map to LinkML
    print("Mapping to LinkML HeritageCustodian schema...")
    linkml_records = map_to_linkml(records, metadata)
    print(f"✅ Mapped {len(linkml_records)} records")
    print()

    # Validate field preservation
    validation_passed = validate_field_preservation(records, linkml_records)

    if not validation_passed:
        print("⚠️  WARNING: Field preservation validation failed!")
        response = input("Continue anyway? (y/N): ")
        if response.lower() != 'y':
            print("Aborted.")
            return
        print()

    # Write YAML
    print(f"Writing LinkML YAML to {yaml_path.name}...")
    with open(yaml_path, 'w', encoding='utf-8') as f:
        yaml.dump(linkml_records, f, allow_unicode=True, default_flow_style=False,
                  sort_keys=False, width=120)

    print("✅ YAML file created successfully!")
    print()

    # Show sample output
    print("=" * 80)
    print("SAMPLE OUTPUT (first record)")
    print("=" * 80)
    print(yaml.dump([linkml_records[0]], allow_unicode=True, default_flow_style=False,
                    sort_keys=False, width=120))

    # Show library type distribution
    from collections import Counter
    type_counts = Counter(r['library_type'] for r in linkml_records)

    print("=" * 80)
    print("LIBRARY TYPE DISTRIBUTION")
    print("=" * 80)
    for lib_type, count in type_counts.most_common():
        print(f"  {lib_type:40s} {count:3d} libraries")
    print()

    print("=" * 80)
    print(f"CONVERSION COMPLETE: {len(linkml_records)} library records")
    print("=" * 80)

if __name__ == "__main__":
    main()