glam/scripts/export_switzerland_csv.py

#!/usr/bin/env python3
"""
Export Swiss ISIL data to CSV format
Converts the JSON scraping output to a flat CSV structure
Author: GLAM Data Extraction Project
Date: November 2025
"""

import json
import csv
from pathlib import Path
from typing import List, Dict, Any

def flatten_institution(inst: Dict[str, Any]) -> Dict[str, str]:
    """Flatten nested institution data for CSV export"""
    flat = {}

    # Basic information
    flat['name'] = inst.get('name', '')
    flat['alternative_name'] = inst.get('alternative_name', '')
    flat['status'] = inst.get('status', '')
    flat['description'] = inst.get('description', '')

    # Location
    flat['region'] = inst.get('region', '')
    flat['canton'] = inst.get('canton', '')

    # Categories (join multiple with semicolon)
    categories = inst.get('categories', [])
    flat['categories'] = '; '.join(categories) if categories else ''

    # ISIL code
    flat['isil_code'] = inst.get('isil_code', '')

    # Institution type (from detail page)
    flat['institution_type'] = inst.get('institution_type', '')

    # Address
    address = inst.get('address', {})
    flat['street'] = address.get('street', '')
    flat['postal_code'] = address.get('postal_code', '')
    flat['city'] = address.get('city', '')

    # Contact
    contact = inst.get('contact', {})
    flat['phone'] = contact.get('phone', '')
    flat['email'] = contact.get('email', '')
    flat['website'] = contact.get('website', '')

    # Opening hours
    flat['opening_hours'] = inst.get('opening_hours', '')

    # Memberships (join multiple with semicolon)
    memberships = inst.get('memberships', [])
    flat['memberships'] = '; '.join(memberships) if memberships else ''

    # Dewey classifications (join multiple with semicolon)
    dewey = inst.get('dewey_classifications', [])
    flat['dewey_classifications'] = '; '.join(dewey) if dewey else ''

    # URLs
    flat['detail_url'] = inst.get('detail_url', '')

    # Merged institution info
    merged = inst.get('merged_into', {})
    if merged:
        flat['merged_into_name'] = merged.get('name', '')
        flat['merged_into_url'] = merged.get('url', '')
    else:
        flat['merged_into_name'] = ''
        flat['merged_into_url'] = ''

    return flat

def export_to_csv(json_file: Path, csv_file: Path):
    """Export Swiss ISIL JSON data to CSV"""
    print(f"Loading data from {json_file}")

    with open(json_file, 'r', encoding='utf-8') as f:
        institutions = json.load(f)

    print(f"Loaded {len(institutions)} institutions")

    # Flatten all institutions
    flattened = [flatten_institution(inst) for inst in institutions]

    # Get all field names (some records may have fields others don't)
    fieldnames = [
        'name',
        'alternative_name',
        'status',
        'description',
        'region',
        'canton',
        'categories',
        'isil_code',
        'institution_type',
        'street',
        'postal_code',
        'city',
        'phone',
        'email',
        'website',
        'opening_hours',
        'memberships',
        'dewey_classifications',
        'detail_url',
        'merged_into_name',
        'merged_into_url'
    ]

    # Write CSV
    print(f"Writing CSV to {csv_file}")
    with open(csv_file, 'w', encoding='utf-8', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(flattened)

    print(f"✓ Successfully exported {len(flattened)} institutions to CSV")

    # Print summary statistics
    with_isil = sum(1 for row in flattened if row['isil_code'])
    with_email = sum(1 for row in flattened if row['email'])
    with_website = sum(1 for row in flattened if row['website'])

    print("\nExport Summary:")
    print(f"  Total records: {len(flattened)}")
    print(f"  With ISIL codes: {with_isil} ({with_isil/len(flattened)*100:.1f}%)")
    print(f"  With email: {with_email} ({with_email/len(flattened)*100:.1f}%)")
    print(f"  With website: {with_website} ({with_website/len(flattened)*100:.1f}%)")

def main():
    """Main export function"""
    base_dir = Path("/Users/kempersc/apps/glam/data/isil/switzerland")
    json_file = base_dir / "swiss_isil_complete_final.json"
    csv_file = base_dir / "swiss_isil_complete.csv"

    if not json_file.exists():
        print(f"Error: JSON file not found at {json_file}")
        return

    export_to_csv(json_file, csv_file)
    print(f"\n✓ CSV export complete!")
    print(f"  Output file: {csv_file}")

if __name__ == "__main__":
    main()