glam/scripts/export_switzerland_csv.py
2025-11-19 23:25:22 +01:00

147 lines
4.6 KiB
Python

#!/usr/bin/env python3
"""
Export Swiss ISIL data to CSV format
Converts the JSON scraping output to a flat CSV structure
Author: GLAM Data Extraction Project
Date: November 2025
"""
import json
import csv
from pathlib import Path
from typing import List, Dict, Any
def flatten_institution(inst: Dict[str, Any]) -> Dict[str, str]:
"""Flatten nested institution data for CSV export"""
flat = {}
# Basic information
flat['name'] = inst.get('name', '')
flat['alternative_name'] = inst.get('alternative_name', '')
flat['status'] = inst.get('status', '')
flat['description'] = inst.get('description', '')
# Location
flat['region'] = inst.get('region', '')
flat['canton'] = inst.get('canton', '')
# Categories (join multiple with semicolon)
categories = inst.get('categories', [])
flat['categories'] = '; '.join(categories) if categories else ''
# ISIL code
flat['isil_code'] = inst.get('isil_code', '')
# Institution type (from detail page)
flat['institution_type'] = inst.get('institution_type', '')
# Address
address = inst.get('address', {})
flat['street'] = address.get('street', '')
flat['postal_code'] = address.get('postal_code', '')
flat['city'] = address.get('city', '')
# Contact
contact = inst.get('contact', {})
flat['phone'] = contact.get('phone', '')
flat['email'] = contact.get('email', '')
flat['website'] = contact.get('website', '')
# Opening hours
flat['opening_hours'] = inst.get('opening_hours', '')
# Memberships (join multiple with semicolon)
memberships = inst.get('memberships', [])
flat['memberships'] = '; '.join(memberships) if memberships else ''
# Dewey classifications (join multiple with semicolon)
dewey = inst.get('dewey_classifications', [])
flat['dewey_classifications'] = '; '.join(dewey) if dewey else ''
# URLs
flat['detail_url'] = inst.get('detail_url', '')
# Merged institution info
merged = inst.get('merged_into', {})
if merged:
flat['merged_into_name'] = merged.get('name', '')
flat['merged_into_url'] = merged.get('url', '')
else:
flat['merged_into_name'] = ''
flat['merged_into_url'] = ''
return flat
def export_to_csv(json_file: Path, csv_file: Path):
"""Export Swiss ISIL JSON data to CSV"""
print(f"Loading data from {json_file}")
with open(json_file, 'r', encoding='utf-8') as f:
institutions = json.load(f)
print(f"Loaded {len(institutions)} institutions")
# Flatten all institutions
flattened = [flatten_institution(inst) for inst in institutions]
# Get all field names (some records may have fields others don't)
fieldnames = [
'name',
'alternative_name',
'status',
'description',
'region',
'canton',
'categories',
'isil_code',
'institution_type',
'street',
'postal_code',
'city',
'phone',
'email',
'website',
'opening_hours',
'memberships',
'dewey_classifications',
'detail_url',
'merged_into_name',
'merged_into_url'
]
# Write CSV
print(f"Writing CSV to {csv_file}")
with open(csv_file, 'w', encoding='utf-8', newline='') as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(flattened)
print(f"✓ Successfully exported {len(flattened)} institutions to CSV")
# Print summary statistics
with_isil = sum(1 for row in flattened if row['isil_code'])
with_email = sum(1 for row in flattened if row['email'])
with_website = sum(1 for row in flattened if row['website'])
print("\nExport Summary:")
print(f" Total records: {len(flattened)}")
print(f" With ISIL codes: {with_isil} ({with_isil/len(flattened)*100:.1f}%)")
print(f" With email: {with_email} ({with_email/len(flattened)*100:.1f}%)")
print(f" With website: {with_website} ({with_website/len(flattened)*100:.1f}%)")
def main():
"""Main export function"""
base_dir = Path("/Users/kempersc/apps/glam/data/isil/switzerland")
json_file = base_dir / "swiss_isil_complete_final.json"
csv_file = base_dir / "swiss_isil_complete.csv"
if not json_file.exists():
print(f"Error: JSON file not found at {json_file}")
return
export_to_csv(json_file, csv_file)
print(f"\n✓ CSV export complete!")
print(f" Output file: {csv_file}")
if __name__ == "__main__":
main()