147 lines
4.6 KiB
Python
147 lines
4.6 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Export Swiss ISIL data to CSV format
|
|
Converts the JSON scraping output to a flat CSV structure
|
|
Author: GLAM Data Extraction Project
|
|
Date: November 2025
|
|
"""
|
|
|
|
import json
|
|
import csv
|
|
from pathlib import Path
|
|
from typing import List, Dict, Any
|
|
|
|
def flatten_institution(inst: Dict[str, Any]) -> Dict[str, str]:
|
|
"""Flatten nested institution data for CSV export"""
|
|
flat = {}
|
|
|
|
# Basic information
|
|
flat['name'] = inst.get('name', '')
|
|
flat['alternative_name'] = inst.get('alternative_name', '')
|
|
flat['status'] = inst.get('status', '')
|
|
flat['description'] = inst.get('description', '')
|
|
|
|
# Location
|
|
flat['region'] = inst.get('region', '')
|
|
flat['canton'] = inst.get('canton', '')
|
|
|
|
# Categories (join multiple with semicolon)
|
|
categories = inst.get('categories', [])
|
|
flat['categories'] = '; '.join(categories) if categories else ''
|
|
|
|
# ISIL code
|
|
flat['isil_code'] = inst.get('isil_code', '')
|
|
|
|
# Institution type (from detail page)
|
|
flat['institution_type'] = inst.get('institution_type', '')
|
|
|
|
# Address
|
|
address = inst.get('address', {})
|
|
flat['street'] = address.get('street', '')
|
|
flat['postal_code'] = address.get('postal_code', '')
|
|
flat['city'] = address.get('city', '')
|
|
|
|
# Contact
|
|
contact = inst.get('contact', {})
|
|
flat['phone'] = contact.get('phone', '')
|
|
flat['email'] = contact.get('email', '')
|
|
flat['website'] = contact.get('website', '')
|
|
|
|
# Opening hours
|
|
flat['opening_hours'] = inst.get('opening_hours', '')
|
|
|
|
# Memberships (join multiple with semicolon)
|
|
memberships = inst.get('memberships', [])
|
|
flat['memberships'] = '; '.join(memberships) if memberships else ''
|
|
|
|
# Dewey classifications (join multiple with semicolon)
|
|
dewey = inst.get('dewey_classifications', [])
|
|
flat['dewey_classifications'] = '; '.join(dewey) if dewey else ''
|
|
|
|
# URLs
|
|
flat['detail_url'] = inst.get('detail_url', '')
|
|
|
|
# Merged institution info
|
|
merged = inst.get('merged_into', {})
|
|
if merged:
|
|
flat['merged_into_name'] = merged.get('name', '')
|
|
flat['merged_into_url'] = merged.get('url', '')
|
|
else:
|
|
flat['merged_into_name'] = ''
|
|
flat['merged_into_url'] = ''
|
|
|
|
return flat
|
|
|
|
def export_to_csv(json_file: Path, csv_file: Path):
|
|
"""Export Swiss ISIL JSON data to CSV"""
|
|
print(f"Loading data from {json_file}")
|
|
|
|
with open(json_file, 'r', encoding='utf-8') as f:
|
|
institutions = json.load(f)
|
|
|
|
print(f"Loaded {len(institutions)} institutions")
|
|
|
|
# Flatten all institutions
|
|
flattened = [flatten_institution(inst) for inst in institutions]
|
|
|
|
# Get all field names (some records may have fields others don't)
|
|
fieldnames = [
|
|
'name',
|
|
'alternative_name',
|
|
'status',
|
|
'description',
|
|
'region',
|
|
'canton',
|
|
'categories',
|
|
'isil_code',
|
|
'institution_type',
|
|
'street',
|
|
'postal_code',
|
|
'city',
|
|
'phone',
|
|
'email',
|
|
'website',
|
|
'opening_hours',
|
|
'memberships',
|
|
'dewey_classifications',
|
|
'detail_url',
|
|
'merged_into_name',
|
|
'merged_into_url'
|
|
]
|
|
|
|
# Write CSV
|
|
print(f"Writing CSV to {csv_file}")
|
|
with open(csv_file, 'w', encoding='utf-8', newline='') as f:
|
|
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
|
writer.writeheader()
|
|
writer.writerows(flattened)
|
|
|
|
print(f"✓ Successfully exported {len(flattened)} institutions to CSV")
|
|
|
|
# Print summary statistics
|
|
with_isil = sum(1 for row in flattened if row['isil_code'])
|
|
with_email = sum(1 for row in flattened if row['email'])
|
|
with_website = sum(1 for row in flattened if row['website'])
|
|
|
|
print("\nExport Summary:")
|
|
print(f" Total records: {len(flattened)}")
|
|
print(f" With ISIL codes: {with_isil} ({with_isil/len(flattened)*100:.1f}%)")
|
|
print(f" With email: {with_email} ({with_email/len(flattened)*100:.1f}%)")
|
|
print(f" With website: {with_website} ({with_website/len(flattened)*100:.1f}%)")
|
|
|
|
def main():
|
|
"""Main export function"""
|
|
base_dir = Path("/Users/kempersc/apps/glam/data/isil/switzerland")
|
|
json_file = base_dir / "swiss_isil_complete_final.json"
|
|
csv_file = base_dir / "swiss_isil_complete.csv"
|
|
|
|
if not json_file.exists():
|
|
print(f"Error: JSON file not found at {json_file}")
|
|
return
|
|
|
|
export_to_csv(json_file, csv_file)
|
|
print(f"\n✓ CSV export complete!")
|
|
print(f" Output file: {csv_file}")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|