#!/usr/bin/env python3 """ Swiss ISIL Database Scraper Scrapes complete heritage institution data from https://www.isil.nb.admin.ch/en/ Author: GLAM Data Extraction Project Date: November 2025 """ import requests from bs4 import BeautifulSoup import json import time import re from datetime import datetime from pathlib import Path from typing import List, Dict, Optional import logging # Setup logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler('/Users/kempersc/apps/glam/data/isil/switzerland/scraper.log'), logging.StreamHandler() ] ) logger = logging.getLogger(__name__) class SwissISILScraper: """Scraper for Swiss National Library ISIL directory""" BASE_URL = "https://www.isil.nb.admin.ch" LIST_URL = f"{BASE_URL}/en/" OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/isil/switzerland") def __init__(self): self.session = requests.Session() self.session.headers.update({ 'User-Agent': 'Mozilla/5.0 (GLAM Heritage Data Research Project)', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.9', }) self.institutions = [] self.stats = { 'start_time': datetime.now().isoformat(), 'pages_scraped': 0, 'institutions_found': 0, 'detail_pages_scraped': 0, 'errors': [] } def fetch_page(self, url: str, retry_count: int = 3) -> Optional[str]: """Fetch a page with retry logic""" for attempt in range(retry_count): try: response = self.session.get(url, timeout=30) response.raise_for_status() return response.text except Exception as e: logger.warning(f"Attempt {attempt + 1} failed for {url}: {e}") if attempt < retry_count - 1: time.sleep(2 ** attempt) # Exponential backoff else: self.stats['errors'].append({'url': url, 'error': str(e)}) logger.error(f"Failed to fetch {url} after {retry_count} attempts") return None def parse_list_page(self, html: str) -> List[Dict]: """Parse institution listing page""" soup = BeautifulSoup(html, 'html.parser') institutions = [] # Find all institution cards institution_cards = soup.select('ul.search-results-list > li') logger.info(f"Found {len(institution_cards)} institution cards on page") for card in institution_cards: try: institution = {} # Get detail page URL link = card.select_one('a.card__link') if link and link.get('href'): institution['detail_url'] = self.BASE_URL + link['href'] # Get institution name name_elem = card.select_one('h3') if name_elem: # Extract primary name and alternative name name_text = name_elem.get_text(strip=True) # Check for alternative name (indicated by /) if '/' in name_text: parts = name_text.split('/') institution['name'] = parts[0].strip() institution['alternative_name'] = parts[1].strip() if len(parts) > 1 else None else: institution['name'] = name_text # Check if inactive (strikethrough) if name_elem.select_one('.line-through'): institution['status'] = 'inactive' else: institution['status'] = 'active' # Get description desc_elem = card.select_one('.leading-snug') if desc_elem: institution['description'] = desc_elem.get_text(strip=True) # Get location (region and canton) location_elem = card.select_one('span.inline-block.text-base.text-gray-500.font-bold.mr-4') if location_elem: location_text = location_elem.get_text(strip=True) if ',' in location_text: parts = location_text.split(',') institution['region'] = parts[0].strip() institution['canton'] = parts[1].strip() else: institution['canton'] = location_text # Get institution categories/types categories = [] category_elems = card.select('span.inline-flex.items-center.mr-4 span.inline-block.text-gray-500.text-base.font-bold') for cat in category_elems: categories.append(cat.get_text(strip=True)) if categories: institution['categories'] = categories # Get merged institution info merged_elem = card.select_one('p.text-base a') if merged_elem: institution['merged_into'] = { 'name': merged_elem.get_text(strip=True), 'url': self.BASE_URL + merged_elem['href'] if merged_elem.get('href') else None } if institution.get('name'): institutions.append(institution) except Exception as e: logger.error(f"Error parsing institution card: {e}") self.stats['errors'].append({'context': 'parse_card', 'error': str(e)}) return institutions def parse_detail_page(self, html: str) -> Dict: """Parse institution detail page""" soup = BeautifulSoup(html, 'html.parser') details = {} try: # Get ISIL code from page isil_elem = soup.select_one('dd:-soup-contains("CH-")') if not isil_elem: # Try alternative selection methods for dd in soup.select('dd'): text = dd.get_text(strip=True) if text.startswith('CH-'): isil_elem = dd break if isil_elem: details['isil_code'] = isil_elem.get_text(strip=True) # Get full address address_parts = {} # Street address street_elem = soup.select_one('dd:-soup-contains("Street")') if street_elem: address_parts['street'] = street_elem.get_text(strip=True) # Postal code and city postal_elem = soup.select_one('dd:-soup-contains("Postal code")') if postal_elem: address_parts['postal_code'] = postal_elem.get_text(strip=True) city_elem = soup.select_one('dd:-soup-contains("City")') if city_elem: address_parts['city'] = city_elem.get_text(strip=True) # Try to find address section address_section = soup.find('dt', string=re.compile(r'Address|Adresse')) if address_section: dd = address_section.find_next('dd') if dd: address_text = dd.get_text(separator='|', strip=True) lines = address_text.split('|') if len(lines) >= 2: address_parts['street'] = lines[0].strip() # Parse postal code and city if len(lines) >= 3: postal_city = lines[1].strip() match = re.match(r'(\d{4})\s+(.+)', postal_city) if match: address_parts['postal_code'] = match.group(1) address_parts['city'] = match.group(2) if address_parts: details['address'] = address_parts # Get contact information contact = {} # Phone phone_elem = soup.find('dt', string=re.compile(r'Phone|Telefon')) if phone_elem: dd = phone_elem.find_next('dd') if dd: contact['phone'] = dd.get_text(strip=True) # Email email_elem = soup.find('a', href=re.compile(r'^mailto:')) if email_elem: contact['email'] = email_elem.get_text(strip=True) # Website website_elem = soup.find('dt', string=re.compile(r'Website|Homepage')) if website_elem: dd = website_elem.find_next('dd') if dd: link = dd.find('a') if link: contact['website'] = link.get('href') if contact: details['contact'] = contact # Get institution type/category type_elem = soup.find('dt', string=re.compile(r'Institution type|Type')) if type_elem: dd = type_elem.find_next('dd') if dd: details['institution_type'] = dd.get_text(strip=True) # Get opening hours hours_elem = soup.find('dt', string=re.compile(r'Opening hours|Öffnungszeiten')) if hours_elem: dd = hours_elem.find_next('dd') if dd: details['opening_hours'] = dd.get_text(separator=' | ', strip=True) # Get membership information memberships = [] member_section = soup.find('dt', string=re.compile(r'Member of|Mitglied')) if member_section: dd = member_section.find_next('dd') if dd: member_tags = dd.select('span.badge, a') for tag in member_tags: memberships.append(tag.get_text(strip=True)) if memberships: details['memberships'] = memberships # Get Dewey classification dewey = [] dewey_section = soup.find('dt', string=re.compile(r'Dewey|Subject area')) if dewey_section: dd = dewey_section.find_next('dd') if dd: dewey_tags = dd.select('span.badge, a') for tag in dewey_tags: dewey.append(tag.get_text(strip=True)) if dewey: details['dewey_classifications'] = dewey except Exception as e: logger.error(f"Error parsing detail page: {e}") self.stats['errors'].append({'context': 'parse_detail', 'error': str(e)}) return details def scrape_institution_details(self, institution: Dict) -> Dict: """Scrape detailed information for a single institution""" if not institution.get('detail_url'): return institution logger.info(f"Fetching details for: {institution.get('name')}") html = self.fetch_page(institution['detail_url']) if html: details = self.parse_detail_page(html) institution.update(details) self.stats['detail_pages_scraped'] += 1 time.sleep(0.5) # Rate limiting return institution def scrape_all_pages(self, max_pages: Optional[int] = None): """Scrape all pages of the institution listing""" page = 1 while True: if max_pages and page > max_pages: break url = f"{self.LIST_URL}?page={page}" logger.info(f"Scraping page {page}: {url}") html = self.fetch_page(url) if not html: logger.error(f"Failed to fetch page {page}") break institutions = self.parse_list_page(html) if not institutions: logger.info(f"No institutions found on page {page}, stopping") break logger.info(f"Found {len(institutions)} institutions on page {page}") self.institutions.extend(institutions) self.stats['pages_scraped'] += 1 self.stats['institutions_found'] += len(institutions) # Check if there's a next page soup = BeautifulSoup(html, 'html.parser') next_link = soup.select_one('a[href*="page="]:-soup-contains("Next")') if not next_link and page >= 96: # We know there are 96 pages logger.info("Reached last page") break page += 1 time.sleep(1) # Rate limiting between pages logger.info(f"Completed scraping {self.stats['pages_scraped']} pages") logger.info(f"Total institutions collected: {self.stats['institutions_found']}") def scrape_all_details(self): """Scrape detailed information for all institutions""" logger.info(f"Starting detailed scrape for {len(self.institutions)} institutions") for i, institution in enumerate(self.institutions, 1): logger.info(f"Processing institution {i}/{len(self.institutions)}") self.scrape_institution_details(institution) # Save intermediate results every 50 institutions if i % 50 == 0: self.save_results(suffix=f"_batch_{i}") logger.info(f"Completed detailed scrape. Scraped {self.stats['detail_pages_scraped']} detail pages") def save_results(self, suffix: str = ""): """Save scraped data to JSON files""" timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") # Save institutions data output_file = self.OUTPUT_DIR / f"swiss_isil_complete{suffix}.json" with open(output_file, 'w', encoding='utf-8') as f: json.dump(self.institutions, f, ensure_ascii=False, indent=2) logger.info(f"Saved {len(self.institutions)} institutions to {output_file}") # Save statistics self.stats['end_time'] = datetime.now().isoformat() self.stats['total_institutions'] = len(self.institutions) stats_file = self.OUTPUT_DIR / f"scraping_stats_{timestamp}.json" with open(stats_file, 'w', encoding='utf-8') as f: json.dump(self.stats, f, ensure_ascii=False, indent=2) logger.info(f"Saved statistics to {stats_file}") # Generate summary report self.generate_report() def generate_report(self): """Generate a summary report of the scraped data""" report = [] report.append("=" * 80) report.append("SWISS ISIL DATABASE SCRAPING REPORT") report.append("=" * 80) report.append(f"Scraping started: {self.stats.get('start_time', 'N/A')}") report.append(f"Scraping ended: {self.stats.get('end_time', 'N/A')}") report.append(f"Pages scraped: {self.stats['pages_scraped']}") report.append(f"Institutions found: {self.stats['institutions_found']}") report.append(f"Detail pages scraped: {self.stats['detail_pages_scraped']}") report.append(f"Errors encountered: {len(self.stats['errors'])}") report.append("") # Count by status active = sum(1 for i in self.institutions if i.get('status') == 'active') inactive = sum(1 for i in self.institutions if i.get('status') == 'inactive') report.append(f"Active institutions: {active}") report.append(f"Inactive institutions: {inactive}") report.append("") # Count by canton canton_counts = {} for inst in self.institutions: canton = inst.get('canton', 'Unknown') canton_counts[canton] = canton_counts.get(canton, 0) + 1 report.append("Institutions by Canton:") for canton, count in sorted(canton_counts.items(), key=lambda x: x[1], reverse=True): report.append(f" {canton}: {count}") report.append("") # Count by institution type type_counts = {} for inst in self.institutions: categories = inst.get('categories', []) for cat in categories: type_counts[cat] = type_counts.get(cat, 0) + 1 report.append("Institutions by Type:") for cat, count in sorted(type_counts.items(), key=lambda x: x[1], reverse=True)[:20]: report.append(f" {cat}: {count}") report.append("") # ISIL codes found with_isil = sum(1 for i in self.institutions if i.get('isil_code')) report.append(f"Institutions with ISIL codes: {with_isil}") report.append("") report.append("=" * 80) report_text = "\n".join(report) print("\n" + report_text) # Save report to file report_file = self.OUTPUT_DIR / f"scraping_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt" with open(report_file, 'w', encoding='utf-8') as f: f.write(report_text) logger.info(f"Saved report to {report_file}") def main(): """Main execution function""" logger.info("Starting Swiss ISIL database scraper") scraper = SwissISILScraper() # Step 1: Scrape all listing pages logger.info("Step 1: Scraping institution listings") scraper.scrape_all_pages() # Save intermediate results scraper.save_results(suffix="_listings_only") # Step 2: Scrape detail pages for each institution logger.info("Step 2: Scraping detailed information for each institution") scraper.scrape_all_details() # Save final results scraper.save_results() logger.info("Scraping complete!") if __name__ == "__main__": main()