glam/scripts/scrapers/scrape_switzerland_isil.py
2025-11-19 23:25:22 +01:00

453 lines
18 KiB
Python

#!/usr/bin/env python3
"""
Swiss ISIL Database Scraper
Scrapes complete heritage institution data from https://www.isil.nb.admin.ch/en/
Author: GLAM Data Extraction Project
Date: November 2025
"""
import requests
from bs4 import BeautifulSoup
import json
import time
import re
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Optional
import logging
# Setup logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler('/Users/kempersc/apps/glam/data/isil/switzerland/scraper.log'),
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
class SwissISILScraper:
"""Scraper for Swiss National Library ISIL directory"""
BASE_URL = "https://www.isil.nb.admin.ch"
LIST_URL = f"{BASE_URL}/en/"
OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/isil/switzerland")
def __init__(self):
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (GLAM Heritage Data Research Project)',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9',
})
self.institutions = []
self.stats = {
'start_time': datetime.now().isoformat(),
'pages_scraped': 0,
'institutions_found': 0,
'detail_pages_scraped': 0,
'errors': []
}
def fetch_page(self, url: str, retry_count: int = 3) -> Optional[str]:
"""Fetch a page with retry logic"""
for attempt in range(retry_count):
try:
response = self.session.get(url, timeout=30)
response.raise_for_status()
return response.text
except Exception as e:
logger.warning(f"Attempt {attempt + 1} failed for {url}: {e}")
if attempt < retry_count - 1:
time.sleep(2 ** attempt) # Exponential backoff
else:
self.stats['errors'].append({'url': url, 'error': str(e)})
logger.error(f"Failed to fetch {url} after {retry_count} attempts")
return None
def parse_list_page(self, html: str) -> List[Dict]:
"""Parse institution listing page"""
soup = BeautifulSoup(html, 'html.parser')
institutions = []
# Find all institution cards
institution_cards = soup.select('ul.search-results-list > li')
logger.info(f"Found {len(institution_cards)} institution cards on page")
for card in institution_cards:
try:
institution = {}
# Get detail page URL
link = card.select_one('a.card__link')
if link and link.get('href'):
institution['detail_url'] = self.BASE_URL + link['href']
# Get institution name
name_elem = card.select_one('h3')
if name_elem:
# Extract primary name and alternative name
name_text = name_elem.get_text(strip=True)
# Check for alternative name (indicated by /)
if '/' in name_text:
parts = name_text.split('/')
institution['name'] = parts[0].strip()
institution['alternative_name'] = parts[1].strip() if len(parts) > 1 else None
else:
institution['name'] = name_text
# Check if inactive (strikethrough)
if name_elem.select_one('.line-through'):
institution['status'] = 'inactive'
else:
institution['status'] = 'active'
# Get description
desc_elem = card.select_one('.leading-snug')
if desc_elem:
institution['description'] = desc_elem.get_text(strip=True)
# Get location (region and canton)
location_elem = card.select_one('span.inline-block.text-base.text-gray-500.font-bold.mr-4')
if location_elem:
location_text = location_elem.get_text(strip=True)
if ',' in location_text:
parts = location_text.split(',')
institution['region'] = parts[0].strip()
institution['canton'] = parts[1].strip()
else:
institution['canton'] = location_text
# Get institution categories/types
categories = []
category_elems = card.select('span.inline-flex.items-center.mr-4 span.inline-block.text-gray-500.text-base.font-bold')
for cat in category_elems:
categories.append(cat.get_text(strip=True))
if categories:
institution['categories'] = categories
# Get merged institution info
merged_elem = card.select_one('p.text-base a')
if merged_elem:
institution['merged_into'] = {
'name': merged_elem.get_text(strip=True),
'url': self.BASE_URL + merged_elem['href'] if merged_elem.get('href') else None
}
if institution.get('name'):
institutions.append(institution)
except Exception as e:
logger.error(f"Error parsing institution card: {e}")
self.stats['errors'].append({'context': 'parse_card', 'error': str(e)})
return institutions
def parse_detail_page(self, html: str) -> Dict:
"""Parse institution detail page"""
soup = BeautifulSoup(html, 'html.parser')
details = {}
try:
# Get ISIL code from page
isil_elem = soup.select_one('dd:-soup-contains("CH-")')
if not isil_elem:
# Try alternative selection methods
for dd in soup.select('dd'):
text = dd.get_text(strip=True)
if text.startswith('CH-'):
isil_elem = dd
break
if isil_elem:
details['isil_code'] = isil_elem.get_text(strip=True)
# Get full address
address_parts = {}
# Street address
street_elem = soup.select_one('dd:-soup-contains("Street")')
if street_elem:
address_parts['street'] = street_elem.get_text(strip=True)
# Postal code and city
postal_elem = soup.select_one('dd:-soup-contains("Postal code")')
if postal_elem:
address_parts['postal_code'] = postal_elem.get_text(strip=True)
city_elem = soup.select_one('dd:-soup-contains("City")')
if city_elem:
address_parts['city'] = city_elem.get_text(strip=True)
# Try to find address section
address_section = soup.find('dt', string=re.compile(r'Address|Adresse'))
if address_section:
dd = address_section.find_next('dd')
if dd:
address_text = dd.get_text(separator='|', strip=True)
lines = address_text.split('|')
if len(lines) >= 2:
address_parts['street'] = lines[0].strip()
# Parse postal code and city
if len(lines) >= 3:
postal_city = lines[1].strip()
match = re.match(r'(\d{4})\s+(.+)', postal_city)
if match:
address_parts['postal_code'] = match.group(1)
address_parts['city'] = match.group(2)
if address_parts:
details['address'] = address_parts
# Get contact information
contact = {}
# Phone
phone_elem = soup.find('dt', string=re.compile(r'Phone|Telefon'))
if phone_elem:
dd = phone_elem.find_next('dd')
if dd:
contact['phone'] = dd.get_text(strip=True)
# Email
email_elem = soup.find('a', href=re.compile(r'^mailto:'))
if email_elem:
contact['email'] = email_elem.get_text(strip=True)
# Website
website_elem = soup.find('dt', string=re.compile(r'Website|Homepage'))
if website_elem:
dd = website_elem.find_next('dd')
if dd:
link = dd.find('a')
if link:
contact['website'] = link.get('href')
if contact:
details['contact'] = contact
# Get institution type/category
type_elem = soup.find('dt', string=re.compile(r'Institution type|Type'))
if type_elem:
dd = type_elem.find_next('dd')
if dd:
details['institution_type'] = dd.get_text(strip=True)
# Get opening hours
hours_elem = soup.find('dt', string=re.compile(r'Opening hours|Öffnungszeiten'))
if hours_elem:
dd = hours_elem.find_next('dd')
if dd:
details['opening_hours'] = dd.get_text(separator=' | ', strip=True)
# Get membership information
memberships = []
member_section = soup.find('dt', string=re.compile(r'Member of|Mitglied'))
if member_section:
dd = member_section.find_next('dd')
if dd:
member_tags = dd.select('span.badge, a')
for tag in member_tags:
memberships.append(tag.get_text(strip=True))
if memberships:
details['memberships'] = memberships
# Get Dewey classification
dewey = []
dewey_section = soup.find('dt', string=re.compile(r'Dewey|Subject area'))
if dewey_section:
dd = dewey_section.find_next('dd')
if dd:
dewey_tags = dd.select('span.badge, a')
for tag in dewey_tags:
dewey.append(tag.get_text(strip=True))
if dewey:
details['dewey_classifications'] = dewey
except Exception as e:
logger.error(f"Error parsing detail page: {e}")
self.stats['errors'].append({'context': 'parse_detail', 'error': str(e)})
return details
def scrape_institution_details(self, institution: Dict) -> Dict:
"""Scrape detailed information for a single institution"""
if not institution.get('detail_url'):
return institution
logger.info(f"Fetching details for: {institution.get('name')}")
html = self.fetch_page(institution['detail_url'])
if html:
details = self.parse_detail_page(html)
institution.update(details)
self.stats['detail_pages_scraped'] += 1
time.sleep(0.5) # Rate limiting
return institution
def scrape_all_pages(self, max_pages: Optional[int] = None):
"""Scrape all pages of the institution listing"""
page = 1
while True:
if max_pages and page > max_pages:
break
url = f"{self.LIST_URL}?page={page}"
logger.info(f"Scraping page {page}: {url}")
html = self.fetch_page(url)
if not html:
logger.error(f"Failed to fetch page {page}")
break
institutions = self.parse_list_page(html)
if not institutions:
logger.info(f"No institutions found on page {page}, stopping")
break
logger.info(f"Found {len(institutions)} institutions on page {page}")
self.institutions.extend(institutions)
self.stats['pages_scraped'] += 1
self.stats['institutions_found'] += len(institutions)
# Check if there's a next page
soup = BeautifulSoup(html, 'html.parser')
next_link = soup.select_one('a[href*="page="]:-soup-contains("Next")')
if not next_link and page >= 96: # We know there are 96 pages
logger.info("Reached last page")
break
page += 1
time.sleep(1) # Rate limiting between pages
logger.info(f"Completed scraping {self.stats['pages_scraped']} pages")
logger.info(f"Total institutions collected: {self.stats['institutions_found']}")
def scrape_all_details(self):
"""Scrape detailed information for all institutions"""
logger.info(f"Starting detailed scrape for {len(self.institutions)} institutions")
for i, institution in enumerate(self.institutions, 1):
logger.info(f"Processing institution {i}/{len(self.institutions)}")
self.scrape_institution_details(institution)
# Save intermediate results every 50 institutions
if i % 50 == 0:
self.save_results(suffix=f"_batch_{i}")
logger.info(f"Completed detailed scrape. Scraped {self.stats['detail_pages_scraped']} detail pages")
def save_results(self, suffix: str = ""):
"""Save scraped data to JSON files"""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
# Save institutions data
output_file = self.OUTPUT_DIR / f"swiss_isil_complete{suffix}.json"
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(self.institutions, f, ensure_ascii=False, indent=2)
logger.info(f"Saved {len(self.institutions)} institutions to {output_file}")
# Save statistics
self.stats['end_time'] = datetime.now().isoformat()
self.stats['total_institutions'] = len(self.institutions)
stats_file = self.OUTPUT_DIR / f"scraping_stats_{timestamp}.json"
with open(stats_file, 'w', encoding='utf-8') as f:
json.dump(self.stats, f, ensure_ascii=False, indent=2)
logger.info(f"Saved statistics to {stats_file}")
# Generate summary report
self.generate_report()
def generate_report(self):
"""Generate a summary report of the scraped data"""
report = []
report.append("=" * 80)
report.append("SWISS ISIL DATABASE SCRAPING REPORT")
report.append("=" * 80)
report.append(f"Scraping started: {self.stats.get('start_time', 'N/A')}")
report.append(f"Scraping ended: {self.stats.get('end_time', 'N/A')}")
report.append(f"Pages scraped: {self.stats['pages_scraped']}")
report.append(f"Institutions found: {self.stats['institutions_found']}")
report.append(f"Detail pages scraped: {self.stats['detail_pages_scraped']}")
report.append(f"Errors encountered: {len(self.stats['errors'])}")
report.append("")
# Count by status
active = sum(1 for i in self.institutions if i.get('status') == 'active')
inactive = sum(1 for i in self.institutions if i.get('status') == 'inactive')
report.append(f"Active institutions: {active}")
report.append(f"Inactive institutions: {inactive}")
report.append("")
# Count by canton
canton_counts = {}
for inst in self.institutions:
canton = inst.get('canton', 'Unknown')
canton_counts[canton] = canton_counts.get(canton, 0) + 1
report.append("Institutions by Canton:")
for canton, count in sorted(canton_counts.items(), key=lambda x: x[1], reverse=True):
report.append(f" {canton}: {count}")
report.append("")
# Count by institution type
type_counts = {}
for inst in self.institutions:
categories = inst.get('categories', [])
for cat in categories:
type_counts[cat] = type_counts.get(cat, 0) + 1
report.append("Institutions by Type:")
for cat, count in sorted(type_counts.items(), key=lambda x: x[1], reverse=True)[:20]:
report.append(f" {cat}: {count}")
report.append("")
# ISIL codes found
with_isil = sum(1 for i in self.institutions if i.get('isil_code'))
report.append(f"Institutions with ISIL codes: {with_isil}")
report.append("")
report.append("=" * 80)
report_text = "\n".join(report)
print("\n" + report_text)
# Save report to file
report_file = self.OUTPUT_DIR / f"scraping_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt"
with open(report_file, 'w', encoding='utf-8') as f:
f.write(report_text)
logger.info(f"Saved report to {report_file}")
def main():
"""Main execution function"""
logger.info("Starting Swiss ISIL database scraper")
scraper = SwissISILScraper()
# Step 1: Scrape all listing pages
logger.info("Step 1: Scraping institution listings")
scraper.scrape_all_pages()
# Save intermediate results
scraper.save_results(suffix="_listings_only")
# Step 2: Scrape detail pages for each institution
logger.info("Step 2: Scraping detailed information for each institution")
scraper.scrape_all_details()
# Save final results
scraper.save_results()
logger.info("Scraping complete!")
if __name__ == "__main__":
main()