453 lines
18 KiB
Python
453 lines
18 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Swiss ISIL Database Scraper
|
|
Scrapes complete heritage institution data from https://www.isil.nb.admin.ch/en/
|
|
Author: GLAM Data Extraction Project
|
|
Date: November 2025
|
|
"""
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
import json
|
|
import time
|
|
import re
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import List, Dict, Optional
|
|
import logging
|
|
|
|
# Setup logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
|
handlers=[
|
|
logging.FileHandler('/Users/kempersc/apps/glam/data/isil/switzerland/scraper.log'),
|
|
logging.StreamHandler()
|
|
]
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class SwissISILScraper:
|
|
"""Scraper for Swiss National Library ISIL directory"""
|
|
|
|
BASE_URL = "https://www.isil.nb.admin.ch"
|
|
LIST_URL = f"{BASE_URL}/en/"
|
|
OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/isil/switzerland")
|
|
|
|
def __init__(self):
|
|
self.session = requests.Session()
|
|
self.session.headers.update({
|
|
'User-Agent': 'Mozilla/5.0 (GLAM Heritage Data Research Project)',
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
'Accept-Language': 'en-US,en;q=0.9',
|
|
})
|
|
self.institutions = []
|
|
self.stats = {
|
|
'start_time': datetime.now().isoformat(),
|
|
'pages_scraped': 0,
|
|
'institutions_found': 0,
|
|
'detail_pages_scraped': 0,
|
|
'errors': []
|
|
}
|
|
|
|
def fetch_page(self, url: str, retry_count: int = 3) -> Optional[str]:
|
|
"""Fetch a page with retry logic"""
|
|
for attempt in range(retry_count):
|
|
try:
|
|
response = self.session.get(url, timeout=30)
|
|
response.raise_for_status()
|
|
return response.text
|
|
except Exception as e:
|
|
logger.warning(f"Attempt {attempt + 1} failed for {url}: {e}")
|
|
if attempt < retry_count - 1:
|
|
time.sleep(2 ** attempt) # Exponential backoff
|
|
else:
|
|
self.stats['errors'].append({'url': url, 'error': str(e)})
|
|
logger.error(f"Failed to fetch {url} after {retry_count} attempts")
|
|
return None
|
|
|
|
def parse_list_page(self, html: str) -> List[Dict]:
|
|
"""Parse institution listing page"""
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
institutions = []
|
|
|
|
# Find all institution cards
|
|
institution_cards = soup.select('ul.search-results-list > li')
|
|
|
|
logger.info(f"Found {len(institution_cards)} institution cards on page")
|
|
|
|
for card in institution_cards:
|
|
try:
|
|
institution = {}
|
|
|
|
# Get detail page URL
|
|
link = card.select_one('a.card__link')
|
|
if link and link.get('href'):
|
|
institution['detail_url'] = self.BASE_URL + link['href']
|
|
|
|
# Get institution name
|
|
name_elem = card.select_one('h3')
|
|
if name_elem:
|
|
# Extract primary name and alternative name
|
|
name_text = name_elem.get_text(strip=True)
|
|
# Check for alternative name (indicated by /)
|
|
if '/' in name_text:
|
|
parts = name_text.split('/')
|
|
institution['name'] = parts[0].strip()
|
|
institution['alternative_name'] = parts[1].strip() if len(parts) > 1 else None
|
|
else:
|
|
institution['name'] = name_text
|
|
|
|
# Check if inactive (strikethrough)
|
|
if name_elem.select_one('.line-through'):
|
|
institution['status'] = 'inactive'
|
|
else:
|
|
institution['status'] = 'active'
|
|
|
|
# Get description
|
|
desc_elem = card.select_one('.leading-snug')
|
|
if desc_elem:
|
|
institution['description'] = desc_elem.get_text(strip=True)
|
|
|
|
# Get location (region and canton)
|
|
location_elem = card.select_one('span.inline-block.text-base.text-gray-500.font-bold.mr-4')
|
|
if location_elem:
|
|
location_text = location_elem.get_text(strip=True)
|
|
if ',' in location_text:
|
|
parts = location_text.split(',')
|
|
institution['region'] = parts[0].strip()
|
|
institution['canton'] = parts[1].strip()
|
|
else:
|
|
institution['canton'] = location_text
|
|
|
|
# Get institution categories/types
|
|
categories = []
|
|
category_elems = card.select('span.inline-flex.items-center.mr-4 span.inline-block.text-gray-500.text-base.font-bold')
|
|
for cat in category_elems:
|
|
categories.append(cat.get_text(strip=True))
|
|
if categories:
|
|
institution['categories'] = categories
|
|
|
|
# Get merged institution info
|
|
merged_elem = card.select_one('p.text-base a')
|
|
if merged_elem:
|
|
institution['merged_into'] = {
|
|
'name': merged_elem.get_text(strip=True),
|
|
'url': self.BASE_URL + merged_elem['href'] if merged_elem.get('href') else None
|
|
}
|
|
|
|
if institution.get('name'):
|
|
institutions.append(institution)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error parsing institution card: {e}")
|
|
self.stats['errors'].append({'context': 'parse_card', 'error': str(e)})
|
|
|
|
return institutions
|
|
|
|
def parse_detail_page(self, html: str) -> Dict:
|
|
"""Parse institution detail page"""
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
details = {}
|
|
|
|
try:
|
|
# Get ISIL code from page
|
|
isil_elem = soup.select_one('dd:-soup-contains("CH-")')
|
|
if not isil_elem:
|
|
# Try alternative selection methods
|
|
for dd in soup.select('dd'):
|
|
text = dd.get_text(strip=True)
|
|
if text.startswith('CH-'):
|
|
isil_elem = dd
|
|
break
|
|
|
|
if isil_elem:
|
|
details['isil_code'] = isil_elem.get_text(strip=True)
|
|
|
|
# Get full address
|
|
address_parts = {}
|
|
|
|
# Street address
|
|
street_elem = soup.select_one('dd:-soup-contains("Street")')
|
|
if street_elem:
|
|
address_parts['street'] = street_elem.get_text(strip=True)
|
|
|
|
# Postal code and city
|
|
postal_elem = soup.select_one('dd:-soup-contains("Postal code")')
|
|
if postal_elem:
|
|
address_parts['postal_code'] = postal_elem.get_text(strip=True)
|
|
|
|
city_elem = soup.select_one('dd:-soup-contains("City")')
|
|
if city_elem:
|
|
address_parts['city'] = city_elem.get_text(strip=True)
|
|
|
|
# Try to find address section
|
|
address_section = soup.find('dt', string=re.compile(r'Address|Adresse'))
|
|
if address_section:
|
|
dd = address_section.find_next('dd')
|
|
if dd:
|
|
address_text = dd.get_text(separator='|', strip=True)
|
|
lines = address_text.split('|')
|
|
if len(lines) >= 2:
|
|
address_parts['street'] = lines[0].strip()
|
|
# Parse postal code and city
|
|
if len(lines) >= 3:
|
|
postal_city = lines[1].strip()
|
|
match = re.match(r'(\d{4})\s+(.+)', postal_city)
|
|
if match:
|
|
address_parts['postal_code'] = match.group(1)
|
|
address_parts['city'] = match.group(2)
|
|
|
|
if address_parts:
|
|
details['address'] = address_parts
|
|
|
|
# Get contact information
|
|
contact = {}
|
|
|
|
# Phone
|
|
phone_elem = soup.find('dt', string=re.compile(r'Phone|Telefon'))
|
|
if phone_elem:
|
|
dd = phone_elem.find_next('dd')
|
|
if dd:
|
|
contact['phone'] = dd.get_text(strip=True)
|
|
|
|
# Email
|
|
email_elem = soup.find('a', href=re.compile(r'^mailto:'))
|
|
if email_elem:
|
|
contact['email'] = email_elem.get_text(strip=True)
|
|
|
|
# Website
|
|
website_elem = soup.find('dt', string=re.compile(r'Website|Homepage'))
|
|
if website_elem:
|
|
dd = website_elem.find_next('dd')
|
|
if dd:
|
|
link = dd.find('a')
|
|
if link:
|
|
contact['website'] = link.get('href')
|
|
|
|
if contact:
|
|
details['contact'] = contact
|
|
|
|
# Get institution type/category
|
|
type_elem = soup.find('dt', string=re.compile(r'Institution type|Type'))
|
|
if type_elem:
|
|
dd = type_elem.find_next('dd')
|
|
if dd:
|
|
details['institution_type'] = dd.get_text(strip=True)
|
|
|
|
# Get opening hours
|
|
hours_elem = soup.find('dt', string=re.compile(r'Opening hours|Öffnungszeiten'))
|
|
if hours_elem:
|
|
dd = hours_elem.find_next('dd')
|
|
if dd:
|
|
details['opening_hours'] = dd.get_text(separator=' | ', strip=True)
|
|
|
|
# Get membership information
|
|
memberships = []
|
|
member_section = soup.find('dt', string=re.compile(r'Member of|Mitglied'))
|
|
if member_section:
|
|
dd = member_section.find_next('dd')
|
|
if dd:
|
|
member_tags = dd.select('span.badge, a')
|
|
for tag in member_tags:
|
|
memberships.append(tag.get_text(strip=True))
|
|
|
|
if memberships:
|
|
details['memberships'] = memberships
|
|
|
|
# Get Dewey classification
|
|
dewey = []
|
|
dewey_section = soup.find('dt', string=re.compile(r'Dewey|Subject area'))
|
|
if dewey_section:
|
|
dd = dewey_section.find_next('dd')
|
|
if dd:
|
|
dewey_tags = dd.select('span.badge, a')
|
|
for tag in dewey_tags:
|
|
dewey.append(tag.get_text(strip=True))
|
|
|
|
if dewey:
|
|
details['dewey_classifications'] = dewey
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error parsing detail page: {e}")
|
|
self.stats['errors'].append({'context': 'parse_detail', 'error': str(e)})
|
|
|
|
return details
|
|
|
|
def scrape_institution_details(self, institution: Dict) -> Dict:
|
|
"""Scrape detailed information for a single institution"""
|
|
if not institution.get('detail_url'):
|
|
return institution
|
|
|
|
logger.info(f"Fetching details for: {institution.get('name')}")
|
|
html = self.fetch_page(institution['detail_url'])
|
|
|
|
if html:
|
|
details = self.parse_detail_page(html)
|
|
institution.update(details)
|
|
self.stats['detail_pages_scraped'] += 1
|
|
|
|
time.sleep(0.5) # Rate limiting
|
|
return institution
|
|
|
|
def scrape_all_pages(self, max_pages: Optional[int] = None):
|
|
"""Scrape all pages of the institution listing"""
|
|
page = 1
|
|
|
|
while True:
|
|
if max_pages and page > max_pages:
|
|
break
|
|
|
|
url = f"{self.LIST_URL}?page={page}"
|
|
logger.info(f"Scraping page {page}: {url}")
|
|
|
|
html = self.fetch_page(url)
|
|
if not html:
|
|
logger.error(f"Failed to fetch page {page}")
|
|
break
|
|
|
|
institutions = self.parse_list_page(html)
|
|
|
|
if not institutions:
|
|
logger.info(f"No institutions found on page {page}, stopping")
|
|
break
|
|
|
|
logger.info(f"Found {len(institutions)} institutions on page {page}")
|
|
self.institutions.extend(institutions)
|
|
self.stats['pages_scraped'] += 1
|
|
self.stats['institutions_found'] += len(institutions)
|
|
|
|
# Check if there's a next page
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
next_link = soup.select_one('a[href*="page="]:-soup-contains("Next")')
|
|
|
|
if not next_link and page >= 96: # We know there are 96 pages
|
|
logger.info("Reached last page")
|
|
break
|
|
|
|
page += 1
|
|
time.sleep(1) # Rate limiting between pages
|
|
|
|
logger.info(f"Completed scraping {self.stats['pages_scraped']} pages")
|
|
logger.info(f"Total institutions collected: {self.stats['institutions_found']}")
|
|
|
|
def scrape_all_details(self):
|
|
"""Scrape detailed information for all institutions"""
|
|
logger.info(f"Starting detailed scrape for {len(self.institutions)} institutions")
|
|
|
|
for i, institution in enumerate(self.institutions, 1):
|
|
logger.info(f"Processing institution {i}/{len(self.institutions)}")
|
|
self.scrape_institution_details(institution)
|
|
|
|
# Save intermediate results every 50 institutions
|
|
if i % 50 == 0:
|
|
self.save_results(suffix=f"_batch_{i}")
|
|
|
|
logger.info(f"Completed detailed scrape. Scraped {self.stats['detail_pages_scraped']} detail pages")
|
|
|
|
def save_results(self, suffix: str = ""):
|
|
"""Save scraped data to JSON files"""
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
|
|
# Save institutions data
|
|
output_file = self.OUTPUT_DIR / f"swiss_isil_complete{suffix}.json"
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(self.institutions, f, ensure_ascii=False, indent=2)
|
|
logger.info(f"Saved {len(self.institutions)} institutions to {output_file}")
|
|
|
|
# Save statistics
|
|
self.stats['end_time'] = datetime.now().isoformat()
|
|
self.stats['total_institutions'] = len(self.institutions)
|
|
stats_file = self.OUTPUT_DIR / f"scraping_stats_{timestamp}.json"
|
|
with open(stats_file, 'w', encoding='utf-8') as f:
|
|
json.dump(self.stats, f, ensure_ascii=False, indent=2)
|
|
logger.info(f"Saved statistics to {stats_file}")
|
|
|
|
# Generate summary report
|
|
self.generate_report()
|
|
|
|
def generate_report(self):
|
|
"""Generate a summary report of the scraped data"""
|
|
report = []
|
|
report.append("=" * 80)
|
|
report.append("SWISS ISIL DATABASE SCRAPING REPORT")
|
|
report.append("=" * 80)
|
|
report.append(f"Scraping started: {self.stats.get('start_time', 'N/A')}")
|
|
report.append(f"Scraping ended: {self.stats.get('end_time', 'N/A')}")
|
|
report.append(f"Pages scraped: {self.stats['pages_scraped']}")
|
|
report.append(f"Institutions found: {self.stats['institutions_found']}")
|
|
report.append(f"Detail pages scraped: {self.stats['detail_pages_scraped']}")
|
|
report.append(f"Errors encountered: {len(self.stats['errors'])}")
|
|
report.append("")
|
|
|
|
# Count by status
|
|
active = sum(1 for i in self.institutions if i.get('status') == 'active')
|
|
inactive = sum(1 for i in self.institutions if i.get('status') == 'inactive')
|
|
report.append(f"Active institutions: {active}")
|
|
report.append(f"Inactive institutions: {inactive}")
|
|
report.append("")
|
|
|
|
# Count by canton
|
|
canton_counts = {}
|
|
for inst in self.institutions:
|
|
canton = inst.get('canton', 'Unknown')
|
|
canton_counts[canton] = canton_counts.get(canton, 0) + 1
|
|
|
|
report.append("Institutions by Canton:")
|
|
for canton, count in sorted(canton_counts.items(), key=lambda x: x[1], reverse=True):
|
|
report.append(f" {canton}: {count}")
|
|
report.append("")
|
|
|
|
# Count by institution type
|
|
type_counts = {}
|
|
for inst in self.institutions:
|
|
categories = inst.get('categories', [])
|
|
for cat in categories:
|
|
type_counts[cat] = type_counts.get(cat, 0) + 1
|
|
|
|
report.append("Institutions by Type:")
|
|
for cat, count in sorted(type_counts.items(), key=lambda x: x[1], reverse=True)[:20]:
|
|
report.append(f" {cat}: {count}")
|
|
report.append("")
|
|
|
|
# ISIL codes found
|
|
with_isil = sum(1 for i in self.institutions if i.get('isil_code'))
|
|
report.append(f"Institutions with ISIL codes: {with_isil}")
|
|
report.append("")
|
|
|
|
report.append("=" * 80)
|
|
|
|
report_text = "\n".join(report)
|
|
print("\n" + report_text)
|
|
|
|
# Save report to file
|
|
report_file = self.OUTPUT_DIR / f"scraping_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt"
|
|
with open(report_file, 'w', encoding='utf-8') as f:
|
|
f.write(report_text)
|
|
logger.info(f"Saved report to {report_file}")
|
|
|
|
def main():
|
|
"""Main execution function"""
|
|
logger.info("Starting Swiss ISIL database scraper")
|
|
|
|
scraper = SwissISILScraper()
|
|
|
|
# Step 1: Scrape all listing pages
|
|
logger.info("Step 1: Scraping institution listings")
|
|
scraper.scrape_all_pages()
|
|
|
|
# Save intermediate results
|
|
scraper.save_results(suffix="_listings_only")
|
|
|
|
# Step 2: Scrape detail pages for each institution
|
|
logger.info("Step 2: Scraping detailed information for each institution")
|
|
scraper.scrape_all_details()
|
|
|
|
# Save final results
|
|
scraper.save_results()
|
|
|
|
logger.info("Scraping complete!")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|