glam/scripts/scrapers/scrape_switzerland_isil_resume.py
2025-11-19 23:25:22 +01:00

298 lines
12 KiB
Python

#!/usr/bin/env python3
"""
Swiss ISIL Database Scraper - RESUMABLE VERSION
Continues scraping from where it left off
Author: GLAM Data Extraction Project
Date: November 2025
"""
import requests
from bs4 import BeautifulSoup
import json
import time
import re
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Optional
import logging
import sys
# Setup logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler('/Users/kempersc/apps/glam/data/isil/switzerland/scraper_resume.log'),
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
class SwissISILScraperResumable:
"""Resumable scraper for Swiss National Library ISIL directory"""
BASE_URL = "https://www.isil.nb.admin.ch"
LIST_URL = f"{BASE_URL}/en/"
OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/isil/switzerland")
LISTINGS_FILE = OUTPUT_DIR / "swiss_isil_complete_listings_only.json"
def __init__(self):
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (GLAM Heritage Data Research Project)',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9',
})
self.institutions = []
self.stats = {
'start_time': datetime.now().isoformat(),
'detail_pages_scraped': 0,
'errors': []
}
def load_listings(self) -> List[Dict]:
"""Load institution listings from existing file"""
if not self.LISTINGS_FILE.exists():
logger.error(f"Listings file not found: {self.LISTINGS_FILE}")
logger.error("Please run the full scraper first to generate listings.")
sys.exit(1)
logger.info(f"Loading listings from {self.LISTINGS_FILE}")
with open(self.LISTINGS_FILE, 'r', encoding='utf-8') as f:
institutions = json.load(f)
logger.info(f"Loaded {len(institutions)} institutions from listings file")
return institutions
def find_resume_point(self) -> int:
"""Find the index to resume scraping from"""
# Look for the most recent batch file
batch_files = list(self.OUTPUT_DIR.glob("swiss_isil_complete_batch_*.json"))
if not batch_files:
logger.info("No existing batch files found, starting from beginning")
return 0
# Sort by modification time, get the most recent
latest_batch = max(batch_files, key=lambda p: p.stat().st_mtime)
logger.info(f"Found latest batch file: {latest_batch.name}")
# Extract the batch number from filename
match = re.search(r'batch_(\d+)\.json$', latest_batch.name)
if match:
last_processed = int(match.group(1))
logger.info(f"Last processed index: {last_processed}")
# Load the batch file to verify it has detail data
with open(latest_batch, 'r', encoding='utf-8') as f:
data = json.load(f)
# Check if the last institution has ISIL code (detail scraped)
if len(data) >= last_processed and data[last_processed - 1].get('isil_code'):
logger.info(f"Verified institution {last_processed} has detail data")
return last_processed
logger.info("Could not determine resume point, starting from beginning")
return 0
def fetch_page(self, url: str, retry_count: int = 3) -> Optional[str]:
"""Fetch a page with retry logic"""
for attempt in range(retry_count):
try:
response = self.session.get(url, timeout=30)
response.raise_for_status()
return response.text
except Exception as e:
logger.warning(f"Attempt {attempt + 1} failed for {url}: {e}")
if attempt < retry_count - 1:
time.sleep(2 ** attempt) # Exponential backoff
else:
self.stats['errors'].append({'url': url, 'error': str(e)})
logger.error(f"Failed to fetch {url} after {retry_count} attempts")
return None
def parse_detail_page(self, html: str) -> Dict:
"""Parse institution detail page"""
soup = BeautifulSoup(html, 'html.parser')
details = {}
try:
# Get ISIL code
for dd in soup.select('dd'):
text = dd.get_text(strip=True)
if text.startswith('CH-'):
details['isil_code'] = text
break
# Get address
address_parts = {}
address_section = soup.find('dt', string=re.compile(r'Address|Adresse'))
if address_section:
dd = address_section.find_next('dd')
if dd:
address_text = dd.get_text(separator='|', strip=True)
lines = address_text.split('|')
if len(lines) >= 1:
address_parts['street'] = lines[0].strip()
if len(lines) >= 2:
postal_city = lines[1].strip()
match = re.match(r'(\d{4})\s+(.+)', postal_city)
if match:
address_parts['postal_code'] = match.group(1)
address_parts['city'] = match.group(2)
if address_parts:
details['address'] = address_parts
# Get contact information
contact = {}
# Phone
phone_elem = soup.find('dt', string=re.compile(r'Phone|Telefon'))
if phone_elem:
dd = phone_elem.find_next('dd')
if dd:
contact['phone'] = dd.get_text(strip=True)
# Email
email_elem = soup.find('a', href=re.compile(r'^mailto:'))
if email_elem:
contact['email'] = email_elem.get_text(strip=True)
# Website
website_elem = soup.find('dt', string=re.compile(r'Website|Homepage'))
if website_elem:
dd = website_elem.find_next('dd')
if dd:
link = dd.find('a')
if link:
contact['website'] = link.get('href')
if contact:
details['contact'] = contact
# Get institution type
type_elem = soup.find('dt', string=re.compile(r'Institution type|Type'))
if type_elem:
dd = type_elem.find_next('dd')
if dd:
details['institution_type'] = dd.get_text(strip=True)
# Get opening hours
hours_elem = soup.find('dt', string=re.compile(r'Opening hours|Öffnungszeiten'))
if hours_elem:
dd = hours_elem.find_next('dd')
if dd:
details['opening_hours'] = dd.get_text(separator=' | ', strip=True)
# Get memberships
memberships = []
member_section = soup.find('dt', string=re.compile(r'Member of|Mitglied'))
if member_section:
dd = member_section.find_next('dd')
if dd:
member_tags = dd.select('span.badge, a')
for tag in member_tags:
memberships.append(tag.get_text(strip=True))
if memberships:
details['memberships'] = memberships
# Get Dewey classification
dewey = []
dewey_section = soup.find('dt', string=re.compile(r'Dewey|Subject area'))
if dewey_section:
dd = dewey_section.find_next('dd')
if dd:
dewey_tags = dd.select('span.badge, a')
for tag in dewey_tags:
dewey.append(tag.get_text(strip=True))
if dewey:
details['dewey_classifications'] = dewey
except Exception as e:
logger.error(f"Error parsing detail page: {e}")
self.stats['errors'].append({'context': 'parse_detail', 'error': str(e)})
return details
def scrape_institution_details(self, institution: Dict) -> Dict:
"""Scrape detailed information for a single institution"""
if not institution.get('detail_url'):
return institution
html = self.fetch_page(institution['detail_url'])
if html:
details = self.parse_detail_page(html)
institution.update(details)
self.stats['detail_pages_scraped'] += 1
time.sleep(0.5) # Rate limiting
return institution
def scrape_details_from(self, start_index: int = 0):
"""Scrape detailed information starting from a specific index"""
total = len(self.institutions)
logger.info(f"Starting detailed scrape from institution {start_index + 1}/{total}")
for i in range(start_index, total):
institution_num = i + 1
logger.info(f"Processing institution {institution_num}/{total}")
self.scrape_institution_details(self.institutions[i])
# Save intermediate results every 50 institutions
if institution_num % 50 == 0:
self.save_results(suffix=f"_batch_{institution_num}")
logger.info(f"Saved batch at institution {institution_num}")
logger.info(f"Completed detailed scrape. Scraped {self.stats['detail_pages_scraped']} detail pages")
def save_results(self, suffix: str = ""):
"""Save scraped data to JSON files"""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
# Save institutions data
output_file = self.OUTPUT_DIR / f"swiss_isil_complete{suffix}.json"
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(self.institutions, f, ensure_ascii=False, indent=2)
logger.info(f"Saved {len(self.institutions)} institutions to {output_file}")
# Save statistics
self.stats['end_time'] = datetime.now().isoformat()
self.stats['total_institutions'] = len(self.institutions)
stats_file = self.OUTPUT_DIR / f"scraping_stats_resume_{timestamp}.json"
with open(stats_file, 'w', encoding='utf-8') as f:
json.dump(self.stats, f, ensure_ascii=False, indent=2)
logger.info(f"Saved statistics to {stats_file}")
def main():
"""Main execution function"""
logger.info("Starting resumable Swiss ISIL database scraper")
scraper = SwissISILScraperResumable()
# Load existing listings
scraper.institutions = scraper.load_listings()
# Find where to resume
start_index = scraper.find_resume_point()
if start_index >= len(scraper.institutions):
logger.info("All institutions already processed!")
return
logger.info(f"Resuming from institution {start_index + 1} of {len(scraper.institutions)}")
# Scrape detail pages starting from resume point
scraper.scrape_details_from(start_index)
# Save final results
scraper.save_results(suffix="_final")
logger.info("Scraping complete!")
if __name__ == "__main__":
main()