298 lines
12 KiB
Python
298 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Swiss ISIL Database Scraper - RESUMABLE VERSION
|
|
Continues scraping from where it left off
|
|
Author: GLAM Data Extraction Project
|
|
Date: November 2025
|
|
"""
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
import json
|
|
import time
|
|
import re
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import List, Dict, Optional
|
|
import logging
|
|
import sys
|
|
|
|
# Setup logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
|
handlers=[
|
|
logging.FileHandler('/Users/kempersc/apps/glam/data/isil/switzerland/scraper_resume.log'),
|
|
logging.StreamHandler()
|
|
]
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class SwissISILScraperResumable:
|
|
"""Resumable scraper for Swiss National Library ISIL directory"""
|
|
|
|
BASE_URL = "https://www.isil.nb.admin.ch"
|
|
LIST_URL = f"{BASE_URL}/en/"
|
|
OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/isil/switzerland")
|
|
LISTINGS_FILE = OUTPUT_DIR / "swiss_isil_complete_listings_only.json"
|
|
|
|
def __init__(self):
|
|
self.session = requests.Session()
|
|
self.session.headers.update({
|
|
'User-Agent': 'Mozilla/5.0 (GLAM Heritage Data Research Project)',
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
'Accept-Language': 'en-US,en;q=0.9',
|
|
})
|
|
self.institutions = []
|
|
self.stats = {
|
|
'start_time': datetime.now().isoformat(),
|
|
'detail_pages_scraped': 0,
|
|
'errors': []
|
|
}
|
|
|
|
def load_listings(self) -> List[Dict]:
|
|
"""Load institution listings from existing file"""
|
|
if not self.LISTINGS_FILE.exists():
|
|
logger.error(f"Listings file not found: {self.LISTINGS_FILE}")
|
|
logger.error("Please run the full scraper first to generate listings.")
|
|
sys.exit(1)
|
|
|
|
logger.info(f"Loading listings from {self.LISTINGS_FILE}")
|
|
with open(self.LISTINGS_FILE, 'r', encoding='utf-8') as f:
|
|
institutions = json.load(f)
|
|
|
|
logger.info(f"Loaded {len(institutions)} institutions from listings file")
|
|
return institutions
|
|
|
|
def find_resume_point(self) -> int:
|
|
"""Find the index to resume scraping from"""
|
|
# Look for the most recent batch file
|
|
batch_files = list(self.OUTPUT_DIR.glob("swiss_isil_complete_batch_*.json"))
|
|
|
|
if not batch_files:
|
|
logger.info("No existing batch files found, starting from beginning")
|
|
return 0
|
|
|
|
# Sort by modification time, get the most recent
|
|
latest_batch = max(batch_files, key=lambda p: p.stat().st_mtime)
|
|
logger.info(f"Found latest batch file: {latest_batch.name}")
|
|
|
|
# Extract the batch number from filename
|
|
match = re.search(r'batch_(\d+)\.json$', latest_batch.name)
|
|
if match:
|
|
last_processed = int(match.group(1))
|
|
logger.info(f"Last processed index: {last_processed}")
|
|
|
|
# Load the batch file to verify it has detail data
|
|
with open(latest_batch, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
# Check if the last institution has ISIL code (detail scraped)
|
|
if len(data) >= last_processed and data[last_processed - 1].get('isil_code'):
|
|
logger.info(f"Verified institution {last_processed} has detail data")
|
|
return last_processed
|
|
|
|
logger.info("Could not determine resume point, starting from beginning")
|
|
return 0
|
|
|
|
def fetch_page(self, url: str, retry_count: int = 3) -> Optional[str]:
|
|
"""Fetch a page with retry logic"""
|
|
for attempt in range(retry_count):
|
|
try:
|
|
response = self.session.get(url, timeout=30)
|
|
response.raise_for_status()
|
|
return response.text
|
|
except Exception as e:
|
|
logger.warning(f"Attempt {attempt + 1} failed for {url}: {e}")
|
|
if attempt < retry_count - 1:
|
|
time.sleep(2 ** attempt) # Exponential backoff
|
|
else:
|
|
self.stats['errors'].append({'url': url, 'error': str(e)})
|
|
logger.error(f"Failed to fetch {url} after {retry_count} attempts")
|
|
return None
|
|
|
|
def parse_detail_page(self, html: str) -> Dict:
|
|
"""Parse institution detail page"""
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
details = {}
|
|
|
|
try:
|
|
# Get ISIL code
|
|
for dd in soup.select('dd'):
|
|
text = dd.get_text(strip=True)
|
|
if text.startswith('CH-'):
|
|
details['isil_code'] = text
|
|
break
|
|
|
|
# Get address
|
|
address_parts = {}
|
|
address_section = soup.find('dt', string=re.compile(r'Address|Adresse'))
|
|
if address_section:
|
|
dd = address_section.find_next('dd')
|
|
if dd:
|
|
address_text = dd.get_text(separator='|', strip=True)
|
|
lines = address_text.split('|')
|
|
if len(lines) >= 1:
|
|
address_parts['street'] = lines[0].strip()
|
|
if len(lines) >= 2:
|
|
postal_city = lines[1].strip()
|
|
match = re.match(r'(\d{4})\s+(.+)', postal_city)
|
|
if match:
|
|
address_parts['postal_code'] = match.group(1)
|
|
address_parts['city'] = match.group(2)
|
|
|
|
if address_parts:
|
|
details['address'] = address_parts
|
|
|
|
# Get contact information
|
|
contact = {}
|
|
|
|
# Phone
|
|
phone_elem = soup.find('dt', string=re.compile(r'Phone|Telefon'))
|
|
if phone_elem:
|
|
dd = phone_elem.find_next('dd')
|
|
if dd:
|
|
contact['phone'] = dd.get_text(strip=True)
|
|
|
|
# Email
|
|
email_elem = soup.find('a', href=re.compile(r'^mailto:'))
|
|
if email_elem:
|
|
contact['email'] = email_elem.get_text(strip=True)
|
|
|
|
# Website
|
|
website_elem = soup.find('dt', string=re.compile(r'Website|Homepage'))
|
|
if website_elem:
|
|
dd = website_elem.find_next('dd')
|
|
if dd:
|
|
link = dd.find('a')
|
|
if link:
|
|
contact['website'] = link.get('href')
|
|
|
|
if contact:
|
|
details['contact'] = contact
|
|
|
|
# Get institution type
|
|
type_elem = soup.find('dt', string=re.compile(r'Institution type|Type'))
|
|
if type_elem:
|
|
dd = type_elem.find_next('dd')
|
|
if dd:
|
|
details['institution_type'] = dd.get_text(strip=True)
|
|
|
|
# Get opening hours
|
|
hours_elem = soup.find('dt', string=re.compile(r'Opening hours|Öffnungszeiten'))
|
|
if hours_elem:
|
|
dd = hours_elem.find_next('dd')
|
|
if dd:
|
|
details['opening_hours'] = dd.get_text(separator=' | ', strip=True)
|
|
|
|
# Get memberships
|
|
memberships = []
|
|
member_section = soup.find('dt', string=re.compile(r'Member of|Mitglied'))
|
|
if member_section:
|
|
dd = member_section.find_next('dd')
|
|
if dd:
|
|
member_tags = dd.select('span.badge, a')
|
|
for tag in member_tags:
|
|
memberships.append(tag.get_text(strip=True))
|
|
|
|
if memberships:
|
|
details['memberships'] = memberships
|
|
|
|
# Get Dewey classification
|
|
dewey = []
|
|
dewey_section = soup.find('dt', string=re.compile(r'Dewey|Subject area'))
|
|
if dewey_section:
|
|
dd = dewey_section.find_next('dd')
|
|
if dd:
|
|
dewey_tags = dd.select('span.badge, a')
|
|
for tag in dewey_tags:
|
|
dewey.append(tag.get_text(strip=True))
|
|
|
|
if dewey:
|
|
details['dewey_classifications'] = dewey
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error parsing detail page: {e}")
|
|
self.stats['errors'].append({'context': 'parse_detail', 'error': str(e)})
|
|
|
|
return details
|
|
|
|
def scrape_institution_details(self, institution: Dict) -> Dict:
|
|
"""Scrape detailed information for a single institution"""
|
|
if not institution.get('detail_url'):
|
|
return institution
|
|
|
|
html = self.fetch_page(institution['detail_url'])
|
|
|
|
if html:
|
|
details = self.parse_detail_page(html)
|
|
institution.update(details)
|
|
self.stats['detail_pages_scraped'] += 1
|
|
|
|
time.sleep(0.5) # Rate limiting
|
|
return institution
|
|
|
|
def scrape_details_from(self, start_index: int = 0):
|
|
"""Scrape detailed information starting from a specific index"""
|
|
total = len(self.institutions)
|
|
logger.info(f"Starting detailed scrape from institution {start_index + 1}/{total}")
|
|
|
|
for i in range(start_index, total):
|
|
institution_num = i + 1
|
|
logger.info(f"Processing institution {institution_num}/{total}")
|
|
self.scrape_institution_details(self.institutions[i])
|
|
|
|
# Save intermediate results every 50 institutions
|
|
if institution_num % 50 == 0:
|
|
self.save_results(suffix=f"_batch_{institution_num}")
|
|
logger.info(f"Saved batch at institution {institution_num}")
|
|
|
|
logger.info(f"Completed detailed scrape. Scraped {self.stats['detail_pages_scraped']} detail pages")
|
|
|
|
def save_results(self, suffix: str = ""):
|
|
"""Save scraped data to JSON files"""
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
|
|
# Save institutions data
|
|
output_file = self.OUTPUT_DIR / f"swiss_isil_complete{suffix}.json"
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(self.institutions, f, ensure_ascii=False, indent=2)
|
|
logger.info(f"Saved {len(self.institutions)} institutions to {output_file}")
|
|
|
|
# Save statistics
|
|
self.stats['end_time'] = datetime.now().isoformat()
|
|
self.stats['total_institutions'] = len(self.institutions)
|
|
stats_file = self.OUTPUT_DIR / f"scraping_stats_resume_{timestamp}.json"
|
|
with open(stats_file, 'w', encoding='utf-8') as f:
|
|
json.dump(self.stats, f, ensure_ascii=False, indent=2)
|
|
logger.info(f"Saved statistics to {stats_file}")
|
|
|
|
def main():
|
|
"""Main execution function"""
|
|
logger.info("Starting resumable Swiss ISIL database scraper")
|
|
|
|
scraper = SwissISILScraperResumable()
|
|
|
|
# Load existing listings
|
|
scraper.institutions = scraper.load_listings()
|
|
|
|
# Find where to resume
|
|
start_index = scraper.find_resume_point()
|
|
|
|
if start_index >= len(scraper.institutions):
|
|
logger.info("All institutions already processed!")
|
|
return
|
|
|
|
logger.info(f"Resuming from institution {start_index + 1} of {len(scraper.institutions)}")
|
|
|
|
# Scrape detail pages starting from resume point
|
|
scraper.scrape_details_from(start_index)
|
|
|
|
# Save final results
|
|
scraper.save_results(suffix="_final")
|
|
|
|
logger.info("Scraping complete!")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|