glam/scripts/scrapers/scrape_switzerland_isil_resume.py

#!/usr/bin/env python3
"""
Swiss ISIL Database Scraper - RESUMABLE VERSION
Continues scraping from where it left off
Author: GLAM Data Extraction Project
Date: November 2025
"""

import requests
from bs4 import BeautifulSoup
import json
import time
import re
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Optional
import logging
import sys

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('/Users/kempersc/apps/glam/data/isil/switzerland/scraper_resume.log'),
        logging.StreamHandler()
    ]
)

logger = logging.getLogger(__name__)

class SwissISILScraperResumable:
    """Resumable scraper for Swiss National Library ISIL directory"""

    BASE_URL = "https://www.isil.nb.admin.ch"
    LIST_URL = f"{BASE_URL}/en/"
    OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/isil/switzerland")
    LISTINGS_FILE = OUTPUT_DIR / "swiss_isil_complete_listings_only.json"

    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (GLAM Heritage Data Research Project)',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.9',
        })
        self.institutions = []
        self.stats = {
            'start_time': datetime.now().isoformat(),
            'detail_pages_scraped': 0,
            'errors': []
        }

    def load_listings(self) -> List[Dict]:
        """Load institution listings from existing file"""
        if not self.LISTINGS_FILE.exists():
            logger.error(f"Listings file not found: {self.LISTINGS_FILE}")
            logger.error("Please run the full scraper first to generate listings.")
            sys.exit(1)

        logger.info(f"Loading listings from {self.LISTINGS_FILE}")
        with open(self.LISTINGS_FILE, 'r', encoding='utf-8') as f:
            institutions = json.load(f)

        logger.info(f"Loaded {len(institutions)} institutions from listings file")
        return institutions

    def find_resume_point(self) -> int:
        """Find the index to resume scraping from"""
        # Look for the most recent batch file
        batch_files = list(self.OUTPUT_DIR.glob("swiss_isil_complete_batch_*.json"))

        if not batch_files:
            logger.info("No existing batch files found, starting from beginning")
            return 0

        # Sort by modification time, get the most recent
        latest_batch = max(batch_files, key=lambda p: p.stat().st_mtime)
        logger.info(f"Found latest batch file: {latest_batch.name}")

        # Extract the batch number from filename
        match = re.search(r'batch_(\d+)\.json$', latest_batch.name)
        if match:
            last_processed = int(match.group(1))
            logger.info(f"Last processed index: {last_processed}")

            # Load the batch file to verify it has detail data
            with open(latest_batch, 'r', encoding='utf-8') as f:
                data = json.load(f)

            # Check if the last institution has ISIL code (detail scraped)
            if len(data) >= last_processed and data[last_processed - 1].get('isil_code'):
                logger.info(f"Verified institution {last_processed} has detail data")
                return last_processed

        logger.info("Could not determine resume point, starting from beginning")
        return 0

    def fetch_page(self, url: str, retry_count: int = 3) -> Optional[str]:
        """Fetch a page with retry logic"""
        for attempt in range(retry_count):
            try:
                response = self.session.get(url, timeout=30)
                response.raise_for_status()
                return response.text
            except Exception as e:
                logger.warning(f"Attempt {attempt + 1} failed for {url}: {e}")
                if attempt < retry_count - 1:
                    time.sleep(2 ** attempt)  # Exponential backoff
                else:
                    self.stats['errors'].append({'url': url, 'error': str(e)})
                    logger.error(f"Failed to fetch {url} after {retry_count} attempts")
                    return None

    def parse_detail_page(self, html: str) -> Dict:
        """Parse institution detail page"""
        soup = BeautifulSoup(html, 'html.parser')
        details = {}

        try:
            # Get ISIL code
            for dd in soup.select('dd'):
                text = dd.get_text(strip=True)
                if text.startswith('CH-'):
                    details['isil_code'] = text
                    break

            # Get address
            address_parts = {}
            address_section = soup.find('dt', string=re.compile(r'Address|Adresse'))
            if address_section:
                dd = address_section.find_next('dd')
                if dd:
                    address_text = dd.get_text(separator='|', strip=True)
                    lines = address_text.split('|')
                    if len(lines) >= 1:
                        address_parts['street'] = lines[0].strip()
                    if len(lines) >= 2:
                        postal_city = lines[1].strip()
                        match = re.match(r'(\d{4})\s+(.+)', postal_city)
                        if match:
                            address_parts['postal_code'] = match.group(1)
                            address_parts['city'] = match.group(2)

            if address_parts:
                details['address'] = address_parts

            # Get contact information
            contact = {}

            # Phone
            phone_elem = soup.find('dt', string=re.compile(r'Phone|Telefon'))
            if phone_elem:
                dd = phone_elem.find_next('dd')
                if dd:
                    contact['phone'] = dd.get_text(strip=True)

            # Email
            email_elem = soup.find('a', href=re.compile(r'^mailto:'))
            if email_elem:
                contact['email'] = email_elem.get_text(strip=True)

            # Website
            website_elem = soup.find('dt', string=re.compile(r'Website|Homepage'))
            if website_elem:
                dd = website_elem.find_next('dd')
                if dd:
                    link = dd.find('a')
                    if link:
                        contact['website'] = link.get('href')

            if contact:
                details['contact'] = contact

            # Get institution type
            type_elem = soup.find('dt', string=re.compile(r'Institution type|Type'))
            if type_elem:
                dd = type_elem.find_next('dd')
                if dd:
                    details['institution_type'] = dd.get_text(strip=True)

            # Get opening hours
            hours_elem = soup.find('dt', string=re.compile(r'Opening hours|Öffnungszeiten'))
            if hours_elem:
                dd = hours_elem.find_next('dd')
                if dd:
                    details['opening_hours'] = dd.get_text(separator=' | ', strip=True)

            # Get memberships
            memberships = []
            member_section = soup.find('dt', string=re.compile(r'Member of|Mitglied'))
            if member_section:
                dd = member_section.find_next('dd')
                if dd:
                    member_tags = dd.select('span.badge, a')
                    for tag in member_tags:
                        memberships.append(tag.get_text(strip=True))

            if memberships:
                details['memberships'] = memberships

            # Get Dewey classification
            dewey = []
            dewey_section = soup.find('dt', string=re.compile(r'Dewey|Subject area'))
            if dewey_section:
                dd = dewey_section.find_next('dd')
                if dd:
                    dewey_tags = dd.select('span.badge, a')
                    for tag in dewey_tags:
                        dewey.append(tag.get_text(strip=True))

            if dewey:
                details['dewey_classifications'] = dewey

        except Exception as e:
            logger.error(f"Error parsing detail page: {e}")
            self.stats['errors'].append({'context': 'parse_detail', 'error': str(e)})

        return details

    def scrape_institution_details(self, institution: Dict) -> Dict:
        """Scrape detailed information for a single institution"""
        if not institution.get('detail_url'):
            return institution

        html = self.fetch_page(institution['detail_url'])

        if html:
            details = self.parse_detail_page(html)
            institution.update(details)
            self.stats['detail_pages_scraped'] += 1

        time.sleep(0.5)  # Rate limiting
        return institution

    def scrape_details_from(self, start_index: int = 0):
        """Scrape detailed information starting from a specific index"""
        total = len(self.institutions)
        logger.info(f"Starting detailed scrape from institution {start_index + 1}/{total}")

        for i in range(start_index, total):
            institution_num = i + 1
            logger.info(f"Processing institution {institution_num}/{total}")
            self.scrape_institution_details(self.institutions[i])

            # Save intermediate results every 50 institutions
            if institution_num % 50 == 0:
                self.save_results(suffix=f"_batch_{institution_num}")
                logger.info(f"Saved batch at institution {institution_num}")

        logger.info(f"Completed detailed scrape. Scraped {self.stats['detail_pages_scraped']} detail pages")

    def save_results(self, suffix: str = ""):
        """Save scraped data to JSON files"""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

        # Save institutions data
        output_file = self.OUTPUT_DIR / f"swiss_isil_complete{suffix}.json"
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(self.institutions, f, ensure_ascii=False, indent=2)
        logger.info(f"Saved {len(self.institutions)} institutions to {output_file}")

        # Save statistics
        self.stats['end_time'] = datetime.now().isoformat()
        self.stats['total_institutions'] = len(self.institutions)
        stats_file = self.OUTPUT_DIR / f"scraping_stats_resume_{timestamp}.json"
        with open(stats_file, 'w', encoding='utf-8') as f:
            json.dump(self.stats, f, ensure_ascii=False, indent=2)
        logger.info(f"Saved statistics to {stats_file}")

def main():
    """Main execution function"""
    logger.info("Starting resumable Swiss ISIL database scraper")

    scraper = SwissISILScraperResumable()

    # Load existing listings
    scraper.institutions = scraper.load_listings()

    # Find where to resume
    start_index = scraper.find_resume_point()

    if start_index >= len(scraper.institutions):
        logger.info("All institutions already processed!")
        return

    logger.info(f"Resuming from institution {start_index + 1} of {len(scraper.institutions)}")

    # Scrape detail pages starting from resume point
    scraper.scrape_details_from(start_index)

    # Save final results
    scraper.save_results(suffix="_final")

    logger.info("Scraping complete!")

if __name__ == "__main__":
    main()