glam/scripts/scrapers/scrape_switzerland_isil.py

#!/usr/bin/env python3
"""
Swiss ISIL Database Scraper
Scrapes complete heritage institution data from https://www.isil.nb.admin.ch/en/
Author: GLAM Data Extraction Project
Date: November 2025
"""

import requests
from bs4 import BeautifulSoup
import json
import time
import re
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Optional
import logging

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('/Users/kempersc/apps/glam/data/isil/switzerland/scraper.log'),
        logging.StreamHandler()
    ]
)

logger = logging.getLogger(__name__)

class SwissISILScraper:
    """Scraper for Swiss National Library ISIL directory"""

    BASE_URL = "https://www.isil.nb.admin.ch"
    LIST_URL = f"{BASE_URL}/en/"
    OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/isil/switzerland")

    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (GLAM Heritage Data Research Project)',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.9',
        })
        self.institutions = []
        self.stats = {
            'start_time': datetime.now().isoformat(),
            'pages_scraped': 0,
            'institutions_found': 0,
            'detail_pages_scraped': 0,
            'errors': []
        }

    def fetch_page(self, url: str, retry_count: int = 3) -> Optional[str]:
        """Fetch a page with retry logic"""
        for attempt in range(retry_count):
            try:
                response = self.session.get(url, timeout=30)
                response.raise_for_status()
                return response.text
            except Exception as e:
                logger.warning(f"Attempt {attempt + 1} failed for {url}: {e}")
                if attempt < retry_count - 1:
                    time.sleep(2 ** attempt)  # Exponential backoff
                else:
                    self.stats['errors'].append({'url': url, 'error': str(e)})
                    logger.error(f"Failed to fetch {url} after {retry_count} attempts")
                    return None

    def parse_list_page(self, html: str) -> List[Dict]:
        """Parse institution listing page"""
        soup = BeautifulSoup(html, 'html.parser')
        institutions = []

        # Find all institution cards
        institution_cards = soup.select('ul.search-results-list > li')

        logger.info(f"Found {len(institution_cards)} institution cards on page")

        for card in institution_cards:
            try:
                institution = {}

                # Get detail page URL
                link = card.select_one('a.card__link')
                if link and link.get('href'):
                    institution['detail_url'] = self.BASE_URL + link['href']

                # Get institution name
                name_elem = card.select_one('h3')
                if name_elem:
                    # Extract primary name and alternative name
                    name_text = name_elem.get_text(strip=True)
                    # Check for alternative name (indicated by /)
                    if '/' in name_text:
                        parts = name_text.split('/')
                        institution['name'] = parts[0].strip()
                        institution['alternative_name'] = parts[1].strip() if len(parts) > 1 else None
                    else:
                        institution['name'] = name_text

                    # Check if inactive (strikethrough)
                    if name_elem.select_one('.line-through'):
                        institution['status'] = 'inactive'
                    else:
                        institution['status'] = 'active'

                # Get description
                desc_elem = card.select_one('.leading-snug')
                if desc_elem:
                    institution['description'] = desc_elem.get_text(strip=True)

                # Get location (region and canton)
                location_elem = card.select_one('span.inline-block.text-base.text-gray-500.font-bold.mr-4')
                if location_elem:
                    location_text = location_elem.get_text(strip=True)
                    if ',' in location_text:
                        parts = location_text.split(',')
                        institution['region'] = parts[0].strip()
                        institution['canton'] = parts[1].strip()
                    else:
                        institution['canton'] = location_text

                # Get institution categories/types
                categories = []
                category_elems = card.select('span.inline-flex.items-center.mr-4 span.inline-block.text-gray-500.text-base.font-bold')
                for cat in category_elems:
                    categories.append(cat.get_text(strip=True))
                if categories:
                    institution['categories'] = categories

                # Get merged institution info
                merged_elem = card.select_one('p.text-base a')
                if merged_elem:
                    institution['merged_into'] = {
                        'name': merged_elem.get_text(strip=True),
                        'url': self.BASE_URL + merged_elem['href'] if merged_elem.get('href') else None
                    }

                if institution.get('name'):
                    institutions.append(institution)

            except Exception as e:
                logger.error(f"Error parsing institution card: {e}")
                self.stats['errors'].append({'context': 'parse_card', 'error': str(e)})

        return institutions

    def parse_detail_page(self, html: str) -> Dict:
        """Parse institution detail page"""
        soup = BeautifulSoup(html, 'html.parser')
        details = {}

        try:
            # Get ISIL code from page
            isil_elem = soup.select_one('dd:-soup-contains("CH-")')
            if not isil_elem:
                # Try alternative selection methods
                for dd in soup.select('dd'):
                    text = dd.get_text(strip=True)
                    if text.startswith('CH-'):
                        isil_elem = dd
                        break

            if isil_elem:
                details['isil_code'] = isil_elem.get_text(strip=True)

            # Get full address
            address_parts = {}

            # Street address
            street_elem = soup.select_one('dd:-soup-contains("Street")')
            if street_elem:
                address_parts['street'] = street_elem.get_text(strip=True)

            # Postal code and city
            postal_elem = soup.select_one('dd:-soup-contains("Postal code")')
            if postal_elem:
                address_parts['postal_code'] = postal_elem.get_text(strip=True)

            city_elem = soup.select_one('dd:-soup-contains("City")')
            if city_elem:
                address_parts['city'] = city_elem.get_text(strip=True)

            # Try to find address section
            address_section = soup.find('dt', string=re.compile(r'Address|Adresse'))
            if address_section:
                dd = address_section.find_next('dd')
                if dd:
                    address_text = dd.get_text(separator='|', strip=True)
                    lines = address_text.split('|')
                    if len(lines) >= 2:
                        address_parts['street'] = lines[0].strip()
                        # Parse postal code and city
                        if len(lines) >= 3:
                            postal_city = lines[1].strip()
                            match = re.match(r'(\d{4})\s+(.+)', postal_city)
                            if match:
                                address_parts['postal_code'] = match.group(1)
                                address_parts['city'] = match.group(2)

            if address_parts:
                details['address'] = address_parts

            # Get contact information
            contact = {}

            # Phone
            phone_elem = soup.find('dt', string=re.compile(r'Phone|Telefon'))
            if phone_elem:
                dd = phone_elem.find_next('dd')
                if dd:
                    contact['phone'] = dd.get_text(strip=True)

            # Email
            email_elem = soup.find('a', href=re.compile(r'^mailto:'))
            if email_elem:
                contact['email'] = email_elem.get_text(strip=True)

            # Website
            website_elem = soup.find('dt', string=re.compile(r'Website|Homepage'))
            if website_elem:
                dd = website_elem.find_next('dd')
                if dd:
                    link = dd.find('a')
                    if link:
                        contact['website'] = link.get('href')

            if contact:
                details['contact'] = contact

            # Get institution type/category
            type_elem = soup.find('dt', string=re.compile(r'Institution type|Type'))
            if type_elem:
                dd = type_elem.find_next('dd')
                if dd:
                    details['institution_type'] = dd.get_text(strip=True)

            # Get opening hours
            hours_elem = soup.find('dt', string=re.compile(r'Opening hours|Öffnungszeiten'))
            if hours_elem:
                dd = hours_elem.find_next('dd')
                if dd:
                    details['opening_hours'] = dd.get_text(separator=' | ', strip=True)

            # Get membership information
            memberships = []
            member_section = soup.find('dt', string=re.compile(r'Member of|Mitglied'))
            if member_section:
                dd = member_section.find_next('dd')
                if dd:
                    member_tags = dd.select('span.badge, a')
                    for tag in member_tags:
                        memberships.append(tag.get_text(strip=True))

            if memberships:
                details['memberships'] = memberships

            # Get Dewey classification
            dewey = []
            dewey_section = soup.find('dt', string=re.compile(r'Dewey|Subject area'))
            if dewey_section:
                dd = dewey_section.find_next('dd')
                if dd:
                    dewey_tags = dd.select('span.badge, a')
                    for tag in dewey_tags:
                        dewey.append(tag.get_text(strip=True))

            if dewey:
                details['dewey_classifications'] = dewey

        except Exception as e:
            logger.error(f"Error parsing detail page: {e}")
            self.stats['errors'].append({'context': 'parse_detail', 'error': str(e)})

        return details

    def scrape_institution_details(self, institution: Dict) -> Dict:
        """Scrape detailed information for a single institution"""
        if not institution.get('detail_url'):
            return institution

        logger.info(f"Fetching details for: {institution.get('name')}")
        html = self.fetch_page(institution['detail_url'])

        if html:
            details = self.parse_detail_page(html)
            institution.update(details)
            self.stats['detail_pages_scraped'] += 1

        time.sleep(0.5)  # Rate limiting
        return institution

    def scrape_all_pages(self, max_pages: Optional[int] = None):
        """Scrape all pages of the institution listing"""
        page = 1

        while True:
            if max_pages and page > max_pages:
                break

            url = f"{self.LIST_URL}?page={page}"
            logger.info(f"Scraping page {page}: {url}")

            html = self.fetch_page(url)
            if not html:
                logger.error(f"Failed to fetch page {page}")
                break

            institutions = self.parse_list_page(html)

            if not institutions:
                logger.info(f"No institutions found on page {page}, stopping")
                break

            logger.info(f"Found {len(institutions)} institutions on page {page}")
            self.institutions.extend(institutions)
            self.stats['pages_scraped'] += 1
            self.stats['institutions_found'] += len(institutions)

            # Check if there's a next page
            soup = BeautifulSoup(html, 'html.parser')
            next_link = soup.select_one('a[href*="page="]:-soup-contains("Next")')

            if not next_link and page >= 96:  # We know there are 96 pages
                logger.info("Reached last page")
                break

            page += 1
            time.sleep(1)  # Rate limiting between pages

        logger.info(f"Completed scraping {self.stats['pages_scraped']} pages")
        logger.info(f"Total institutions collected: {self.stats['institutions_found']}")

    def scrape_all_details(self):
        """Scrape detailed information for all institutions"""
        logger.info(f"Starting detailed scrape for {len(self.institutions)} institutions")

        for i, institution in enumerate(self.institutions, 1):
            logger.info(f"Processing institution {i}/{len(self.institutions)}")
            self.scrape_institution_details(institution)

            # Save intermediate results every 50 institutions
            if i % 50 == 0:
                self.save_results(suffix=f"_batch_{i}")

        logger.info(f"Completed detailed scrape. Scraped {self.stats['detail_pages_scraped']} detail pages")

    def save_results(self, suffix: str = ""):
        """Save scraped data to JSON files"""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

        # Save institutions data
        output_file = self.OUTPUT_DIR / f"swiss_isil_complete{suffix}.json"
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(self.institutions, f, ensure_ascii=False, indent=2)
        logger.info(f"Saved {len(self.institutions)} institutions to {output_file}")

        # Save statistics
        self.stats['end_time'] = datetime.now().isoformat()
        self.stats['total_institutions'] = len(self.institutions)
        stats_file = self.OUTPUT_DIR / f"scraping_stats_{timestamp}.json"
        with open(stats_file, 'w', encoding='utf-8') as f:
            json.dump(self.stats, f, ensure_ascii=False, indent=2)
        logger.info(f"Saved statistics to {stats_file}")

        # Generate summary report
        self.generate_report()

    def generate_report(self):
        """Generate a summary report of the scraped data"""
        report = []
        report.append("=" * 80)
        report.append("SWISS ISIL DATABASE SCRAPING REPORT")
        report.append("=" * 80)
        report.append(f"Scraping started: {self.stats.get('start_time', 'N/A')}")
        report.append(f"Scraping ended: {self.stats.get('end_time', 'N/A')}")
        report.append(f"Pages scraped: {self.stats['pages_scraped']}")
        report.append(f"Institutions found: {self.stats['institutions_found']}")
        report.append(f"Detail pages scraped: {self.stats['detail_pages_scraped']}")
        report.append(f"Errors encountered: {len(self.stats['errors'])}")
        report.append("")

        # Count by status
        active = sum(1 for i in self.institutions if i.get('status') == 'active')
        inactive = sum(1 for i in self.institutions if i.get('status') == 'inactive')
        report.append(f"Active institutions: {active}")
        report.append(f"Inactive institutions: {inactive}")
        report.append("")

        # Count by canton
        canton_counts = {}
        for inst in self.institutions:
            canton = inst.get('canton', 'Unknown')
            canton_counts[canton] = canton_counts.get(canton, 0) + 1

        report.append("Institutions by Canton:")
        for canton, count in sorted(canton_counts.items(), key=lambda x: x[1], reverse=True):
            report.append(f"  {canton}: {count}")
        report.append("")

        # Count by institution type
        type_counts = {}
        for inst in self.institutions:
            categories = inst.get('categories', [])
            for cat in categories:
                type_counts[cat] = type_counts.get(cat, 0) + 1

        report.append("Institutions by Type:")
        for cat, count in sorted(type_counts.items(), key=lambda x: x[1], reverse=True)[:20]:
            report.append(f"  {cat}: {count}")
        report.append("")

        # ISIL codes found
        with_isil = sum(1 for i in self.institutions if i.get('isil_code'))
        report.append(f"Institutions with ISIL codes: {with_isil}")
        report.append("")

        report.append("=" * 80)

        report_text = "\n".join(report)
        print("\n" + report_text)

        # Save report to file
        report_file = self.OUTPUT_DIR / f"scraping_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt"
        with open(report_file, 'w', encoding='utf-8') as f:
            f.write(report_text)
        logger.info(f"Saved report to {report_file}")

def main():
    """Main execution function"""
    logger.info("Starting Swiss ISIL database scraper")

    scraper = SwissISILScraper()

    # Step 1: Scrape all listing pages
    logger.info("Step 1: Scraping institution listings")
    scraper.scrape_all_pages()

    # Save intermediate results
    scraper.save_results(suffix="_listings_only")

    # Step 2: Scrape detail pages for each institution
    logger.info("Step 2: Scraping detailed information for each institution")
    scraper.scrape_all_details()

    # Save final results
    scraper.save_results()

    logger.info("Scraping complete!")

if __name__ == "__main__":
    main()