glam/scripts/scrapers/harvest_swiss_isil.py

#!/usr/bin/env python3
"""
Swiss ISIL Database Harvester

This script harvests all Swiss and Liechtenstein ISIL records by scraping
the Swiss National Library's ISIL directory web interface.

Source: https://www.isil.nb.admin.ch/en/
Records: ~2,379 institutions (Switzerland + Liechtenstein)

Author: OpenCode + MCP Tools
Date: 2025-11-19
"""

import json
import time
import re
from pathlib import Path
from typing import List, Dict, Optional
from datetime import datetime
import requests
from bs4 import BeautifulSoup

# Configuration
BASE_URL = "https://www.isil.nb.admin.ch/en/"
OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/isil/switzerland")
RESULTS_PER_PAGE = 25  # Default pagination
REQUEST_DELAY = 2.0  # Seconds between requests (be polite)
MAX_RETRIES = 3

# Create output directory
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)


def fetch_page(page_num: int) -> Optional[BeautifulSoup]:
    """
    Fetch a page of ISIL records.

    Args:
        page_num: Page number (1-indexed)

    Returns:
        BeautifulSoup object or None on error
    """
    url = f"{BASE_URL}?page={page_num}"

    for attempt in range(MAX_RETRIES):
        try:
            print(f"Fetching page {page_num}...", end=' ')
            response = requests.get(url, timeout=30)
            response.raise_for_status()

            soup = BeautifulSoup(response.content, 'html.parser')
            print("OK")
            return soup

        except requests.exceptions.RequestException as e:
            print(f"Attempt {attempt + 1}/{MAX_RETRIES} failed: {e}")
            if attempt < MAX_RETRIES - 1:
                time.sleep(REQUEST_DELAY * (attempt + 1))
            else:
                return None

    return None


def extract_institution_links(soup: BeautifulSoup) -> List[str]:
    """
    Extract institution detail page URLs from a listing page.

    Args:
        soup: BeautifulSoup object of the listing page

    Returns:
        List of institution detail URLs
    """
    links = []

    # Find all institution links
    # Pattern: /en/institutions/[slug]/
    for link in soup.find_all('a', href=True):
        href = link['href']
        if '/institutions/' in href and href.endswith('/'):
            if not href.startswith('http'):
                href = f"https://www.isil.nb.admin.ch{href}"
            links.append(href)

    return list(set(links))  # Remove duplicates


def fetch_institution_detail(url: str) -> Optional[Dict]:
    """
    Fetch detailed information for a single institution.

    Args:
        url: Institution detail page URL

    Returns:
        Dictionary with institution data or None on error
    """
    try:
        response = requests.get(url, timeout=30)
        response.raise_for_status()

        soup = BeautifulSoup(response.content, 'html.parser')

        # Initialize record
        record = {
            'source_url': url,
            'isil': None,
            'name': None,
            'alternative_names': [],
            'institution_type': None,
            'address': {},
            'contact': {},
            'urls': [],
            'parent_org': None,
            'status': 'active',
            'notes': None
        }

        # Extract ISIL code from URL (e.g., /institutions/name-ch-123456-x/)
        isil_match = re.search(r'-(ch-\d{6}-[\dx])', url.lower())
        if isil_match:
            record['isil'] = isil_match.group(1).upper()

        # Extract name (h1 or h2 heading)
        name_elem = soup.find(['h1', 'h2', 'h3'])
        if name_elem:
            record['name'] = name_elem.get_text(strip=True)

        # Extract description/notes
        description = soup.find('div', class_=re.compile('description|content|detail'))
        if description:
            record['notes'] = description.get_text(strip=True)

        # Extract metadata (region, canton, type, etc.)
        # Look for metadata sections
        for elem in soup.find_all(['span', 'div', 'p']):
            text = elem.get_text(strip=True)

            # Check for institution type
            if any(keyword in text.lower() for keyword in [
                'library', 'archive', 'museum', 'documentation',
                'bibliothek', 'archiv', 'dokumentation'
            ]):
                record['institution_type'] = text

            # Check for status
            if 'inactive' in text.lower() or 'nicht mehr' in text.lower():
                record['status'] = 'inactive'

        return record

    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return None


def get_total_pages() -> int:
    """
    Get the total number of pages by fetching the first page.

    Returns:
        Total number of pages
    """
    soup = fetch_page(1)
    if not soup:
        return 0

    # Look for total results count
    # Pattern: "2379 Search results"
    results_text = soup.find(text=re.compile(r'\d+\s+Search results'))
    if results_text:
        match = re.search(r'(\d+)\s+Search results', str(results_text))
        if match:
            total_results = int(match.group(1))
            total_pages = (total_results + RESULTS_PER_PAGE - 1) // RESULTS_PER_PAGE
            print(f"Found {total_results} total results across {total_pages} pages")
            return total_pages

    # Fallback: look for pagination links
    pagination = soup.find_all('a', href=re.compile(r'\?page=\d+'))
    if pagination:
        page_nums = []
        for link in pagination:
            if 'page=' in link['href']:
                match = re.search(r'page=(\d+)', link['href'])
                if match:
                    page_nums.append(int(match.group(1)))
        return max(page_nums) if page_nums else 1

    return 1


def harvest_all_institutions() -> List[Dict]:
    """
    Harvest all ISIL records from the Swiss directory.

    Returns:
        List of institution records
    """
    print("=" * 60)
    print("Swiss ISIL Database Harvester")
    print("=" * 60)
    print()

    total_pages = get_total_pages()
    print(f"Total pages to harvest: {total_pages}")
    print()

    all_institution_urls = []

    # Step 1: Collect all institution URLs from listing pages
    print("Step 1: Collecting institution URLs from listing pages...")
    for page_num in range(1, total_pages + 1):
        soup = fetch_page(page_num)
        if soup:
            links = extract_institution_links(soup)
            all_institution_urls.extend(links)
            print(f"  Found {len(links)} institutions on page {page_num}")

        time.sleep(REQUEST_DELAY)

    print(f"\nCollected {len(all_institution_urls)} unique institution URLs")
    print()

    # Step 2: Fetch detailed information for each institution
    print("Step 2: Fetching detailed information...")
    institutions = []

    for idx, url in enumerate(all_institution_urls, 1):
        print(f"[{idx}/{len(all_institution_urls)}] ", end='')
        record = fetch_institution_detail(url)
        if record:
            institutions.append(record)

        time.sleep(REQUEST_DELAY)

    print()
    print(f"Successfully harvested {len(institutions)} institutions")

    return institutions


def save_results(institutions: List[Dict], timestamp: str):
    """
    Save harvested data to JSON and JSONL files.

    Args:
        institutions: List of institution records
        timestamp: Timestamp string for filenames
    """
    # Save as JSON
    json_path = OUTPUT_DIR / f"swiss_isil_complete_{timestamp}.json"
    with open(json_path, 'w', encoding='utf-8') as f:
        json.dump(institutions, f, indent=2, ensure_ascii=False)
    print(f"\nSaved JSON: {json_path}")

    # Save as JSONL
    jsonl_path = OUTPUT_DIR / f"swiss_isil_complete_{timestamp}.jsonl"
    with open(jsonl_path, 'w', encoding='utf-8') as f:
        for inst in institutions:
            f.write(json.dumps(inst, ensure_ascii=False) + '\n')
    print(f"Saved JSONL: {jsonl_path}")

    # Save statistics
    stats = {
        'total_institutions': len(institutions),
        'active_institutions': sum(1 for i in institutions if i.get('status') == 'active'),
        'inactive_institutions': sum(1 for i in institutions if i.get('status') == 'inactive'),
        'harvest_date': datetime.now().isoformat(),
        'source': 'https://www.isil.nb.admin.ch/en/'
    }

    stats_path = OUTPUT_DIR / f"swiss_isil_stats_{timestamp}.json"
    with open(stats_path, 'w', encoding='utf-8') as f:
        json.dump(stats, f, indent=2)
    print(f"Saved stats: {stats_path}")


def main():
    """Main execution function."""
    start_time = time.time()
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    try:
        # Harvest all institutions
        institutions = harvest_all_institutions()

        # Save results
        save_results(institutions, timestamp)

        # Print summary
        elapsed = time.time() - start_time
        print()
        print("=" * 60)
        print("Harvest Complete!")
        print("=" * 60)
        print(f"Total institutions: {len(institutions)}")
        print(f"Time elapsed: {elapsed:.2f} seconds ({elapsed/60:.2f} minutes)")
        print(f"Output directory: {OUTPUT_DIR}")

    except KeyboardInterrupt:
        print("\n\nHarvest interrupted by user")
    except Exception as e:
        print(f"\n\nError during harvest: {e}")
        raise


if __name__ == "__main__":
    main()