glam/scripts/scrapers/scrape_canadian_isil.py

#!/usr/bin/env python3
"""
Scraper for Canadian ISIL database from Library and Archives Canada.

This script extracts all library records (active and closed/superseded) from:
https://sigles-symbols.bac-lac.gc.ca/eng/Search

Total expected records:
- Active libraries: 6,520
- Closed/Superseded: 3,046
- Total: 9,566

Output: JSON files in data/isil/canada/
"""

import asyncio
import json
import logging
import re
from pathlib import Path
from typing import Dict, List, Optional
from datetime import datetime
from urllib.parse import urljoin

try:
    from playwright.async_api import async_playwright
except ImportError:
    print("Error: playwright is not installed. Install with: pip install playwright && playwright install")
    exit(1)

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Constants
BASE_URL = "https://sigles-symbols.bac-lac.gc.ca"
SEARCH_URL = f"{BASE_URL}/eng/Search/List"
OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/isil/canada")
PAGE_SIZE = 100  # Maximum results per page


class CanadianISILScraper:
    """Scraper for Canadian ISIL/Library Symbol database."""

    def __init__(self):
        self.browser = None
        self.page = None
        self.output_dir = OUTPUT_DIR
        self.output_dir.mkdir(parents=True, exist_ok=True)

    async def start_browser(self):
        """Initialize browser and page."""
        playwright = await async_playwright().start()
        self.browser = await playwright.chromium.launch(headless=True)
        self.page = await self.browser.new_page()
        logger.info("Browser started")

    async def close_browser(self):
        """Close browser."""
        if self.browser:
            await self.browser.close()
            logger.info("Browser closed")

    async def fetch_list_page(
        self,
        page_num: int,
        closed_superseded: bool = False
    ) -> List[Dict]:
        """
        Fetch a single page of library listings.

        Args:
            page_num: Page number (1-indexed)
            closed_superseded: If True, fetch closed/superseded libraries

        Returns:
            List of library records with basic info
        """
        url = (
            f"{SEARCH_URL}?Page={page_num}&PageSize={PAGE_SIZE}"
            f"&q=&Scope=&LibraryType=&Province="
            f"&LendingMonographs=false&PhotocopyMonographs=false"
            f"&LendingSerials=false&PhotocopySerials=false"
            f"&NoLendingCharge=false&RenewableMonographs=false"
            f"&NoPhotocopyCharge=false&RenewableSerials=false"
        )

        if closed_superseded:
            url += "&ClosedSuperceded=True"

        logger.info(f"Fetching page {page_num} ({'closed' if closed_superseded else 'active'})...")

        try:
            await self.page.goto(url, wait_until="networkidle", timeout=60000)

            # Extract table rows
            rows = await self.page.locator("table tbody tr").all()

            libraries = []
            for row in rows:
                cells = await row.locator("td").all()
                if len(cells) >= 4:
                    symbol_cell = cells[0]
                    symbol_link = await symbol_cell.locator("a").get_attribute("href")
                    symbol_text = await symbol_cell.inner_text()

                    name = await cells[1].inner_text()
                    city = await cells[2].inner_text()
                    province = await cells[3].inner_text()

                    # Extract library ID from detail link
                    library_id = None
                    if symbol_link:
                        match = re.search(r'Id=(\d+)', symbol_link)
                        if match:
                            library_id = match.group(1)

                    libraries.append({
                        "library_symbol": symbol_text.strip(),
                        "name": name.strip(),
                        "city": city.strip(),
                        "province": province.strip(),
                        "library_id": library_id,
                        "detail_url": urljoin(BASE_URL, symbol_link) if symbol_link else None,
                        "status": "closed" if closed_superseded else "active"
                    })

            logger.info(f"  Extracted {len(libraries)} records")
            return libraries

        except Exception as e:
            logger.error(f"Error fetching page {page_num}: {e}")
            return []

    async def fetch_library_details(self, library: Dict) -> Dict:
        """
        Fetch detailed information for a single library.

        Args:
            library: Basic library info with detail_url

        Returns:
            Complete library record with all available fields
        """
        if not library.get("detail_url"):
            logger.warning(f"No detail URL for {library.get('library_symbol')}")
            return library

        try:
            await self.page.goto(library["detail_url"], wait_until="networkidle", timeout=30000)

            # Extract all detail fields (structure varies, we'll parse what we can)
            # The detail page typically has definition lists (dt/dd pairs)

            details = {}

            # Try to find all key-value pairs
            dts = await self.page.locator("dt").all()
            dds = await self.page.locator("dd").all()

            for dt, dd in zip(dts, dds):
                key = await dt.inner_text()
                value = await dd.inner_text()
                details[key.strip().rstrip(":")] = value.strip()

            # Merge details into library record
            library.update(details)

            logger.debug(f"  Fetched details for {library['library_symbol']}")
            return library

        except Exception as e:
            logger.error(f"Error fetching details for {library.get('library_symbol')}: {e}")
            return library

    async def scrape_all_libraries(
        self,
        closed_superseded: bool = False,
        max_pages: Optional[int] = None
    ) -> List[Dict]:
        """
        Scrape all libraries (active or closed/superseded).

        Args:
            closed_superseded: If True, scrape closed/superseded libraries
            max_pages: Optional limit on number of pages (for testing)

        Returns:
            List of all library records
        """
        all_libraries = []

        # Determine total pages needed
        if closed_superseded:
            total_records = 3046
            category = "closed/superseded"
        else:
            total_records = 6520
            category = "active"

        total_pages = (total_records + PAGE_SIZE - 1) // PAGE_SIZE  # Ceiling division

        if max_pages:
            total_pages = min(total_pages, max_pages)

        logger.info(f"Scraping {category} libraries: {total_pages} pages ({total_records} records)")

        # Fetch list pages
        for page_num in range(1, total_pages + 1):
            libraries = await self.fetch_list_page(page_num, closed_superseded)
            all_libraries.extend(libraries)

            # Be polite - add delay between requests
            await asyncio.sleep(1)

        logger.info(f"Fetched {len(all_libraries)} {category} library records")

        # Now fetch details for each library (this will take a while!)
        logger.info(f"Fetching detailed information for {len(all_libraries)} libraries...")

        for i, library in enumerate(all_libraries, 1):
            if i % 100 == 0:
                logger.info(f"  Progress: {i}/{len(all_libraries)}")

            await self.fetch_library_details(library)
            await asyncio.sleep(0.5)  # Be polite

        return all_libraries

    async def run(self, test_mode: bool = False):
        """
        Main execution method.

        Args:
            test_mode: If True, only fetch first 2 pages of each category
        """
        await self.start_browser()

        try:
            # Scrape active libraries
            max_pages = 2 if test_mode else None
            active_libraries = await self.scrape_all_libraries(
                closed_superseded=False,
                max_pages=max_pages
            )

            # Save active libraries
            active_file = self.output_dir / "canadian_libraries_active.json"
            with open(active_file, 'w', encoding='utf-8') as f:
                json.dump({
                    "extraction_date": datetime.now().isoformat(),
                    "source": "Library and Archives Canada - Canadian Library Directory",
                    "source_url": "https://sigles-symbols.bac-lac.gc.ca/eng/Search",
                    "status": "active",
                    "record_count": len(active_libraries),
                    "libraries": active_libraries
                }, f, indent=2, ensure_ascii=False)

            logger.info(f"Saved {len(active_libraries)} active libraries to {active_file}")

            # Scrape closed/superseded libraries
            closed_libraries = await self.scrape_all_libraries(
                closed_superseded=True,
                max_pages=max_pages
            )

            # Save closed libraries
            closed_file = self.output_dir / "canadian_libraries_closed.json"
            with open(closed_file, 'w', encoding='utf-8') as f:
                json.dump({
                    "extraction_date": datetime.now().isoformat(),
                    "source": "Library and Archives Canada - Canadian Library Directory",
                    "source_url": "https://sigles-symbols.bac-lac.gc.ca/eng/Search",
                    "status": "closed_superseded",
                    "record_count": len(closed_libraries),
                    "libraries": closed_libraries
                }, f, indent=2, ensure_ascii=False)

            logger.info(f"Saved {len(closed_libraries)} closed/superseded libraries to {closed_file}")

            # Create combined file
            combined_file = self.output_dir / "canadian_libraries_all.json"
            with open(combined_file, 'w', encoding='utf-8') as f:
                json.dump({
                    "extraction_date": datetime.now().isoformat(),
                    "source": "Library and Archives Canada - Canadian Library Directory",
                    "source_url": "https://sigles-symbols.bac-lac.gc.ca/eng/Search",
                    "total_records": len(active_libraries) + len(closed_libraries),
                    "active_count": len(active_libraries),
                    "closed_count": len(closed_libraries),
                    "libraries": active_libraries + closed_libraries
                }, f, indent=2, ensure_ascii=False)

            logger.info(f"Saved combined dataset to {combined_file}")

        finally:
            await self.close_browser()


async def main():
    """Entry point."""
    import argparse

    parser = argparse.ArgumentParser(
        description="Scrape Canadian ISIL database from Library and Archives Canada"
    )
    parser.add_argument(
        "--test",
        action="store_true",
        help="Test mode: only scrape first 2 pages of each category"
    )

    args = parser.parse_args()

    scraper = CanadianISILScraper()
    await scraper.run(test_mode=args.test)


if __name__ == "__main__":
    asyncio.run(main())