#!/usr/bin/env python3 """ Scraper for Canadian ISIL database from Library and Archives Canada. This script extracts all library records (active and closed/superseded) from: https://sigles-symbols.bac-lac.gc.ca/eng/Search Total expected records: - Active libraries: 6,520 - Closed/Superseded: 3,046 - Total: 9,566 Output: JSON files in data/isil/canada/ """ import asyncio import json import logging import re from pathlib import Path from typing import Dict, List, Optional from datetime import datetime from urllib.parse import urljoin try: from playwright.async_api import async_playwright except ImportError: print("Error: playwright is not installed. Install with: pip install playwright && playwright install") exit(1) # Setup logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) # Constants BASE_URL = "https://sigles-symbols.bac-lac.gc.ca" SEARCH_URL = f"{BASE_URL}/eng/Search/List" OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/isil/canada") PAGE_SIZE = 100 # Maximum results per page class CanadianISILScraper: """Scraper for Canadian ISIL/Library Symbol database.""" def __init__(self): self.browser = None self.page = None self.output_dir = OUTPUT_DIR self.output_dir.mkdir(parents=True, exist_ok=True) async def start_browser(self): """Initialize browser and page.""" playwright = await async_playwright().start() self.browser = await playwright.chromium.launch(headless=True) self.page = await self.browser.new_page() logger.info("Browser started") async def close_browser(self): """Close browser.""" if self.browser: await self.browser.close() logger.info("Browser closed") async def fetch_list_page( self, page_num: int, closed_superseded: bool = False ) -> List[Dict]: """ Fetch a single page of library listings. Args: page_num: Page number (1-indexed) closed_superseded: If True, fetch closed/superseded libraries Returns: List of library records with basic info """ url = ( f"{SEARCH_URL}?Page={page_num}&PageSize={PAGE_SIZE}" f"&q=&Scope=&LibraryType=&Province=" f"&LendingMonographs=false&PhotocopyMonographs=false" f"&LendingSerials=false&PhotocopySerials=false" f"&NoLendingCharge=false&RenewableMonographs=false" f"&NoPhotocopyCharge=false&RenewableSerials=false" ) if closed_superseded: url += "&ClosedSuperceded=True" logger.info(f"Fetching page {page_num} ({'closed' if closed_superseded else 'active'})...") try: await self.page.goto(url, wait_until="networkidle", timeout=60000) # Extract table rows rows = await self.page.locator("table tbody tr").all() libraries = [] for row in rows: cells = await row.locator("td").all() if len(cells) >= 4: symbol_cell = cells[0] symbol_link = await symbol_cell.locator("a").get_attribute("href") symbol_text = await symbol_cell.inner_text() name = await cells[1].inner_text() city = await cells[2].inner_text() province = await cells[3].inner_text() # Extract library ID from detail link library_id = None if symbol_link: match = re.search(r'Id=(\d+)', symbol_link) if match: library_id = match.group(1) libraries.append({ "library_symbol": symbol_text.strip(), "name": name.strip(), "city": city.strip(), "province": province.strip(), "library_id": library_id, "detail_url": urljoin(BASE_URL, symbol_link) if symbol_link else None, "status": "closed" if closed_superseded else "active" }) logger.info(f" Extracted {len(libraries)} records") return libraries except Exception as e: logger.error(f"Error fetching page {page_num}: {e}") return [] async def fetch_library_details(self, library: Dict) -> Dict: """ Fetch detailed information for a single library. Args: library: Basic library info with detail_url Returns: Complete library record with all available fields """ if not library.get("detail_url"): logger.warning(f"No detail URL for {library.get('library_symbol')}") return library try: await self.page.goto(library["detail_url"], wait_until="networkidle", timeout=30000) # Extract all detail fields (structure varies, we'll parse what we can) # The detail page typically has definition lists (dt/dd pairs) details = {} # Try to find all key-value pairs dts = await self.page.locator("dt").all() dds = await self.page.locator("dd").all() for dt, dd in zip(dts, dds): key = await dt.inner_text() value = await dd.inner_text() details[key.strip().rstrip(":")] = value.strip() # Merge details into library record library.update(details) logger.debug(f" Fetched details for {library['library_symbol']}") return library except Exception as e: logger.error(f"Error fetching details for {library.get('library_symbol')}: {e}") return library async def scrape_all_libraries( self, closed_superseded: bool = False, max_pages: Optional[int] = None ) -> List[Dict]: """ Scrape all libraries (active or closed/superseded). Args: closed_superseded: If True, scrape closed/superseded libraries max_pages: Optional limit on number of pages (for testing) Returns: List of all library records """ all_libraries = [] # Determine total pages needed if closed_superseded: total_records = 3046 category = "closed/superseded" else: total_records = 6520 category = "active" total_pages = (total_records + PAGE_SIZE - 1) // PAGE_SIZE # Ceiling division if max_pages: total_pages = min(total_pages, max_pages) logger.info(f"Scraping {category} libraries: {total_pages} pages ({total_records} records)") # Fetch list pages for page_num in range(1, total_pages + 1): libraries = await self.fetch_list_page(page_num, closed_superseded) all_libraries.extend(libraries) # Be polite - add delay between requests await asyncio.sleep(1) logger.info(f"Fetched {len(all_libraries)} {category} library records") # Now fetch details for each library (this will take a while!) logger.info(f"Fetching detailed information for {len(all_libraries)} libraries...") for i, library in enumerate(all_libraries, 1): if i % 100 == 0: logger.info(f" Progress: {i}/{len(all_libraries)}") await self.fetch_library_details(library) await asyncio.sleep(0.5) # Be polite return all_libraries async def run(self, test_mode: bool = False): """ Main execution method. Args: test_mode: If True, only fetch first 2 pages of each category """ await self.start_browser() try: # Scrape active libraries max_pages = 2 if test_mode else None active_libraries = await self.scrape_all_libraries( closed_superseded=False, max_pages=max_pages ) # Save active libraries active_file = self.output_dir / "canadian_libraries_active.json" with open(active_file, 'w', encoding='utf-8') as f: json.dump({ "extraction_date": datetime.now().isoformat(), "source": "Library and Archives Canada - Canadian Library Directory", "source_url": "https://sigles-symbols.bac-lac.gc.ca/eng/Search", "status": "active", "record_count": len(active_libraries), "libraries": active_libraries }, f, indent=2, ensure_ascii=False) logger.info(f"Saved {len(active_libraries)} active libraries to {active_file}") # Scrape closed/superseded libraries closed_libraries = await self.scrape_all_libraries( closed_superseded=True, max_pages=max_pages ) # Save closed libraries closed_file = self.output_dir / "canadian_libraries_closed.json" with open(closed_file, 'w', encoding='utf-8') as f: json.dump({ "extraction_date": datetime.now().isoformat(), "source": "Library and Archives Canada - Canadian Library Directory", "source_url": "https://sigles-symbols.bac-lac.gc.ca/eng/Search", "status": "closed_superseded", "record_count": len(closed_libraries), "libraries": closed_libraries }, f, indent=2, ensure_ascii=False) logger.info(f"Saved {len(closed_libraries)} closed/superseded libraries to {closed_file}") # Create combined file combined_file = self.output_dir / "canadian_libraries_all.json" with open(combined_file, 'w', encoding='utf-8') as f: json.dump({ "extraction_date": datetime.now().isoformat(), "source": "Library and Archives Canada - Canadian Library Directory", "source_url": "https://sigles-symbols.bac-lac.gc.ca/eng/Search", "total_records": len(active_libraries) + len(closed_libraries), "active_count": len(active_libraries), "closed_count": len(closed_libraries), "libraries": active_libraries + closed_libraries }, f, indent=2, ensure_ascii=False) logger.info(f"Saved combined dataset to {combined_file}") finally: await self.close_browser() async def main(): """Entry point.""" import argparse parser = argparse.ArgumentParser( description="Scrape Canadian ISIL database from Library and Archives Canada" ) parser.add_argument( "--test", action="store_true", help="Test mode: only scrape first 2 pages of each category" ) args = parser.parse_args() scraper = CanadianISILScraper() await scraper.run(test_mode=args.test) if __name__ == "__main__": asyncio.run(main())