#!/usr/bin/env python3 """ Fast scraper for Canadian ISIL database from Library and Archives Canada. This version only scrapes the list pages (not detail pages) for maximum speed. A separate script can fetch details later if needed. Total expected records: - Active libraries: 6,520 - Closed/Superseded: 3,046 - Total: 9,566 Output: JSON files in data/isil/canada/ """ import asyncio import json import logging import re from pathlib import Path from typing import Dict, List from datetime import datetime from urllib.parse import urljoin try: from playwright.async_api import async_playwright except ImportError: print("Error: playwright is not installed. Install with: pip install playwright && playwright install") exit(1) # Setup logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) # Constants BASE_URL = "https://sigles-symbols.bac-lac.gc.ca" SEARCH_URL = f"{BASE_URL}/eng/Search/List" OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/isil/canada") PAGE_SIZE = 100 # Maximum results per page class FastCanadianISILScraper: """Fast scraper for Canadian ISIL/Library Symbol database (list pages only).""" def __init__(self): self.browser = None self.page = None self.output_dir = OUTPUT_DIR self.output_dir.mkdir(parents=True, exist_ok=True) async def start_browser(self): """Initialize browser and page.""" playwright = await async_playwright().start() self.browser = await playwright.chromium.launch(headless=True) self.page = await self.browser.new_page() logger.info("Browser started") async def close_browser(self): """Close browser.""" if self.browser: await self.browser.close() logger.info("Browser closed") async def fetch_list_page( self, page_num: int, closed_superseded: bool = False ) -> List[Dict]: """ Fetch a single page of library listings. Args: page_num: Page number (1-indexed) closed_superseded: If True, fetch closed/superseded libraries Returns: List of library records with basic info """ url = ( f"{SEARCH_URL}?Page={page_num}&PageSize={PAGE_SIZE}" f"&q=&Scope=&LibraryType=&Province=" f"&LendingMonographs=false&PhotocopyMonographs=false" f"&LendingSerials=false&PhotocopySerials=false" f"&NoLendingCharge=false&RenewableMonographs=false" f"&NoPhotocopyCharge=false&RenewableSerials=false" ) if closed_superseded: url += "&ClosedSuperceded=True" logger.info(f"Fetching page {page_num} ({'closed' if closed_superseded else 'active'})...") try: await self.page.goto(url, wait_until="networkidle", timeout=60000) # Extract table rows rows = await self.page.locator("table tbody tr").all() libraries = [] for row in rows: cells = await row.locator("td").all() if len(cells) >= 4: symbol_cell = cells[0] symbol_link = await symbol_cell.locator("a").get_attribute("href") symbol_text = await symbol_cell.inner_text() name = await cells[1].inner_text() city = await cells[2].inner_text() province = await cells[3].inner_text() # Extract library ID from detail link library_id = None if symbol_link: match = re.search(r'Id=(\d+)', symbol_link) if match: library_id = match.group(1) # Generate ISIL code (Canadian format: CA-XXXX where XXXX is the symbol) isil_code = f"CA-{symbol_text.strip()}" libraries.append({ "isil_code": isil_code, "library_symbol": symbol_text.strip(), "name": name.strip(), "city": city.strip(), "province": province.strip(), "country": "CA", "library_id": library_id, "detail_url": urljoin(BASE_URL, symbol_link) if symbol_link else None, "status": "closed" if closed_superseded else "active" }) logger.info(f" Extracted {len(libraries)} records") return libraries except Exception as e: logger.error(f"Error fetching page {page_num}: {e}") return [] async def scrape_all_libraries( self, closed_superseded: bool = False, max_pages = None ) -> List[Dict]: """ Scrape all libraries (active or closed/superseded) from list pages only. Args: closed_superseded: If True, scrape closed/superseded libraries max_pages: Optional limit on number of pages (for testing) Returns: List of all library records """ all_libraries = [] # Determine total pages needed if closed_superseded: total_records = 3046 category = "closed/superseded" else: total_records = 6520 category = "active" total_pages = (total_records + PAGE_SIZE - 1) // PAGE_SIZE # Ceiling division if max_pages: total_pages = min(total_pages, max_pages) logger.info(f"Scraping {category} libraries: {total_pages} pages (~{total_records} records)") # Fetch list pages for page_num in range(1, total_pages + 1): libraries = await self.fetch_list_page(page_num, closed_superseded) all_libraries.extend(libraries) # Be polite - add delay between requests await asyncio.sleep(0.5) logger.info(f"Fetched {len(all_libraries)} {category} library records") return all_libraries async def run(self, test_mode: bool = False): """ Main execution method. Args: test_mode: If True, only fetch first 2 pages of each category """ await self.start_browser() try: # Scrape active libraries max_pages = 2 if test_mode else None active_libraries = await self.scrape_all_libraries( closed_superseded=False, max_pages=max_pages ) # Save active libraries active_file = self.output_dir / "canadian_libraries_active.json" with open(active_file, 'w', encoding='utf-8') as f: json.dump({ "extraction_date": datetime.now().isoformat(), "source": "Library and Archives Canada - Canadian Library Directory", "source_url": "https://sigles-symbols.bac-lac.gc.ca/eng/Search", "status": "active", "record_count": len(active_libraries), "libraries": active_libraries }, f, indent=2, ensure_ascii=False) logger.info(f"Saved {len(active_libraries)} active libraries to {active_file}") # Scrape closed/superseded libraries closed_libraries = await self.scrape_all_libraries( closed_superseded=True, max_pages=max_pages ) # Save closed libraries closed_file = self.output_dir / "canadian_libraries_closed.json" with open(closed_file, 'w', encoding='utf-8') as f: json.dump({ "extraction_date": datetime.now().isoformat(), "source": "Library and Archives Canada - Canadian Library Directory", "source_url": "https://sigles-symbols.bac-lac.gc.ca/eng/Search", "status": "closed_superseded", "record_count": len(closed_libraries), "libraries": closed_libraries }, f, indent=2, ensure_ascii=False) logger.info(f"Saved {len(closed_libraries)} closed/superseded libraries to {closed_file}") # Create combined file combined_file = self.output_dir / "canadian_libraries_all.json" with open(combined_file, 'w', encoding='utf-8') as f: json.dump({ "extraction_date": datetime.now().isoformat(), "source": "Library and Archives Canada - Canadian Library Directory", "source_url": "https://sigles-symbols.bac-lac.gc.ca/eng/Search", "total_records": len(active_libraries) + len(closed_libraries), "active_count": len(active_libraries), "closed_count": len(closed_libraries), "libraries": active_libraries + closed_libraries }, f, indent=2, ensure_ascii=False) logger.info(f"Saved combined dataset ({len(active_libraries) + len(closed_libraries)} records) to {combined_file}") logger.info("\n" + "="*60) logger.info("EXTRACTION COMPLETE!") logger.info(f"Active libraries: {len(active_libraries)}") logger.info(f"Closed libraries: {len(closed_libraries)}") logger.info(f"Total: {len(active_libraries) + len(closed_libraries)}") logger.info("="*60) finally: await self.close_browser() async def main(): """Entry point.""" import argparse parser = argparse.ArgumentParser( description="Fast scraper for Canadian ISIL database (list pages only)" ) parser.add_argument( "--test", action="store_true", help="Test mode: only scrape first 2 pages of each category" ) args = parser.parse_args() scraper = FastCanadianISILScraper() await scraper.run(test_mode=args.test) if __name__ == "__main__": asyncio.run(main())