glam/scripts/scrapers/scrape_canadian_isil.py
2025-11-19 23:25:22 +01:00

323 lines
12 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Scraper for Canadian ISIL database from Library and Archives Canada.
This script extracts all library records (active and closed/superseded) from:
https://sigles-symbols.bac-lac.gc.ca/eng/Search
Total expected records:
- Active libraries: 6,520
- Closed/Superseded: 3,046
- Total: 9,566
Output: JSON files in data/isil/canada/
"""
import asyncio
import json
import logging
import re
from pathlib import Path
from typing import Dict, List, Optional
from datetime import datetime
from urllib.parse import urljoin
try:
from playwright.async_api import async_playwright
except ImportError:
print("Error: playwright is not installed. Install with: pip install playwright && playwright install")
exit(1)
# Setup logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# Constants
BASE_URL = "https://sigles-symbols.bac-lac.gc.ca"
SEARCH_URL = f"{BASE_URL}/eng/Search/List"
OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/isil/canada")
PAGE_SIZE = 100 # Maximum results per page
class CanadianISILScraper:
"""Scraper for Canadian ISIL/Library Symbol database."""
def __init__(self):
self.browser = None
self.page = None
self.output_dir = OUTPUT_DIR
self.output_dir.mkdir(parents=True, exist_ok=True)
async def start_browser(self):
"""Initialize browser and page."""
playwright = await async_playwright().start()
self.browser = await playwright.chromium.launch(headless=True)
self.page = await self.browser.new_page()
logger.info("Browser started")
async def close_browser(self):
"""Close browser."""
if self.browser:
await self.browser.close()
logger.info("Browser closed")
async def fetch_list_page(
self,
page_num: int,
closed_superseded: bool = False
) -> List[Dict]:
"""
Fetch a single page of library listings.
Args:
page_num: Page number (1-indexed)
closed_superseded: If True, fetch closed/superseded libraries
Returns:
List of library records with basic info
"""
url = (
f"{SEARCH_URL}?Page={page_num}&PageSize={PAGE_SIZE}"
f"&q=&Scope=&LibraryType=&Province="
f"&LendingMonographs=false&PhotocopyMonographs=false"
f"&LendingSerials=false&PhotocopySerials=false"
f"&NoLendingCharge=false&RenewableMonographs=false"
f"&NoPhotocopyCharge=false&RenewableSerials=false"
)
if closed_superseded:
url += "&ClosedSuperceded=True"
logger.info(f"Fetching page {page_num} ({'closed' if closed_superseded else 'active'})...")
try:
await self.page.goto(url, wait_until="networkidle", timeout=60000)
# Extract table rows
rows = await self.page.locator("table tbody tr").all()
libraries = []
for row in rows:
cells = await row.locator("td").all()
if len(cells) >= 4:
symbol_cell = cells[0]
symbol_link = await symbol_cell.locator("a").get_attribute("href")
symbol_text = await symbol_cell.inner_text()
name = await cells[1].inner_text()
city = await cells[2].inner_text()
province = await cells[3].inner_text()
# Extract library ID from detail link
library_id = None
if symbol_link:
match = re.search(r'Id=(\d+)', symbol_link)
if match:
library_id = match.group(1)
libraries.append({
"library_symbol": symbol_text.strip(),
"name": name.strip(),
"city": city.strip(),
"province": province.strip(),
"library_id": library_id,
"detail_url": urljoin(BASE_URL, symbol_link) if symbol_link else None,
"status": "closed" if closed_superseded else "active"
})
logger.info(f" Extracted {len(libraries)} records")
return libraries
except Exception as e:
logger.error(f"Error fetching page {page_num}: {e}")
return []
async def fetch_library_details(self, library: Dict) -> Dict:
"""
Fetch detailed information for a single library.
Args:
library: Basic library info with detail_url
Returns:
Complete library record with all available fields
"""
if not library.get("detail_url"):
logger.warning(f"No detail URL for {library.get('library_symbol')}")
return library
try:
await self.page.goto(library["detail_url"], wait_until="networkidle", timeout=30000)
# Extract all detail fields (structure varies, we'll parse what we can)
# The detail page typically has definition lists (dt/dd pairs)
details = {}
# Try to find all key-value pairs
dts = await self.page.locator("dt").all()
dds = await self.page.locator("dd").all()
for dt, dd in zip(dts, dds):
key = await dt.inner_text()
value = await dd.inner_text()
details[key.strip().rstrip(":")] = value.strip()
# Merge details into library record
library.update(details)
logger.debug(f" Fetched details for {library['library_symbol']}")
return library
except Exception as e:
logger.error(f"Error fetching details for {library.get('library_symbol')}: {e}")
return library
async def scrape_all_libraries(
self,
closed_superseded: bool = False,
max_pages: Optional[int] = None
) -> List[Dict]:
"""
Scrape all libraries (active or closed/superseded).
Args:
closed_superseded: If True, scrape closed/superseded libraries
max_pages: Optional limit on number of pages (for testing)
Returns:
List of all library records
"""
all_libraries = []
# Determine total pages needed
if closed_superseded:
total_records = 3046
category = "closed/superseded"
else:
total_records = 6520
category = "active"
total_pages = (total_records + PAGE_SIZE - 1) // PAGE_SIZE # Ceiling division
if max_pages:
total_pages = min(total_pages, max_pages)
logger.info(f"Scraping {category} libraries: {total_pages} pages ({total_records} records)")
# Fetch list pages
for page_num in range(1, total_pages + 1):
libraries = await self.fetch_list_page(page_num, closed_superseded)
all_libraries.extend(libraries)
# Be polite - add delay between requests
await asyncio.sleep(1)
logger.info(f"Fetched {len(all_libraries)} {category} library records")
# Now fetch details for each library (this will take a while!)
logger.info(f"Fetching detailed information for {len(all_libraries)} libraries...")
for i, library in enumerate(all_libraries, 1):
if i % 100 == 0:
logger.info(f" Progress: {i}/{len(all_libraries)}")
await self.fetch_library_details(library)
await asyncio.sleep(0.5) # Be polite
return all_libraries
async def run(self, test_mode: bool = False):
"""
Main execution method.
Args:
test_mode: If True, only fetch first 2 pages of each category
"""
await self.start_browser()
try:
# Scrape active libraries
max_pages = 2 if test_mode else None
active_libraries = await self.scrape_all_libraries(
closed_superseded=False,
max_pages=max_pages
)
# Save active libraries
active_file = self.output_dir / "canadian_libraries_active.json"
with open(active_file, 'w', encoding='utf-8') as f:
json.dump({
"extraction_date": datetime.now().isoformat(),
"source": "Library and Archives Canada - Canadian Library Directory",
"source_url": "https://sigles-symbols.bac-lac.gc.ca/eng/Search",
"status": "active",
"record_count": len(active_libraries),
"libraries": active_libraries
}, f, indent=2, ensure_ascii=False)
logger.info(f"Saved {len(active_libraries)} active libraries to {active_file}")
# Scrape closed/superseded libraries
closed_libraries = await self.scrape_all_libraries(
closed_superseded=True,
max_pages=max_pages
)
# Save closed libraries
closed_file = self.output_dir / "canadian_libraries_closed.json"
with open(closed_file, 'w', encoding='utf-8') as f:
json.dump({
"extraction_date": datetime.now().isoformat(),
"source": "Library and Archives Canada - Canadian Library Directory",
"source_url": "https://sigles-symbols.bac-lac.gc.ca/eng/Search",
"status": "closed_superseded",
"record_count": len(closed_libraries),
"libraries": closed_libraries
}, f, indent=2, ensure_ascii=False)
logger.info(f"Saved {len(closed_libraries)} closed/superseded libraries to {closed_file}")
# Create combined file
combined_file = self.output_dir / "canadian_libraries_all.json"
with open(combined_file, 'w', encoding='utf-8') as f:
json.dump({
"extraction_date": datetime.now().isoformat(),
"source": "Library and Archives Canada - Canadian Library Directory",
"source_url": "https://sigles-symbols.bac-lac.gc.ca/eng/Search",
"total_records": len(active_libraries) + len(closed_libraries),
"active_count": len(active_libraries),
"closed_count": len(closed_libraries),
"libraries": active_libraries + closed_libraries
}, f, indent=2, ensure_ascii=False)
logger.info(f"Saved combined dataset to {combined_file}")
finally:
await self.close_browser()
async def main():
"""Entry point."""
import argparse
parser = argparse.ArgumentParser(
description="Scrape Canadian ISIL database from Library and Archives Canada"
)
parser.add_argument(
"--test",
action="store_true",
help="Test mode: only scrape first 2 pages of each category"
)
args = parser.parse_args()
scraper = CanadianISILScraper()
await scraper.run(test_mode=args.test)
if __name__ == "__main__":
asyncio.run(main())