323 lines
12 KiB
Python
Executable file
323 lines
12 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Scraper for Canadian ISIL database from Library and Archives Canada.
|
|
|
|
This script extracts all library records (active and closed/superseded) from:
|
|
https://sigles-symbols.bac-lac.gc.ca/eng/Search
|
|
|
|
Total expected records:
|
|
- Active libraries: 6,520
|
|
- Closed/Superseded: 3,046
|
|
- Total: 9,566
|
|
|
|
Output: JSON files in data/isil/canada/
|
|
"""
|
|
|
|
import asyncio
|
|
import json
|
|
import logging
|
|
import re
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional
|
|
from datetime import datetime
|
|
from urllib.parse import urljoin
|
|
|
|
try:
|
|
from playwright.async_api import async_playwright
|
|
except ImportError:
|
|
print("Error: playwright is not installed. Install with: pip install playwright && playwright install")
|
|
exit(1)
|
|
|
|
# Setup logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Constants
|
|
BASE_URL = "https://sigles-symbols.bac-lac.gc.ca"
|
|
SEARCH_URL = f"{BASE_URL}/eng/Search/List"
|
|
OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/isil/canada")
|
|
PAGE_SIZE = 100 # Maximum results per page
|
|
|
|
|
|
class CanadianISILScraper:
|
|
"""Scraper for Canadian ISIL/Library Symbol database."""
|
|
|
|
def __init__(self):
|
|
self.browser = None
|
|
self.page = None
|
|
self.output_dir = OUTPUT_DIR
|
|
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
async def start_browser(self):
|
|
"""Initialize browser and page."""
|
|
playwright = await async_playwright().start()
|
|
self.browser = await playwright.chromium.launch(headless=True)
|
|
self.page = await self.browser.new_page()
|
|
logger.info("Browser started")
|
|
|
|
async def close_browser(self):
|
|
"""Close browser."""
|
|
if self.browser:
|
|
await self.browser.close()
|
|
logger.info("Browser closed")
|
|
|
|
async def fetch_list_page(
|
|
self,
|
|
page_num: int,
|
|
closed_superseded: bool = False
|
|
) -> List[Dict]:
|
|
"""
|
|
Fetch a single page of library listings.
|
|
|
|
Args:
|
|
page_num: Page number (1-indexed)
|
|
closed_superseded: If True, fetch closed/superseded libraries
|
|
|
|
Returns:
|
|
List of library records with basic info
|
|
"""
|
|
url = (
|
|
f"{SEARCH_URL}?Page={page_num}&PageSize={PAGE_SIZE}"
|
|
f"&q=&Scope=&LibraryType=&Province="
|
|
f"&LendingMonographs=false&PhotocopyMonographs=false"
|
|
f"&LendingSerials=false&PhotocopySerials=false"
|
|
f"&NoLendingCharge=false&RenewableMonographs=false"
|
|
f"&NoPhotocopyCharge=false&RenewableSerials=false"
|
|
)
|
|
|
|
if closed_superseded:
|
|
url += "&ClosedSuperceded=True"
|
|
|
|
logger.info(f"Fetching page {page_num} ({'closed' if closed_superseded else 'active'})...")
|
|
|
|
try:
|
|
await self.page.goto(url, wait_until="networkidle", timeout=60000)
|
|
|
|
# Extract table rows
|
|
rows = await self.page.locator("table tbody tr").all()
|
|
|
|
libraries = []
|
|
for row in rows:
|
|
cells = await row.locator("td").all()
|
|
if len(cells) >= 4:
|
|
symbol_cell = cells[0]
|
|
symbol_link = await symbol_cell.locator("a").get_attribute("href")
|
|
symbol_text = await symbol_cell.inner_text()
|
|
|
|
name = await cells[1].inner_text()
|
|
city = await cells[2].inner_text()
|
|
province = await cells[3].inner_text()
|
|
|
|
# Extract library ID from detail link
|
|
library_id = None
|
|
if symbol_link:
|
|
match = re.search(r'Id=(\d+)', symbol_link)
|
|
if match:
|
|
library_id = match.group(1)
|
|
|
|
libraries.append({
|
|
"library_symbol": symbol_text.strip(),
|
|
"name": name.strip(),
|
|
"city": city.strip(),
|
|
"province": province.strip(),
|
|
"library_id": library_id,
|
|
"detail_url": urljoin(BASE_URL, symbol_link) if symbol_link else None,
|
|
"status": "closed" if closed_superseded else "active"
|
|
})
|
|
|
|
logger.info(f" Extracted {len(libraries)} records")
|
|
return libraries
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error fetching page {page_num}: {e}")
|
|
return []
|
|
|
|
async def fetch_library_details(self, library: Dict) -> Dict:
|
|
"""
|
|
Fetch detailed information for a single library.
|
|
|
|
Args:
|
|
library: Basic library info with detail_url
|
|
|
|
Returns:
|
|
Complete library record with all available fields
|
|
"""
|
|
if not library.get("detail_url"):
|
|
logger.warning(f"No detail URL for {library.get('library_symbol')}")
|
|
return library
|
|
|
|
try:
|
|
await self.page.goto(library["detail_url"], wait_until="networkidle", timeout=30000)
|
|
|
|
# Extract all detail fields (structure varies, we'll parse what we can)
|
|
# The detail page typically has definition lists (dt/dd pairs)
|
|
|
|
details = {}
|
|
|
|
# Try to find all key-value pairs
|
|
dts = await self.page.locator("dt").all()
|
|
dds = await self.page.locator("dd").all()
|
|
|
|
for dt, dd in zip(dts, dds):
|
|
key = await dt.inner_text()
|
|
value = await dd.inner_text()
|
|
details[key.strip().rstrip(":")] = value.strip()
|
|
|
|
# Merge details into library record
|
|
library.update(details)
|
|
|
|
logger.debug(f" Fetched details for {library['library_symbol']}")
|
|
return library
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error fetching details for {library.get('library_symbol')}: {e}")
|
|
return library
|
|
|
|
async def scrape_all_libraries(
|
|
self,
|
|
closed_superseded: bool = False,
|
|
max_pages: Optional[int] = None
|
|
) -> List[Dict]:
|
|
"""
|
|
Scrape all libraries (active or closed/superseded).
|
|
|
|
Args:
|
|
closed_superseded: If True, scrape closed/superseded libraries
|
|
max_pages: Optional limit on number of pages (for testing)
|
|
|
|
Returns:
|
|
List of all library records
|
|
"""
|
|
all_libraries = []
|
|
|
|
# Determine total pages needed
|
|
if closed_superseded:
|
|
total_records = 3046
|
|
category = "closed/superseded"
|
|
else:
|
|
total_records = 6520
|
|
category = "active"
|
|
|
|
total_pages = (total_records + PAGE_SIZE - 1) // PAGE_SIZE # Ceiling division
|
|
|
|
if max_pages:
|
|
total_pages = min(total_pages, max_pages)
|
|
|
|
logger.info(f"Scraping {category} libraries: {total_pages} pages ({total_records} records)")
|
|
|
|
# Fetch list pages
|
|
for page_num in range(1, total_pages + 1):
|
|
libraries = await self.fetch_list_page(page_num, closed_superseded)
|
|
all_libraries.extend(libraries)
|
|
|
|
# Be polite - add delay between requests
|
|
await asyncio.sleep(1)
|
|
|
|
logger.info(f"Fetched {len(all_libraries)} {category} library records")
|
|
|
|
# Now fetch details for each library (this will take a while!)
|
|
logger.info(f"Fetching detailed information for {len(all_libraries)} libraries...")
|
|
|
|
for i, library in enumerate(all_libraries, 1):
|
|
if i % 100 == 0:
|
|
logger.info(f" Progress: {i}/{len(all_libraries)}")
|
|
|
|
await self.fetch_library_details(library)
|
|
await asyncio.sleep(0.5) # Be polite
|
|
|
|
return all_libraries
|
|
|
|
async def run(self, test_mode: bool = False):
|
|
"""
|
|
Main execution method.
|
|
|
|
Args:
|
|
test_mode: If True, only fetch first 2 pages of each category
|
|
"""
|
|
await self.start_browser()
|
|
|
|
try:
|
|
# Scrape active libraries
|
|
max_pages = 2 if test_mode else None
|
|
active_libraries = await self.scrape_all_libraries(
|
|
closed_superseded=False,
|
|
max_pages=max_pages
|
|
)
|
|
|
|
# Save active libraries
|
|
active_file = self.output_dir / "canadian_libraries_active.json"
|
|
with open(active_file, 'w', encoding='utf-8') as f:
|
|
json.dump({
|
|
"extraction_date": datetime.now().isoformat(),
|
|
"source": "Library and Archives Canada - Canadian Library Directory",
|
|
"source_url": "https://sigles-symbols.bac-lac.gc.ca/eng/Search",
|
|
"status": "active",
|
|
"record_count": len(active_libraries),
|
|
"libraries": active_libraries
|
|
}, f, indent=2, ensure_ascii=False)
|
|
|
|
logger.info(f"Saved {len(active_libraries)} active libraries to {active_file}")
|
|
|
|
# Scrape closed/superseded libraries
|
|
closed_libraries = await self.scrape_all_libraries(
|
|
closed_superseded=True,
|
|
max_pages=max_pages
|
|
)
|
|
|
|
# Save closed libraries
|
|
closed_file = self.output_dir / "canadian_libraries_closed.json"
|
|
with open(closed_file, 'w', encoding='utf-8') as f:
|
|
json.dump({
|
|
"extraction_date": datetime.now().isoformat(),
|
|
"source": "Library and Archives Canada - Canadian Library Directory",
|
|
"source_url": "https://sigles-symbols.bac-lac.gc.ca/eng/Search",
|
|
"status": "closed_superseded",
|
|
"record_count": len(closed_libraries),
|
|
"libraries": closed_libraries
|
|
}, f, indent=2, ensure_ascii=False)
|
|
|
|
logger.info(f"Saved {len(closed_libraries)} closed/superseded libraries to {closed_file}")
|
|
|
|
# Create combined file
|
|
combined_file = self.output_dir / "canadian_libraries_all.json"
|
|
with open(combined_file, 'w', encoding='utf-8') as f:
|
|
json.dump({
|
|
"extraction_date": datetime.now().isoformat(),
|
|
"source": "Library and Archives Canada - Canadian Library Directory",
|
|
"source_url": "https://sigles-symbols.bac-lac.gc.ca/eng/Search",
|
|
"total_records": len(active_libraries) + len(closed_libraries),
|
|
"active_count": len(active_libraries),
|
|
"closed_count": len(closed_libraries),
|
|
"libraries": active_libraries + closed_libraries
|
|
}, f, indent=2, ensure_ascii=False)
|
|
|
|
logger.info(f"Saved combined dataset to {combined_file}")
|
|
|
|
finally:
|
|
await self.close_browser()
|
|
|
|
|
|
async def main():
|
|
"""Entry point."""
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(
|
|
description="Scrape Canadian ISIL database from Library and Archives Canada"
|
|
)
|
|
parser.add_argument(
|
|
"--test",
|
|
action="store_true",
|
|
help="Test mode: only scrape first 2 pages of each category"
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
scraper = CanadianISILScraper()
|
|
await scraper.run(test_mode=args.test)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|