282 lines
10 KiB
Python
Executable file
282 lines
10 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Fast scraper for Canadian ISIL database from Library and Archives Canada.
|
|
|
|
This version only scrapes the list pages (not detail pages) for maximum speed.
|
|
A separate script can fetch details later if needed.
|
|
|
|
Total expected records:
|
|
- Active libraries: 6,520
|
|
- Closed/Superseded: 3,046
|
|
- Total: 9,566
|
|
|
|
Output: JSON files in data/isil/canada/
|
|
"""
|
|
|
|
import asyncio
|
|
import json
|
|
import logging
|
|
import re
|
|
from pathlib import Path
|
|
from typing import Dict, List
|
|
from datetime import datetime
|
|
from urllib.parse import urljoin
|
|
|
|
try:
|
|
from playwright.async_api import async_playwright
|
|
except ImportError:
|
|
print("Error: playwright is not installed. Install with: pip install playwright && playwright install")
|
|
exit(1)
|
|
|
|
# Setup logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Constants
|
|
BASE_URL = "https://sigles-symbols.bac-lac.gc.ca"
|
|
SEARCH_URL = f"{BASE_URL}/eng/Search/List"
|
|
OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/isil/canada")
|
|
PAGE_SIZE = 100 # Maximum results per page
|
|
|
|
|
|
class FastCanadianISILScraper:
|
|
"""Fast scraper for Canadian ISIL/Library Symbol database (list pages only)."""
|
|
|
|
def __init__(self):
|
|
self.browser = None
|
|
self.page = None
|
|
self.output_dir = OUTPUT_DIR
|
|
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
async def start_browser(self):
|
|
"""Initialize browser and page."""
|
|
playwright = await async_playwright().start()
|
|
self.browser = await playwright.chromium.launch(headless=True)
|
|
self.page = await self.browser.new_page()
|
|
logger.info("Browser started")
|
|
|
|
async def close_browser(self):
|
|
"""Close browser."""
|
|
if self.browser:
|
|
await self.browser.close()
|
|
logger.info("Browser closed")
|
|
|
|
async def fetch_list_page(
|
|
self,
|
|
page_num: int,
|
|
closed_superseded: bool = False
|
|
) -> List[Dict]:
|
|
"""
|
|
Fetch a single page of library listings.
|
|
|
|
Args:
|
|
page_num: Page number (1-indexed)
|
|
closed_superseded: If True, fetch closed/superseded libraries
|
|
|
|
Returns:
|
|
List of library records with basic info
|
|
"""
|
|
url = (
|
|
f"{SEARCH_URL}?Page={page_num}&PageSize={PAGE_SIZE}"
|
|
f"&q=&Scope=&LibraryType=&Province="
|
|
f"&LendingMonographs=false&PhotocopyMonographs=false"
|
|
f"&LendingSerials=false&PhotocopySerials=false"
|
|
f"&NoLendingCharge=false&RenewableMonographs=false"
|
|
f"&NoPhotocopyCharge=false&RenewableSerials=false"
|
|
)
|
|
|
|
if closed_superseded:
|
|
url += "&ClosedSuperceded=True"
|
|
|
|
logger.info(f"Fetching page {page_num} ({'closed' if closed_superseded else 'active'})...")
|
|
|
|
try:
|
|
await self.page.goto(url, wait_until="networkidle", timeout=60000)
|
|
|
|
# Extract table rows
|
|
rows = await self.page.locator("table tbody tr").all()
|
|
|
|
libraries = []
|
|
for row in rows:
|
|
cells = await row.locator("td").all()
|
|
if len(cells) >= 4:
|
|
symbol_cell = cells[0]
|
|
symbol_link = await symbol_cell.locator("a").get_attribute("href")
|
|
symbol_text = await symbol_cell.inner_text()
|
|
|
|
name = await cells[1].inner_text()
|
|
city = await cells[2].inner_text()
|
|
province = await cells[3].inner_text()
|
|
|
|
# Extract library ID from detail link
|
|
library_id = None
|
|
if symbol_link:
|
|
match = re.search(r'Id=(\d+)', symbol_link)
|
|
if match:
|
|
library_id = match.group(1)
|
|
|
|
# Generate ISIL code (Canadian format: CA-XXXX where XXXX is the symbol)
|
|
isil_code = f"CA-{symbol_text.strip()}"
|
|
|
|
libraries.append({
|
|
"isil_code": isil_code,
|
|
"library_symbol": symbol_text.strip(),
|
|
"name": name.strip(),
|
|
"city": city.strip(),
|
|
"province": province.strip(),
|
|
"country": "CA",
|
|
"library_id": library_id,
|
|
"detail_url": urljoin(BASE_URL, symbol_link) if symbol_link else None,
|
|
"status": "closed" if closed_superseded else "active"
|
|
})
|
|
|
|
logger.info(f" Extracted {len(libraries)} records")
|
|
return libraries
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error fetching page {page_num}: {e}")
|
|
return []
|
|
|
|
async def scrape_all_libraries(
|
|
self,
|
|
closed_superseded: bool = False,
|
|
max_pages = None
|
|
) -> List[Dict]:
|
|
"""
|
|
Scrape all libraries (active or closed/superseded) from list pages only.
|
|
|
|
Args:
|
|
closed_superseded: If True, scrape closed/superseded libraries
|
|
max_pages: Optional limit on number of pages (for testing)
|
|
|
|
Returns:
|
|
List of all library records
|
|
"""
|
|
all_libraries = []
|
|
|
|
# Determine total pages needed
|
|
if closed_superseded:
|
|
total_records = 3046
|
|
category = "closed/superseded"
|
|
else:
|
|
total_records = 6520
|
|
category = "active"
|
|
|
|
total_pages = (total_records + PAGE_SIZE - 1) // PAGE_SIZE # Ceiling division
|
|
|
|
if max_pages:
|
|
total_pages = min(total_pages, max_pages)
|
|
|
|
logger.info(f"Scraping {category} libraries: {total_pages} pages (~{total_records} records)")
|
|
|
|
# Fetch list pages
|
|
for page_num in range(1, total_pages + 1):
|
|
libraries = await self.fetch_list_page(page_num, closed_superseded)
|
|
all_libraries.extend(libraries)
|
|
|
|
# Be polite - add delay between requests
|
|
await asyncio.sleep(0.5)
|
|
|
|
logger.info(f"Fetched {len(all_libraries)} {category} library records")
|
|
return all_libraries
|
|
|
|
async def run(self, test_mode: bool = False):
|
|
"""
|
|
Main execution method.
|
|
|
|
Args:
|
|
test_mode: If True, only fetch first 2 pages of each category
|
|
"""
|
|
await self.start_browser()
|
|
|
|
try:
|
|
# Scrape active libraries
|
|
max_pages = 2 if test_mode else None
|
|
active_libraries = await self.scrape_all_libraries(
|
|
closed_superseded=False,
|
|
max_pages=max_pages
|
|
)
|
|
|
|
# Save active libraries
|
|
active_file = self.output_dir / "canadian_libraries_active.json"
|
|
with open(active_file, 'w', encoding='utf-8') as f:
|
|
json.dump({
|
|
"extraction_date": datetime.now().isoformat(),
|
|
"source": "Library and Archives Canada - Canadian Library Directory",
|
|
"source_url": "https://sigles-symbols.bac-lac.gc.ca/eng/Search",
|
|
"status": "active",
|
|
"record_count": len(active_libraries),
|
|
"libraries": active_libraries
|
|
}, f, indent=2, ensure_ascii=False)
|
|
|
|
logger.info(f"Saved {len(active_libraries)} active libraries to {active_file}")
|
|
|
|
# Scrape closed/superseded libraries
|
|
closed_libraries = await self.scrape_all_libraries(
|
|
closed_superseded=True,
|
|
max_pages=max_pages
|
|
)
|
|
|
|
# Save closed libraries
|
|
closed_file = self.output_dir / "canadian_libraries_closed.json"
|
|
with open(closed_file, 'w', encoding='utf-8') as f:
|
|
json.dump({
|
|
"extraction_date": datetime.now().isoformat(),
|
|
"source": "Library and Archives Canada - Canadian Library Directory",
|
|
"source_url": "https://sigles-symbols.bac-lac.gc.ca/eng/Search",
|
|
"status": "closed_superseded",
|
|
"record_count": len(closed_libraries),
|
|
"libraries": closed_libraries
|
|
}, f, indent=2, ensure_ascii=False)
|
|
|
|
logger.info(f"Saved {len(closed_libraries)} closed/superseded libraries to {closed_file}")
|
|
|
|
# Create combined file
|
|
combined_file = self.output_dir / "canadian_libraries_all.json"
|
|
with open(combined_file, 'w', encoding='utf-8') as f:
|
|
json.dump({
|
|
"extraction_date": datetime.now().isoformat(),
|
|
"source": "Library and Archives Canada - Canadian Library Directory",
|
|
"source_url": "https://sigles-symbols.bac-lac.gc.ca/eng/Search",
|
|
"total_records": len(active_libraries) + len(closed_libraries),
|
|
"active_count": len(active_libraries),
|
|
"closed_count": len(closed_libraries),
|
|
"libraries": active_libraries + closed_libraries
|
|
}, f, indent=2, ensure_ascii=False)
|
|
|
|
logger.info(f"Saved combined dataset ({len(active_libraries) + len(closed_libraries)} records) to {combined_file}")
|
|
logger.info("\n" + "="*60)
|
|
logger.info("EXTRACTION COMPLETE!")
|
|
logger.info(f"Active libraries: {len(active_libraries)}")
|
|
logger.info(f"Closed libraries: {len(closed_libraries)}")
|
|
logger.info(f"Total: {len(active_libraries) + len(closed_libraries)}")
|
|
logger.info("="*60)
|
|
|
|
finally:
|
|
await self.close_browser()
|
|
|
|
|
|
async def main():
|
|
"""Entry point."""
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(
|
|
description="Fast scraper for Canadian ISIL database (list pages only)"
|
|
)
|
|
parser.add_argument(
|
|
"--test",
|
|
action="store_true",
|
|
help="Test mode: only scrape first 2 pages of each category"
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
scraper = FastCanadianISILScraper()
|
|
await scraper.run(test_mode=args.test)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|