#!/usr/bin/env python3
"""
Scraper for Canadian ISIL database from Library and Archives Canada.

This script extracts all library records (active and closed/superseded) from:
https://sigles-symbols.bac-lac.gc.ca/eng/Search

Total expected records:
- Active libraries: 6,520
- Closed/Superseded: 3,046
- Total: 9,566

Output: JSON files in data/isil/canada/
"""

import asyncio
import json
import logging
import re
from pathlib import Path
from typing import Dict, List, Optional
from datetime import datetime
from urllib.parse import urljoin

try:
    from playwright.async_api import async_playwright
except ImportError:
    print("Error: playwright is not installed. Install with: pip install playwright && playwright install")
    exit(1)

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Constants
BASE_URL = "https://sigles-symbols.bac-lac.gc.ca"
SEARCH_URL = f"{BASE_URL}/eng/Search/List"
OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/isil/canada")
PAGE_SIZE = 100  # Maximum results per page


class CanadianISILScraper:
    """Scraper for Canadian ISIL/Library Symbol database."""
    
    def __init__(self):
        self.browser = None
        self.page = None
        self.output_dir = OUTPUT_DIR
        self.output_dir.mkdir(parents=True, exist_ok=True)
        
    async def start_browser(self):
        """Initialize browser and page."""
        playwright = await async_playwright().start()
        self.browser = await playwright.chromium.launch(headless=True)
        self.page = await self.browser.new_page()
        logger.info("Browser started")
        
    async def close_browser(self):
        """Close browser."""
        if self.browser:
            await self.browser.close()
            logger.info("Browser closed")
    
    async def fetch_list_page(
        self, 
        page_num: int, 
        closed_superseded: bool = False
    ) -> List[Dict]:
        """
        Fetch a single page of library listings.
        
        Args:
            page_num: Page number (1-indexed)
            closed_superseded: If True, fetch closed/superseded libraries
            
        Returns:
            List of library records with basic info
        """
        url = (
            f"{SEARCH_URL}?Page={page_num}&PageSize={PAGE_SIZE}"
            f"&q=&Scope=&LibraryType=&Province="
            f"&LendingMonographs=false&PhotocopyMonographs=false"
            f"&LendingSerials=false&PhotocopySerials=false"
            f"&NoLendingCharge=false&RenewableMonographs=false"
            f"&NoPhotocopyCharge=false&RenewableSerials=false"
        )
        
        if closed_superseded:
            url += "&ClosedSuperceded=True"
        
        logger.info(f"Fetching page {page_num} ({'closed' if closed_superseded else 'active'})...")
        
        try:
            await self.page.goto(url, wait_until="networkidle", timeout=60000)
            
            # Extract table rows
            rows = await self.page.locator("table tbody tr").all()
            
            libraries = []
            for row in rows:
                cells = await row.locator("td").all()
                if len(cells) >= 4:
                    symbol_cell = cells[0]
                    symbol_link = await symbol_cell.locator("a").get_attribute("href")
                    symbol_text = await symbol_cell.inner_text()
                    
                    name = await cells[1].inner_text()
                    city = await cells[2].inner_text()
                    province = await cells[3].inner_text()
                    
                    # Extract library ID from detail link
                    library_id = None
                    if symbol_link:
                        match = re.search(r'Id=(\d+)', symbol_link)
                        if match:
                            library_id = match.group(1)
                    
                    libraries.append({
                        "library_symbol": symbol_text.strip(),
                        "name": name.strip(),
                        "city": city.strip(),
                        "province": province.strip(),
                        "library_id": library_id,
                        "detail_url": urljoin(BASE_URL, symbol_link) if symbol_link else None,
                        "status": "closed" if closed_superseded else "active"
                    })
            
            logger.info(f"  Extracted {len(libraries)} records")
            return libraries
            
        except Exception as e:
            logger.error(f"Error fetching page {page_num}: {e}")
            return []
    
    async def fetch_library_details(self, library: Dict) -> Dict:
        """
        Fetch detailed information for a single library.
        
        Args:
            library: Basic library info with detail_url
            
        Returns:
            Complete library record with all available fields
        """
        if not library.get("detail_url"):
            logger.warning(f"No detail URL for {library.get('library_symbol')}")
            return library
        
        try:
            await self.page.goto(library["detail_url"], wait_until="networkidle", timeout=30000)
            
            # Extract all detail fields (structure varies, we'll parse what we can)
            # The detail page typically has definition lists (dt/dd pairs)
            
            details = {}
            
            # Try to find all key-value pairs
            dts = await self.page.locator("dt").all()
            dds = await self.page.locator("dd").all()
            
            for dt, dd in zip(dts, dds):
                key = await dt.inner_text()
                value = await dd.inner_text()
                details[key.strip().rstrip(":")] = value.strip()
            
            # Merge details into library record
            library.update(details)
            
            logger.debug(f"  Fetched details for {library['library_symbol']}")
            return library
            
        except Exception as e:
            logger.error(f"Error fetching details for {library.get('library_symbol')}: {e}")
            return library
    
    async def scrape_all_libraries(
        self, 
        closed_superseded: bool = False,
        max_pages: Optional[int] = None
    ) -> List[Dict]:
        """
        Scrape all libraries (active or closed/superseded).
        
        Args:
            closed_superseded: If True, scrape closed/superseded libraries
            max_pages: Optional limit on number of pages (for testing)
            
        Returns:
            List of all library records
        """
        all_libraries = []
        
        # Determine total pages needed
        if closed_superseded:
            total_records = 3046
            category = "closed/superseded"
        else:
            total_records = 6520
            category = "active"
        
        total_pages = (total_records + PAGE_SIZE - 1) // PAGE_SIZE  # Ceiling division
        
        if max_pages:
            total_pages = min(total_pages, max_pages)
        
        logger.info(f"Scraping {category} libraries: {total_pages} pages ({total_records} records)")
        
        # Fetch list pages
        for page_num in range(1, total_pages + 1):
            libraries = await self.fetch_list_page(page_num, closed_superseded)
            all_libraries.extend(libraries)
            
            # Be polite - add delay between requests
            await asyncio.sleep(1)
        
        logger.info(f"Fetched {len(all_libraries)} {category} library records")
        
        # Now fetch details for each library (this will take a while!)
        logger.info(f"Fetching detailed information for {len(all_libraries)} libraries...")
        
        for i, library in enumerate(all_libraries, 1):
            if i % 100 == 0:
                logger.info(f"  Progress: {i}/{len(all_libraries)}")
            
            await self.fetch_library_details(library)
            await asyncio.sleep(0.5)  # Be polite
        
        return all_libraries
    
    async def run(self, test_mode: bool = False):
        """
        Main execution method.
        
        Args:
            test_mode: If True, only fetch first 2 pages of each category
        """
        await self.start_browser()
        
        try:
            # Scrape active libraries
            max_pages = 2 if test_mode else None
            active_libraries = await self.scrape_all_libraries(
                closed_superseded=False,
                max_pages=max_pages
            )
            
            # Save active libraries
            active_file = self.output_dir / "canadian_libraries_active.json"
            with open(active_file, 'w', encoding='utf-8') as f:
                json.dump({
                    "extraction_date": datetime.now().isoformat(),
                    "source": "Library and Archives Canada - Canadian Library Directory",
                    "source_url": "https://sigles-symbols.bac-lac.gc.ca/eng/Search",
                    "status": "active",
                    "record_count": len(active_libraries),
                    "libraries": active_libraries
                }, f, indent=2, ensure_ascii=False)
            
            logger.info(f"Saved {len(active_libraries)} active libraries to {active_file}")
            
            # Scrape closed/superseded libraries
            closed_libraries = await self.scrape_all_libraries(
                closed_superseded=True,
                max_pages=max_pages
            )
            
            # Save closed libraries
            closed_file = self.output_dir / "canadian_libraries_closed.json"
            with open(closed_file, 'w', encoding='utf-8') as f:
                json.dump({
                    "extraction_date": datetime.now().isoformat(),
                    "source": "Library and Archives Canada - Canadian Library Directory",
                    "source_url": "https://sigles-symbols.bac-lac.gc.ca/eng/Search",
                    "status": "closed_superseded",
                    "record_count": len(closed_libraries),
                    "libraries": closed_libraries
                }, f, indent=2, ensure_ascii=False)
            
            logger.info(f"Saved {len(closed_libraries)} closed/superseded libraries to {closed_file}")
            
            # Create combined file
            combined_file = self.output_dir / "canadian_libraries_all.json"
            with open(combined_file, 'w', encoding='utf-8') as f:
                json.dump({
                    "extraction_date": datetime.now().isoformat(),
                    "source": "Library and Archives Canada - Canadian Library Directory",
                    "source_url": "https://sigles-symbols.bac-lac.gc.ca/eng/Search",
                    "total_records": len(active_libraries) + len(closed_libraries),
                    "active_count": len(active_libraries),
                    "closed_count": len(closed_libraries),
                    "libraries": active_libraries + closed_libraries
                }, f, indent=2, ensure_ascii=False)
            
            logger.info(f"Saved combined dataset to {combined_file}")
            
        finally:
            await self.close_browser()


async def main():
    """Entry point."""
    import argparse
    
    parser = argparse.ArgumentParser(
        description="Scrape Canadian ISIL database from Library and Archives Canada"
    )
    parser.add_argument(
        "--test",
        action="store_true",
        help="Test mode: only scrape first 2 pages of each category"
    )
    
    args = parser.parse_args()
    
    scraper = CanadianISILScraper()
    await scraper.run(test_mode=args.test)


if __name__ == "__main__":
    asyncio.run(main())