glam/scripts/scrape_austrian_isil_systematic.py

#!/usr/bin/env python3
"""
Systematic scraper for Austrian ISIL database using Playwright MCP.
Scrapes pages sequentially to avoid navigation conflicts.
"""

import json
import time
from pathlib import Path

# Configuration
BASE_URL = "https://www.isil.at/primo-explore/search?query=any,contains,AT-&tab=default_tab&search_scope=default_scope&vid=AIS&offset="
DATA_DIR = Path("/Users/kempersc/apps/glam/data/isil/austria")
RESULTS_PER_PAGE = 10
WAIT_BETWEEN_PAGES = 3  # seconds

# Extraction JavaScript function
EXTRACT_JS = """() => {
  const results = [];
  const headings = document.querySelectorAll('h3.item-title');
  headings.forEach((heading) => {
    const fullText = heading.textContent.trim();
    const match = fullText.match(/^(.*?)\\s+(AT-[A-Za-z0-9-]+)\\s*$/);
    if (match) {
      results.push({name: match[1].trim(), isil: match[2].trim()});
    }
  });
  return {count: results.length, institutions: results};
}"""


def get_existing_pages():
    """Get list of already scraped page numbers."""
    existing = []
    for file in DATA_DIR.glob("page_*_data.json"):
        page_num = int(file.stem.split('_')[1])
        existing.append(page_num)
    return sorted(existing)


def get_offset_for_page(page_num):
    """Calculate offset for a given page number."""
    return (page_num - 1) * RESULTS_PER_PAGE


def save_page_data(page_num, offset, institutions):
    """Save extracted data to JSON file."""
    data = {
        "page": page_num,
        "offset": offset,
        "count": len(institutions),
        "extraction_date": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
        "source_url": f"{BASE_URL}{offset}",
        "institutions": institutions
    }

    output_file = DATA_DIR / f"page_{page_num:03d}_data.json"
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2, ensure_ascii=False)

    print(f"✅ Saved page {page_num} ({len(institutions)} institutions) to {output_file.name}")


def scrape_pages(start_page, end_page):
    """
    Generate instructions for scraping pages sequentially.

    This function generates the workflow that should be executed using
    Playwright MCP browser tools:
    1. Navigate to URL
    2. Wait 5 seconds
    3. Run extraction JavaScript
    4. Save to JSON
    5. Close browser
    6. Sleep 3 seconds
    """
    existing_pages = get_existing_pages()
    print(f"Already scraped pages: {existing_pages}")
    print(f"Target range: {start_page}-{end_page}")

    pages_to_scrape = [p for p in range(start_page, end_page + 1) if p not in existing_pages]

    if not pages_to_scrape:
        print("✅ All pages in range already scraped!")
        return []

    print(f"\n📋 Pages to scrape: {pages_to_scrape}")
    print(f"Total: {len(pages_to_scrape)} pages")

    instructions = []

    for page_num in pages_to_scrape:
        offset = get_offset_for_page(page_num)
        url = f"{BASE_URL}{offset}"

        instruction = {
            "page": page_num,
            "offset": offset,
            "url": url,
            "workflow": [
                f"1. Navigate to: {url}",
                f"2. Wait 5 seconds for page load",
                f"3. Run extraction JavaScript",
                f"4. Save to page_{page_num:03d}_data.json",
                f"5. Close browser",
                f"6. Sleep 3 seconds"
            ]
        }
        instructions.append(instruction)

    return instructions


if __name__ == "__main__":
    # Scrape pages 3-20 (next batch after current progress)
    instructions = scrape_pages(start_page=3, end_page=20)

    if instructions:
        print(f"\n{'='*60}")
        print("SCRAPING WORKFLOW")
        print(f"{'='*60}\n")

        for inst in instructions:
            print(f"\n📄 PAGE {inst['page']} (offset={inst['offset']}):")
            for step in inst['workflow']:
                print(f"   {step}")

    print(f"\n{'='*60}")
    print(f"Next: Use Playwright MCP to execute workflow for {len(instructions)} pages")
    print(f"{'='*60}")