glam/scripts/scrape_austrian_isil_complete.py

#!/usr/bin/env python3
"""
Complete Austrian ISIL Database Scraper
Uses Playwright MCP tools to scrape all 39 pages of the Austrian ISIL database.

This script coordinates with OpenCODE's Playwright MCP server to:
1. Navigate to each results page (10 results per page, 194 pages total)
2. Wait for JavaScript rendering
3. Extract institution names and ISIL codes
4. Save progress after each page

Total institutions: 1,934
Pages to scrape: 194 (10 per page, offset by 10)
Estimated time: ~10 minutes with 3-second rate limiting
"""

import json
import time
from datetime import datetime, timezone
from pathlib import Path


def save_page_data(page_num: int, offset: int, institutions: list, output_dir: Path):
    """Save extracted data for a single page."""
    data = {
        "page": page_num,
        "offset": offset,
        "url": f"https://www.isil.at/primo-explore/search?query=any,contains,AT-&tab=default_tab&search_scope=default_scope&vid=AIS&offset={offset}",
        "extraction_date": datetime.now(timezone.utc).isoformat(),
        "count": len(institutions),
        "institutions": institutions
    }

    output_file = output_dir / f"page_{page_num:03d}_data.json"
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2, ensure_ascii=False)

    print(f"✅ Page {page_num} saved: {len(institutions)} institutions")
    return output_file


def main():
    """
    Main scraper orchestration.

    Note: This script is designed to work WITH OpenCODE's Playwright MCP tools.
    The actual browser automation happens through OpenCODE's MCP server.
    This script provides the logic and coordination.
    """

    # Setup
    output_dir = Path("/Users/kempersc/apps/glam/data/isil/austria")
    output_dir.mkdir(parents=True, exist_ok=True)

    # Configuration
    TOTAL_INSTITUTIONS = 1934
    RESULTS_PER_PAGE = 10  # Using 10 per page for more reliable extraction
    TOTAL_PAGES = (TOTAL_INSTITUTIONS + RESULTS_PER_PAGE - 1) // RESULTS_PER_PAGE  # 194 pages
    START_PAGE = 3  # Resume from page 3 (we already have 1, 2, and 11)
    RATE_LIMIT_SECONDS = 3

    print(f"🚀 Austrian ISIL Scraper")
    print(f"📊 Total pages to scrape: {TOTAL_PAGES}")
    print(f"📄 Results per page: {RESULTS_PER_PAGE}")
    print(f"⏱️  Estimated time: ~{TOTAL_PAGES * RATE_LIMIT_SECONDS / 60:.1f} minutes")
    print(f"📁 Output directory: {output_dir}")
    print(f"▶️  Starting from page {START_PAGE}")
    print()

    # Check which pages we already have
    existing_pages = set()
    for existing_file in output_dir.glob("page_*.json"):
        try:
            page_num = int(existing_file.stem.split('_')[1])
            existing_pages.add(page_num)
        except (IndexError, ValueError):
            pass

    if existing_pages:
        print(f"✅ Found {len(existing_pages)} existing pages: {sorted(existing_pages)}")
        print()

    # Instructions for OpenCODE agent
    print("=" * 70)
    print("INSTRUCTIONS FOR OPENCODE AGENT:")
    print("=" * 70)
    print()
    print("For each page from {} to {}:".format(START_PAGE, TOTAL_PAGES))
    print()
    print("1. Calculate offset: offset = (page - 1) * 10")
    print("2. Navigate to URL:")
    print("   https://www.isil.at/primo-explore/search?query=any,contains,AT-&offset={offset}")
    print()
    print("3. Wait 5 seconds for JavaScript to render")
    print()
    print("4. Extract institutions with JavaScript:")
    print("""
    const results = [];
    const headings = document.querySelectorAll('h3.item-title');

    headings.forEach((heading) => {
      const fullText = heading.textContent.trim();
      const match = fullText.match(/^(.*?)\\s+(AT-[A-Za-z0-9-]+)\\s*$/);

      if (match) {
        results.push({
          name: match[1].trim(),
          isil: match[2].trim()
        });
      }
    });

    return { count: results.length, institutions: results };
    """)
    print()
    print("5. Save to: data/isil/austria/page_{:03d}_data.json".format(START_PAGE))
    print()
    print("6. Sleep 3 seconds (rate limiting)")
    print()
    print("7. Repeat for next page")
    print()
    print("=" * 70)
    print()

    # Generate manifest of pages to scrape
    pages_to_scrape = []
    for page in range(START_PAGE, TOTAL_PAGES + 1):
        if page not in existing_pages:
            offset = (page - 1) * RESULTS_PER_PAGE
            pages_to_scrape.append({
                "page": page,
                "offset": offset,
                "url": f"https://www.isil.at/primo-explore/search?query=any,contains,AT-&tab=default_tab&search_scope=default_scope&vid=AIS&offset={offset}"
            })

    print(f"📋 Pages remaining to scrape: {len(pages_to_scrape)}")
    if len(pages_to_scrape) <= 10:
        print(f"   Next pages: {[p['page'] for p in pages_to_scrape]}")
    else:
        print(f"   Next 10 pages: {[p['page'] for p in pages_to_scrape[:10]]}")
    print()

    # Save manifest
    manifest_file = output_dir / "scraping_manifest.json"
    with open(manifest_file, 'w', encoding='utf-8') as f:
        json.dump({
            "total_institutions": TOTAL_INSTITUTIONS,
            "results_per_page": RESULTS_PER_PAGE,
            "total_pages": TOTAL_PAGES,
            "pages_completed": sorted(existing_pages),
            "pages_remaining": [p['page'] for p in pages_to_scrape],
            "next_pages_to_scrape": pages_to_scrape[:20]  # First 20 for reference
        }, f, indent=2)

    print(f"✅ Scraping manifest saved to: {manifest_file}")
    print()
    print("🤖 Ready for OpenCODE agent to continue scraping!")


if __name__ == "__main__":
    main()