#!/usr/bin/env python3 """ Systematic scraper for Austrian ISIL database using Playwright MCP. Scrapes pages sequentially to avoid navigation conflicts. """ import json import time from pathlib import Path # Configuration BASE_URL = "https://www.isil.at/primo-explore/search?query=any,contains,AT-&tab=default_tab&search_scope=default_scope&vid=AIS&offset=" DATA_DIR = Path("/Users/kempersc/apps/glam/data/isil/austria") RESULTS_PER_PAGE = 10 WAIT_BETWEEN_PAGES = 3 # seconds # Extraction JavaScript function EXTRACT_JS = """() => { const results = []; const headings = document.querySelectorAll('h3.item-title'); headings.forEach((heading) => { const fullText = heading.textContent.trim(); const match = fullText.match(/^(.*?)\\s+(AT-[A-Za-z0-9-]+)\\s*$/); if (match) { results.push({name: match[1].trim(), isil: match[2].trim()}); } }); return {count: results.length, institutions: results}; }""" def get_existing_pages(): """Get list of already scraped page numbers.""" existing = [] for file in DATA_DIR.glob("page_*_data.json"): page_num = int(file.stem.split('_')[1]) existing.append(page_num) return sorted(existing) def get_offset_for_page(page_num): """Calculate offset for a given page number.""" return (page_num - 1) * RESULTS_PER_PAGE def save_page_data(page_num, offset, institutions): """Save extracted data to JSON file.""" data = { "page": page_num, "offset": offset, "count": len(institutions), "extraction_date": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()), "source_url": f"{BASE_URL}{offset}", "institutions": institutions } output_file = DATA_DIR / f"page_{page_num:03d}_data.json" with open(output_file, 'w', encoding='utf-8') as f: json.dump(data, f, indent=2, ensure_ascii=False) print(f"āœ… Saved page {page_num} ({len(institutions)} institutions) to {output_file.name}") def scrape_pages(start_page, end_page): """ Generate instructions for scraping pages sequentially. This function generates the workflow that should be executed using Playwright MCP browser tools: 1. Navigate to URL 2. Wait 5 seconds 3. Run extraction JavaScript 4. Save to JSON 5. Close browser 6. Sleep 3 seconds """ existing_pages = get_existing_pages() print(f"Already scraped pages: {existing_pages}") print(f"Target range: {start_page}-{end_page}") pages_to_scrape = [p for p in range(start_page, end_page + 1) if p not in existing_pages] if not pages_to_scrape: print("āœ… All pages in range already scraped!") return [] print(f"\nšŸ“‹ Pages to scrape: {pages_to_scrape}") print(f"Total: {len(pages_to_scrape)} pages") instructions = [] for page_num in pages_to_scrape: offset = get_offset_for_page(page_num) url = f"{BASE_URL}{offset}" instruction = { "page": page_num, "offset": offset, "url": url, "workflow": [ f"1. Navigate to: {url}", f"2. Wait 5 seconds for page load", f"3. Run extraction JavaScript", f"4. Save to page_{page_num:03d}_data.json", f"5. Close browser", f"6. Sleep 3 seconds" ] } instructions.append(instruction) return instructions if __name__ == "__main__": # Scrape pages 3-20 (next batch after current progress) instructions = scrape_pages(start_page=3, end_page=20) if instructions: print(f"\n{'='*60}") print("SCRAPING WORKFLOW") print(f"{'='*60}\n") for inst in instructions: print(f"\nšŸ“„ PAGE {inst['page']} (offset={inst['offset']}):") for step in inst['workflow']: print(f" {step}") print(f"\n{'='*60}") print(f"Next: Use Playwright MCP to execute workflow for {len(instructions)} pages") print(f"{'='*60}")