#!/usr/bin/env python3 """ Complete Austrian ISIL Database Scraper Uses Playwright MCP tools to scrape all 39 pages of the Austrian ISIL database. This script coordinates with OpenCODE's Playwright MCP server to: 1. Navigate to each results page (10 results per page, 194 pages total) 2. Wait for JavaScript rendering 3. Extract institution names and ISIL codes 4. Save progress after each page Total institutions: 1,934 Pages to scrape: 194 (10 per page, offset by 10) Estimated time: ~10 minutes with 3-second rate limiting """ import json import time from datetime import datetime, timezone from pathlib import Path def save_page_data(page_num: int, offset: int, institutions: list, output_dir: Path): """Save extracted data for a single page.""" data = { "page": page_num, "offset": offset, "url": f"https://www.isil.at/primo-explore/search?query=any,contains,AT-&tab=default_tab&search_scope=default_scope&vid=AIS&offset={offset}", "extraction_date": datetime.now(timezone.utc).isoformat(), "count": len(institutions), "institutions": institutions } output_file = output_dir / f"page_{page_num:03d}_data.json" with open(output_file, 'w', encoding='utf-8') as f: json.dump(data, f, indent=2, ensure_ascii=False) print(f"✅ Page {page_num} saved: {len(institutions)} institutions") return output_file def main(): """ Main scraper orchestration. Note: This script is designed to work WITH OpenCODE's Playwright MCP tools. The actual browser automation happens through OpenCODE's MCP server. This script provides the logic and coordination. """ # Setup output_dir = Path("/Users/kempersc/apps/glam/data/isil/austria") output_dir.mkdir(parents=True, exist_ok=True) # Configuration TOTAL_INSTITUTIONS = 1934 RESULTS_PER_PAGE = 10 # Using 10 per page for more reliable extraction TOTAL_PAGES = (TOTAL_INSTITUTIONS + RESULTS_PER_PAGE - 1) // RESULTS_PER_PAGE # 194 pages START_PAGE = 3 # Resume from page 3 (we already have 1, 2, and 11) RATE_LIMIT_SECONDS = 3 print(f"🚀 Austrian ISIL Scraper") print(f"📊 Total pages to scrape: {TOTAL_PAGES}") print(f"📄 Results per page: {RESULTS_PER_PAGE}") print(f"⏱️ Estimated time: ~{TOTAL_PAGES * RATE_LIMIT_SECONDS / 60:.1f} minutes") print(f"📁 Output directory: {output_dir}") print(f"▶️ Starting from page {START_PAGE}") print() # Check which pages we already have existing_pages = set() for existing_file in output_dir.glob("page_*.json"): try: page_num = int(existing_file.stem.split('_')[1]) existing_pages.add(page_num) except (IndexError, ValueError): pass if existing_pages: print(f"✅ Found {len(existing_pages)} existing pages: {sorted(existing_pages)}") print() # Instructions for OpenCODE agent print("=" * 70) print("INSTRUCTIONS FOR OPENCODE AGENT:") print("=" * 70) print() print("For each page from {} to {}:".format(START_PAGE, TOTAL_PAGES)) print() print("1. Calculate offset: offset = (page - 1) * 10") print("2. Navigate to URL:") print(" https://www.isil.at/primo-explore/search?query=any,contains,AT-&offset={offset}") print() print("3. Wait 5 seconds for JavaScript to render") print() print("4. Extract institutions with JavaScript:") print(""" const results = []; const headings = document.querySelectorAll('h3.item-title'); headings.forEach((heading) => { const fullText = heading.textContent.trim(); const match = fullText.match(/^(.*?)\\s+(AT-[A-Za-z0-9-]+)\\s*$/); if (match) { results.push({ name: match[1].trim(), isil: match[2].trim() }); } }); return { count: results.length, institutions: results }; """) print() print("5. Save to: data/isil/austria/page_{:03d}_data.json".format(START_PAGE)) print() print("6. Sleep 3 seconds (rate limiting)") print() print("7. Repeat for next page") print() print("=" * 70) print() # Generate manifest of pages to scrape pages_to_scrape = [] for page in range(START_PAGE, TOTAL_PAGES + 1): if page not in existing_pages: offset = (page - 1) * RESULTS_PER_PAGE pages_to_scrape.append({ "page": page, "offset": offset, "url": f"https://www.isil.at/primo-explore/search?query=any,contains,AT-&tab=default_tab&search_scope=default_scope&vid=AIS&offset={offset}" }) print(f"📋 Pages remaining to scrape: {len(pages_to_scrape)}") if len(pages_to_scrape) <= 10: print(f" Next pages: {[p['page'] for p in pages_to_scrape]}") else: print(f" Next 10 pages: {[p['page'] for p in pages_to_scrape[:10]]}") print() # Save manifest manifest_file = output_dir / "scraping_manifest.json" with open(manifest_file, 'w', encoding='utf-8') as f: json.dump({ "total_institutions": TOTAL_INSTITUTIONS, "results_per_page": RESULTS_PER_PAGE, "total_pages": TOTAL_PAGES, "pages_completed": sorted(existing_pages), "pages_remaining": [p['page'] for p in pages_to_scrape], "next_pages_to_scrape": pages_to_scrape[:20] # First 20 for reference }, f, indent=2) print(f"✅ Scraping manifest saved to: {manifest_file}") print() print("🤖 Ready for OpenCODE agent to continue scraping!") if __name__ == "__main__": main()