#!/usr/bin/env python3 """ Fast batch scraper for Austrian ISIL database Extracts all institutions from https://www.isil.at in one automated run """ from playwright.sync_api import sync_playwright import json import time from pathlib import Path from datetime import datetime, timezone import sys def log(msg): timestamp = datetime.now().strftime('%H:%M:%S') print(f"[{timestamp}] {msg}") sys.stdout.flush() def extract_institutions(page): """Extract institution data from current page using JavaScript""" results = page.evaluate(""" () => { const results = []; const items = document.querySelectorAll('prm-brief-result-container'); items.forEach(item => { const headingEl = item.querySelector('h3.item-title'); if (!headingEl) return; const fullText = headingEl.textContent.trim(); // Pattern 1: ISIL code at END separated by space // Example: "Stadtarchiv Steyr AT-40201-AR" const matchEndSpace = fullText.match(/^(.+?)\\s+(AT-[A-Za-z0-9\\-]+)$/); // Pattern 2: ISIL code at END without space (embedded in name) // Example: "Stadtarchiv Steyr AT-40201-AR" (same as pattern 1 but handles edge cases) const matchEndNoSpace = fullText.match(/^(.+?)\\s+(AT-[A-Za-z0-9\\-]+)$/); // Pattern 3: ISIL code embedded IN NAME (before the end) // Example: "Universität Wien | Bibliothek AT-UBW-097" const matchEmbedded = fullText.match(/^(.+)\\s+(AT-[A-Za-z0-9\\-]+)$/); if (matchEndSpace) { results.push({ name: matchEndSpace[1].trim(), isil_code: matchEndSpace[2].trim() }); } else if (matchEmbedded) { results.push({ name: matchEmbedded[1].trim(), isil_code: matchEmbedded[2].trim() }); } else if (fullText.length > 0) { // Extract institutions WITHOUT ISIL codes results.push({ name: fullText, isil_code: null }); } }); return results; } """) return results def scrape_all_pages(start_page=1, end_page=194): """ Scrape all pages from Austrian ISIL database Args: start_page: Starting page number (1-based) end_page: Ending page number (inclusive) """ output_dir = Path("data/isil/austria") output_dir.mkdir(parents=True, exist_ok=True) log(f"Starting batch scrape: pages {start_page} to {end_page}") log(f"Output directory: {output_dir}") stats = { "start_time": datetime.now(timezone.utc).isoformat(), "pages_scraped": 0, "institutions_extracted": 0, "failed_pages": [], "start_page": start_page, "end_page": end_page } with sync_playwright() as p: log("Launching browser...") browser = p.chromium.launch(headless=True) # Headless for speed page = browser.new_page() page.set_default_timeout(30000) # 30 second timeout try: for page_num in range(start_page, end_page + 1): offset = (page_num - 1) * 10 url = f"https://www.isil.at/primo-explore/search?query=any,contains,AT-&tab=default_tab&search_scope=default_scope&vid=AIS&offset={offset}" log(f"\n=== Page {page_num}/{end_page} (offset={offset}) ===") try: # Navigate to page page.goto(url, wait_until="networkidle") time.sleep(5) # Wait for AngularJS to render # Extract institutions institutions = extract_institutions(page) if not institutions: log(f"⚠️ WARNING: No institutions found on page {page_num}") stats["failed_pages"].append({ "page": page_num, "offset": offset, "reason": "No institutions extracted" }) continue # Save to JSON output_file = output_dir / f"page_{page_num:03d}_data.json" with open(output_file, 'w', encoding='utf-8') as f: json.dump(institutions, f, ensure_ascii=False, indent=2) log(f"✅ Extracted {len(institutions)} institutions") log(f" Saved to: {output_file.name}") stats["pages_scraped"] += 1 stats["institutions_extracted"] += len(institutions) # Rate limiting - be nice to the server time.sleep(2) except Exception as e: log(f"❌ ERROR on page {page_num}: {e}") stats["failed_pages"].append({ "page": page_num, "offset": offset, "reason": str(e) }) continue # Progress update every 10 pages if page_num % 10 == 0: elapsed = (datetime.now(timezone.utc) - datetime.fromisoformat(stats["start_time"])).total_seconds() rate = stats["pages_scraped"] / elapsed if elapsed > 0 else 0 remaining = (end_page - page_num) / rate if rate > 0 else 0 log(f"\n📊 Progress: {stats['pages_scraped']}/{end_page} pages") log(f" Total institutions: {stats['institutions_extracted']}") log(f" Rate: {rate:.2f} pages/sec") log(f" Estimated time remaining: {remaining/60:.1f} minutes\n") finally: browser.close() # Final statistics stats["end_time"] = datetime.now(timezone.utc).isoformat() stats["duration_seconds"] = ( datetime.fromisoformat(stats["end_time"]) - datetime.fromisoformat(stats["start_time"]) ).total_seconds() # Save statistics stats_file = output_dir / f"scraping_stats_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" with open(stats_file, 'w', encoding='utf-8') as f: json.dump(stats, f, indent=2) # Print summary log("\n" + "="*60) log("SCRAPING COMPLETE") log("="*60) log(f"Pages scraped: {stats['pages_scraped']}/{end_page - start_page + 1}") log(f"Institutions extracted: {stats['institutions_extracted']}") log(f"Failed pages: {len(stats['failed_pages'])}") log(f"Duration: {stats['duration_seconds']/60:.1f} minutes") log(f"Statistics saved to: {stats_file}") if stats["failed_pages"]: log("\n⚠️ Failed pages (need retry):") for fail in stats["failed_pages"]: log(f" - Page {fail['page']}: {fail['reason']}") return stats def main(): import argparse parser = argparse.ArgumentParser(description="Scrape Austrian ISIL database") parser.add_argument("--start", type=int, default=1, help="Start page (default: 1)") parser.add_argument("--end", type=int, default=194, help="End page (default: 194)") parser.add_argument("--test", action="store_true", help="Test mode: scrape only 5 pages") args = parser.parse_args() if args.test: log("🧪 TEST MODE: Scraping first 5 pages only") scrape_all_pages(start_page=1, end_page=5) else: scrape_all_pages(start_page=args.start, end_page=args.end) if __name__ == "__main__": main()