#!/usr/bin/env python3 """ Austrian ISIL Downloader using browser automation Uses Playwright to click through and download Excel exports """ from playwright.sync_api import sync_playwright import time from pathlib import Path import sys def log(msg): from datetime import datetime print(f"[{datetime.now().strftime('%H:%M:%S')}] {msg}") sys.stdout.flush() def download_isil_data(): output_dir = Path("data/isil/austria_downloads") output_dir.mkdir(parents=True, exist_ok=True) log("Starting browser automation...") with sync_playwright() as p: # Launch browser browser = p.chromium.launch(headless=False) # Visible for debugging context = browser.new_context(accept_downloads=True) page = context.new_page() # Set longer timeout page.set_default_timeout(60000) try: # Navigate to search for all AT- institutions search_url = "https://www.isil.at/primo-explore/search?query=any,contains,AT-&tab=default_tab&search_scope=default_scope&vid=AIS&offset=0" log(f"Navigating to: {search_url}") page.goto(search_url, wait_until="networkidle") time.sleep(3) # Wait for results page.wait_for_selector('text=/Ergebnisse/') # Get total count results_text = page.locator('text=/\\d+-\\d+ von \\d+ Ergebnisse/').first.text_content() log(f"Results: {results_text}") # Change to show 50 results per page log("Changing to 50 results per page...") button_50 = page.get_by_role('button', name='Anzeige: 50 Ergebnisse pro Seite') if button_50.count() > 0: button_50.click() time.sleep(2) # Now iterate through pages and download page_num = 0 max_pages = 100 # Safety limit, adjust as needed while page_num < max_pages: log(f"\n=== Page {page_num + 1} ===") # Select all items on current page log("Selecting all items on page...") select_all_checkbox = page.locator('input[aria-label*="Alle angezeigten"]').first if select_all_checkbox.count() > 0: select_all_checkbox.check() time.sleep(1) # Click export button log("Clicking export button...") export_button = page.get_by_role('button', name='Alle Ergebnisse exportieren') if export_button.count() > 0: export_button.click() time.sleep(2) # Click "Zu Excel exportieren" excel_tab = page.get_by_label('Zu Excel exportieren') if excel_tab.count() > 0: excel_tab.click() time.sleep(1) # Click Download button with page.expect_download() as download_info: download_button = page.get_by_role('button', name='Download') if download_button.count() > 0: download_button.click() download = download_info.value filename = f"austria_isil_page_{page_num + 1:03d}.xlsx" download.save_as(output_dir / filename) log(f"Downloaded: {filename}") # Close export dialog close_button = page.get_by_role('button', name='Schließen') if close_button.count() > 0: close_button.click() time.sleep(1) # Go to next page next_button = page.locator('button[aria-label*="nächste Seite"]') if next_button.count() > 0 and next_button.is_enabled(): log("Going to next page...") next_button.click() time.sleep(3) page_num += 1 else: log("No more pages") break log(f"\n=== Download Complete ===") log(f"Downloaded {page_num + 1} Excel files to {output_dir}") except Exception as e: log(f"Error: {e}") import traceback traceback.print_exc() finally: browser.close() if __name__ == "__main__": download_isil_data()