glam/scripts/download_austrian_isil_via_browser.py
2025-11-19 23:25:22 +01:00

120 lines
4.6 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Austrian ISIL Downloader using browser automation
Uses Playwright to click through and download Excel exports
"""
from playwright.sync_api import sync_playwright
import time
from pathlib import Path
import sys
def log(msg):
from datetime import datetime
print(f"[{datetime.now().strftime('%H:%M:%S')}] {msg}")
sys.stdout.flush()
def download_isil_data():
output_dir = Path("data/isil/austria_downloads")
output_dir.mkdir(parents=True, exist_ok=True)
log("Starting browser automation...")
with sync_playwright() as p:
# Launch browser
browser = p.chromium.launch(headless=False) # Visible for debugging
context = browser.new_context(accept_downloads=True)
page = context.new_page()
# Set longer timeout
page.set_default_timeout(60000)
try:
# Navigate to search for all AT- institutions
search_url = "https://www.isil.at/primo-explore/search?query=any,contains,AT-&tab=default_tab&search_scope=default_scope&vid=AIS&offset=0"
log(f"Navigating to: {search_url}")
page.goto(search_url, wait_until="networkidle")
time.sleep(3)
# Wait for results
page.wait_for_selector('text=/Ergebnisse/')
# Get total count
results_text = page.locator('text=/\\d+-\\d+ von \\d+ Ergebnisse/').first.text_content()
log(f"Results: {results_text}")
# Change to show 50 results per page
log("Changing to 50 results per page...")
button_50 = page.get_by_role('button', name='Anzeige: 50 Ergebnisse pro Seite')
if button_50.count() > 0:
button_50.click()
time.sleep(2)
# Now iterate through pages and download
page_num = 0
max_pages = 100 # Safety limit, adjust as needed
while page_num < max_pages:
log(f"\n=== Page {page_num + 1} ===")
# Select all items on current page
log("Selecting all items on page...")
select_all_checkbox = page.locator('input[aria-label*="Alle angezeigten"]').first
if select_all_checkbox.count() > 0:
select_all_checkbox.check()
time.sleep(1)
# Click export button
log("Clicking export button...")
export_button = page.get_by_role('button', name='Alle Ergebnisse exportieren')
if export_button.count() > 0:
export_button.click()
time.sleep(2)
# Click "Zu Excel exportieren"
excel_tab = page.get_by_label('Zu Excel exportieren')
if excel_tab.count() > 0:
excel_tab.click()
time.sleep(1)
# Click Download button
with page.expect_download() as download_info:
download_button = page.get_by_role('button', name='Download')
if download_button.count() > 0:
download_button.click()
download = download_info.value
filename = f"austria_isil_page_{page_num + 1:03d}.xlsx"
download.save_as(output_dir / filename)
log(f"Downloaded: {filename}")
# Close export dialog
close_button = page.get_by_role('button', name='Schließen')
if close_button.count() > 0:
close_button.click()
time.sleep(1)
# Go to next page
next_button = page.locator('button[aria-label*="nächste Seite"]')
if next_button.count() > 0 and next_button.is_enabled():
log("Going to next page...")
next_button.click()
time.sleep(3)
page_num += 1
else:
log("No more pages")
break
log(f"\n=== Download Complete ===")
log(f"Downloaded {page_num + 1} Excel files to {output_dir}")
except Exception as e:
log(f"Error: {e}")
import traceback
traceback.print_exc()
finally:
browser.close()
if __name__ == "__main__":
download_isil_data()