120 lines
4.6 KiB
Python
Executable file
120 lines
4.6 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Austrian ISIL Downloader using browser automation
|
|
Uses Playwright to click through and download Excel exports
|
|
"""
|
|
|
|
from playwright.sync_api import sync_playwright
|
|
import time
|
|
from pathlib import Path
|
|
import sys
|
|
|
|
def log(msg):
|
|
from datetime import datetime
|
|
print(f"[{datetime.now().strftime('%H:%M:%S')}] {msg}")
|
|
sys.stdout.flush()
|
|
|
|
def download_isil_data():
|
|
output_dir = Path("data/isil/austria_downloads")
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
log("Starting browser automation...")
|
|
|
|
with sync_playwright() as p:
|
|
# Launch browser
|
|
browser = p.chromium.launch(headless=False) # Visible for debugging
|
|
context = browser.new_context(accept_downloads=True)
|
|
page = context.new_page()
|
|
|
|
# Set longer timeout
|
|
page.set_default_timeout(60000)
|
|
|
|
try:
|
|
# Navigate to search for all AT- institutions
|
|
search_url = "https://www.isil.at/primo-explore/search?query=any,contains,AT-&tab=default_tab&search_scope=default_scope&vid=AIS&offset=0"
|
|
log(f"Navigating to: {search_url}")
|
|
page.goto(search_url, wait_until="networkidle")
|
|
time.sleep(3)
|
|
|
|
# Wait for results
|
|
page.wait_for_selector('text=/Ergebnisse/')
|
|
|
|
# Get total count
|
|
results_text = page.locator('text=/\\d+-\\d+ von \\d+ Ergebnisse/').first.text_content()
|
|
log(f"Results: {results_text}")
|
|
|
|
# Change to show 50 results per page
|
|
log("Changing to 50 results per page...")
|
|
button_50 = page.get_by_role('button', name='Anzeige: 50 Ergebnisse pro Seite')
|
|
if button_50.count() > 0:
|
|
button_50.click()
|
|
time.sleep(2)
|
|
|
|
# Now iterate through pages and download
|
|
page_num = 0
|
|
max_pages = 100 # Safety limit, adjust as needed
|
|
|
|
while page_num < max_pages:
|
|
log(f"\n=== Page {page_num + 1} ===")
|
|
|
|
# Select all items on current page
|
|
log("Selecting all items on page...")
|
|
select_all_checkbox = page.locator('input[aria-label*="Alle angezeigten"]').first
|
|
if select_all_checkbox.count() > 0:
|
|
select_all_checkbox.check()
|
|
time.sleep(1)
|
|
|
|
# Click export button
|
|
log("Clicking export button...")
|
|
export_button = page.get_by_role('button', name='Alle Ergebnisse exportieren')
|
|
if export_button.count() > 0:
|
|
export_button.click()
|
|
time.sleep(2)
|
|
|
|
# Click "Zu Excel exportieren"
|
|
excel_tab = page.get_by_label('Zu Excel exportieren')
|
|
if excel_tab.count() > 0:
|
|
excel_tab.click()
|
|
time.sleep(1)
|
|
|
|
# Click Download button
|
|
with page.expect_download() as download_info:
|
|
download_button = page.get_by_role('button', name='Download')
|
|
if download_button.count() > 0:
|
|
download_button.click()
|
|
|
|
download = download_info.value
|
|
filename = f"austria_isil_page_{page_num + 1:03d}.xlsx"
|
|
download.save_as(output_dir / filename)
|
|
log(f"Downloaded: {filename}")
|
|
|
|
# Close export dialog
|
|
close_button = page.get_by_role('button', name='Schließen')
|
|
if close_button.count() > 0:
|
|
close_button.click()
|
|
time.sleep(1)
|
|
|
|
# Go to next page
|
|
next_button = page.locator('button[aria-label*="nächste Seite"]')
|
|
if next_button.count() > 0 and next_button.is_enabled():
|
|
log("Going to next page...")
|
|
next_button.click()
|
|
time.sleep(3)
|
|
page_num += 1
|
|
else:
|
|
log("No more pages")
|
|
break
|
|
|
|
log(f"\n=== Download Complete ===")
|
|
log(f"Downloaded {page_num + 1} Excel files to {output_dir}")
|
|
|
|
except Exception as e:
|
|
log(f"Error: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
|
|
finally:
|
|
browser.close()
|
|
|
|
if __name__ == "__main__":
|
|
download_isil_data()
|