34 lines
1.2 KiB
Python
34 lines
1.2 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Quick script to scrape pages 14-20 of Austrian ISIL database using Playwright.
|
|
This continues from where page 13 left off.
|
|
"""
|
|
import json
|
|
import time
|
|
from pathlib import Path
|
|
|
|
# Note: This script expects to be run via OpenCode's Playwright browser tools
|
|
# It generates the URLs and offsets for pages 14-20
|
|
|
|
OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/isil/austria")
|
|
|
|
def generate_page_info():
|
|
"""Generate page numbers and offsets for pages 14-20."""
|
|
pages = []
|
|
for page_num in range(14, 21): # 14 through 20
|
|
offset = (page_num - 1) * 10
|
|
url = f"https://www.isil.at/primo-explore/search?query=any,contains,AT-&tab=default_tab&search_scope=default_scope&vid=AIS&offset={offset}"
|
|
pages.append({
|
|
"page": page_num,
|
|
"offset": offset,
|
|
"url": url,
|
|
"output_file": OUTPUT_DIR / f"page_{page_num:03d}_data.json"
|
|
})
|
|
return pages
|
|
|
|
if __name__ == "__main__":
|
|
pages = generate_page_info()
|
|
print("Pages to scrape:")
|
|
for p in pages:
|
|
print(f" Page {p['page']}: offset={p['offset']}, file={p['output_file'].name}")
|
|
print(f"\nTotal: {len(pages)} pages")
|