glam/scripts/scrape_pages_14_to_20.py
2025-11-19 23:25:22 +01:00

34 lines
1.2 KiB
Python

#!/usr/bin/env python3
"""
Quick script to scrape pages 14-20 of Austrian ISIL database using Playwright.
This continues from where page 13 left off.
"""
import json
import time
from pathlib import Path
# Note: This script expects to be run via OpenCode's Playwright browser tools
# It generates the URLs and offsets for pages 14-20
OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/isil/austria")
def generate_page_info():
"""Generate page numbers and offsets for pages 14-20."""
pages = []
for page_num in range(14, 21): # 14 through 20
offset = (page_num - 1) * 10
url = f"https://www.isil.at/primo-explore/search?query=any,contains,AT-&tab=default_tab&search_scope=default_scope&vid=AIS&offset={offset}"
pages.append({
"page": page_num,
"offset": offset,
"url": url,
"output_file": OUTPUT_DIR / f"page_{page_num:03d}_data.json"
})
return pages
if __name__ == "__main__":
pages = generate_page_info()
print("Pages to scrape:")
for p in pages:
print(f" Page {p['page']}: offset={p['offset']}, file={p['output_file'].name}")
print(f"\nTotal: {len(pages)} pages")