#!/usr/bin/env python3 """ Batch scraper for Austrian ISIL database. This script will be called by the OpenCODE agent in batches. Usage: python3 batch_scrape_austrian_isil.py --start-page N --batch-size M """ import argparse import json from datetime import datetime, timezone from pathlib import Path def create_batch_instructions(start_page: int, batch_size: int): """Generate instructions for scraping a batch of pages.""" output_dir = Path("/Users/kempersc/apps/glam/data/isil/austria") results_per_page = 10 pages = [] for i in range(batch_size): page_num = start_page + i offset = (page_num - 1) * results_per_page pages.append({ "page": page_num, "offset": offset, "url": f"https://www.isil.at/primo-explore/search?query=any,contains,AT-&tab=default_tab&search_scope=default_scope&vid=AIS&offset={offset}", "output_file": str(output_dir / f"page_{page_num:03d}_data.json") }) return pages def main(): parser = argparse.ArgumentParser(description="Generate batch scraping instructions") parser.add_argument("--start-page", type=int, default=3, help="Start page number") parser.add_argument("--batch-size", type=int, default=20, help="Number of pages to scrape") args = parser.parse_args() print(f"\nšŸ“¦ Batch Scraping Instructions") print(f"=" * 70) print(f"Start Page: {args.start_page}") print(f"Batch Size: {args.batch_size}") print(f"End Page: {args.start_page + args.batch_size - 1}") print(f"=" * 70) print() pages = create_batch_instructions(args.start_page, args.batch_size) # Save batch instructions output_dir = Path("/Users/kempersc/apps/glam/data/isil/austria") batch_file = output_dir / f"batch_{args.start_page}_{args.start_page + args.batch_size - 1}.json" with open(batch_file, 'w') as f: json.dump({ "start_page": args.start_page, "batch_size": args.batch_size, "pages": pages, "created_at": datetime.now(timezone.utc).isoformat() }, f, indent=2) print(f"āœ… Batch instructions saved to: {batch_file}") print() print("šŸ“‹ Pages in this batch:") for p in pages[:5]: print(f" - Page {p['page']}: offset {p['offset']}") if len(pages) > 5: print(f" ... and {len(pages) - 5} more pages") print() return batch_file if __name__ == "__main__": main()