glam/scripts/batch_scrape_austrian_isil.py
2025-11-19 23:25:22 +01:00

76 lines
2.5 KiB
Python

#!/usr/bin/env python3
"""
Batch scraper for Austrian ISIL database.
This script will be called by the OpenCODE agent in batches.
Usage: python3 batch_scrape_austrian_isil.py --start-page N --batch-size M
"""
import argparse
import json
from datetime import datetime, timezone
from pathlib import Path
def create_batch_instructions(start_page: int, batch_size: int):
"""Generate instructions for scraping a batch of pages."""
output_dir = Path("/Users/kempersc/apps/glam/data/isil/austria")
results_per_page = 10
pages = []
for i in range(batch_size):
page_num = start_page + i
offset = (page_num - 1) * results_per_page
pages.append({
"page": page_num,
"offset": offset,
"url": f"https://www.isil.at/primo-explore/search?query=any,contains,AT-&tab=default_tab&search_scope=default_scope&vid=AIS&offset={offset}",
"output_file": str(output_dir / f"page_{page_num:03d}_data.json")
})
return pages
def main():
parser = argparse.ArgumentParser(description="Generate batch scraping instructions")
parser.add_argument("--start-page", type=int, default=3, help="Start page number")
parser.add_argument("--batch-size", type=int, default=20, help="Number of pages to scrape")
args = parser.parse_args()
print(f"\n📦 Batch Scraping Instructions")
print(f"=" * 70)
print(f"Start Page: {args.start_page}")
print(f"Batch Size: {args.batch_size}")
print(f"End Page: {args.start_page + args.batch_size - 1}")
print(f"=" * 70)
print()
pages = create_batch_instructions(args.start_page, args.batch_size)
# Save batch instructions
output_dir = Path("/Users/kempersc/apps/glam/data/isil/austria")
batch_file = output_dir / f"batch_{args.start_page}_{args.start_page + args.batch_size - 1}.json"
with open(batch_file, 'w') as f:
json.dump({
"start_page": args.start_page,
"batch_size": args.batch_size,
"pages": pages,
"created_at": datetime.now(timezone.utc).isoformat()
}, f, indent=2)
print(f"✅ Batch instructions saved to: {batch_file}")
print()
print("📋 Pages in this batch:")
for p in pages[:5]:
print(f" - Page {p['page']}: offset {p['offset']}")
if len(pages) > 5:
print(f" ... and {len(pages) - 5} more pages")
print()
return batch_file
if __name__ == "__main__":
main()