#!/bin/bash # Batch scraper for Austrian ISIL pages # This script generates instructions for manual Playwright MCP scraping BASE_URL="https://www.isil.at/primo-explore/search?query=any,contains,AT-&tab=default_tab&search_scope=default_scope&vid=AIS&offset=" DATA_DIR="/Users/kempersc/apps/glam/data/isil/austria" echo "==========================================" echo "AUSTRIAN ISIL BATCH SCRAPING INSTRUCTIONS" echo "==========================================" echo "" echo "Pages to scrape: 4, 6-10, 12-20" echo "" echo "For each page, execute this workflow:" echo "1. playwright_browser_navigate(url)" echo "2. playwright_browser_wait_for(time=5)" echo "3. playwright_browser_evaluate(EXTRACT_JS)" echo "4. Save JSON to page_XXX_data.json" echo "5. playwright_browser_close()" echo "6. sleep 3" echo "" declare -a PAGES=(4 6 7 8 9 10 12 13 14 15 16 17 18 19 20) for PAGE in "${PAGES[@]}"; do OFFSET=$(( ($PAGE - 1) * 10 )) URL="${BASE_URL}${OFFSET}" FILE="page_$(printf "%03d" $PAGE)_data.json" echo "PAGE $PAGE (offset=$OFFSET)" echo " URL: $URL" echo " Output: $FILE" echo "" done echo "" echo "==========================================" echo "EXTRACTION JAVASCRIPT:" echo "==========================================" cat << 'JSEND' () => { const results = []; const headings = document.querySelectorAll('h3.item-title'); headings.forEach((heading) => { const fullText = heading.textContent.trim(); const match = fullText.match(/^(.*?)\s+(AT-[A-Za-z0-9-]+)\s*$/); if (match) { results.push({name: match[1].trim(), isil: match[2].trim()}); } }); return {count: results.length, institutions: results}; } JSEND