glam/scripts/batch_scrape_austrian_pages.sh
2025-11-19 23:25:22 +01:00

53 lines
1.6 KiB
Bash

#!/bin/bash
# Batch scraper for Austrian ISIL pages
# This script generates instructions for manual Playwright MCP scraping
BASE_URL="https://www.isil.at/primo-explore/search?query=any,contains,AT-&tab=default_tab&search_scope=default_scope&vid=AIS&offset="
DATA_DIR="/Users/kempersc/apps/glam/data/isil/austria"
echo "=========================================="
echo "AUSTRIAN ISIL BATCH SCRAPING INSTRUCTIONS"
echo "=========================================="
echo ""
echo "Pages to scrape: 4, 6-10, 12-20"
echo ""
echo "For each page, execute this workflow:"
echo "1. playwright_browser_navigate(url)"
echo "2. playwright_browser_wait_for(time=5)"
echo "3. playwright_browser_evaluate(EXTRACT_JS)"
echo "4. Save JSON to page_XXX_data.json"
echo "5. playwright_browser_close()"
echo "6. sleep 3"
echo ""
declare -a PAGES=(4 6 7 8 9 10 12 13 14 15 16 17 18 19 20)
for PAGE in "${PAGES[@]}"; do
OFFSET=$(( ($PAGE - 1) * 10 ))
URL="${BASE_URL}${OFFSET}"
FILE="page_$(printf "%03d" $PAGE)_data.json"
echo "PAGE $PAGE (offset=$OFFSET)"
echo " URL: $URL"
echo " Output: $FILE"
echo ""
done
echo ""
echo "=========================================="
echo "EXTRACTION JAVASCRIPT:"
echo "=========================================="
cat << 'JSEND'
() => {
const results = [];
const headings = document.querySelectorAll('h3.item-title');
headings.forEach((heading) => {
const fullText = heading.textContent.trim();
const match = fullText.match(/^(.*?)\s+(AT-[A-Za-z0-9-]+)\s*$/);
if (match) {
results.push({name: match[1].trim(), isil: match[2].trim()});
}
});
return {count: results.length, institutions: results};
}
JSEND