53 lines
1.6 KiB
Bash
53 lines
1.6 KiB
Bash
#!/bin/bash
|
|
# Batch scraper for Austrian ISIL pages
|
|
# This script generates instructions for manual Playwright MCP scraping
|
|
|
|
BASE_URL="https://www.isil.at/primo-explore/search?query=any,contains,AT-&tab=default_tab&search_scope=default_scope&vid=AIS&offset="
|
|
DATA_DIR="/Users/kempersc/apps/glam/data/isil/austria"
|
|
|
|
echo "=========================================="
|
|
echo "AUSTRIAN ISIL BATCH SCRAPING INSTRUCTIONS"
|
|
echo "=========================================="
|
|
echo ""
|
|
echo "Pages to scrape: 4, 6-10, 12-20"
|
|
echo ""
|
|
echo "For each page, execute this workflow:"
|
|
echo "1. playwright_browser_navigate(url)"
|
|
echo "2. playwright_browser_wait_for(time=5)"
|
|
echo "3. playwright_browser_evaluate(EXTRACT_JS)"
|
|
echo "4. Save JSON to page_XXX_data.json"
|
|
echo "5. playwright_browser_close()"
|
|
echo "6. sleep 3"
|
|
echo ""
|
|
|
|
declare -a PAGES=(4 6 7 8 9 10 12 13 14 15 16 17 18 19 20)
|
|
|
|
for PAGE in "${PAGES[@]}"; do
|
|
OFFSET=$(( ($PAGE - 1) * 10 ))
|
|
URL="${BASE_URL}${OFFSET}"
|
|
FILE="page_$(printf "%03d" $PAGE)_data.json"
|
|
|
|
echo "PAGE $PAGE (offset=$OFFSET)"
|
|
echo " URL: $URL"
|
|
echo " Output: $FILE"
|
|
echo ""
|
|
done
|
|
|
|
echo ""
|
|
echo "=========================================="
|
|
echo "EXTRACTION JAVASCRIPT:"
|
|
echo "=========================================="
|
|
cat << 'JSEND'
|
|
() => {
|
|
const results = [];
|
|
const headings = document.querySelectorAll('h3.item-title');
|
|
headings.forEach((heading) => {
|
|
const fullText = heading.textContent.trim();
|
|
const match = fullText.match(/^(.*?)\s+(AT-[A-Za-z0-9-]+)\s*$/);
|
|
if (match) {
|
|
results.push({name: match[1].trim(), isil: match[2].trim()});
|
|
}
|
|
});
|
|
return {count: results.length, institutions: results};
|
|
}
|
|
JSEND
|