glam/scripts/scrape_austrian_isil_complete.py
2025-11-19 23:25:22 +01:00

161 lines
5.6 KiB
Python

#!/usr/bin/env python3
"""
Complete Austrian ISIL Database Scraper
Uses Playwright MCP tools to scrape all 39 pages of the Austrian ISIL database.
This script coordinates with OpenCODE's Playwright MCP server to:
1. Navigate to each results page (10 results per page, 194 pages total)
2. Wait for JavaScript rendering
3. Extract institution names and ISIL codes
4. Save progress after each page
Total institutions: 1,934
Pages to scrape: 194 (10 per page, offset by 10)
Estimated time: ~10 minutes with 3-second rate limiting
"""
import json
import time
from datetime import datetime, timezone
from pathlib import Path
def save_page_data(page_num: int, offset: int, institutions: list, output_dir: Path):
"""Save extracted data for a single page."""
data = {
"page": page_num,
"offset": offset,
"url": f"https://www.isil.at/primo-explore/search?query=any,contains,AT-&tab=default_tab&search_scope=default_scope&vid=AIS&offset={offset}",
"extraction_date": datetime.now(timezone.utc).isoformat(),
"count": len(institutions),
"institutions": institutions
}
output_file = output_dir / f"page_{page_num:03d}_data.json"
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
print(f"✅ Page {page_num} saved: {len(institutions)} institutions")
return output_file
def main():
"""
Main scraper orchestration.
Note: This script is designed to work WITH OpenCODE's Playwright MCP tools.
The actual browser automation happens through OpenCODE's MCP server.
This script provides the logic and coordination.
"""
# Setup
output_dir = Path("/Users/kempersc/apps/glam/data/isil/austria")
output_dir.mkdir(parents=True, exist_ok=True)
# Configuration
TOTAL_INSTITUTIONS = 1934
RESULTS_PER_PAGE = 10 # Using 10 per page for more reliable extraction
TOTAL_PAGES = (TOTAL_INSTITUTIONS + RESULTS_PER_PAGE - 1) // RESULTS_PER_PAGE # 194 pages
START_PAGE = 3 # Resume from page 3 (we already have 1, 2, and 11)
RATE_LIMIT_SECONDS = 3
print(f"🚀 Austrian ISIL Scraper")
print(f"📊 Total pages to scrape: {TOTAL_PAGES}")
print(f"📄 Results per page: {RESULTS_PER_PAGE}")
print(f"⏱️ Estimated time: ~{TOTAL_PAGES * RATE_LIMIT_SECONDS / 60:.1f} minutes")
print(f"📁 Output directory: {output_dir}")
print(f"▶️ Starting from page {START_PAGE}")
print()
# Check which pages we already have
existing_pages = set()
for existing_file in output_dir.glob("page_*.json"):
try:
page_num = int(existing_file.stem.split('_')[1])
existing_pages.add(page_num)
except (IndexError, ValueError):
pass
if existing_pages:
print(f"✅ Found {len(existing_pages)} existing pages: {sorted(existing_pages)}")
print()
# Instructions for OpenCODE agent
print("=" * 70)
print("INSTRUCTIONS FOR OPENCODE AGENT:")
print("=" * 70)
print()
print("For each page from {} to {}:".format(START_PAGE, TOTAL_PAGES))
print()
print("1. Calculate offset: offset = (page - 1) * 10")
print("2. Navigate to URL:")
print(" https://www.isil.at/primo-explore/search?query=any,contains,AT-&offset={offset}")
print()
print("3. Wait 5 seconds for JavaScript to render")
print()
print("4. Extract institutions with JavaScript:")
print("""
const results = [];
const headings = document.querySelectorAll('h3.item-title');
headings.forEach((heading) => {
const fullText = heading.textContent.trim();
const match = fullText.match(/^(.*?)\\s+(AT-[A-Za-z0-9-]+)\\s*$/);
if (match) {
results.push({
name: match[1].trim(),
isil: match[2].trim()
});
}
});
return { count: results.length, institutions: results };
""")
print()
print("5. Save to: data/isil/austria/page_{:03d}_data.json".format(START_PAGE))
print()
print("6. Sleep 3 seconds (rate limiting)")
print()
print("7. Repeat for next page")
print()
print("=" * 70)
print()
# Generate manifest of pages to scrape
pages_to_scrape = []
for page in range(START_PAGE, TOTAL_PAGES + 1):
if page not in existing_pages:
offset = (page - 1) * RESULTS_PER_PAGE
pages_to_scrape.append({
"page": page,
"offset": offset,
"url": f"https://www.isil.at/primo-explore/search?query=any,contains,AT-&tab=default_tab&search_scope=default_scope&vid=AIS&offset={offset}"
})
print(f"📋 Pages remaining to scrape: {len(pages_to_scrape)}")
if len(pages_to_scrape) <= 10:
print(f" Next pages: {[p['page'] for p in pages_to_scrape]}")
else:
print(f" Next 10 pages: {[p['page'] for p in pages_to_scrape[:10]]}")
print()
# Save manifest
manifest_file = output_dir / "scraping_manifest.json"
with open(manifest_file, 'w', encoding='utf-8') as f:
json.dump({
"total_institutions": TOTAL_INSTITUTIONS,
"results_per_page": RESULTS_PER_PAGE,
"total_pages": TOTAL_PAGES,
"pages_completed": sorted(existing_pages),
"pages_remaining": [p['page'] for p in pages_to_scrape],
"next_pages_to_scrape": pages_to_scrape[:20] # First 20 for reference
}, f, indent=2)
print(f"✅ Scraping manifest saved to: {manifest_file}")
print()
print("🤖 Ready for OpenCODE agent to continue scraping!")
if __name__ == "__main__":
main()