161 lines
5.6 KiB
Python
161 lines
5.6 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Complete Austrian ISIL Database Scraper
|
|
Uses Playwright MCP tools to scrape all 39 pages of the Austrian ISIL database.
|
|
|
|
This script coordinates with OpenCODE's Playwright MCP server to:
|
|
1. Navigate to each results page (10 results per page, 194 pages total)
|
|
2. Wait for JavaScript rendering
|
|
3. Extract institution names and ISIL codes
|
|
4. Save progress after each page
|
|
|
|
Total institutions: 1,934
|
|
Pages to scrape: 194 (10 per page, offset by 10)
|
|
Estimated time: ~10 minutes with 3-second rate limiting
|
|
"""
|
|
|
|
import json
|
|
import time
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
|
|
def save_page_data(page_num: int, offset: int, institutions: list, output_dir: Path):
|
|
"""Save extracted data for a single page."""
|
|
data = {
|
|
"page": page_num,
|
|
"offset": offset,
|
|
"url": f"https://www.isil.at/primo-explore/search?query=any,contains,AT-&tab=default_tab&search_scope=default_scope&vid=AIS&offset={offset}",
|
|
"extraction_date": datetime.now(timezone.utc).isoformat(),
|
|
"count": len(institutions),
|
|
"institutions": institutions
|
|
}
|
|
|
|
output_file = output_dir / f"page_{page_num:03d}_data.json"
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(data, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"✅ Page {page_num} saved: {len(institutions)} institutions")
|
|
return output_file
|
|
|
|
|
|
def main():
|
|
"""
|
|
Main scraper orchestration.
|
|
|
|
Note: This script is designed to work WITH OpenCODE's Playwright MCP tools.
|
|
The actual browser automation happens through OpenCODE's MCP server.
|
|
This script provides the logic and coordination.
|
|
"""
|
|
|
|
# Setup
|
|
output_dir = Path("/Users/kempersc/apps/glam/data/isil/austria")
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Configuration
|
|
TOTAL_INSTITUTIONS = 1934
|
|
RESULTS_PER_PAGE = 10 # Using 10 per page for more reliable extraction
|
|
TOTAL_PAGES = (TOTAL_INSTITUTIONS + RESULTS_PER_PAGE - 1) // RESULTS_PER_PAGE # 194 pages
|
|
START_PAGE = 3 # Resume from page 3 (we already have 1, 2, and 11)
|
|
RATE_LIMIT_SECONDS = 3
|
|
|
|
print(f"🚀 Austrian ISIL Scraper")
|
|
print(f"📊 Total pages to scrape: {TOTAL_PAGES}")
|
|
print(f"📄 Results per page: {RESULTS_PER_PAGE}")
|
|
print(f"⏱️ Estimated time: ~{TOTAL_PAGES * RATE_LIMIT_SECONDS / 60:.1f} minutes")
|
|
print(f"📁 Output directory: {output_dir}")
|
|
print(f"▶️ Starting from page {START_PAGE}")
|
|
print()
|
|
|
|
# Check which pages we already have
|
|
existing_pages = set()
|
|
for existing_file in output_dir.glob("page_*.json"):
|
|
try:
|
|
page_num = int(existing_file.stem.split('_')[1])
|
|
existing_pages.add(page_num)
|
|
except (IndexError, ValueError):
|
|
pass
|
|
|
|
if existing_pages:
|
|
print(f"✅ Found {len(existing_pages)} existing pages: {sorted(existing_pages)}")
|
|
print()
|
|
|
|
# Instructions for OpenCODE agent
|
|
print("=" * 70)
|
|
print("INSTRUCTIONS FOR OPENCODE AGENT:")
|
|
print("=" * 70)
|
|
print()
|
|
print("For each page from {} to {}:".format(START_PAGE, TOTAL_PAGES))
|
|
print()
|
|
print("1. Calculate offset: offset = (page - 1) * 10")
|
|
print("2. Navigate to URL:")
|
|
print(" https://www.isil.at/primo-explore/search?query=any,contains,AT-&offset={offset}")
|
|
print()
|
|
print("3. Wait 5 seconds for JavaScript to render")
|
|
print()
|
|
print("4. Extract institutions with JavaScript:")
|
|
print("""
|
|
const results = [];
|
|
const headings = document.querySelectorAll('h3.item-title');
|
|
|
|
headings.forEach((heading) => {
|
|
const fullText = heading.textContent.trim();
|
|
const match = fullText.match(/^(.*?)\\s+(AT-[A-Za-z0-9-]+)\\s*$/);
|
|
|
|
if (match) {
|
|
results.push({
|
|
name: match[1].trim(),
|
|
isil: match[2].trim()
|
|
});
|
|
}
|
|
});
|
|
|
|
return { count: results.length, institutions: results };
|
|
""")
|
|
print()
|
|
print("5. Save to: data/isil/austria/page_{:03d}_data.json".format(START_PAGE))
|
|
print()
|
|
print("6. Sleep 3 seconds (rate limiting)")
|
|
print()
|
|
print("7. Repeat for next page")
|
|
print()
|
|
print("=" * 70)
|
|
print()
|
|
|
|
# Generate manifest of pages to scrape
|
|
pages_to_scrape = []
|
|
for page in range(START_PAGE, TOTAL_PAGES + 1):
|
|
if page not in existing_pages:
|
|
offset = (page - 1) * RESULTS_PER_PAGE
|
|
pages_to_scrape.append({
|
|
"page": page,
|
|
"offset": offset,
|
|
"url": f"https://www.isil.at/primo-explore/search?query=any,contains,AT-&tab=default_tab&search_scope=default_scope&vid=AIS&offset={offset}"
|
|
})
|
|
|
|
print(f"📋 Pages remaining to scrape: {len(pages_to_scrape)}")
|
|
if len(pages_to_scrape) <= 10:
|
|
print(f" Next pages: {[p['page'] for p in pages_to_scrape]}")
|
|
else:
|
|
print(f" Next 10 pages: {[p['page'] for p in pages_to_scrape[:10]]}")
|
|
print()
|
|
|
|
# Save manifest
|
|
manifest_file = output_dir / "scraping_manifest.json"
|
|
with open(manifest_file, 'w', encoding='utf-8') as f:
|
|
json.dump({
|
|
"total_institutions": TOTAL_INSTITUTIONS,
|
|
"results_per_page": RESULTS_PER_PAGE,
|
|
"total_pages": TOTAL_PAGES,
|
|
"pages_completed": sorted(existing_pages),
|
|
"pages_remaining": [p['page'] for p in pages_to_scrape],
|
|
"next_pages_to_scrape": pages_to_scrape[:20] # First 20 for reference
|
|
}, f, indent=2)
|
|
|
|
print(f"✅ Scraping manifest saved to: {manifest_file}")
|
|
print()
|
|
print("🤖 Ready for OpenCODE agent to continue scraping!")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|