130 lines
4 KiB
Python
130 lines
4 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Systematic scraper for Austrian ISIL database using Playwright MCP.
|
|
Scrapes pages sequentially to avoid navigation conflicts.
|
|
"""
|
|
|
|
import json
|
|
import time
|
|
from pathlib import Path
|
|
|
|
# Configuration
|
|
BASE_URL = "https://www.isil.at/primo-explore/search?query=any,contains,AT-&tab=default_tab&search_scope=default_scope&vid=AIS&offset="
|
|
DATA_DIR = Path("/Users/kempersc/apps/glam/data/isil/austria")
|
|
RESULTS_PER_PAGE = 10
|
|
WAIT_BETWEEN_PAGES = 3 # seconds
|
|
|
|
# Extraction JavaScript function
|
|
EXTRACT_JS = """() => {
|
|
const results = [];
|
|
const headings = document.querySelectorAll('h3.item-title');
|
|
headings.forEach((heading) => {
|
|
const fullText = heading.textContent.trim();
|
|
const match = fullText.match(/^(.*?)\\s+(AT-[A-Za-z0-9-]+)\\s*$/);
|
|
if (match) {
|
|
results.push({name: match[1].trim(), isil: match[2].trim()});
|
|
}
|
|
});
|
|
return {count: results.length, institutions: results};
|
|
}"""
|
|
|
|
|
|
def get_existing_pages():
|
|
"""Get list of already scraped page numbers."""
|
|
existing = []
|
|
for file in DATA_DIR.glob("page_*_data.json"):
|
|
page_num = int(file.stem.split('_')[1])
|
|
existing.append(page_num)
|
|
return sorted(existing)
|
|
|
|
|
|
def get_offset_for_page(page_num):
|
|
"""Calculate offset for a given page number."""
|
|
return (page_num - 1) * RESULTS_PER_PAGE
|
|
|
|
|
|
def save_page_data(page_num, offset, institutions):
|
|
"""Save extracted data to JSON file."""
|
|
data = {
|
|
"page": page_num,
|
|
"offset": offset,
|
|
"count": len(institutions),
|
|
"extraction_date": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
|
|
"source_url": f"{BASE_URL}{offset}",
|
|
"institutions": institutions
|
|
}
|
|
|
|
output_file = DATA_DIR / f"page_{page_num:03d}_data.json"
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(data, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"✅ Saved page {page_num} ({len(institutions)} institutions) to {output_file.name}")
|
|
|
|
|
|
def scrape_pages(start_page, end_page):
|
|
"""
|
|
Generate instructions for scraping pages sequentially.
|
|
|
|
This function generates the workflow that should be executed using
|
|
Playwright MCP browser tools:
|
|
1. Navigate to URL
|
|
2. Wait 5 seconds
|
|
3. Run extraction JavaScript
|
|
4. Save to JSON
|
|
5. Close browser
|
|
6. Sleep 3 seconds
|
|
"""
|
|
existing_pages = get_existing_pages()
|
|
print(f"Already scraped pages: {existing_pages}")
|
|
print(f"Target range: {start_page}-{end_page}")
|
|
|
|
pages_to_scrape = [p for p in range(start_page, end_page + 1) if p not in existing_pages]
|
|
|
|
if not pages_to_scrape:
|
|
print("✅ All pages in range already scraped!")
|
|
return []
|
|
|
|
print(f"\n📋 Pages to scrape: {pages_to_scrape}")
|
|
print(f"Total: {len(pages_to_scrape)} pages")
|
|
|
|
instructions = []
|
|
|
|
for page_num in pages_to_scrape:
|
|
offset = get_offset_for_page(page_num)
|
|
url = f"{BASE_URL}{offset}"
|
|
|
|
instruction = {
|
|
"page": page_num,
|
|
"offset": offset,
|
|
"url": url,
|
|
"workflow": [
|
|
f"1. Navigate to: {url}",
|
|
f"2. Wait 5 seconds for page load",
|
|
f"3. Run extraction JavaScript",
|
|
f"4. Save to page_{page_num:03d}_data.json",
|
|
f"5. Close browser",
|
|
f"6. Sleep 3 seconds"
|
|
]
|
|
}
|
|
instructions.append(instruction)
|
|
|
|
return instructions
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# Scrape pages 3-20 (next batch after current progress)
|
|
instructions = scrape_pages(start_page=3, end_page=20)
|
|
|
|
if instructions:
|
|
print(f"\n{'='*60}")
|
|
print("SCRAPING WORKFLOW")
|
|
print(f"{'='*60}\n")
|
|
|
|
for inst in instructions:
|
|
print(f"\n📄 PAGE {inst['page']} (offset={inst['offset']}):")
|
|
for step in inst['workflow']:
|
|
print(f" {step}")
|
|
|
|
print(f"\n{'='*60}")
|
|
print(f"Next: Use Playwright MCP to execute workflow for {len(instructions)} pages")
|
|
print(f"{'='*60}")
|