glam/scripts/scrape_austrian_isil_systematic.py
2025-11-19 23:25:22 +01:00

130 lines
4 KiB
Python

#!/usr/bin/env python3
"""
Systematic scraper for Austrian ISIL database using Playwright MCP.
Scrapes pages sequentially to avoid navigation conflicts.
"""
import json
import time
from pathlib import Path
# Configuration
BASE_URL = "https://www.isil.at/primo-explore/search?query=any,contains,AT-&tab=default_tab&search_scope=default_scope&vid=AIS&offset="
DATA_DIR = Path("/Users/kempersc/apps/glam/data/isil/austria")
RESULTS_PER_PAGE = 10
WAIT_BETWEEN_PAGES = 3 # seconds
# Extraction JavaScript function
EXTRACT_JS = """() => {
const results = [];
const headings = document.querySelectorAll('h3.item-title');
headings.forEach((heading) => {
const fullText = heading.textContent.trim();
const match = fullText.match(/^(.*?)\\s+(AT-[A-Za-z0-9-]+)\\s*$/);
if (match) {
results.push({name: match[1].trim(), isil: match[2].trim()});
}
});
return {count: results.length, institutions: results};
}"""
def get_existing_pages():
"""Get list of already scraped page numbers."""
existing = []
for file in DATA_DIR.glob("page_*_data.json"):
page_num = int(file.stem.split('_')[1])
existing.append(page_num)
return sorted(existing)
def get_offset_for_page(page_num):
"""Calculate offset for a given page number."""
return (page_num - 1) * RESULTS_PER_PAGE
def save_page_data(page_num, offset, institutions):
"""Save extracted data to JSON file."""
data = {
"page": page_num,
"offset": offset,
"count": len(institutions),
"extraction_date": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
"source_url": f"{BASE_URL}{offset}",
"institutions": institutions
}
output_file = DATA_DIR / f"page_{page_num:03d}_data.json"
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
print(f"✅ Saved page {page_num} ({len(institutions)} institutions) to {output_file.name}")
def scrape_pages(start_page, end_page):
"""
Generate instructions for scraping pages sequentially.
This function generates the workflow that should be executed using
Playwright MCP browser tools:
1. Navigate to URL
2. Wait 5 seconds
3. Run extraction JavaScript
4. Save to JSON
5. Close browser
6. Sleep 3 seconds
"""
existing_pages = get_existing_pages()
print(f"Already scraped pages: {existing_pages}")
print(f"Target range: {start_page}-{end_page}")
pages_to_scrape = [p for p in range(start_page, end_page + 1) if p not in existing_pages]
if not pages_to_scrape:
print("✅ All pages in range already scraped!")
return []
print(f"\n📋 Pages to scrape: {pages_to_scrape}")
print(f"Total: {len(pages_to_scrape)} pages")
instructions = []
for page_num in pages_to_scrape:
offset = get_offset_for_page(page_num)
url = f"{BASE_URL}{offset}"
instruction = {
"page": page_num,
"offset": offset,
"url": url,
"workflow": [
f"1. Navigate to: {url}",
f"2. Wait 5 seconds for page load",
f"3. Run extraction JavaScript",
f"4. Save to page_{page_num:03d}_data.json",
f"5. Close browser",
f"6. Sleep 3 seconds"
]
}
instructions.append(instruction)
return instructions
if __name__ == "__main__":
# Scrape pages 3-20 (next batch after current progress)
instructions = scrape_pages(start_page=3, end_page=20)
if instructions:
print(f"\n{'='*60}")
print("SCRAPING WORKFLOW")
print(f"{'='*60}\n")
for inst in instructions:
print(f"\n📄 PAGE {inst['page']} (offset={inst['offset']}):")
for step in inst['workflow']:
print(f" {step}")
print(f"\n{'='*60}")
print(f"Next: Use Playwright MCP to execute workflow for {len(instructions)} pages")
print(f"{'='*60}")