119 lines
3.9 KiB
Python
119 lines
3.9 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Austrian ISIL Scraper using Playwright MCP integration
|
|
Scrapes all Austrian ISIL codes from https://www.isil.at
|
|
"""
|
|
|
|
import json
|
|
import csv
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
import time
|
|
|
|
# NOTE: This script is designed to work with Playwright MCP server
|
|
# Run via OpenCODE which provides the MCP integration
|
|
|
|
def main():
|
|
"""Main scraping workflow"""
|
|
|
|
output_dir = Path("data/isil/austria")
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
json_output = output_dir / f"austrian_isil_{timestamp}.json"
|
|
csv_output = output_dir / f"austrian_isil_{timestamp}.csv"
|
|
|
|
all_institutions = []
|
|
|
|
# Base URL with pagination
|
|
base_url = "https://www.isil.at/primo-explore/search?query=any,contains,AT-&tab=default_tab&search_scope=default_scope&vid=AIS&offset="
|
|
|
|
# Total results: 1,934 (as of Nov 2024)
|
|
# 50 results per page = 39 pages
|
|
total_pages = 39
|
|
results_per_page = 50
|
|
|
|
print(f"Starting Austrian ISIL scraper...")
|
|
print(f"Total estimated pages: {total_pages}")
|
|
print(f"Results per page: {results_per_page}")
|
|
print(f"Expected total: ~1,934 institutions\n")
|
|
|
|
for page_num in range(total_pages):
|
|
offset = page_num * results_per_page
|
|
url = f"{base_url}{offset}"
|
|
|
|
print(f"\n{'='*60}")
|
|
print(f"Page {page_num + 1}/{total_pages} (offset {offset})")
|
|
print(f"URL: {url}")
|
|
print(f"{'='*60}")
|
|
|
|
# This would be called via MCP - placeholder for structure
|
|
# In actual use, OpenCODE MCP integration handles this
|
|
|
|
# Extract data from current page
|
|
# (In real implementation, this uses playwright_browser_evaluate via MCP)
|
|
page_data = extract_page_data_mcp(url)
|
|
|
|
if page_data:
|
|
all_institutions.extend(page_data)
|
|
print(f"✓ Extracted {len(page_data)} institutions (Total: {len(all_institutions)})")
|
|
else:
|
|
print(f"✗ No data extracted from page {page_num + 1}")
|
|
|
|
# Rate limiting - be respectful
|
|
if page_num < total_pages - 1:
|
|
print(f"Waiting 3 seconds before next page...")
|
|
time.sleep(3)
|
|
|
|
print(f"\n{'='*60}")
|
|
print(f"Scraping complete!")
|
|
print(f"Total institutions extracted: {len(all_institutions)}")
|
|
print(f"{'='*60}\n")
|
|
|
|
# Save to JSON
|
|
with open(json_output, 'w', encoding='utf-8') as f:
|
|
json.dump({
|
|
'extraction_date': datetime.now().isoformat(),
|
|
'source_url': 'https://www.isil.at',
|
|
'total_count': len(all_institutions),
|
|
'institutions': all_institutions
|
|
}, f, ensure_ascii=False, indent=2)
|
|
print(f"✓ Saved JSON: {json_output}")
|
|
|
|
# Save to CSV
|
|
with open(csv_output, 'w', encoding='utf-8', newline='') as f:
|
|
writer = csv.DictWriter(f, fieldnames=['name', 'isil'])
|
|
writer.writeheader()
|
|
writer.writerows(all_institutions)
|
|
print(f"✓ Saved CSV: {csv_output}")
|
|
|
|
print(f"\nNext steps:")
|
|
print(f"1. Review extracted data in: {output_dir}")
|
|
print(f"2. Parse into LinkML format using: scripts/parse_austrian_isil.py")
|
|
print(f"3. Enrich with Wikidata using: scripts/enrich_austrian_institutions.py")
|
|
|
|
|
|
def extract_page_data_mcp(url):
|
|
"""
|
|
Extract institution data from a single page
|
|
|
|
NOTE: In actual use, this function would be replaced by direct
|
|
MCP tool calls (playwright_browser_navigate, playwright_browser_evaluate)
|
|
via OpenCODE's MCP integration.
|
|
|
|
Args:
|
|
url: Page URL to scrape
|
|
|
|
Returns:
|
|
List of dicts with 'name' and 'isil' keys
|
|
"""
|
|
# Placeholder - in real use, this is handled by OpenCODE MCP tools
|
|
# See: playwright_browser_navigate(url)
|
|
# playwright_browser_wait_for(time=5)
|
|
# playwright_browser_evaluate(function="...")
|
|
|
|
return []
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|