glam/scripts/scrape_austrian_isil_mcp.py
2025-11-19 23:25:22 +01:00

119 lines
3.9 KiB
Python

#!/usr/bin/env python3
"""
Austrian ISIL Scraper using Playwright MCP integration
Scrapes all Austrian ISIL codes from https://www.isil.at
"""
import json
import csv
from pathlib import Path
from datetime import datetime
import time
# NOTE: This script is designed to work with Playwright MCP server
# Run via OpenCODE which provides the MCP integration
def main():
"""Main scraping workflow"""
output_dir = Path("data/isil/austria")
output_dir.mkdir(parents=True, exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
json_output = output_dir / f"austrian_isil_{timestamp}.json"
csv_output = output_dir / f"austrian_isil_{timestamp}.csv"
all_institutions = []
# Base URL with pagination
base_url = "https://www.isil.at/primo-explore/search?query=any,contains,AT-&tab=default_tab&search_scope=default_scope&vid=AIS&offset="
# Total results: 1,934 (as of Nov 2024)
# 50 results per page = 39 pages
total_pages = 39
results_per_page = 50
print(f"Starting Austrian ISIL scraper...")
print(f"Total estimated pages: {total_pages}")
print(f"Results per page: {results_per_page}")
print(f"Expected total: ~1,934 institutions\n")
for page_num in range(total_pages):
offset = page_num * results_per_page
url = f"{base_url}{offset}"
print(f"\n{'='*60}")
print(f"Page {page_num + 1}/{total_pages} (offset {offset})")
print(f"URL: {url}")
print(f"{'='*60}")
# This would be called via MCP - placeholder for structure
# In actual use, OpenCODE MCP integration handles this
# Extract data from current page
# (In real implementation, this uses playwright_browser_evaluate via MCP)
page_data = extract_page_data_mcp(url)
if page_data:
all_institutions.extend(page_data)
print(f"✓ Extracted {len(page_data)} institutions (Total: {len(all_institutions)})")
else:
print(f"✗ No data extracted from page {page_num + 1}")
# Rate limiting - be respectful
if page_num < total_pages - 1:
print(f"Waiting 3 seconds before next page...")
time.sleep(3)
print(f"\n{'='*60}")
print(f"Scraping complete!")
print(f"Total institutions extracted: {len(all_institutions)}")
print(f"{'='*60}\n")
# Save to JSON
with open(json_output, 'w', encoding='utf-8') as f:
json.dump({
'extraction_date': datetime.now().isoformat(),
'source_url': 'https://www.isil.at',
'total_count': len(all_institutions),
'institutions': all_institutions
}, f, ensure_ascii=False, indent=2)
print(f"✓ Saved JSON: {json_output}")
# Save to CSV
with open(csv_output, 'w', encoding='utf-8', newline='') as f:
writer = csv.DictWriter(f, fieldnames=['name', 'isil'])
writer.writeheader()
writer.writerows(all_institutions)
print(f"✓ Saved CSV: {csv_output}")
print(f"\nNext steps:")
print(f"1. Review extracted data in: {output_dir}")
print(f"2. Parse into LinkML format using: scripts/parse_austrian_isil.py")
print(f"3. Enrich with Wikidata using: scripts/enrich_austrian_institutions.py")
def extract_page_data_mcp(url):
"""
Extract institution data from a single page
NOTE: In actual use, this function would be replaced by direct
MCP tool calls (playwright_browser_navigate, playwright_browser_evaluate)
via OpenCODE's MCP integration.
Args:
url: Page URL to scrape
Returns:
List of dicts with 'name' and 'isil' keys
"""
# Placeholder - in real use, this is handled by OpenCODE MCP tools
# See: playwright_browser_navigate(url)
# playwright_browser_wait_for(time=5)
# playwright_browser_evaluate(function="...")
return []
if __name__ == "__main__":
main()