#!/usr/bin/env python3 """ Austrian ISIL Database Scraper - Simple approach using requests Downloads complete ISIL dataset with respectful rate limiting """ import json import time import re import sys from datetime import datetime from pathlib import Path # Try to use requests, fallback to urllib if needed try: import requests HAS_REQUESTS = True except ImportError: import urllib.request import urllib.parse HAS_REQUESTS = False print("Warning: requests library not available, using urllib") class AustrianISILScraper: def __init__(self): self.base_url = "https://www.isil.at/primo-explore/search" self.output_dir = Path("data/isil") self.output_dir.mkdir(parents=True, exist_ok=True) self.institutions = [] self.rate_limit = 3 # seconds between requests def log(self, msg): print(f"[{datetime.now().strftime('%H:%M:%S')}] {msg}") sys.stdout.flush() def fetch_url(self, url): """Fetch URL with error handling""" try: if HAS_REQUESTS: response = requests.get(url, timeout=30) return response.text else: with urllib.request.urlopen(url, timeout=30) as response: return response.read().decode('utf-8') except Exception as e: self.log(f"Error fetching {url}: {e}") return None def search_all_institutions(self): """Search for all Austrian institutions""" self.log("Searching for all AT- institutions...") # Strategy: Search by federal states to get complete coverage # This avoids pagination limits states = [ "Wien", "Niederösterreich", "Oberösterreich", "Steiermark", "Tirol", "Salzburg", "Kärnten", "Vorarlberg", "Burgenland" ] all_results = [] for state in states: self.log(f"\nSearching institutions in {state}...") offset = 0 page = 1 while True: query = f"any,contains,{state}" url = f"{self.base_url}?query={query}&tab=default_tab&search_scope=default_scope&vid=AIS&offset={offset}" self.log(f" Page {page}, offset {offset}") content = self.fetch_url(url) if not content: break # Extract institution data from HTML institutions = self.parse_search_results(content, state) if not institutions: self.log(f" No more results for {state}") break self.log(f" Found {len(institutions)} institutions") all_results.extend(institutions) # Check if there are more pages if len(institutions) < 10: # Less than full page = last page break offset += 10 page += 1 time.sleep(self.rate_limit) self.log(f"\nTotal institutions found: {len(all_results)}") self.institutions = all_results return all_results def parse_search_results(self, html_content, region): """Parse institution data from HTML""" institutions = [] # Extract ISIL codes using regex isil_pattern = r'AT-[A-Za-z0-9\-/:]+' isils = re.findall(isil_pattern, html_content) # Extract institution names (basic approach) # Look for patterns like "Institution Name AT-CODE" name_pattern = r'([^<>"]+)\s+(AT-[A-Za-z0-9\-/:]+)' matches = re.findall(name_pattern, html_content) seen = set() for match in matches: name = match[0].strip() isil = match[1].strip() # Deduplicate if isil in seen: continue seen.add(isil) # Clean name name = re.sub(r'\s+', ' ', name) if len(name) > 3 and len(name) < 200: # Reasonable name length institutions.append({ 'name': name, 'isil': isil, 'region': region, 'extraction_date': datetime.now().isoformat() }) return institutions def save_data(self): """Save to JSON and CSV""" # Save JSON json_file = self.output_dir / "austria_isil_scraped.json" with open(json_file, 'w', encoding='utf-8') as f: json.dump(self.institutions, f, ensure_ascii=False, indent=2) self.log(f"Saved {len(self.institutions)} institutions to {json_file}") # Save CSV csv_file = self.output_dir / "austria_isil_scraped.csv" with open(csv_file, 'w', encoding='utf-8') as f: f.write("name,isil,region,extraction_date\n") for inst in self.institutions: name = inst['name'].replace('"', '""') f.write(f'"{name}",{inst["isil"]},{inst["region"]},{inst["extraction_date"]}\n') self.log(f"Saved CSV to {csv_file}") def run(self): """Main execution""" self.log("=== Austrian ISIL Scraper ===") self.log(f"Output directory: {self.output_dir}") self.log("\nNote: This scraper uses respectful rate limiting (3s delay)") self.log("and searches by region to avoid pagination limits.\n") self.search_all_institutions() self.save_data() self.log("\n=== Scraping Complete ===") self.log(f"Total institutions: {len(self.institutions)}") if __name__ == "__main__": scraper = AustrianISILScraper() scraper.run()