glam/scripts/scrape_austrian_isil.py
2025-11-19 23:25:22 +01:00

166 lines
5.8 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Austrian ISIL Database Scraper - Simple approach using requests
Downloads complete ISIL dataset with respectful rate limiting
"""
import json
import time
import re
import sys
from datetime import datetime
from pathlib import Path
# Try to use requests, fallback to urllib if needed
try:
import requests
HAS_REQUESTS = True
except ImportError:
import urllib.request
import urllib.parse
HAS_REQUESTS = False
print("Warning: requests library not available, using urllib")
class AustrianISILScraper:
def __init__(self):
self.base_url = "https://www.isil.at/primo-explore/search"
self.output_dir = Path("data/isil")
self.output_dir.mkdir(parents=True, exist_ok=True)
self.institutions = []
self.rate_limit = 3 # seconds between requests
def log(self, msg):
print(f"[{datetime.now().strftime('%H:%M:%S')}] {msg}")
sys.stdout.flush()
def fetch_url(self, url):
"""Fetch URL with error handling"""
try:
if HAS_REQUESTS:
response = requests.get(url, timeout=30)
return response.text
else:
with urllib.request.urlopen(url, timeout=30) as response:
return response.read().decode('utf-8')
except Exception as e:
self.log(f"Error fetching {url}: {e}")
return None
def search_all_institutions(self):
"""Search for all Austrian institutions"""
self.log("Searching for all AT- institutions...")
# Strategy: Search by federal states to get complete coverage
# This avoids pagination limits
states = [
"Wien", "Niederösterreich", "Oberösterreich", "Steiermark",
"Tirol", "Salzburg", "Kärnten", "Vorarlberg", "Burgenland"
]
all_results = []
for state in states:
self.log(f"\nSearching institutions in {state}...")
offset = 0
page = 1
while True:
query = f"any,contains,{state}"
url = f"{self.base_url}?query={query}&tab=default_tab&search_scope=default_scope&vid=AIS&offset={offset}"
self.log(f" Page {page}, offset {offset}")
content = self.fetch_url(url)
if not content:
break
# Extract institution data from HTML
institutions = self.parse_search_results(content, state)
if not institutions:
self.log(f" No more results for {state}")
break
self.log(f" Found {len(institutions)} institutions")
all_results.extend(institutions)
# Check if there are more pages
if len(institutions) < 10: # Less than full page = last page
break
offset += 10
page += 1
time.sleep(self.rate_limit)
self.log(f"\nTotal institutions found: {len(all_results)}")
self.institutions = all_results
return all_results
def parse_search_results(self, html_content, region):
"""Parse institution data from HTML"""
institutions = []
# Extract ISIL codes using regex
isil_pattern = r'AT-[A-Za-z0-9\-/:]+'
isils = re.findall(isil_pattern, html_content)
# Extract institution names (basic approach)
# Look for patterns like "Institution Name AT-CODE"
name_pattern = r'([^<>"]+)\s+(AT-[A-Za-z0-9\-/:]+)'
matches = re.findall(name_pattern, html_content)
seen = set()
for match in matches:
name = match[0].strip()
isil = match[1].strip()
# Deduplicate
if isil in seen:
continue
seen.add(isil)
# Clean name
name = re.sub(r'\s+', ' ', name)
if len(name) > 3 and len(name) < 200: # Reasonable name length
institutions.append({
'name': name,
'isil': isil,
'region': region,
'extraction_date': datetime.now().isoformat()
})
return institutions
def save_data(self):
"""Save to JSON and CSV"""
# Save JSON
json_file = self.output_dir / "austria_isil_scraped.json"
with open(json_file, 'w', encoding='utf-8') as f:
json.dump(self.institutions, f, ensure_ascii=False, indent=2)
self.log(f"Saved {len(self.institutions)} institutions to {json_file}")
# Save CSV
csv_file = self.output_dir / "austria_isil_scraped.csv"
with open(csv_file, 'w', encoding='utf-8') as f:
f.write("name,isil,region,extraction_date\n")
for inst in self.institutions:
name = inst['name'].replace('"', '""')
f.write(f'"{name}",{inst["isil"]},{inst["region"]},{inst["extraction_date"]}\n')
self.log(f"Saved CSV to {csv_file}")
def run(self):
"""Main execution"""
self.log("=== Austrian ISIL Scraper ===")
self.log(f"Output directory: {self.output_dir}")
self.log("\nNote: This scraper uses respectful rate limiting (3s delay)")
self.log("and searches by region to avoid pagination limits.\n")
self.search_all_institutions()
self.save_data()
self.log("\n=== Scraping Complete ===")
self.log(f"Total institutions: {len(self.institutions)}")
if __name__ == "__main__":
scraper = AustrianISILScraper()
scraper.run()