166 lines
5.8 KiB
Python
Executable file
166 lines
5.8 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Austrian ISIL Database Scraper - Simple approach using requests
|
|
Downloads complete ISIL dataset with respectful rate limiting
|
|
"""
|
|
|
|
import json
|
|
import time
|
|
import re
|
|
import sys
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
|
|
# Try to use requests, fallback to urllib if needed
|
|
try:
|
|
import requests
|
|
HAS_REQUESTS = True
|
|
except ImportError:
|
|
import urllib.request
|
|
import urllib.parse
|
|
HAS_REQUESTS = False
|
|
print("Warning: requests library not available, using urllib")
|
|
|
|
class AustrianISILScraper:
|
|
def __init__(self):
|
|
self.base_url = "https://www.isil.at/primo-explore/search"
|
|
self.output_dir = Path("data/isil")
|
|
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
self.institutions = []
|
|
self.rate_limit = 3 # seconds between requests
|
|
|
|
def log(self, msg):
|
|
print(f"[{datetime.now().strftime('%H:%M:%S')}] {msg}")
|
|
sys.stdout.flush()
|
|
|
|
def fetch_url(self, url):
|
|
"""Fetch URL with error handling"""
|
|
try:
|
|
if HAS_REQUESTS:
|
|
response = requests.get(url, timeout=30)
|
|
return response.text
|
|
else:
|
|
with urllib.request.urlopen(url, timeout=30) as response:
|
|
return response.read().decode('utf-8')
|
|
except Exception as e:
|
|
self.log(f"Error fetching {url}: {e}")
|
|
return None
|
|
|
|
def search_all_institutions(self):
|
|
"""Search for all Austrian institutions"""
|
|
self.log("Searching for all AT- institutions...")
|
|
|
|
# Strategy: Search by federal states to get complete coverage
|
|
# This avoids pagination limits
|
|
states = [
|
|
"Wien", "Niederösterreich", "Oberösterreich", "Steiermark",
|
|
"Tirol", "Salzburg", "Kärnten", "Vorarlberg", "Burgenland"
|
|
]
|
|
|
|
all_results = []
|
|
|
|
for state in states:
|
|
self.log(f"\nSearching institutions in {state}...")
|
|
offset = 0
|
|
page = 1
|
|
|
|
while True:
|
|
query = f"any,contains,{state}"
|
|
url = f"{self.base_url}?query={query}&tab=default_tab&search_scope=default_scope&vid=AIS&offset={offset}"
|
|
|
|
self.log(f" Page {page}, offset {offset}")
|
|
content = self.fetch_url(url)
|
|
|
|
if not content:
|
|
break
|
|
|
|
# Extract institution data from HTML
|
|
institutions = self.parse_search_results(content, state)
|
|
|
|
if not institutions:
|
|
self.log(f" No more results for {state}")
|
|
break
|
|
|
|
self.log(f" Found {len(institutions)} institutions")
|
|
all_results.extend(institutions)
|
|
|
|
# Check if there are more pages
|
|
if len(institutions) < 10: # Less than full page = last page
|
|
break
|
|
|
|
offset += 10
|
|
page += 1
|
|
time.sleep(self.rate_limit)
|
|
|
|
self.log(f"\nTotal institutions found: {len(all_results)}")
|
|
self.institutions = all_results
|
|
return all_results
|
|
|
|
def parse_search_results(self, html_content, region):
|
|
"""Parse institution data from HTML"""
|
|
institutions = []
|
|
|
|
# Extract ISIL codes using regex
|
|
isil_pattern = r'AT-[A-Za-z0-9\-/:]+'
|
|
isils = re.findall(isil_pattern, html_content)
|
|
|
|
# Extract institution names (basic approach)
|
|
# Look for patterns like "Institution Name AT-CODE"
|
|
name_pattern = r'([^<>"]+)\s+(AT-[A-Za-z0-9\-/:]+)'
|
|
matches = re.findall(name_pattern, html_content)
|
|
|
|
seen = set()
|
|
for match in matches:
|
|
name = match[0].strip()
|
|
isil = match[1].strip()
|
|
|
|
# Deduplicate
|
|
if isil in seen:
|
|
continue
|
|
seen.add(isil)
|
|
|
|
# Clean name
|
|
name = re.sub(r'\s+', ' ', name)
|
|
if len(name) > 3 and len(name) < 200: # Reasonable name length
|
|
institutions.append({
|
|
'name': name,
|
|
'isil': isil,
|
|
'region': region,
|
|
'extraction_date': datetime.now().isoformat()
|
|
})
|
|
|
|
return institutions
|
|
|
|
def save_data(self):
|
|
"""Save to JSON and CSV"""
|
|
# Save JSON
|
|
json_file = self.output_dir / "austria_isil_scraped.json"
|
|
with open(json_file, 'w', encoding='utf-8') as f:
|
|
json.dump(self.institutions, f, ensure_ascii=False, indent=2)
|
|
self.log(f"Saved {len(self.institutions)} institutions to {json_file}")
|
|
|
|
# Save CSV
|
|
csv_file = self.output_dir / "austria_isil_scraped.csv"
|
|
with open(csv_file, 'w', encoding='utf-8') as f:
|
|
f.write("name,isil,region,extraction_date\n")
|
|
for inst in self.institutions:
|
|
name = inst['name'].replace('"', '""')
|
|
f.write(f'"{name}",{inst["isil"]},{inst["region"]},{inst["extraction_date"]}\n')
|
|
self.log(f"Saved CSV to {csv_file}")
|
|
|
|
def run(self):
|
|
"""Main execution"""
|
|
self.log("=== Austrian ISIL Scraper ===")
|
|
self.log(f"Output directory: {self.output_dir}")
|
|
self.log("\nNote: This scraper uses respectful rate limiting (3s delay)")
|
|
self.log("and searches by region to avoid pagination limits.\n")
|
|
|
|
self.search_all_institutions()
|
|
self.save_data()
|
|
|
|
self.log("\n=== Scraping Complete ===")
|
|
self.log(f"Total institutions: {len(self.institutions)}")
|
|
|
|
if __name__ == "__main__":
|
|
scraper = AustrianISILScraper()
|
|
scraper.run()
|