glam/scripts/scrape_austrian_isil.py

#!/usr/bin/env python3
"""
Austrian ISIL Database Scraper - Simple approach using requests
Downloads complete ISIL dataset with respectful rate limiting
"""

import json
import time
import re
import sys
from datetime import datetime
from pathlib import Path

# Try to use requests, fallback to urllib if needed
try:
    import requests
    HAS_REQUESTS = True
except ImportError:
    import urllib.request
    import urllib.parse
    HAS_REQUESTS = False
    print("Warning: requests library not available, using urllib")

class AustrianISILScraper:
    def __init__(self):
        self.base_url = "https://www.isil.at/primo-explore/search"
        self.output_dir = Path("data/isil")
        self.output_dir.mkdir(parents=True, exist_ok=True)
        self.institutions = []
        self.rate_limit = 3  # seconds between requests

    def log(self, msg):
        print(f"[{datetime.now().strftime('%H:%M:%S')}] {msg}")
        sys.stdout.flush()

    def fetch_url(self, url):
        """Fetch URL with error handling"""
        try:
            if HAS_REQUESTS:
                response = requests.get(url, timeout=30)
                return response.text
            else:
                with urllib.request.urlopen(url, timeout=30) as response:
                    return response.read().decode('utf-8')
        except Exception as e:
            self.log(f"Error fetching {url}: {e}")
            return None

    def search_all_institutions(self):
        """Search for all Austrian institutions"""
        self.log("Searching for all AT- institutions...")

        # Strategy: Search by federal states to get complete coverage
        # This avoids pagination limits
        states = [
            "Wien", "Niederösterreich", "Oberösterreich", "Steiermark",
            "Tirol", "Salzburg", "Kärnten", "Vorarlberg", "Burgenland"
        ]

        all_results = []

        for state in states:
            self.log(f"\nSearching institutions in {state}...")
            offset = 0
            page = 1

            while True:
                query = f"any,contains,{state}"
                url = f"{self.base_url}?query={query}&tab=default_tab&search_scope=default_scope&vid=AIS&offset={offset}"

                self.log(f"  Page {page}, offset {offset}")
                content = self.fetch_url(url)

                if not content:
                    break

                # Extract institution data from HTML
                institutions = self.parse_search_results(content, state)

                if not institutions:
                    self.log(f"  No more results for {state}")
                    break

                self.log(f"  Found {len(institutions)} institutions")
                all_results.extend(institutions)

                # Check if there are more pages
                if len(institutions) < 10:  # Less than full page = last page
                    break

                offset += 10
                page += 1
                time.sleep(self.rate_limit)

        self.log(f"\nTotal institutions found: {len(all_results)}")
        self.institutions = all_results
        return all_results

    def parse_search_results(self, html_content, region):
        """Parse institution data from HTML"""
        institutions = []

        # Extract ISIL codes using regex
        isil_pattern = r'AT-[A-Za-z0-9\-/:]+'
        isils = re.findall(isil_pattern, html_content)

        # Extract institution names (basic approach)
        # Look for patterns like "Institution Name AT-CODE"
        name_pattern = r'([^<>"]+)\s+(AT-[A-Za-z0-9\-/:]+)'
        matches = re.findall(name_pattern, html_content)

        seen = set()
        for match in matches:
            name = match[0].strip()
            isil = match[1].strip()

            # Deduplicate
            if isil in seen:
                continue
            seen.add(isil)

            # Clean name
            name = re.sub(r'\s+', ' ', name)
            if len(name) > 3 and len(name) < 200:  # Reasonable name length
                institutions.append({
                    'name': name,
                    'isil': isil,
                    'region': region,
                    'extraction_date': datetime.now().isoformat()
                })

        return institutions

    def save_data(self):
        """Save to JSON and CSV"""
        # Save JSON
        json_file = self.output_dir / "austria_isil_scraped.json"
        with open(json_file, 'w', encoding='utf-8') as f:
            json.dump(self.institutions, f, ensure_ascii=False, indent=2)
        self.log(f"Saved {len(self.institutions)} institutions to {json_file}")

        # Save CSV
        csv_file = self.output_dir / "austria_isil_scraped.csv"
        with open(csv_file, 'w', encoding='utf-8') as f:
            f.write("name,isil,region,extraction_date\n")
            for inst in self.institutions:
                name = inst['name'].replace('"', '""')
                f.write(f'"{name}",{inst["isil"]},{inst["region"]},{inst["extraction_date"]}\n')
        self.log(f"Saved CSV to {csv_file}")

    def run(self):
        """Main execution"""
        self.log("=== Austrian ISIL Scraper ===")
        self.log(f"Output directory: {self.output_dir}")
        self.log("\nNote: This scraper uses respectful rate limiting (3s delay)")
        self.log("and searches by region to avoid pagination limits.\n")

        self.search_all_institutions()
        self.save_data()

        self.log("\n=== Scraping Complete ===")
        self.log(f"Total institutions: {len(self.institutions)}")

if __name__ == "__main__":
    scraper = AustrianISILScraper()
    scraper.run()