glam/scripts/scrape_austrian_isil_requests.py

#!/usr/bin/env python3
"""
Austrian ISIL Scraper using direct HTTP requests to find API endpoints
"""

import json
import time
import re
from datetime import datetime
from pathlib import Path

try:
    import requests
    from bs4 import BeautifulSoup
    HAS_BS4 = True
except ImportError:
    HAS_BS4 = False
    print("BeautifulSoup4 not available, using regex parsing")

def log(msg):
    print(f"[{datetime.now().strftime('%H:%M:%S')}] {msg}")

def try_primo_api():
    """Try to find Primo API endpoints"""
    log("Attempting to find Primo API endpoints...")

    # Try different potential API URLs
    api_attempts = [
        "https://www.isil.at/primo_library/libweb/webservices/rest/primo-explore/v1/pnxs",
        "https://www.isil.at/primo_library/libweb/webservices/rest/v1/pnxs",
        "https://www.isil.at/primaws/rest/pub/pnxs",
    ]

    headers = {
        'User-Agent': 'Mozilla/5.0 (Research/AcademicProject)',
        'Accept': 'application/json'
    }

    for url in api_attempts:
        try:
            params = {
                'blendFacetsSeparately': 'false',
                'disableCache': 'false',
                'getMore': '0',
                'inst': 'AIS',
                'lang': 'de',
                'limit': '10',
                'mode': 'advanced',
                'newspapersActive': 'false',
                'newspapersSearch': 'false',
                'offset': '0',
                'pcAvailability': 'false',
                'q': 'any,contains,AT-',
                'qExclude': '',
                'qInclude': '',
                'rapido': 'false',
                'refEntryActive': 'false',
                'rtaLinks': 'true',
                'scope': 'default_scope',
                'searchInFulltextUserSelection': 'false',
                'skipDelivery': 'Y',
                'sort': 'rank',
                'tab': 'default_tab',
                'vid': 'AIS'
            }

            log(f"Trying: {url}")
            response = requests.get(url, params=params, headers=headers, timeout=10)

            if response.status_code == 200:
                log(f"SUCCESS! Found API at: {url}")
                log(f"Response preview: {response.text[:500]}")
                return url, response.json()
            else:
                log(f"  Status {response.status_code}")

        except Exception as e:
            log(f"  Error: {e}")

    return None, None

def scrape_with_session():
    """Try scraping with a session that preserves cookies"""
    log("\nTrying to scrape search results page...")

    session = requests.Session()
    session.headers.update({
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
    })

    # First, load the main page to get cookies
    log("Loading main page to establish session...")
    main_url = "https://www.isil.at/primo-explore/search?vid=AIS"
    response = session.get(main_url)
    time.sleep(2)

    # Now try the search
    search_url = "https://www.isil.at/primo-explore/search"
    params = {
        'query': 'any,contains,AT-',
        'tab': 'default_tab',
        'search_scope': 'default_scope',
        'vid': 'AIS',
        'offset': '0',
        'limit': '50'
    }

    log(f"Searching: {search_url}")
    response = session.get(search_url, params=params)

    if response.status_code == 200:
        log(f"Got response, length: {len(response.text)}")

        # Try to extract ISIL codes
        isil_pattern = r'AT-[A-Za-z0-9\-/:]+(?=["\s<])'
        isils = list(set(re.findall(isil_pattern, response.text)))
        isils.sort()

        log(f"Found {len(isils)} unique ISIL codes")

        if isils:
            log("Sample ISILs:")
            for isil in isils[:20]:
                log(f"  {isil}")

        # Save the HTML for analysis
        output_dir = Path("data/isil")
        output_dir.mkdir(parents=True, exist_ok=True)

        html_file = output_dir / "austria_search_page.html"
        with open(html_file, 'w', encoding='utf-8') as f:
            f.write(response.text)
        log(f"Saved HTML to: {html_file}")

        # Save ISILs
        isil_file = output_dir / "austria_isils_extracted.txt"
        with open(isil_file, 'w') as f:
            for isil in isils:
                f.write(f"{isil}\n")
        log(f"Saved ISIL codes to: {isil_file}")

        return isils
    else:
        log(f"Error: Status {response.status_code}")
        return []

def main():
    log("=== Austrian ISIL Data Extraction ===\n")

    # Try API first
    api_url, api_data = try_primo_api()

    if api_data:
        log("\nAPI approach successful!")
        # Process API data
    else:
        log("\nAPI approach failed, trying HTTP scraping...")
        isils = scrape_with_session()

        if isils:
            log(f"\nExtracted {len(isils)} ISIL codes")
        else:
            log("\nNo ISILs extracted. The site requires JavaScript rendering.")
            log("Recommendation: Use Playwright-based scraper instead.")

if __name__ == "__main__":
    try:
        import requests
        main()
    except ImportError:
        log("Error: 'requests' library required. Install with: pip install requests")