#!/usr/bin/env python3 """ Austrian ISIL Scraper using direct HTTP requests to find API endpoints """ import json import time import re from datetime import datetime from pathlib import Path try: import requests from bs4 import BeautifulSoup HAS_BS4 = True except ImportError: HAS_BS4 = False print("BeautifulSoup4 not available, using regex parsing") def log(msg): print(f"[{datetime.now().strftime('%H:%M:%S')}] {msg}") def try_primo_api(): """Try to find Primo API endpoints""" log("Attempting to find Primo API endpoints...") # Try different potential API URLs api_attempts = [ "https://www.isil.at/primo_library/libweb/webservices/rest/primo-explore/v1/pnxs", "https://www.isil.at/primo_library/libweb/webservices/rest/v1/pnxs", "https://www.isil.at/primaws/rest/pub/pnxs", ] headers = { 'User-Agent': 'Mozilla/5.0 (Research/AcademicProject)', 'Accept': 'application/json' } for url in api_attempts: try: params = { 'blendFacetsSeparately': 'false', 'disableCache': 'false', 'getMore': '0', 'inst': 'AIS', 'lang': 'de', 'limit': '10', 'mode': 'advanced', 'newspapersActive': 'false', 'newspapersSearch': 'false', 'offset': '0', 'pcAvailability': 'false', 'q': 'any,contains,AT-', 'qExclude': '', 'qInclude': '', 'rapido': 'false', 'refEntryActive': 'false', 'rtaLinks': 'true', 'scope': 'default_scope', 'searchInFulltextUserSelection': 'false', 'skipDelivery': 'Y', 'sort': 'rank', 'tab': 'default_tab', 'vid': 'AIS' } log(f"Trying: {url}") response = requests.get(url, params=params, headers=headers, timeout=10) if response.status_code == 200: log(f"SUCCESS! Found API at: {url}") log(f"Response preview: {response.text[:500]}") return url, response.json() else: log(f" Status {response.status_code}") except Exception as e: log(f" Error: {e}") return None, None def scrape_with_session(): """Try scraping with a session that preserves cookies""" log("\nTrying to scrape search results page...") session = requests.Session() session.headers.update({ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36' }) # First, load the main page to get cookies log("Loading main page to establish session...") main_url = "https://www.isil.at/primo-explore/search?vid=AIS" response = session.get(main_url) time.sleep(2) # Now try the search search_url = "https://www.isil.at/primo-explore/search" params = { 'query': 'any,contains,AT-', 'tab': 'default_tab', 'search_scope': 'default_scope', 'vid': 'AIS', 'offset': '0', 'limit': '50' } log(f"Searching: {search_url}") response = session.get(search_url, params=params) if response.status_code == 200: log(f"Got response, length: {len(response.text)}") # Try to extract ISIL codes isil_pattern = r'AT-[A-Za-z0-9\-/:]+(?=["\s<])' isils = list(set(re.findall(isil_pattern, response.text))) isils.sort() log(f"Found {len(isils)} unique ISIL codes") if isils: log("Sample ISILs:") for isil in isils[:20]: log(f" {isil}") # Save the HTML for analysis output_dir = Path("data/isil") output_dir.mkdir(parents=True, exist_ok=True) html_file = output_dir / "austria_search_page.html" with open(html_file, 'w', encoding='utf-8') as f: f.write(response.text) log(f"Saved HTML to: {html_file}") # Save ISILs isil_file = output_dir / "austria_isils_extracted.txt" with open(isil_file, 'w') as f: for isil in isils: f.write(f"{isil}\n") log(f"Saved ISIL codes to: {isil_file}") return isils else: log(f"Error: Status {response.status_code}") return [] def main(): log("=== Austrian ISIL Data Extraction ===\n") # Try API first api_url, api_data = try_primo_api() if api_data: log("\nAPI approach successful!") # Process API data else: log("\nAPI approach failed, trying HTTP scraping...") isils = scrape_with_session() if isils: log(f"\nExtracted {len(isils)} ISIL codes") else: log("\nNo ISILs extracted. The site requires JavaScript rendering.") log("Recommendation: Use Playwright-based scraper instead.") if __name__ == "__main__": try: import requests main() except ImportError: log("Error: 'requests' library required. Install with: pip install requests")