glam/scripts/scrape_austrian_isil_requests.py
2025-11-19 23:25:22 +01:00

171 lines
5.2 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Austrian ISIL Scraper using direct HTTP requests to find API endpoints
"""
import json
import time
import re
from datetime import datetime
from pathlib import Path
try:
import requests
from bs4 import BeautifulSoup
HAS_BS4 = True
except ImportError:
HAS_BS4 = False
print("BeautifulSoup4 not available, using regex parsing")
def log(msg):
print(f"[{datetime.now().strftime('%H:%M:%S')}] {msg}")
def try_primo_api():
"""Try to find Primo API endpoints"""
log("Attempting to find Primo API endpoints...")
# Try different potential API URLs
api_attempts = [
"https://www.isil.at/primo_library/libweb/webservices/rest/primo-explore/v1/pnxs",
"https://www.isil.at/primo_library/libweb/webservices/rest/v1/pnxs",
"https://www.isil.at/primaws/rest/pub/pnxs",
]
headers = {
'User-Agent': 'Mozilla/5.0 (Research/AcademicProject)',
'Accept': 'application/json'
}
for url in api_attempts:
try:
params = {
'blendFacetsSeparately': 'false',
'disableCache': 'false',
'getMore': '0',
'inst': 'AIS',
'lang': 'de',
'limit': '10',
'mode': 'advanced',
'newspapersActive': 'false',
'newspapersSearch': 'false',
'offset': '0',
'pcAvailability': 'false',
'q': 'any,contains,AT-',
'qExclude': '',
'qInclude': '',
'rapido': 'false',
'refEntryActive': 'false',
'rtaLinks': 'true',
'scope': 'default_scope',
'searchInFulltextUserSelection': 'false',
'skipDelivery': 'Y',
'sort': 'rank',
'tab': 'default_tab',
'vid': 'AIS'
}
log(f"Trying: {url}")
response = requests.get(url, params=params, headers=headers, timeout=10)
if response.status_code == 200:
log(f"SUCCESS! Found API at: {url}")
log(f"Response preview: {response.text[:500]}")
return url, response.json()
else:
log(f" Status {response.status_code}")
except Exception as e:
log(f" Error: {e}")
return None, None
def scrape_with_session():
"""Try scraping with a session that preserves cookies"""
log("\nTrying to scrape search results page...")
session = requests.Session()
session.headers.update({
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
})
# First, load the main page to get cookies
log("Loading main page to establish session...")
main_url = "https://www.isil.at/primo-explore/search?vid=AIS"
response = session.get(main_url)
time.sleep(2)
# Now try the search
search_url = "https://www.isil.at/primo-explore/search"
params = {
'query': 'any,contains,AT-',
'tab': 'default_tab',
'search_scope': 'default_scope',
'vid': 'AIS',
'offset': '0',
'limit': '50'
}
log(f"Searching: {search_url}")
response = session.get(search_url, params=params)
if response.status_code == 200:
log(f"Got response, length: {len(response.text)}")
# Try to extract ISIL codes
isil_pattern = r'AT-[A-Za-z0-9\-/:]+(?=["\s<])'
isils = list(set(re.findall(isil_pattern, response.text)))
isils.sort()
log(f"Found {len(isils)} unique ISIL codes")
if isils:
log("Sample ISILs:")
for isil in isils[:20]:
log(f" {isil}")
# Save the HTML for analysis
output_dir = Path("data/isil")
output_dir.mkdir(parents=True, exist_ok=True)
html_file = output_dir / "austria_search_page.html"
with open(html_file, 'w', encoding='utf-8') as f:
f.write(response.text)
log(f"Saved HTML to: {html_file}")
# Save ISILs
isil_file = output_dir / "austria_isils_extracted.txt"
with open(isil_file, 'w') as f:
for isil in isils:
f.write(f"{isil}\n")
log(f"Saved ISIL codes to: {isil_file}")
return isils
else:
log(f"Error: Status {response.status_code}")
return []
def main():
log("=== Austrian ISIL Data Extraction ===\n")
# Try API first
api_url, api_data = try_primo_api()
if api_data:
log("\nAPI approach successful!")
# Process API data
else:
log("\nAPI approach failed, trying HTTP scraping...")
isils = scrape_with_session()
if isils:
log(f"\nExtracted {len(isils)} ISIL codes")
else:
log("\nNo ISILs extracted. The site requires JavaScript rendering.")
log("Recommendation: Use Playwright-based scraper instead.")
if __name__ == "__main__":
try:
import requests
main()
except ImportError:
log("Error: 'requests' library required. Install with: pip install requests")