171 lines
5.2 KiB
Python
Executable file
171 lines
5.2 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Austrian ISIL Scraper using direct HTTP requests to find API endpoints
|
|
"""
|
|
|
|
import json
|
|
import time
|
|
import re
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
|
|
try:
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
HAS_BS4 = True
|
|
except ImportError:
|
|
HAS_BS4 = False
|
|
print("BeautifulSoup4 not available, using regex parsing")
|
|
|
|
def log(msg):
|
|
print(f"[{datetime.now().strftime('%H:%M:%S')}] {msg}")
|
|
|
|
def try_primo_api():
|
|
"""Try to find Primo API endpoints"""
|
|
log("Attempting to find Primo API endpoints...")
|
|
|
|
# Try different potential API URLs
|
|
api_attempts = [
|
|
"https://www.isil.at/primo_library/libweb/webservices/rest/primo-explore/v1/pnxs",
|
|
"https://www.isil.at/primo_library/libweb/webservices/rest/v1/pnxs",
|
|
"https://www.isil.at/primaws/rest/pub/pnxs",
|
|
]
|
|
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Research/AcademicProject)',
|
|
'Accept': 'application/json'
|
|
}
|
|
|
|
for url in api_attempts:
|
|
try:
|
|
params = {
|
|
'blendFacetsSeparately': 'false',
|
|
'disableCache': 'false',
|
|
'getMore': '0',
|
|
'inst': 'AIS',
|
|
'lang': 'de',
|
|
'limit': '10',
|
|
'mode': 'advanced',
|
|
'newspapersActive': 'false',
|
|
'newspapersSearch': 'false',
|
|
'offset': '0',
|
|
'pcAvailability': 'false',
|
|
'q': 'any,contains,AT-',
|
|
'qExclude': '',
|
|
'qInclude': '',
|
|
'rapido': 'false',
|
|
'refEntryActive': 'false',
|
|
'rtaLinks': 'true',
|
|
'scope': 'default_scope',
|
|
'searchInFulltextUserSelection': 'false',
|
|
'skipDelivery': 'Y',
|
|
'sort': 'rank',
|
|
'tab': 'default_tab',
|
|
'vid': 'AIS'
|
|
}
|
|
|
|
log(f"Trying: {url}")
|
|
response = requests.get(url, params=params, headers=headers, timeout=10)
|
|
|
|
if response.status_code == 200:
|
|
log(f"SUCCESS! Found API at: {url}")
|
|
log(f"Response preview: {response.text[:500]}")
|
|
return url, response.json()
|
|
else:
|
|
log(f" Status {response.status_code}")
|
|
|
|
except Exception as e:
|
|
log(f" Error: {e}")
|
|
|
|
return None, None
|
|
|
|
def scrape_with_session():
|
|
"""Try scraping with a session that preserves cookies"""
|
|
log("\nTrying to scrape search results page...")
|
|
|
|
session = requests.Session()
|
|
session.headers.update({
|
|
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
|
|
})
|
|
|
|
# First, load the main page to get cookies
|
|
log("Loading main page to establish session...")
|
|
main_url = "https://www.isil.at/primo-explore/search?vid=AIS"
|
|
response = session.get(main_url)
|
|
time.sleep(2)
|
|
|
|
# Now try the search
|
|
search_url = "https://www.isil.at/primo-explore/search"
|
|
params = {
|
|
'query': 'any,contains,AT-',
|
|
'tab': 'default_tab',
|
|
'search_scope': 'default_scope',
|
|
'vid': 'AIS',
|
|
'offset': '0',
|
|
'limit': '50'
|
|
}
|
|
|
|
log(f"Searching: {search_url}")
|
|
response = session.get(search_url, params=params)
|
|
|
|
if response.status_code == 200:
|
|
log(f"Got response, length: {len(response.text)}")
|
|
|
|
# Try to extract ISIL codes
|
|
isil_pattern = r'AT-[A-Za-z0-9\-/:]+(?=["\s<])'
|
|
isils = list(set(re.findall(isil_pattern, response.text)))
|
|
isils.sort()
|
|
|
|
log(f"Found {len(isils)} unique ISIL codes")
|
|
|
|
if isils:
|
|
log("Sample ISILs:")
|
|
for isil in isils[:20]:
|
|
log(f" {isil}")
|
|
|
|
# Save the HTML for analysis
|
|
output_dir = Path("data/isil")
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
html_file = output_dir / "austria_search_page.html"
|
|
with open(html_file, 'w', encoding='utf-8') as f:
|
|
f.write(response.text)
|
|
log(f"Saved HTML to: {html_file}")
|
|
|
|
# Save ISILs
|
|
isil_file = output_dir / "austria_isils_extracted.txt"
|
|
with open(isil_file, 'w') as f:
|
|
for isil in isils:
|
|
f.write(f"{isil}\n")
|
|
log(f"Saved ISIL codes to: {isil_file}")
|
|
|
|
return isils
|
|
else:
|
|
log(f"Error: Status {response.status_code}")
|
|
return []
|
|
|
|
def main():
|
|
log("=== Austrian ISIL Data Extraction ===\n")
|
|
|
|
# Try API first
|
|
api_url, api_data = try_primo_api()
|
|
|
|
if api_data:
|
|
log("\nAPI approach successful!")
|
|
# Process API data
|
|
else:
|
|
log("\nAPI approach failed, trying HTTP scraping...")
|
|
isils = scrape_with_session()
|
|
|
|
if isils:
|
|
log(f"\nExtracted {len(isils)} ISIL codes")
|
|
else:
|
|
log("\nNo ISILs extracted. The site requires JavaScript rendering.")
|
|
log("Recommendation: Use Playwright-based scraper instead.")
|
|
|
|
if __name__ == "__main__":
|
|
try:
|
|
import requests
|
|
main()
|
|
except ImportError:
|
|
log("Error: 'requests' library required. Install with: pip install requests")
|