glam/scripts/batch_httpx_scrape_aggressive.py
2025-12-15 22:31:41 +01:00

369 lines
13 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Aggressive web scraper with SSL verification disabled and multiple User-Agent rotation.
For sites that block normal requests.
"""
from __future__ import annotations
import argparse
import re
import ssl
import sys
import time
import random
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
from urllib.parse import urljoin, urlparse
import httpx
import yaml
from bs4 import BeautifulSoup
# Configuration
CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian")
FAILED_URLS_FILE = Path("/Users/kempersc/apps/glam/data/failed_crawl_urls.txt")
# Multiple user agents to rotate
USER_AGENTS = [
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0",
]
# Platform type detection patterns
PLATFORM_PATTERNS: dict[str, list[str]] = {
'DISCOVERY_PORTAL': [
r'/collectie', r'/collection', r'/catalogus', r'/catalog',
r'/zoeken', r'/search', r'/archief', r'/archive',
r'/beeldbank', r'/images', r'/foto', r'/photo',
],
'DIGITAL_ARCHIVE': [
r'archieven\.nl', r'archief', r'archive',
r'/inventaris', r'/inventory', r'/toegang',
],
'EDUCATION': [
r'/educatie', r'/education', r'/onderwijs', r'/leren',
r'/scholen', r'/schools', r'/lesmateriaal',
],
'INSTITUTIONAL_WEBSITE': [
r'/over-ons', r'/about', r'/contact', r'/bezoek',
r'/visit', r'/openingstijden', r'/hours',
],
}
def detect_platform_type(url: str, links: list[str] | None = None) -> str:
"""Detect the platform type based on URL patterns and extracted links."""
url_lower = url.lower()
all_urls = [url_lower] + [link.lower() for link in (links or [])]
for platform_type, patterns in PLATFORM_PATTERNS.items():
for pattern in patterns:
for check_url in all_urls:
if re.search(pattern, check_url):
return platform_type
return 'INSTITUTIONAL_WEBSITE'
def extract_collection_urls(links: list[str], base_url: str) -> list[str]:
"""Extract URLs that appear to be collection/catalog pages."""
collection_patterns = [
r'/collectie', r'/collection', r'/catalogus', r'/catalog',
r'/zoeken', r'/search', r'/beeldbank', r'/inventaris',
r'/archief(?!en\.)', r'/archiefstukken', r'/toegangen',
]
collection_urls: list[str] = []
base_domain = urlparse(base_url).netloc
for link in links:
try:
parsed = urlparse(link)
if base_domain in parsed.netloc or parsed.netloc in base_domain:
for pattern in collection_patterns:
if re.search(pattern, link.lower()):
if link not in collection_urls:
collection_urls.append(link)
break
except Exception:
continue
return collection_urls[:5]
def fetch_with_httpx(url: str, timeout: int = 20) -> tuple[str | None, int | None]:
"""Fetch URL content using httpx with SSL verification DISABLED."""
headers = {
"User-Agent": random.choice(USER_AGENTS),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9,nl;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"Cache-Control": "max-age=0",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
}
try:
with httpx.Client(
timeout=timeout,
follow_redirects=True,
verify=False, # DISABLE SSL VERIFICATION
http2=True,
) as client:
response = client.get(url, headers=headers)
return response.text, response.status_code
except httpx.ConnectError as e:
print(f" FAILED: Connection error: {e}")
return None, None
except httpx.TimeoutException:
print(f" FAILED: Timeout after {timeout}s")
return None, None
except Exception as e:
print(f" FAILED: {type(e).__name__}: {e}")
return None, None
def parse_html_content(html: str, url: str) -> dict[str, Any]:
"""Parse HTML content and extract metadata."""
soup = BeautifulSoup(html, 'html.parser')
base_domain = urlparse(url).netloc
# Extract title
title = None
if soup.title and soup.title.string:
title = soup.title.string.strip()
# Extract meta description
description = None
meta_desc = soup.find('meta', attrs={'name': 'description'})
if meta_desc and meta_desc.get('content'):
description = meta_desc.get('content').strip()
# Extract Open Graph data
og_title = None
og_desc = None
og_image = None
og_meta = soup.find('meta', property='og:title')
if og_meta:
og_title = og_meta.get('content')
og_meta = soup.find('meta', property='og:description')
if og_meta:
og_desc = og_meta.get('content')
og_meta = soup.find('meta', property='og:image')
if og_meta:
og_image = og_meta.get('content')
# Extract language
language = None
html_tag = soup.find('html')
if html_tag and html_tag.get('lang'):
language = html_tag.get('lang')[:2]
# Extract favicon
favicon = None
link_icon = soup.find('link', rel=lambda x: x and 'icon' in x.lower() if isinstance(x, str) else False)
if not link_icon:
link_icon = soup.find('link', rel=lambda x: 'icon' in x if isinstance(x, list) else False)
if link_icon and link_icon.get('href'):
favicon = urljoin(url, link_icon.get('href'))
# Extract all links
links = []
for a_tag in soup.find_all('a', href=True):
href = a_tag['href']
if href.startswith(('http://', 'https://')):
links.append(href)
elif href.startswith('/') and not href.startswith('//'):
links.append(urljoin(url, href))
# Extract navigation links (first 20)
nav_links = list(set(links))[:20]
# Extract collection URLs
collection_urls = extract_collection_urls(links, url)
return {
'title': og_title or title,
'description': og_desc or description,
'og_image': og_image,
'language': language,
'favicon': favicon,
'links': nav_links,
'collection_urls': collection_urls,
}
def transform_to_digital_platform_v2(
url: str,
parsed_data: dict[str, Any],
status_code: int,
) -> dict[str, Any]:
"""Transform parsed data to digital_platform_v2 format."""
domain = urlparse(url).netloc.replace('www.', '')
platform_id = f"primary_website_{domain.replace('.', '_')}"
platform_type = detect_platform_type(url, parsed_data.get('links', []))
result = {
'transformation_metadata': {
'transformed_from': 'httpx_beautifulsoup_aggressive',
'transformation_date': datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ'),
'transformation_version': '2.2',
'source_status_code': status_code,
},
'primary_platform': {
'platform_id': platform_id,
'platform_name': parsed_data.get('title') or f"{domain} Website",
'platform_url': url,
'platform_type': platform_type,
}
}
# Add optional fields if present
if parsed_data.get('description'):
result['primary_platform']['description'] = parsed_data['description'][:500]
if parsed_data.get('language'):
result['primary_platform']['language'] = parsed_data['language']
if parsed_data.get('og_image'):
result['primary_platform']['og_image'] = parsed_data['og_image']
if parsed_data.get('favicon'):
result['primary_platform']['favicon'] = parsed_data['favicon']
if parsed_data.get('collection_urls'):
result['primary_platform']['collection_urls'] = parsed_data['collection_urls']
# Add navigation links if present
if parsed_data.get('links'):
result['navigation_links'] = parsed_data['links'][:10]
return result
def update_custodian_yaml(filename: str, digital_platform_v2: dict[str, Any]) -> bool:
"""Update the custodian YAML file with digital_platform_v2 data."""
filepath = CUSTODIAN_DIR / filename
if not filepath.exists():
print(f" File not found: {filepath}")
return False
try:
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
data = yaml.safe_load(content)
if data is None:
data = {}
# Check if already has digital_platform_v2
if 'digital_platform_v2' in data:
print(f" SKIP: Already has digital_platform_v2")
return False
# Add digital_platform_v2
data['digital_platform_v2'] = digital_platform_v2
with open(filepath, 'w', encoding='utf-8') as f:
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
return True
except Exception as e:
print(f" ERROR updating YAML: {e}")
return False
def main():
parser = argparse.ArgumentParser(description='Aggressive batch web scraper')
parser.add_argument('--start', type=int, default=0, help='Start index')
parser.add_argument('--limit', type=int, default=None, help='Max URLs to process')
parser.add_argument('--delay', type=float, default=1.5, help='Delay between requests (seconds)')
parser.add_argument('--timeout', type=int, default=20, help='Request timeout (seconds)')
parser.add_argument('--dry-run', action='store_true', help='Do not update YAML files')
args = parser.parse_args()
# Suppress SSL warnings
import warnings
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
warnings.filterwarnings('ignore', message='Unverified HTTPS request')
# Load URLs
if not FAILED_URLS_FILE.exists():
print(f"Error: {FAILED_URLS_FILE} not found")
sys.exit(1)
urls = []
with open(FAILED_URLS_FILE, 'r') as f:
for line in f:
line = line.strip()
if line and '\t' in line:
parts = line.split('\t', 1)
if len(parts) == 2:
urls.append((parts[0], parts[1]))
print(f"Loaded {len(urls)} failed URLs from {FAILED_URLS_FILE}")
# Apply start/limit
end_idx = len(urls) if args.limit is None else args.start + args.limit
urls_to_process = urls[args.start:end_idx]
print(f"Processing {len(urls_to_process)} URLs (start={args.start}, limit={args.limit})")
# Process URLs
success_count = 0
skip_count = 0
fail_count = 0
for i, (filename, url) in enumerate(urls_to_process):
print(f"\n[{i+1}/{len(urls_to_process)}] {filename}")
print(f" URL: {url}")
# Fetch content
html, status_code = fetch_with_httpx(url, timeout=args.timeout)
if html is None:
fail_count += 1
continue
if status_code != 200:
print(f" FAILED: HTTP {status_code}")
fail_count += 1
continue
# Parse HTML
parsed_data = parse_html_content(html, url)
# Transform to digital_platform_v2
digital_platform_v2 = transform_to_digital_platform_v2(url, parsed_data, status_code)
if args.dry_run:
print(f" DRY RUN: Would update with platform_type={digital_platform_v2['primary_platform']['platform_type']}")
success_count += 1
else:
# Update YAML
if update_custodian_yaml(filename, digital_platform_v2):
print(f" SUCCESS: Updated with platform_type={digital_platform_v2['primary_platform']['platform_type']}")
success_count += 1
else:
skip_count += 1
# Delay between requests
if i < len(urls_to_process) - 1:
time.sleep(args.delay)
print(f"\n=== Final Results ===")
print(f"Success: {success_count}")
print(f"Skipped: {skip_count}")
print(f"Failed: {fail_count}")
print(f"Total: {len(urls_to_process)}")
if __name__ == '__main__':
main()