#!/usr/bin/env python3 """ Aggressive web scraper with SSL verification disabled and multiple User-Agent rotation. For sites that block normal requests. """ from __future__ import annotations import argparse import re import ssl import sys import time import random from datetime import datetime, timezone from pathlib import Path from typing import Any from urllib.parse import urljoin, urlparse import httpx import yaml from bs4 import BeautifulSoup # Configuration CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian") FAILED_URLS_FILE = Path("/Users/kempersc/apps/glam/data/failed_crawl_urls.txt") # Multiple user agents to rotate USER_AGENTS = [ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0", ] # Platform type detection patterns PLATFORM_PATTERNS: dict[str, list[str]] = { 'DISCOVERY_PORTAL': [ r'/collectie', r'/collection', r'/catalogus', r'/catalog', r'/zoeken', r'/search', r'/archief', r'/archive', r'/beeldbank', r'/images', r'/foto', r'/photo', ], 'DIGITAL_ARCHIVE': [ r'archieven\.nl', r'archief', r'archive', r'/inventaris', r'/inventory', r'/toegang', ], 'EDUCATION': [ r'/educatie', r'/education', r'/onderwijs', r'/leren', r'/scholen', r'/schools', r'/lesmateriaal', ], 'INSTITUTIONAL_WEBSITE': [ r'/over-ons', r'/about', r'/contact', r'/bezoek', r'/visit', r'/openingstijden', r'/hours', ], } def detect_platform_type(url: str, links: list[str] | None = None) -> str: """Detect the platform type based on URL patterns and extracted links.""" url_lower = url.lower() all_urls = [url_lower] + [link.lower() for link in (links or [])] for platform_type, patterns in PLATFORM_PATTERNS.items(): for pattern in patterns: for check_url in all_urls: if re.search(pattern, check_url): return platform_type return 'INSTITUTIONAL_WEBSITE' def extract_collection_urls(links: list[str], base_url: str) -> list[str]: """Extract URLs that appear to be collection/catalog pages.""" collection_patterns = [ r'/collectie', r'/collection', r'/catalogus', r'/catalog', r'/zoeken', r'/search', r'/beeldbank', r'/inventaris', r'/archief(?!en\.)', r'/archiefstukken', r'/toegangen', ] collection_urls: list[str] = [] base_domain = urlparse(base_url).netloc for link in links: try: parsed = urlparse(link) if base_domain in parsed.netloc or parsed.netloc in base_domain: for pattern in collection_patterns: if re.search(pattern, link.lower()): if link not in collection_urls: collection_urls.append(link) break except Exception: continue return collection_urls[:5] def fetch_with_httpx(url: str, timeout: int = 20) -> tuple[str | None, int | None]: """Fetch URL content using httpx with SSL verification DISABLED.""" headers = { "User-Agent": random.choice(USER_AGENTS), "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.9,nl;q=0.8", "Accept-Encoding": "gzip, deflate, br", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "Cache-Control": "max-age=0", "Sec-Fetch-Dest": "document", "Sec-Fetch-Mode": "navigate", "Sec-Fetch-Site": "none", "Sec-Fetch-User": "?1", } try: with httpx.Client( timeout=timeout, follow_redirects=True, verify=False, # DISABLE SSL VERIFICATION http2=True, ) as client: response = client.get(url, headers=headers) return response.text, response.status_code except httpx.ConnectError as e: print(f" FAILED: Connection error: {e}") return None, None except httpx.TimeoutException: print(f" FAILED: Timeout after {timeout}s") return None, None except Exception as e: print(f" FAILED: {type(e).__name__}: {e}") return None, None def parse_html_content(html: str, url: str) -> dict[str, Any]: """Parse HTML content and extract metadata.""" soup = BeautifulSoup(html, 'html.parser') base_domain = urlparse(url).netloc # Extract title title = None if soup.title and soup.title.string: title = soup.title.string.strip() # Extract meta description description = None meta_desc = soup.find('meta', attrs={'name': 'description'}) if meta_desc and meta_desc.get('content'): description = meta_desc.get('content').strip() # Extract Open Graph data og_title = None og_desc = None og_image = None og_meta = soup.find('meta', property='og:title') if og_meta: og_title = og_meta.get('content') og_meta = soup.find('meta', property='og:description') if og_meta: og_desc = og_meta.get('content') og_meta = soup.find('meta', property='og:image') if og_meta: og_image = og_meta.get('content') # Extract language language = None html_tag = soup.find('html') if html_tag and html_tag.get('lang'): language = html_tag.get('lang')[:2] # Extract favicon favicon = None link_icon = soup.find('link', rel=lambda x: x and 'icon' in x.lower() if isinstance(x, str) else False) if not link_icon: link_icon = soup.find('link', rel=lambda x: 'icon' in x if isinstance(x, list) else False) if link_icon and link_icon.get('href'): favicon = urljoin(url, link_icon.get('href')) # Extract all links links = [] for a_tag in soup.find_all('a', href=True): href = a_tag['href'] if href.startswith(('http://', 'https://')): links.append(href) elif href.startswith('/') and not href.startswith('//'): links.append(urljoin(url, href)) # Extract navigation links (first 20) nav_links = list(set(links))[:20] # Extract collection URLs collection_urls = extract_collection_urls(links, url) return { 'title': og_title or title, 'description': og_desc or description, 'og_image': og_image, 'language': language, 'favicon': favicon, 'links': nav_links, 'collection_urls': collection_urls, } def transform_to_digital_platform_v2( url: str, parsed_data: dict[str, Any], status_code: int, ) -> dict[str, Any]: """Transform parsed data to digital_platform_v2 format.""" domain = urlparse(url).netloc.replace('www.', '') platform_id = f"primary_website_{domain.replace('.', '_')}" platform_type = detect_platform_type(url, parsed_data.get('links', [])) result = { 'transformation_metadata': { 'transformed_from': 'httpx_beautifulsoup_aggressive', 'transformation_date': datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ'), 'transformation_version': '2.2', 'source_status_code': status_code, }, 'primary_platform': { 'platform_id': platform_id, 'platform_name': parsed_data.get('title') or f"{domain} Website", 'platform_url': url, 'platform_type': platform_type, } } # Add optional fields if present if parsed_data.get('description'): result['primary_platform']['description'] = parsed_data['description'][:500] if parsed_data.get('language'): result['primary_platform']['language'] = parsed_data['language'] if parsed_data.get('og_image'): result['primary_platform']['og_image'] = parsed_data['og_image'] if parsed_data.get('favicon'): result['primary_platform']['favicon'] = parsed_data['favicon'] if parsed_data.get('collection_urls'): result['primary_platform']['collection_urls'] = parsed_data['collection_urls'] # Add navigation links if present if parsed_data.get('links'): result['navigation_links'] = parsed_data['links'][:10] return result def update_custodian_yaml(filename: str, digital_platform_v2: dict[str, Any]) -> bool: """Update the custodian YAML file with digital_platform_v2 data.""" filepath = CUSTODIAN_DIR / filename if not filepath.exists(): print(f" File not found: {filepath}") return False try: with open(filepath, 'r', encoding='utf-8') as f: content = f.read() data = yaml.safe_load(content) if data is None: data = {} # Check if already has digital_platform_v2 if 'digital_platform_v2' in data: print(f" SKIP: Already has digital_platform_v2") return False # Add digital_platform_v2 data['digital_platform_v2'] = digital_platform_v2 with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) return True except Exception as e: print(f" ERROR updating YAML: {e}") return False def main(): parser = argparse.ArgumentParser(description='Aggressive batch web scraper') parser.add_argument('--start', type=int, default=0, help='Start index') parser.add_argument('--limit', type=int, default=None, help='Max URLs to process') parser.add_argument('--delay', type=float, default=1.5, help='Delay between requests (seconds)') parser.add_argument('--timeout', type=int, default=20, help='Request timeout (seconds)') parser.add_argument('--dry-run', action='store_true', help='Do not update YAML files') args = parser.parse_args() # Suppress SSL warnings import warnings import urllib3 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) warnings.filterwarnings('ignore', message='Unverified HTTPS request') # Load URLs if not FAILED_URLS_FILE.exists(): print(f"Error: {FAILED_URLS_FILE} not found") sys.exit(1) urls = [] with open(FAILED_URLS_FILE, 'r') as f: for line in f: line = line.strip() if line and '\t' in line: parts = line.split('\t', 1) if len(parts) == 2: urls.append((parts[0], parts[1])) print(f"Loaded {len(urls)} failed URLs from {FAILED_URLS_FILE}") # Apply start/limit end_idx = len(urls) if args.limit is None else args.start + args.limit urls_to_process = urls[args.start:end_idx] print(f"Processing {len(urls_to_process)} URLs (start={args.start}, limit={args.limit})") # Process URLs success_count = 0 skip_count = 0 fail_count = 0 for i, (filename, url) in enumerate(urls_to_process): print(f"\n[{i+1}/{len(urls_to_process)}] {filename}") print(f" URL: {url}") # Fetch content html, status_code = fetch_with_httpx(url, timeout=args.timeout) if html is None: fail_count += 1 continue if status_code != 200: print(f" FAILED: HTTP {status_code}") fail_count += 1 continue # Parse HTML parsed_data = parse_html_content(html, url) # Transform to digital_platform_v2 digital_platform_v2 = transform_to_digital_platform_v2(url, parsed_data, status_code) if args.dry_run: print(f" DRY RUN: Would update with platform_type={digital_platform_v2['primary_platform']['platform_type']}") success_count += 1 else: # Update YAML if update_custodian_yaml(filename, digital_platform_v2): print(f" SUCCESS: Updated with platform_type={digital_platform_v2['primary_platform']['platform_type']}") success_count += 1 else: skip_count += 1 # Delay between requests if i < len(urls_to_process) - 1: time.sleep(args.delay) print(f"\n=== Final Results ===") print(f"Success: {success_count}") print(f"Skipped: {skip_count}") print(f"Failed: {fail_count}") print(f"Total: {len(urls_to_process)}") if __name__ == '__main__': main()