369 lines
13 KiB
Python
Executable file
369 lines
13 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Aggressive web scraper with SSL verification disabled and multiple User-Agent rotation.
|
|
For sites that block normal requests.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import re
|
|
import ssl
|
|
import sys
|
|
import time
|
|
import random
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Any
|
|
from urllib.parse import urljoin, urlparse
|
|
|
|
import httpx
|
|
import yaml
|
|
from bs4 import BeautifulSoup
|
|
|
|
# Configuration
|
|
CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian")
|
|
FAILED_URLS_FILE = Path("/Users/kempersc/apps/glam/data/failed_crawl_urls.txt")
|
|
|
|
# Multiple user agents to rotate
|
|
USER_AGENTS = [
|
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15",
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0",
|
|
]
|
|
|
|
# Platform type detection patterns
|
|
PLATFORM_PATTERNS: dict[str, list[str]] = {
|
|
'DISCOVERY_PORTAL': [
|
|
r'/collectie', r'/collection', r'/catalogus', r'/catalog',
|
|
r'/zoeken', r'/search', r'/archief', r'/archive',
|
|
r'/beeldbank', r'/images', r'/foto', r'/photo',
|
|
],
|
|
'DIGITAL_ARCHIVE': [
|
|
r'archieven\.nl', r'archief', r'archive',
|
|
r'/inventaris', r'/inventory', r'/toegang',
|
|
],
|
|
'EDUCATION': [
|
|
r'/educatie', r'/education', r'/onderwijs', r'/leren',
|
|
r'/scholen', r'/schools', r'/lesmateriaal',
|
|
],
|
|
'INSTITUTIONAL_WEBSITE': [
|
|
r'/over-ons', r'/about', r'/contact', r'/bezoek',
|
|
r'/visit', r'/openingstijden', r'/hours',
|
|
],
|
|
}
|
|
|
|
|
|
def detect_platform_type(url: str, links: list[str] | None = None) -> str:
|
|
"""Detect the platform type based on URL patterns and extracted links."""
|
|
url_lower = url.lower()
|
|
all_urls = [url_lower] + [link.lower() for link in (links or [])]
|
|
|
|
for platform_type, patterns in PLATFORM_PATTERNS.items():
|
|
for pattern in patterns:
|
|
for check_url in all_urls:
|
|
if re.search(pattern, check_url):
|
|
return platform_type
|
|
|
|
return 'INSTITUTIONAL_WEBSITE'
|
|
|
|
|
|
def extract_collection_urls(links: list[str], base_url: str) -> list[str]:
|
|
"""Extract URLs that appear to be collection/catalog pages."""
|
|
collection_patterns = [
|
|
r'/collectie', r'/collection', r'/catalogus', r'/catalog',
|
|
r'/zoeken', r'/search', r'/beeldbank', r'/inventaris',
|
|
r'/archief(?!en\.)', r'/archiefstukken', r'/toegangen',
|
|
]
|
|
|
|
collection_urls: list[str] = []
|
|
base_domain = urlparse(base_url).netloc
|
|
|
|
for link in links:
|
|
try:
|
|
parsed = urlparse(link)
|
|
if base_domain in parsed.netloc or parsed.netloc in base_domain:
|
|
for pattern in collection_patterns:
|
|
if re.search(pattern, link.lower()):
|
|
if link not in collection_urls:
|
|
collection_urls.append(link)
|
|
break
|
|
except Exception:
|
|
continue
|
|
|
|
return collection_urls[:5]
|
|
|
|
|
|
def fetch_with_httpx(url: str, timeout: int = 20) -> tuple[str | None, int | None]:
|
|
"""Fetch URL content using httpx with SSL verification DISABLED."""
|
|
headers = {
|
|
"User-Agent": random.choice(USER_AGENTS),
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
|
"Accept-Language": "en-US,en;q=0.9,nl;q=0.8",
|
|
"Accept-Encoding": "gzip, deflate, br",
|
|
"Connection": "keep-alive",
|
|
"Upgrade-Insecure-Requests": "1",
|
|
"Cache-Control": "max-age=0",
|
|
"Sec-Fetch-Dest": "document",
|
|
"Sec-Fetch-Mode": "navigate",
|
|
"Sec-Fetch-Site": "none",
|
|
"Sec-Fetch-User": "?1",
|
|
}
|
|
|
|
try:
|
|
with httpx.Client(
|
|
timeout=timeout,
|
|
follow_redirects=True,
|
|
verify=False, # DISABLE SSL VERIFICATION
|
|
http2=True,
|
|
) as client:
|
|
response = client.get(url, headers=headers)
|
|
return response.text, response.status_code
|
|
except httpx.ConnectError as e:
|
|
print(f" FAILED: Connection error: {e}")
|
|
return None, None
|
|
except httpx.TimeoutException:
|
|
print(f" FAILED: Timeout after {timeout}s")
|
|
return None, None
|
|
except Exception as e:
|
|
print(f" FAILED: {type(e).__name__}: {e}")
|
|
return None, None
|
|
|
|
|
|
def parse_html_content(html: str, url: str) -> dict[str, Any]:
|
|
"""Parse HTML content and extract metadata."""
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
base_domain = urlparse(url).netloc
|
|
|
|
# Extract title
|
|
title = None
|
|
if soup.title and soup.title.string:
|
|
title = soup.title.string.strip()
|
|
|
|
# Extract meta description
|
|
description = None
|
|
meta_desc = soup.find('meta', attrs={'name': 'description'})
|
|
if meta_desc and meta_desc.get('content'):
|
|
description = meta_desc.get('content').strip()
|
|
|
|
# Extract Open Graph data
|
|
og_title = None
|
|
og_desc = None
|
|
og_image = None
|
|
og_meta = soup.find('meta', property='og:title')
|
|
if og_meta:
|
|
og_title = og_meta.get('content')
|
|
og_meta = soup.find('meta', property='og:description')
|
|
if og_meta:
|
|
og_desc = og_meta.get('content')
|
|
og_meta = soup.find('meta', property='og:image')
|
|
if og_meta:
|
|
og_image = og_meta.get('content')
|
|
|
|
# Extract language
|
|
language = None
|
|
html_tag = soup.find('html')
|
|
if html_tag and html_tag.get('lang'):
|
|
language = html_tag.get('lang')[:2]
|
|
|
|
# Extract favicon
|
|
favicon = None
|
|
link_icon = soup.find('link', rel=lambda x: x and 'icon' in x.lower() if isinstance(x, str) else False)
|
|
if not link_icon:
|
|
link_icon = soup.find('link', rel=lambda x: 'icon' in x if isinstance(x, list) else False)
|
|
if link_icon and link_icon.get('href'):
|
|
favicon = urljoin(url, link_icon.get('href'))
|
|
|
|
# Extract all links
|
|
links = []
|
|
for a_tag in soup.find_all('a', href=True):
|
|
href = a_tag['href']
|
|
if href.startswith(('http://', 'https://')):
|
|
links.append(href)
|
|
elif href.startswith('/') and not href.startswith('//'):
|
|
links.append(urljoin(url, href))
|
|
|
|
# Extract navigation links (first 20)
|
|
nav_links = list(set(links))[:20]
|
|
|
|
# Extract collection URLs
|
|
collection_urls = extract_collection_urls(links, url)
|
|
|
|
return {
|
|
'title': og_title or title,
|
|
'description': og_desc or description,
|
|
'og_image': og_image,
|
|
'language': language,
|
|
'favicon': favicon,
|
|
'links': nav_links,
|
|
'collection_urls': collection_urls,
|
|
}
|
|
|
|
|
|
def transform_to_digital_platform_v2(
|
|
url: str,
|
|
parsed_data: dict[str, Any],
|
|
status_code: int,
|
|
) -> dict[str, Any]:
|
|
"""Transform parsed data to digital_platform_v2 format."""
|
|
domain = urlparse(url).netloc.replace('www.', '')
|
|
platform_id = f"primary_website_{domain.replace('.', '_')}"
|
|
|
|
platform_type = detect_platform_type(url, parsed_data.get('links', []))
|
|
|
|
result = {
|
|
'transformation_metadata': {
|
|
'transformed_from': 'httpx_beautifulsoup_aggressive',
|
|
'transformation_date': datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ'),
|
|
'transformation_version': '2.2',
|
|
'source_status_code': status_code,
|
|
},
|
|
'primary_platform': {
|
|
'platform_id': platform_id,
|
|
'platform_name': parsed_data.get('title') or f"{domain} Website",
|
|
'platform_url': url,
|
|
'platform_type': platform_type,
|
|
}
|
|
}
|
|
|
|
# Add optional fields if present
|
|
if parsed_data.get('description'):
|
|
result['primary_platform']['description'] = parsed_data['description'][:500]
|
|
if parsed_data.get('language'):
|
|
result['primary_platform']['language'] = parsed_data['language']
|
|
if parsed_data.get('og_image'):
|
|
result['primary_platform']['og_image'] = parsed_data['og_image']
|
|
if parsed_data.get('favicon'):
|
|
result['primary_platform']['favicon'] = parsed_data['favicon']
|
|
if parsed_data.get('collection_urls'):
|
|
result['primary_platform']['collection_urls'] = parsed_data['collection_urls']
|
|
|
|
# Add navigation links if present
|
|
if parsed_data.get('links'):
|
|
result['navigation_links'] = parsed_data['links'][:10]
|
|
|
|
return result
|
|
|
|
|
|
def update_custodian_yaml(filename: str, digital_platform_v2: dict[str, Any]) -> bool:
|
|
"""Update the custodian YAML file with digital_platform_v2 data."""
|
|
filepath = CUSTODIAN_DIR / filename
|
|
|
|
if not filepath.exists():
|
|
print(f" File not found: {filepath}")
|
|
return False
|
|
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
data = yaml.safe_load(content)
|
|
|
|
if data is None:
|
|
data = {}
|
|
|
|
# Check if already has digital_platform_v2
|
|
if 'digital_platform_v2' in data:
|
|
print(f" SKIP: Already has digital_platform_v2")
|
|
return False
|
|
|
|
# Add digital_platform_v2
|
|
data['digital_platform_v2'] = digital_platform_v2
|
|
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
print(f" ERROR updating YAML: {e}")
|
|
return False
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='Aggressive batch web scraper')
|
|
parser.add_argument('--start', type=int, default=0, help='Start index')
|
|
parser.add_argument('--limit', type=int, default=None, help='Max URLs to process')
|
|
parser.add_argument('--delay', type=float, default=1.5, help='Delay between requests (seconds)')
|
|
parser.add_argument('--timeout', type=int, default=20, help='Request timeout (seconds)')
|
|
parser.add_argument('--dry-run', action='store_true', help='Do not update YAML files')
|
|
args = parser.parse_args()
|
|
|
|
# Suppress SSL warnings
|
|
import warnings
|
|
import urllib3
|
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
|
warnings.filterwarnings('ignore', message='Unverified HTTPS request')
|
|
|
|
# Load URLs
|
|
if not FAILED_URLS_FILE.exists():
|
|
print(f"Error: {FAILED_URLS_FILE} not found")
|
|
sys.exit(1)
|
|
|
|
urls = []
|
|
with open(FAILED_URLS_FILE, 'r') as f:
|
|
for line in f:
|
|
line = line.strip()
|
|
if line and '\t' in line:
|
|
parts = line.split('\t', 1)
|
|
if len(parts) == 2:
|
|
urls.append((parts[0], parts[1]))
|
|
|
|
print(f"Loaded {len(urls)} failed URLs from {FAILED_URLS_FILE}")
|
|
|
|
# Apply start/limit
|
|
end_idx = len(urls) if args.limit is None else args.start + args.limit
|
|
urls_to_process = urls[args.start:end_idx]
|
|
print(f"Processing {len(urls_to_process)} URLs (start={args.start}, limit={args.limit})")
|
|
|
|
# Process URLs
|
|
success_count = 0
|
|
skip_count = 0
|
|
fail_count = 0
|
|
|
|
for i, (filename, url) in enumerate(urls_to_process):
|
|
print(f"\n[{i+1}/{len(urls_to_process)}] {filename}")
|
|
print(f" URL: {url}")
|
|
|
|
# Fetch content
|
|
html, status_code = fetch_with_httpx(url, timeout=args.timeout)
|
|
|
|
if html is None:
|
|
fail_count += 1
|
|
continue
|
|
|
|
if status_code != 200:
|
|
print(f" FAILED: HTTP {status_code}")
|
|
fail_count += 1
|
|
continue
|
|
|
|
# Parse HTML
|
|
parsed_data = parse_html_content(html, url)
|
|
|
|
# Transform to digital_platform_v2
|
|
digital_platform_v2 = transform_to_digital_platform_v2(url, parsed_data, status_code)
|
|
|
|
if args.dry_run:
|
|
print(f" DRY RUN: Would update with platform_type={digital_platform_v2['primary_platform']['platform_type']}")
|
|
success_count += 1
|
|
else:
|
|
# Update YAML
|
|
if update_custodian_yaml(filename, digital_platform_v2):
|
|
print(f" SUCCESS: Updated with platform_type={digital_platform_v2['primary_platform']['platform_type']}")
|
|
success_count += 1
|
|
else:
|
|
skip_count += 1
|
|
|
|
# Delay between requests
|
|
if i < len(urls_to_process) - 1:
|
|
time.sleep(args.delay)
|
|
|
|
print(f"\n=== Final Results ===")
|
|
print(f"Success: {success_count}")
|
|
print(f"Skipped: {skip_count}")
|
|
print(f"Failed: {fail_count}")
|
|
print(f"Total: {len(urls_to_process)}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|