#!/usr/bin/env python3 """ Retry archiving for entries that previously failed. Uses longer timeouts and different strategies for different failure types. Usage: python scripts/retry_archive_failures.py [--dry-run] """ import argparse import re import sys import time from datetime import datetime, timezone from pathlib import Path from urllib.parse import urlparse, unquote import yaml try: from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeout from bs4 import BeautifulSoup from markdownify import markdownify as md HAS_DEPS = True except ImportError as e: HAS_DEPS = False MISSING_DEP = str(e) ENTRIES_DIR = Path('/Users/kempersc/apps/glam/data/nde/enriched/entries') WEB_DIR = ENTRIES_DIR / 'web' # URLs that are definitely not archivable (skip these) SKIP_URLS = { "http://www.aflegverenigingrachel@gmail.com", # Email, not URL "http://Schuttersbroederschap%20St.%20Sebastianus", # Invalid URL "http://marcdenelzen.nl/assets/ontwijken-of-samenleven-(column-74).pdf", # PDF on dead domain } def sanitize_dirname(url: str) -> str: """Create a safe directory name from a URL.""" parsed = urlparse(url) name = parsed.netloc.replace('www.', '') name = re.sub(r'[^\w\-.]', '_', name) return name[:50] def clean_html_for_markdown(html: str) -> str: """Clean HTML before markdown conversion.""" soup = BeautifulSoup(html, 'html.parser') for element in soup.find_all(['script', 'style', 'nav', 'footer', 'header', 'aside', 'form', 'iframe', 'noscript', 'svg', 'button', 'input', 'select', 'textarea']): element.decompose() return str(soup) def fetch_with_playwright(url: str, browser, timeout: int = 60000) -> dict: """Fetch URL with longer timeout.""" result = { 'url': url, 'fetch_timestamp': datetime.now(timezone.utc).isoformat(), 'raw_html': None, 'rendered_html': None, 'markdown': None, 'error': None } try: context = browser.new_context( user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', viewport={'width': 1920, 'height': 1080}, ignore_https_errors=True # Ignore SSL errors ) page = context.new_page() # Try with longer timeout response = page.goto(url, wait_until='domcontentloaded', timeout=timeout) if not response: result['error'] = "No response" context.close() return result if response.status >= 400: result['error'] = f"HTTP {response.status}" context.close() return result # Wait for page to settle page.wait_for_timeout(3000) result['raw_html'] = page.content() result['rendered_html'] = page.content() # Convert to markdown cleaned = clean_html_for_markdown(result['rendered_html']) markdown = md(cleaned, heading_style='atx', bullets='-') markdown = re.sub(r'\n{3,}', '\n\n', markdown) result['markdown'] = markdown.strip() context.close() except PlaywrightTimeout: result['error'] = "Timeout" except Exception as e: error_msg = str(e)[:100] result['error'] = error_msg return result def archive_url(url: str, entry_num: str, browser) -> tuple[bool, str, dict | None]: """Archive a single URL.""" # Skip known bad URLs if url in SKIP_URLS or unquote(url) in SKIP_URLS: return False, "Permanently invalid URL", None dirname = sanitize_dirname(url) url_dir = WEB_DIR / entry_num / dirname # Check if already archived if (url_dir / 'rendered.html').exists(): return False, "Already archived", None result = fetch_with_playwright(url, browser) if result['error']: return False, result['error'], None if not result['rendered_html']: return False, "No content", None # Save files url_dir.mkdir(parents=True, exist_ok=True) with open(url_dir / 'index.html', 'w', encoding='utf-8') as f: f.write(result['raw_html']) with open(url_dir / 'rendered.html', 'w', encoding='utf-8') as f: f.write(result['rendered_html']) with open(url_dir / 'content.md', 'w', encoding='utf-8') as f: f.write(f"---\nsource_url: {url}\nfetch_timestamp: {result['fetch_timestamp']}\n---\n\n") f.write(result['markdown'] or '') return True, "Success", { 'url': url, 'directory': f'web/{entry_num}/{dirname}', 'archive_timestamp': result['fetch_timestamp'], 'status': 'retry_success' } def process_entry(filepath: Path, browser) -> tuple[int, int, list]: """Process an entry with previous failures.""" with open(filepath, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) if not data: return 0, 0, [] failures = data.get('web_enrichment', {}).get('archive_failures', []) if not failures: return 0, 0, [] entry_num = filepath.name.split('_')[0] archived = 0 still_failed = 0 new_archives = [] updated_failures = [] for failure in failures: url = failure.get('url', '') if not url: continue print(f" Retrying: {url[:60]}...") success, status, archive_info = archive_url(url, entry_num, browser) if success: archived += 1 print(f" ✓ {status}") if archive_info: new_archives.append(archive_info) else: still_failed += 1 print(f" ✗ {status}") # Update failure record with new attempt failure['last_retry'] = datetime.now(timezone.utc).isoformat() failure['retry_result'] = status updated_failures.append(failure) # Update entry if new_archives or updated_failures: if 'web_enrichment' not in data: data['web_enrichment'] = {} # Add successful archives if new_archives: existing = data['web_enrichment'].get('web_archives', []) existing_urls = {a.get('url', '').lower().rstrip('/') for a in existing} for archive in new_archives: if archive['url'].lower().rstrip('/') not in existing_urls: existing.append(archive) data['web_enrichment']['web_archives'] = existing # Update failures list (only keep still-failed) data['web_enrichment']['archive_failures'] = updated_failures data['web_enrichment']['retry_timestamp'] = datetime.now(timezone.utc).isoformat() with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) return archived, still_failed, new_archives def main(): parser = argparse.ArgumentParser(description='Retry failed archives') parser.add_argument('--dry-run', action='store_true', help='Show what would be done') args = parser.parse_args() if not HAS_DEPS: print(f"Error: {MISSING_DEP}") return 1 # Find entries with failures print("Finding entries with archive failures...") entries = [] for filepath in sorted(ENTRIES_DIR.glob('*.yaml')): if not filepath.is_file(): continue try: with open(filepath, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) if data and data.get('web_enrichment', {}).get('archive_failures'): entries.append(filepath) except Exception: continue print(f"Found {len(entries)} entries with failures to retry") print() if args.dry_run: for filepath in entries: print(f"Would retry: {filepath.name}") return 0 total_archived = 0 total_failed = 0 with sync_playwright() as p: browser = p.chromium.launch(headless=True) for i, filepath in enumerate(entries): print(f"[{i+1}/{len(entries)}] {filepath.name}") archived, failed, _ = process_entry(filepath, browser) total_archived += archived total_failed += failed # Delay between entries if archived > 0: time.sleep(2) browser.close() print() print("=" * 60) print("RETRY SUMMARY") print("=" * 60) print(f" Successfully archived: {total_archived}") print(f" Still failing: {total_failed}") return 0 if __name__ == '__main__': sys.exit(main())