glam/scripts/retry_archive_failures.py

#!/usr/bin/env python3
"""
Retry archiving for entries that previously failed.

Uses longer timeouts and different strategies for different failure types.

Usage:
    python scripts/retry_archive_failures.py [--dry-run]
"""

import argparse
import re
import sys
import time
from datetime import datetime, timezone
from pathlib import Path
from urllib.parse import urlparse, unquote

import yaml

try:
    from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeout
    from bs4 import BeautifulSoup
    from markdownify import markdownify as md
    HAS_DEPS = True
except ImportError as e:
    HAS_DEPS = False
    MISSING_DEP = str(e)

ENTRIES_DIR = Path('/Users/kempersc/apps/glam/data/nde/enriched/entries')
WEB_DIR = ENTRIES_DIR / 'web'

# URLs that are definitely not archivable (skip these)
SKIP_URLS = {
    "http://www.aflegverenigingrachel@gmail.com",  # Email, not URL
    "http://Schuttersbroederschap%20St.%20Sebastianus",  # Invalid URL
    "http://marcdenelzen.nl/assets/ontwijken-of-samenleven-(column-74).pdf",  # PDF on dead domain
}


def sanitize_dirname(url: str) -> str:
    """Create a safe directory name from a URL."""
    parsed = urlparse(url)
    name = parsed.netloc.replace('www.', '')
    name = re.sub(r'[^\w\-.]', '_', name)
    return name[:50]


def clean_html_for_markdown(html: str) -> str:
    """Clean HTML before markdown conversion."""
    soup = BeautifulSoup(html, 'html.parser')
    for element in soup.find_all(['script', 'style', 'nav', 'footer', 'header',
                                  'aside', 'form', 'iframe', 'noscript', 'svg',
                                  'button', 'input', 'select', 'textarea']):
        element.decompose()
    return str(soup)


def fetch_with_playwright(url: str, browser, timeout: int = 60000) -> dict:
    """Fetch URL with longer timeout."""
    result = {
        'url': url,
        'fetch_timestamp': datetime.now(timezone.utc).isoformat(),
        'raw_html': None,
        'rendered_html': None,
        'markdown': None,
        'error': None
    }

    try:
        context = browser.new_context(
            user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            viewport={'width': 1920, 'height': 1080},
            ignore_https_errors=True  # Ignore SSL errors
        )
        page = context.new_page()

        # Try with longer timeout
        response = page.goto(url, wait_until='domcontentloaded', timeout=timeout)

        if not response:
            result['error'] = "No response"
            context.close()
            return result

        if response.status >= 400:
            result['error'] = f"HTTP {response.status}"
            context.close()
            return result

        # Wait for page to settle
        page.wait_for_timeout(3000)

        result['raw_html'] = page.content()
        result['rendered_html'] = page.content()

        # Convert to markdown
        cleaned = clean_html_for_markdown(result['rendered_html'])
        markdown = md(cleaned, heading_style='atx', bullets='-')
        markdown = re.sub(r'\n{3,}', '\n\n', markdown)
        result['markdown'] = markdown.strip()

        context.close()

    except PlaywrightTimeout:
        result['error'] = "Timeout"
    except Exception as e:
        error_msg = str(e)[:100]
        result['error'] = error_msg

    return result


def archive_url(url: str, entry_num: str, browser) -> tuple[bool, str, dict | None]:
    """Archive a single URL."""
    # Skip known bad URLs
    if url in SKIP_URLS or unquote(url) in SKIP_URLS:
        return False, "Permanently invalid URL", None

    dirname = sanitize_dirname(url)
    url_dir = WEB_DIR / entry_num / dirname

    # Check if already archived
    if (url_dir / 'rendered.html').exists():
        return False, "Already archived", None

    result = fetch_with_playwright(url, browser)

    if result['error']:
        return False, result['error'], None

    if not result['rendered_html']:
        return False, "No content", None

    # Save files
    url_dir.mkdir(parents=True, exist_ok=True)

    with open(url_dir / 'index.html', 'w', encoding='utf-8') as f:
        f.write(result['raw_html'])

    with open(url_dir / 'rendered.html', 'w', encoding='utf-8') as f:
        f.write(result['rendered_html'])

    with open(url_dir / 'content.md', 'w', encoding='utf-8') as f:
        f.write(f"---\nsource_url: {url}\nfetch_timestamp: {result['fetch_timestamp']}\n---\n\n")
        f.write(result['markdown'] or '')

    return True, "Success", {
        'url': url,
        'directory': f'web/{entry_num}/{dirname}',
        'archive_timestamp': result['fetch_timestamp'],
        'status': 'retry_success'
    }


def process_entry(filepath: Path, browser) -> tuple[int, int, list]:
    """Process an entry with previous failures."""
    with open(filepath, 'r', encoding='utf-8') as f:
        data = yaml.safe_load(f)

    if not data:
        return 0, 0, []

    failures = data.get('web_enrichment', {}).get('archive_failures', [])
    if not failures:
        return 0, 0, []

    entry_num = filepath.name.split('_')[0]

    archived = 0
    still_failed = 0
    new_archives = []
    updated_failures = []

    for failure in failures:
        url = failure.get('url', '')
        if not url:
            continue

        print(f"    Retrying: {url[:60]}...")

        success, status, archive_info = archive_url(url, entry_num, browser)

        if success:
            archived += 1
            print(f"      ✓ {status}")
            if archive_info:
                new_archives.append(archive_info)
        else:
            still_failed += 1
            print(f"      ✗ {status}")
            # Update failure record with new attempt
            failure['last_retry'] = datetime.now(timezone.utc).isoformat()
            failure['retry_result'] = status
            updated_failures.append(failure)

    # Update entry
    if new_archives or updated_failures:
        if 'web_enrichment' not in data:
            data['web_enrichment'] = {}

        # Add successful archives
        if new_archives:
            existing = data['web_enrichment'].get('web_archives', [])
            existing_urls = {a.get('url', '').lower().rstrip('/') for a in existing}

            for archive in new_archives:
                if archive['url'].lower().rstrip('/') not in existing_urls:
                    existing.append(archive)

            data['web_enrichment']['web_archives'] = existing

        # Update failures list (only keep still-failed)
        data['web_enrichment']['archive_failures'] = updated_failures
        data['web_enrichment']['retry_timestamp'] = datetime.now(timezone.utc).isoformat()

        with open(filepath, 'w', encoding='utf-8') as f:
            yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)

    return archived, still_failed, new_archives


def main():
    parser = argparse.ArgumentParser(description='Retry failed archives')
    parser.add_argument('--dry-run', action='store_true', help='Show what would be done')
    args = parser.parse_args()

    if not HAS_DEPS:
        print(f"Error: {MISSING_DEP}")
        return 1

    # Find entries with failures
    print("Finding entries with archive failures...")
    entries = []
    for filepath in sorted(ENTRIES_DIR.glob('*.yaml')):
        if not filepath.is_file():
            continue
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                data = yaml.safe_load(f)
            if data and data.get('web_enrichment', {}).get('archive_failures'):
                entries.append(filepath)
        except Exception:
            continue

    print(f"Found {len(entries)} entries with failures to retry")
    print()

    if args.dry_run:
        for filepath in entries:
            print(f"Would retry: {filepath.name}")
        return 0

    total_archived = 0
    total_failed = 0

    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True)

        for i, filepath in enumerate(entries):
            print(f"[{i+1}/{len(entries)}] {filepath.name}")

            archived, failed, _ = process_entry(filepath, browser)

            total_archived += archived
            total_failed += failed

            # Delay between entries
            if archived > 0:
                time.sleep(2)

        browser.close()

    print()
    print("=" * 60)
    print("RETRY SUMMARY")
    print("=" * 60)
    print(f"  Successfully archived: {total_archived}")
    print(f"  Still failing:         {total_failed}")

    return 0


if __name__ == '__main__':
    sys.exit(main())