glam/scripts/archive_platforms_batch.py

#!/usr/bin/env python3
"""
Fast batch archiving of digital platform websites.

Reads a list of files needing archiving from stdin or a file.

Usage:
    # Generate list and pipe:
    comm -23 <(rg -l "platform_url:" data/nde/enriched/entries/*.yaml | sort) \
             <(rg -l "web_archives:" data/nde/enriched/entries/*.yaml | sort) | \
    python scripts/archive_platforms_batch.py [--limit N] [--dry-run]

    # Or from file:
    python scripts/archive_platforms_batch.py --input /tmp/needs_archiving.txt
"""

import argparse
import re
import sys
import time
from datetime import datetime, timezone
from pathlib import Path
from urllib.parse import urlparse

import yaml

try:
    from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeout
    from bs4 import BeautifulSoup
    from markdownify import markdownify as md
    HAS_DEPS = True
except ImportError as e:
    HAS_DEPS = False
    MISSING_DEP = str(e)

# Directories
ENTRIES_DIR = Path('/Users/kempersc/apps/glam/data/nde/enriched/entries')
WEB_DIR = ENTRIES_DIR / 'web'


def sanitize_dirname(url: str) -> str:
    """Create a safe directory name from a URL."""
    parsed = urlparse(url)
    name = parsed.netloc.replace('www.', '')
    name = re.sub(r'[^\w\-.]', '_', name)
    return name[:50]  # Limit length


def get_platform_urls(data: dict) -> list[str]:
    """Extract all platform URLs from digital_platforms."""
    urls = []
    platforms = data.get('digital_platforms', [])
    if not platforms:
        return urls

    for platform in platforms:
        url = platform.get('platform_url')
        if url:
            url = url.strip()
            if not url.startswith(('http://', 'https://')):
                url = 'https://' + url
            urls.append(url)

    return urls


def clean_html_for_markdown(html: str) -> str:
    """Clean HTML before markdown conversion."""
    soup = BeautifulSoup(html, 'html.parser')
    for element in soup.find_all(['script', 'style', 'nav', 'footer', 'header',
                                  'aside', 'form', 'iframe', 'noscript', 'svg',
                                  'button', 'input', 'select', 'textarea']):
        element.decompose()
    return str(soup)


def fetch_with_playwright(url: str, browser, timeout: int = 30000) -> dict:
    """Fetch URL using existing browser instance."""
    result = {
        'url': url,
        'fetch_timestamp': datetime.now(timezone.utc).isoformat(),
        'raw_html': None,
        'rendered_html': None,
        'markdown': None,
        'error': None
    }

    try:
        context = browser.new_context(
            user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
            viewport={'width': 1920, 'height': 1080}
        )
        page = context.new_page()

        response = page.goto(url, wait_until='networkidle', timeout=timeout)

        if not response or response.status >= 400:
            result['error'] = f"HTTP {response.status if response else 'No response'}"
            context.close()
            return result

        result['raw_html'] = page.content()
        page.wait_for_timeout(1500)
        result['rendered_html'] = page.content()

        # Convert to markdown
        cleaned = clean_html_for_markdown(result['rendered_html'])
        markdown = md(cleaned, heading_style='atx', bullets='-')
        markdown = re.sub(r'\n{3,}', '\n\n', markdown)
        result['markdown'] = markdown.strip()

        context.close()

    except PlaywrightTimeout:
        result['error'] = f"Timeout"
    except Exception as e:
        result['error'] = str(e)[:100]

    return result


def archive_url(url: str, entry_num: str, browser, dry_run: bool = False) -> tuple[bool, str, dict | None]:
    """Archive a single URL."""
    dirname = sanitize_dirname(url)
    url_dir = WEB_DIR / entry_num / dirname

    if (url_dir / 'rendered.html').exists():
        return False, "exists", {
            'url': url,
            'directory': f'web/{entry_num}/{dirname}',
            'archive_timestamp': datetime.now(timezone.utc).isoformat(),
            'status': 'existing'
        }

    if dry_run:
        return True, "would_fetch", None

    result = fetch_with_playwright(url, browser)

    if result['error']:
        return False, result['error'], None

    if not result['rendered_html']:
        return False, "no_content", None

    url_dir.mkdir(parents=True, exist_ok=True)

    with open(url_dir / 'index.html', 'w', encoding='utf-8') as f:
        f.write(result['raw_html'])

    with open(url_dir / 'rendered.html', 'w', encoding='utf-8') as f:
        f.write(result['rendered_html'])

    with open(url_dir / 'content.md', 'w', encoding='utf-8') as f:
        f.write(f"---\nsource_url: {url}\nfetch_timestamp: {result['fetch_timestamp']}\n---\n\n")
        f.write(result['markdown'] or '')

    return True, "archived", {
        'url': url,
        'directory': f'web/{entry_num}/{dirname}',
        'archive_timestamp': result['fetch_timestamp'],
        'status': 'new'
    }


def process_entry(filepath: Path, browser, dry_run: bool = False) -> tuple[int, int, list]:
    """Process a single entry."""
    with open(filepath, 'r', encoding='utf-8') as f:
        data = yaml.safe_load(f)

    if not data:
        return 0, 0, []

    platform_urls = get_platform_urls(data)
    if not platform_urls:
        return 0, 0, []

    entry_num = filepath.name.split('_')[0]

    archived = 0
    failed = 0
    new_archives = []

    for url in platform_urls:
        success, status, archive_info = archive_url(url, entry_num, browser, dry_run)

        if success:
            archived += 1
        elif status == "exists":
            if archive_info:
                new_archives.append(archive_info)
        else:
            failed += 1
            print(f"    ✗ {url[:50]}... - {status}")

        if archive_info:
            new_archives.append(archive_info)

    # Update entry
    if new_archives and not dry_run:
        if 'web_enrichment' not in data:
            data['web_enrichment'] = {}

        existing = data['web_enrichment'].get('web_archives', [])
        existing_urls = {a.get('url', '').lower().rstrip('/') for a in existing}

        for archive in new_archives:
            if archive['url'].lower().rstrip('/') not in existing_urls:
                existing.append(archive)

        data['web_enrichment']['web_archives'] = existing
        data['web_enrichment']['platform_archive_timestamp'] = datetime.now(timezone.utc).isoformat()

        with open(filepath, 'w', encoding='utf-8') as f:
            yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)

    return archived, failed, new_archives


def main():
    parser = argparse.ArgumentParser(description='Batch archive digital platform websites')
    parser.add_argument('--input', type=str, help='File with list of entries to process')
    parser.add_argument('--limit', type=int, default=None, help='Limit number of entries')
    parser.add_argument('--dry-run', action='store_true', help='Show what would be done')
    args = parser.parse_args()

    if not HAS_DEPS:
        print(f"Error: {MISSING_DEP}")
        print("Run: pip install playwright beautifulsoup4 markdownify && playwright install chromium")
        return 1

    # Read file list
    if args.input:
        with open(args.input, 'r') as f:
            files = [Path(line.strip()) for line in f if line.strip()]
    else:
        files = [Path(line.strip()) for line in sys.stdin if line.strip()]

    if args.limit:
        files = files[:args.limit]

    print(f"Processing {len(files)} entries...")
    print(f"Mode: {'DRY RUN' if args.dry_run else 'LIVE'}")
    print()

    total_archived = 0
    total_failed = 0

    if args.dry_run:
        # No browser needed for dry run
        for i, filepath in enumerate(files):
            print(f"[{i+1}/{len(files)}] {filepath.name}")
            with open(filepath, 'r', encoding='utf-8') as f:
                data = yaml.safe_load(f)
            urls = get_platform_urls(data) if data else []
            for url in urls:
                print(f"    Would archive: {url[:60]}...")
                total_archived += 1
    else:
        with sync_playwright() as p:
            browser = p.chromium.launch(headless=True)

            for i, filepath in enumerate(files):
                print(f"[{i+1}/{len(files)}] {filepath.name}", end=" ", flush=True)

                archived, failed, archives = process_entry(filepath, browser, dry_run=False)

                total_archived += archived
                total_failed += failed

                if archived > 0:
                    print(f"✓ {archived} archived")
                elif failed > 0:
                    print(f"✗ {failed} failed")
                else:
                    print("- skipped (already archived)")

                # Small delay between entries
                if archived > 0:
                    time.sleep(1)

            browser.close()

    print()
    print("=" * 60)
    print(f"{'DRY RUN - ' if args.dry_run else ''}SUMMARY")
    print("=" * 60)
    print(f"  Entries processed: {len(files)}")
    print(f"  URLs archived:     {total_archived}")
    print(f"  URLs failed:       {total_failed}")

    return 0


if __name__ == '__main__':
    sys.exit(main())