#!/usr/bin/env python3 """ Fast batch archiving of digital platform websites. Reads a list of files needing archiving from stdin or a file. Usage: # Generate list and pipe: comm -23 <(rg -l "platform_url:" data/nde/enriched/entries/*.yaml | sort) \ <(rg -l "web_archives:" data/nde/enriched/entries/*.yaml | sort) | \ python scripts/archive_platforms_batch.py [--limit N] [--dry-run] # Or from file: python scripts/archive_platforms_batch.py --input /tmp/needs_archiving.txt """ import argparse import re import sys import time from datetime import datetime, timezone from pathlib import Path from urllib.parse import urlparse import yaml try: from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeout from bs4 import BeautifulSoup from markdownify import markdownify as md HAS_DEPS = True except ImportError as e: HAS_DEPS = False MISSING_DEP = str(e) # Directories ENTRIES_DIR = Path('/Users/kempersc/apps/glam/data/nde/enriched/entries') WEB_DIR = ENTRIES_DIR / 'web' def sanitize_dirname(url: str) -> str: """Create a safe directory name from a URL.""" parsed = urlparse(url) name = parsed.netloc.replace('www.', '') name = re.sub(r'[^\w\-.]', '_', name) return name[:50] # Limit length def get_platform_urls(data: dict) -> list[str]: """Extract all platform URLs from digital_platforms.""" urls = [] platforms = data.get('digital_platforms', []) if not platforms: return urls for platform in platforms: url = platform.get('platform_url') if url: url = url.strip() if not url.startswith(('http://', 'https://')): url = 'https://' + url urls.append(url) return urls def clean_html_for_markdown(html: str) -> str: """Clean HTML before markdown conversion.""" soup = BeautifulSoup(html, 'html.parser') for element in soup.find_all(['script', 'style', 'nav', 'footer', 'header', 'aside', 'form', 'iframe', 'noscript', 'svg', 'button', 'input', 'select', 'textarea']): element.decompose() return str(soup) def fetch_with_playwright(url: str, browser, timeout: int = 30000) -> dict: """Fetch URL using existing browser instance.""" result = { 'url': url, 'fetch_timestamp': datetime.now(timezone.utc).isoformat(), 'raw_html': None, 'rendered_html': None, 'markdown': None, 'error': None } try: context = browser.new_context( user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', viewport={'width': 1920, 'height': 1080} ) page = context.new_page() response = page.goto(url, wait_until='networkidle', timeout=timeout) if not response or response.status >= 400: result['error'] = f"HTTP {response.status if response else 'No response'}" context.close() return result result['raw_html'] = page.content() page.wait_for_timeout(1500) result['rendered_html'] = page.content() # Convert to markdown cleaned = clean_html_for_markdown(result['rendered_html']) markdown = md(cleaned, heading_style='atx', bullets='-') markdown = re.sub(r'\n{3,}', '\n\n', markdown) result['markdown'] = markdown.strip() context.close() except PlaywrightTimeout: result['error'] = f"Timeout" except Exception as e: result['error'] = str(e)[:100] return result def archive_url(url: str, entry_num: str, browser, dry_run: bool = False) -> tuple[bool, str, dict | None]: """Archive a single URL.""" dirname = sanitize_dirname(url) url_dir = WEB_DIR / entry_num / dirname if (url_dir / 'rendered.html').exists(): return False, "exists", { 'url': url, 'directory': f'web/{entry_num}/{dirname}', 'archive_timestamp': datetime.now(timezone.utc).isoformat(), 'status': 'existing' } if dry_run: return True, "would_fetch", None result = fetch_with_playwright(url, browser) if result['error']: return False, result['error'], None if not result['rendered_html']: return False, "no_content", None url_dir.mkdir(parents=True, exist_ok=True) with open(url_dir / 'index.html', 'w', encoding='utf-8') as f: f.write(result['raw_html']) with open(url_dir / 'rendered.html', 'w', encoding='utf-8') as f: f.write(result['rendered_html']) with open(url_dir / 'content.md', 'w', encoding='utf-8') as f: f.write(f"---\nsource_url: {url}\nfetch_timestamp: {result['fetch_timestamp']}\n---\n\n") f.write(result['markdown'] or '') return True, "archived", { 'url': url, 'directory': f'web/{entry_num}/{dirname}', 'archive_timestamp': result['fetch_timestamp'], 'status': 'new' } def process_entry(filepath: Path, browser, dry_run: bool = False) -> tuple[int, int, list]: """Process a single entry.""" with open(filepath, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) if not data: return 0, 0, [] platform_urls = get_platform_urls(data) if not platform_urls: return 0, 0, [] entry_num = filepath.name.split('_')[0] archived = 0 failed = 0 new_archives = [] for url in platform_urls: success, status, archive_info = archive_url(url, entry_num, browser, dry_run) if success: archived += 1 elif status == "exists": if archive_info: new_archives.append(archive_info) else: failed += 1 print(f" ✗ {url[:50]}... - {status}") if archive_info: new_archives.append(archive_info) # Update entry if new_archives and not dry_run: if 'web_enrichment' not in data: data['web_enrichment'] = {} existing = data['web_enrichment'].get('web_archives', []) existing_urls = {a.get('url', '').lower().rstrip('/') for a in existing} for archive in new_archives: if archive['url'].lower().rstrip('/') not in existing_urls: existing.append(archive) data['web_enrichment']['web_archives'] = existing data['web_enrichment']['platform_archive_timestamp'] = datetime.now(timezone.utc).isoformat() with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) return archived, failed, new_archives def main(): parser = argparse.ArgumentParser(description='Batch archive digital platform websites') parser.add_argument('--input', type=str, help='File with list of entries to process') parser.add_argument('--limit', type=int, default=None, help='Limit number of entries') parser.add_argument('--dry-run', action='store_true', help='Show what would be done') args = parser.parse_args() if not HAS_DEPS: print(f"Error: {MISSING_DEP}") print("Run: pip install playwright beautifulsoup4 markdownify && playwright install chromium") return 1 # Read file list if args.input: with open(args.input, 'r') as f: files = [Path(line.strip()) for line in f if line.strip()] else: files = [Path(line.strip()) for line in sys.stdin if line.strip()] if args.limit: files = files[:args.limit] print(f"Processing {len(files)} entries...") print(f"Mode: {'DRY RUN' if args.dry_run else 'LIVE'}") print() total_archived = 0 total_failed = 0 if args.dry_run: # No browser needed for dry run for i, filepath in enumerate(files): print(f"[{i+1}/{len(files)}] {filepath.name}") with open(filepath, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) urls = get_platform_urls(data) if data else [] for url in urls: print(f" Would archive: {url[:60]}...") total_archived += 1 else: with sync_playwright() as p: browser = p.chromium.launch(headless=True) for i, filepath in enumerate(files): print(f"[{i+1}/{len(files)}] {filepath.name}", end=" ", flush=True) archived, failed, archives = process_entry(filepath, browser, dry_run=False) total_archived += archived total_failed += failed if archived > 0: print(f"✓ {archived} archived") elif failed > 0: print(f"✗ {failed} failed") else: print("- skipped (already archived)") # Small delay between entries if archived > 0: time.sleep(1) browser.close() print() print("=" * 60) print(f"{'DRY RUN - ' if args.dry_run else ''}SUMMARY") print("=" * 60) print(f" Entries processed: {len(files)}") print(f" URLs archived: {total_archived}") print(f" URLs failed: {total_failed}") return 0 if __name__ == '__main__': sys.exit(main())