#!/usr/bin/env python3 """ Archive websites for entries that have digital_platforms but no web_archives. This script: 1. Finds entries with digital_platforms but missing web_archives 2. Extracts platform_url from digital_platforms 3. Fetches and archives using Playwright 4. Updates entries with web archive references Usage: python scripts/archive_digital_platforms.py [--limit N] [--dry-run] [--start N] """ import argparse import re import sys import time from datetime import datetime, timezone from pathlib import Path from urllib.parse import urlparse import yaml try: from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeout from bs4 import BeautifulSoup from markdownify import markdownify as md HAS_DEPS = True except ImportError as e: HAS_DEPS = False MISSING_DEP = str(e) # Directories ENTRIES_DIR = Path('/Users/kempersc/apps/glam/data/nde/enriched/entries') WEB_DIR = ENTRIES_DIR / 'web' def sanitize_dirname(url: str) -> str: """Create a safe directory name from a URL.""" parsed = urlparse(url) name = parsed.netloc.replace('www.', '') name = re.sub(r'[^\w\-.]', '_', name) return name def get_platform_urls(data: dict) -> list[str]: """Extract all platform URLs from digital_platforms.""" urls = [] platforms = data.get('digital_platforms', []) if not platforms: return urls for platform in platforms: url = platform.get('platform_url') if url: url = url.strip() if not url.startswith(('http://', 'https://')): url = 'https://' + url urls.append(url) return urls def get_archived_urls(data: dict) -> set[str]: """Get URLs already archived.""" archived = set() web_enrichment = data.get('web_enrichment', {}) web_archives = web_enrichment.get('web_archives', []) for archive in web_archives: url = archive.get('url', '') if url: archived.add(url.lower().rstrip('/')) return archived def clean_html_for_markdown(html: str) -> str: """Clean HTML before markdown conversion.""" soup = BeautifulSoup(html, 'html.parser') # Remove unwanted elements for element in soup.find_all(['script', 'style', 'nav', 'footer', 'header', 'aside', 'form', 'iframe', 'noscript', 'svg', 'button', 'input', 'select', 'textarea', 'meta', 'link']): element.decompose() return str(soup) def fetch_with_playwright(url: str, timeout: int = 30000) -> dict: """Fetch URL using Playwright.""" result = { 'url': url, 'fetch_timestamp': datetime.now(timezone.utc).isoformat(), 'raw_html': None, 'rendered_html': None, 'markdown': None, 'error': None } try: with sync_playwright() as p: browser = p.chromium.launch(headless=True) context = browser.new_context( user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', viewport={'width': 1920, 'height': 1080} ) page = context.new_page() response = page.goto(url, wait_until='networkidle', timeout=timeout) if not response or response.status >= 400: result['error'] = f"HTTP {response.status if response else 'No response'}" browser.close() return result result['raw_html'] = page.content() page.wait_for_timeout(2000) result['rendered_html'] = page.content() # Convert to markdown cleaned = clean_html_for_markdown(result['rendered_html']) markdown = md(cleaned, heading_style='atx', bullets='-') markdown = re.sub(r'\n{3,}', '\n\n', markdown) result['markdown'] = markdown.strip() browser.close() except PlaywrightTimeout: result['error'] = f"Timeout loading {url}" except Exception as e: result['error'] = f"Error: {str(e)}" return result def archive_url(url: str, entry_num: str, dry_run: bool = False) -> tuple[bool, str, dict | None]: """Archive a single URL.""" dirname = sanitize_dirname(url) url_dir = WEB_DIR / entry_num / dirname # Skip if already archived (directory exists with rendered.html) if (url_dir / 'rendered.html').exists(): return False, "Already archived on disk", None if dry_run: return True, f"Would fetch: {url}", None # Fetch the website result = fetch_with_playwright(url) if result['error']: return False, result['error'], None if not result['rendered_html']: return False, "No content", None # Create directory and save files url_dir.mkdir(parents=True, exist_ok=True) # Save raw HTML with open(url_dir / 'index.html', 'w', encoding='utf-8') as f: f.write(result['raw_html']) # Save rendered HTML with open(url_dir / 'rendered.html', 'w', encoding='utf-8') as f: f.write(result['rendered_html']) # Save markdown md_header = f"""--- source_url: {url} fetch_timestamp: {result['fetch_timestamp']} --- """ with open(url_dir / 'content.md', 'w', encoding='utf-8') as f: f.write(md_header + (result['markdown'] or '')) # Save metadata metadata = { 'url': url, 'fetch_timestamp': result['fetch_timestamp'], 'files': { 'raw_html': 'index.html', 'rendered_html': 'rendered.html', 'markdown': 'content.md' } } with open(url_dir / 'metadata.yaml', 'w', encoding='utf-8') as f: yaml.dump(metadata, f, default_flow_style=False, allow_unicode=True) archive_info = { 'url': url, 'directory': f'web/{entry_num}/{dirname}', 'archive_timestamp': result['fetch_timestamp'] } return True, f"Archived: {url}", archive_info def process_entry(filepath: Path, dry_run: bool = False) -> tuple[int, int, int]: """Process a single entry and archive any missing platform URLs.""" with open(filepath, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) if not data: return 0, 0, 0 # Get platform URLs platform_urls = get_platform_urls(data) if not platform_urls: return 0, 0, 0 # Get already archived URLs archived_urls = get_archived_urls(data) entry_num = filepath.name.split('_')[0] archived = 0 failed = 0 skipped = 0 new_archives = [] for url in platform_urls: url_normalized = url.lower().rstrip('/') # Skip if already in web_archives if url_normalized in archived_urls: skipped += 1 continue success, message, archive_info = archive_url(url, entry_num, dry_run) if success: archived += 1 print(f" ✓ {message}") if archive_info: new_archives.append(archive_info) elif 'Already archived' in message: skipped += 1 # Still add to entry metadata if disk archive exists dirname = sanitize_dirname(url) new_archives.append({ 'url': url, 'directory': f'web/{entry_num}/{dirname}', 'archive_timestamp': datetime.now(timezone.utc).isoformat() }) else: failed += 1 print(f" ✗ {message}") # Update entry with new archives if new_archives and not dry_run: if 'web_enrichment' not in data: data['web_enrichment'] = {} existing = data['web_enrichment'].get('web_archives', []) data['web_enrichment']['web_archives'] = existing + new_archives data['web_enrichment']['platform_archive_timestamp'] = datetime.now(timezone.utc).isoformat() with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) return archived, failed, skipped def main(): parser = argparse.ArgumentParser(description='Archive digital platform websites') parser.add_argument('--limit', type=int, default=None, help='Limit number of entries') parser.add_argument('--dry-run', action='store_true', help='Show what would be done') parser.add_argument('--start', type=int, default=0, help='Start from entry index') args = parser.parse_args() if not HAS_DEPS: print(f"Error: Required dependencies not installed: {MISSING_DEP}") print("Run: pip install playwright beautifulsoup4 markdownify && playwright install chromium") return 1 # Find entries with digital_platforms but missing/incomplete web_archives print("Finding entries with digital_platforms needing archiving...") entries_to_process = [] for filepath in sorted(ENTRIES_DIR.glob('*.yaml')): if filepath.name == 'web': # Skip 'web' if it's somehow a file continue if not filepath.is_file(): continue try: with open(filepath, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) except Exception: continue if not data: continue # Check for digital_platforms with URLs platform_urls = get_platform_urls(data) if not platform_urls: continue # Check if all URLs are already archived archived_urls = get_archived_urls(data) needs_archiving = False for url in platform_urls: url_normalized = url.lower().rstrip('/') if url_normalized not in archived_urls: # Also check if on-disk archive exists entry_num = filepath.name.split('_')[0] dirname = sanitize_dirname(url) if not (WEB_DIR / entry_num / dirname / 'rendered.html').exists(): needs_archiving = True break if needs_archiving: entries_to_process.append(filepath) print(f"Found {len(entries_to_process)} entries needing archiving") # Apply start and limit if args.start: entries_to_process = entries_to_process[args.start:] if args.limit: entries_to_process = entries_to_process[:args.limit] print(f"Processing {len(entries_to_process)} entries...") print(f"Mode: {'DRY RUN' if args.dry_run else 'LIVE'}") print() total_archived = 0 total_failed = 0 total_skipped = 0 for i, filepath in enumerate(entries_to_process): print(f"[{i+1}/{len(entries_to_process)}] {filepath.name}") archived, failed, skipped = process_entry(filepath, dry_run=args.dry_run) total_archived += archived total_failed += failed total_skipped += skipped # Rate limiting (2 seconds between successful fetches) if archived > 0 and not args.dry_run: time.sleep(2) print() print("=" * 60) print(f"{'DRY RUN - ' if args.dry_run else ''}SUMMARY") print("=" * 60) print(f" Entries processed: {len(entries_to_process)}") print(f" URLs archived: {total_archived}") print(f" URLs failed: {total_failed}") print(f" URLs skipped: {total_skipped}") return 0 if __name__ == '__main__': sys.exit(main())