#!/usr/bin/env python3
"""
Archive websites for entries that have digital_platforms but no web_archives.

This script:
1. Finds entries with digital_platforms but missing web_archives
2. Extracts platform_url from digital_platforms
3. Fetches and archives using Playwright
4. Updates entries with web archive references

Usage:
    python scripts/archive_digital_platforms.py [--limit N] [--dry-run] [--start N]
"""

import argparse
import re
import sys
import time
from datetime import datetime, timezone
from pathlib import Path
from urllib.parse import urlparse

import yaml

try:
    from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeout
    from bs4 import BeautifulSoup
    from markdownify import markdownify as md
    HAS_DEPS = True
except ImportError as e:
    HAS_DEPS = False
    MISSING_DEP = str(e)

# Directories
ENTRIES_DIR = Path('/Users/kempersc/apps/glam/data/nde/enriched/entries')
WEB_DIR = ENTRIES_DIR / 'web'


def sanitize_dirname(url: str) -> str:
    """Create a safe directory name from a URL."""
    parsed = urlparse(url)
    name = parsed.netloc.replace('www.', '')
    name = re.sub(r'[^\w\-.]', '_', name)
    return name


def get_platform_urls(data: dict) -> list[str]:
    """Extract all platform URLs from digital_platforms."""
    urls = []
    platforms = data.get('digital_platforms', [])
    if not platforms:
        return urls
    
    for platform in platforms:
        url = platform.get('platform_url')
        if url:
            url = url.strip()
            if not url.startswith(('http://', 'https://')):
                url = 'https://' + url
            urls.append(url)
    
    return urls


def get_archived_urls(data: dict) -> set[str]:
    """Get URLs already archived."""
    archived = set()
    web_enrichment = data.get('web_enrichment', {})
    web_archives = web_enrichment.get('web_archives', [])
    
    for archive in web_archives:
        url = archive.get('url', '')
        if url:
            archived.add(url.lower().rstrip('/'))
    
    return archived


def clean_html_for_markdown(html: str) -> str:
    """Clean HTML before markdown conversion."""
    soup = BeautifulSoup(html, 'html.parser')
    
    # Remove unwanted elements
    for element in soup.find_all(['script', 'style', 'nav', 'footer', 'header', 
                                  'aside', 'form', 'iframe', 'noscript', 'svg',
                                  'button', 'input', 'select', 'textarea', 'meta',
                                  'link']):
        element.decompose()
    
    return str(soup)


def fetch_with_playwright(url: str, timeout: int = 30000) -> dict:
    """Fetch URL using Playwright."""
    result = {
        'url': url,
        'fetch_timestamp': datetime.now(timezone.utc).isoformat(),
        'raw_html': None,
        'rendered_html': None,
        'markdown': None,
        'error': None
    }
    
    try:
        with sync_playwright() as p:
            browser = p.chromium.launch(headless=True)
            context = browser.new_context(
                user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
                viewport={'width': 1920, 'height': 1080}
            )
            page = context.new_page()
            
            response = page.goto(url, wait_until='networkidle', timeout=timeout)
            
            if not response or response.status >= 400:
                result['error'] = f"HTTP {response.status if response else 'No response'}"
                browser.close()
                return result
            
            result['raw_html'] = page.content()
            page.wait_for_timeout(2000)
            result['rendered_html'] = page.content()
            
            # Convert to markdown
            cleaned = clean_html_for_markdown(result['rendered_html'])
            markdown = md(cleaned, heading_style='atx', bullets='-')
            markdown = re.sub(r'\n{3,}', '\n\n', markdown)
            result['markdown'] = markdown.strip()
            
            browser.close()
            
    except PlaywrightTimeout:
        result['error'] = f"Timeout loading {url}"
    except Exception as e:
        result['error'] = f"Error: {str(e)}"
    
    return result


def archive_url(url: str, entry_num: str, dry_run: bool = False) -> tuple[bool, str, dict | None]:
    """Archive a single URL."""
    dirname = sanitize_dirname(url)
    url_dir = WEB_DIR / entry_num / dirname
    
    # Skip if already archived (directory exists with rendered.html)
    if (url_dir / 'rendered.html').exists():
        return False, "Already archived on disk", None
    
    if dry_run:
        return True, f"Would fetch: {url}", None
    
    # Fetch the website
    result = fetch_with_playwright(url)
    
    if result['error']:
        return False, result['error'], None
    
    if not result['rendered_html']:
        return False, "No content", None
    
    # Create directory and save files
    url_dir.mkdir(parents=True, exist_ok=True)
    
    # Save raw HTML
    with open(url_dir / 'index.html', 'w', encoding='utf-8') as f:
        f.write(result['raw_html'])
    
    # Save rendered HTML
    with open(url_dir / 'rendered.html', 'w', encoding='utf-8') as f:
        f.write(result['rendered_html'])
    
    # Save markdown
    md_header = f"""---
source_url: {url}
fetch_timestamp: {result['fetch_timestamp']}
---

"""
    with open(url_dir / 'content.md', 'w', encoding='utf-8') as f:
        f.write(md_header + (result['markdown'] or ''))
    
    # Save metadata
    metadata = {
        'url': url,
        'fetch_timestamp': result['fetch_timestamp'],
        'files': {
            'raw_html': 'index.html',
            'rendered_html': 'rendered.html',
            'markdown': 'content.md'
        }
    }
    with open(url_dir / 'metadata.yaml', 'w', encoding='utf-8') as f:
        yaml.dump(metadata, f, default_flow_style=False, allow_unicode=True)
    
    archive_info = {
        'url': url,
        'directory': f'web/{entry_num}/{dirname}',
        'archive_timestamp': result['fetch_timestamp']
    }
    
    return True, f"Archived: {url}", archive_info


def process_entry(filepath: Path, dry_run: bool = False) -> tuple[int, int, int]:
    """Process a single entry and archive any missing platform URLs."""
    with open(filepath, 'r', encoding='utf-8') as f:
        data = yaml.safe_load(f)
    
    if not data:
        return 0, 0, 0
    
    # Get platform URLs
    platform_urls = get_platform_urls(data)
    if not platform_urls:
        return 0, 0, 0
    
    # Get already archived URLs
    archived_urls = get_archived_urls(data)
    
    entry_num = filepath.name.split('_')[0]
    
    archived = 0
    failed = 0
    skipped = 0
    new_archives = []
    
    for url in platform_urls:
        url_normalized = url.lower().rstrip('/')
        
        # Skip if already in web_archives
        if url_normalized in archived_urls:
            skipped += 1
            continue
        
        success, message, archive_info = archive_url(url, entry_num, dry_run)
        
        if success:
            archived += 1
            print(f"    ✓ {message}")
            if archive_info:
                new_archives.append(archive_info)
        elif 'Already archived' in message:
            skipped += 1
            # Still add to entry metadata if disk archive exists
            dirname = sanitize_dirname(url)
            new_archives.append({
                'url': url,
                'directory': f'web/{entry_num}/{dirname}',
                'archive_timestamp': datetime.now(timezone.utc).isoformat()
            })
        else:
            failed += 1
            print(f"    ✗ {message}")
    
    # Update entry with new archives
    if new_archives and not dry_run:
        if 'web_enrichment' not in data:
            data['web_enrichment'] = {}
        
        existing = data['web_enrichment'].get('web_archives', [])
        data['web_enrichment']['web_archives'] = existing + new_archives
        data['web_enrichment']['platform_archive_timestamp'] = datetime.now(timezone.utc).isoformat()
        
        with open(filepath, 'w', encoding='utf-8') as f:
            yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
    
    return archived, failed, skipped


def main():
    parser = argparse.ArgumentParser(description='Archive digital platform websites')
    parser.add_argument('--limit', type=int, default=None, help='Limit number of entries')
    parser.add_argument('--dry-run', action='store_true', help='Show what would be done')
    parser.add_argument('--start', type=int, default=0, help='Start from entry index')
    args = parser.parse_args()
    
    if not HAS_DEPS:
        print(f"Error: Required dependencies not installed: {MISSING_DEP}")
        print("Run: pip install playwright beautifulsoup4 markdownify && playwright install chromium")
        return 1
    
    # Find entries with digital_platforms but missing/incomplete web_archives
    print("Finding entries with digital_platforms needing archiving...")
    entries_to_process = []
    
    for filepath in sorted(ENTRIES_DIR.glob('*.yaml')):
        if filepath.name == 'web':  # Skip 'web' if it's somehow a file
            continue
        if not filepath.is_file():
            continue
        
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                data = yaml.safe_load(f)
        except Exception:
            continue
        
        if not data:
            continue
        
        # Check for digital_platforms with URLs
        platform_urls = get_platform_urls(data)
        if not platform_urls:
            continue
        
        # Check if all URLs are already archived
        archived_urls = get_archived_urls(data)
        needs_archiving = False
        
        for url in platform_urls:
            url_normalized = url.lower().rstrip('/')
            if url_normalized not in archived_urls:
                # Also check if on-disk archive exists
                entry_num = filepath.name.split('_')[0]
                dirname = sanitize_dirname(url)
                if not (WEB_DIR / entry_num / dirname / 'rendered.html').exists():
                    needs_archiving = True
                    break
        
        if needs_archiving:
            entries_to_process.append(filepath)
    
    print(f"Found {len(entries_to_process)} entries needing archiving")
    
    # Apply start and limit
    if args.start:
        entries_to_process = entries_to_process[args.start:]
    if args.limit:
        entries_to_process = entries_to_process[:args.limit]
    
    print(f"Processing {len(entries_to_process)} entries...")
    print(f"Mode: {'DRY RUN' if args.dry_run else 'LIVE'}")
    print()
    
    total_archived = 0
    total_failed = 0
    total_skipped = 0
    
    for i, filepath in enumerate(entries_to_process):
        print(f"[{i+1}/{len(entries_to_process)}] {filepath.name}")
        
        archived, failed, skipped = process_entry(filepath, dry_run=args.dry_run)
        
        total_archived += archived
        total_failed += failed
        total_skipped += skipped
        
        # Rate limiting (2 seconds between successful fetches)
        if archived > 0 and not args.dry_run:
            time.sleep(2)
    
    print()
    print("=" * 60)
    print(f"{'DRY RUN - ' if args.dry_run else ''}SUMMARY")
    print("=" * 60)
    print(f"  Entries processed: {len(entries_to_process)}")
    print(f"  URLs archived:     {total_archived}")
    print(f"  URLs failed:       {total_failed}")
    print(f"  URLs skipped:      {total_skipped}")
    
    return 0


if __name__ == '__main__':
    sys.exit(main())