glam/scripts/batch_extract_web_claims.py

#!/usr/bin/env python3
"""
Batch runner for web claims extraction with logging and progress tracking.

This script processes all entries in the NDE enriched entries directory,
extracting web claims from archived HTML files.

Features:
- Progress logging to file and console
- --force flag to re-extract already processed entries
- Resume capability (skips already processed entries by default)
- Detailed statistics and timing
- Safe interruption handling

Usage:
    # Process all unprocessed entries
    python scripts/batch_extract_web_claims.py

    # Force re-extraction of all entries
    python scripts/batch_extract_web_claims.py --force

    # Limit to first N entries
    python scripts/batch_extract_web_claims.py --limit 100

    # Start from a specific entry number
    python scripts/batch_extract_web_claims.py --start 0500

    # Run in background with nohup
    nohup python scripts/batch_extract_web_claims.py > batch_extract.out 2>&1 &
"""

import argparse
import logging
import os
import signal
import sys
import time
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional, List, Dict, Any

import yaml

# Add scripts directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))

# Import the extraction module
try:
    from extract_html_claims import (
        process_entry,
        ENTRIES_DIR,
        WEB_DIR,
        HAS_DEPS,
        NER_ENABLED,
        NER_MODEL,
        NER_CONVENTION_VERSION,
    )
except ImportError as e:
    print(f"Error importing extract_html_claims: {e}")
    print("Make sure extract_html_claims.py is in the same directory")
    sys.exit(1)

# Configure logging
LOG_DIR = Path('/Users/kempersc/apps/glam/logs')
LOG_DIR.mkdir(exist_ok=True)

timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
log_file = LOG_DIR / f'batch_extract_{timestamp}.log'

# Create formatters
file_formatter = logging.Formatter(
    '%(asctime)s | %(levelname)-8s | %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)
console_formatter = logging.Formatter(
    '%(asctime)s | %(message)s',
    datefmt='%H:%M:%S'
)

# File handler (detailed)
file_handler = logging.FileHandler(log_file, encoding='utf-8')
file_handler.setLevel(logging.DEBUG)
file_handler.setFormatter(file_formatter)

# Console handler (summary)
console_handler = logging.StreamHandler(sys.stdout)
console_handler.setLevel(logging.INFO)
console_handler.setFormatter(console_formatter)

# Root logger
logger = logging.getLogger('batch_extract')
logger.setLevel(logging.DEBUG)
logger.addHandler(file_handler)
logger.addHandler(console_handler)

# Statistics tracking
stats = {
    'total_entries': 0,
    'processed': 0,
    'skipped': 0,
    'failed': 0,
    'claims_extracted': 0,
    'claims_validated': 0,
    'claims_rejected': 0,
    'start_time': None,
    'end_time': None,
}

# Graceful shutdown flag
shutdown_requested = False


def signal_handler(signum, frame):
    """Handle interrupt signals gracefully."""
    global shutdown_requested
    if shutdown_requested:
        logger.warning("Force quit requested, exiting immediately...")
        sys.exit(1)
    logger.warning("\nShutdown requested. Finishing current entry...")
    logger.warning("Press Ctrl+C again to force quit.")
    shutdown_requested = True


def get_entry_files() -> List[Path]:
    """Get all entry YAML files sorted by number."""
    entries = []
    for entry_file in ENTRIES_DIR.glob('*.yaml'):
        if entry_file.name.startswith('.'):
            continue
        # Extract entry number from filename (e.g., 0022_Q123.yaml -> 0022)
        name_parts = entry_file.stem.split('_')
        if name_parts and name_parts[0].isdigit():
            entries.append(entry_file)
    return sorted(entries, key=lambda x: x.stem.split('_')[0])


def has_web_claims(entry_file: Path) -> bool:
    """Check if entry already has web_claims in its YAML file."""
    try:
        with open(entry_file, 'r', encoding='utf-8') as f:
            data = yaml.safe_load(f)
            if data and isinstance(data, dict):
                return 'web_claims' in data
    except Exception:
        pass
    return False


def has_html_files(entry_file: Path) -> bool:
    """Check if entry has archived HTML files to process."""
    # Extract entry number from filename (e.g., 0022_Q123.yaml -> 0022)
    entry_num = entry_file.stem.split('_')[0]
    web_entry_dir = WEB_DIR / entry_num
    if not web_entry_dir.exists():
        return False

    # Check for any .html files
    html_files = list(web_entry_dir.rglob('*.html'))
    return len(html_files) > 0


def process_single_entry(
    entry_file: Path,
    force: bool = False,
    verbose: bool = False,
) -> Dict[str, Any]:
    """
    Process a single entry file and return results.

    Returns dict with keys:
    - status: 'processed' | 'skipped' | 'failed' | 'no_html'
    - claims_count: int
    - valid_count: int
    - invalid_count: int
    - error: str or None
    - duration: float (seconds)
    """
    entry_num = entry_file.stem.split('_')[0]
    result = {
        'entry': entry_num,
        'status': 'unknown',
        'claims_count': 0,
        'valid_count': 0,
        'invalid_count': 0,
        'error': None,
        'duration': 0.0,
    }

    start_time = time.time()

    try:
        # Check if already processed
        if not force and has_web_claims(entry_file):
            result['status'] = 'skipped'
            result['duration'] = time.time() - start_time
            return result

        # Check if has HTML files
        if not has_html_files(entry_file):
            result['status'] = 'no_html'
            result['duration'] = time.time() - start_time
            return result

        # Process the entry
        claims_count, errors = process_entry(
            filepath=entry_file,
            dry_run=False,
        )

        if errors:
            for err in errors:
                logger.debug(f"  Entry {entry_num}: {err}")

        result['status'] = 'processed'
        result['claims_count'] = claims_count
        result['valid_count'] = claims_count  # All claims that made it through are valid

    except Exception as e:
        result['status'] = 'failed'
        result['error'] = str(e)
        logger.error(f"Entry {entry_num} failed: {e}")

    result['duration'] = time.time() - start_time
    return result


def run_batch(
    force: bool = False,
    limit: Optional[int] = None,
    start: Optional[str] = None,
    verbose: bool = False,
) -> None:
    """Run batch extraction on all entries."""
    global stats, shutdown_requested

    logger.info("=" * 70)
    logger.info("BATCH WEB CLAIMS EXTRACTION")
    logger.info("=" * 70)
    logger.info(f"Log file: {log_file}")
    logger.info(f"Force mode: {force}")
    logger.info(f"LLM validation: {NER_ENABLED}")
    if NER_ENABLED:
        logger.info(f"Model: {NER_MODEL}")
        logger.info(f"Convention: {NER_CONVENTION_VERSION}")
    logger.info("")

    # Get all entries
    entries = get_entry_files()
    stats['total_entries'] = len(entries)

    # Filter by start
    if start:
        entries = [e for e in entries if e.stem.split('_')[0] >= start]

    # Apply limit
    if limit:
        entries = entries[:limit]

    logger.info(f"Entries to process: {len(entries)}")
    logger.info("=" * 70)
    logger.info("")

    stats['start_time'] = datetime.now(timezone.utc)

    for i, entry_file in enumerate(entries):
        if shutdown_requested:
            logger.warning("Shutdown requested, stopping batch processing")
            break

        entry_num = entry_file.stem.split('_')[0]
        progress = f"[{i+1}/{len(entries)}]"

        result = process_single_entry(entry_file, force=force, verbose=verbose)

        if result['status'] == 'processed':
            stats['processed'] += 1
            stats['claims_extracted'] += result['claims_count']
            stats['claims_validated'] += result['valid_count']
            stats['claims_rejected'] += result['invalid_count']
            logger.info(
                f"{progress} Entry {entry_num}: "
                f"{result['valid_count']} claims, "
                f"{result['invalid_count']} rejected "
                f"({result['duration']:.1f}s)"
            )
        elif result['status'] == 'skipped':
            stats['skipped'] += 1
            logger.debug(f"{progress} Entry {entry_num}: Skipped (already has web_claims)")
        elif result['status'] == 'no_html':
            stats['skipped'] += 1
            logger.debug(f"{progress} Entry {entry_num}: Skipped (no HTML files)")
        elif result['status'] == 'failed':
            stats['failed'] += 1
            logger.error(f"{progress} Entry {entry_num}: FAILED - {result['error']}")

    stats['end_time'] = datetime.now(timezone.utc)

    # Print summary
    logger.info("")
    logger.info("=" * 70)
    logger.info("BATCH COMPLETE")
    logger.info("=" * 70)

    duration = (stats['end_time'] - stats['start_time']).total_seconds()
    hours, remainder = divmod(int(duration), 3600)
    minutes, seconds = divmod(remainder, 60)

    logger.info(f"Total entries:    {stats['total_entries']}")
    logger.info(f"Processed:        {stats['processed']}")
    logger.info(f"Skipped:          {stats['skipped']}")
    logger.info(f"Failed:           {stats['failed']}")
    logger.info(f"Claims extracted: {stats['claims_extracted']}")
    logger.info(f"Claims validated: {stats['claims_validated']}")
    logger.info(f"Claims rejected:  {stats['claims_rejected']}")
    logger.info(f"Duration:         {hours}h {minutes}m {seconds}s")
    logger.info(f"Avg per entry:    {duration/max(stats['processed'],1):.1f}s")
    logger.info("")
    logger.info(f"Log file: {log_file}")
    logger.info("=" * 70)


def main():
    """Main entry point."""
    parser = argparse.ArgumentParser(
        description='Batch extract web claims from archived HTML files',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  python scripts/batch_extract_web_claims.py
  python scripts/batch_extract_web_claims.py --force
  python scripts/batch_extract_web_claims.py --limit 100
  python scripts/batch_extract_web_claims.py --start 0500 --limit 50

Background execution:
  nohup python scripts/batch_extract_web_claims.py > batch.out 2>&1 &
        """
    )
    parser.add_argument(
        '--force', '-f',
        action='store_true',
        help='Force re-extraction even if web_claims already exist'
    )
    parser.add_argument(
        '--limit', '-l',
        type=int,
        help='Limit number of entries to process'
    )
    parser.add_argument(
        '--start', '-s',
        type=str,
        help='Start from entry number (e.g., 0500)'
    )
    parser.add_argument(
        '--verbose', '-v',
        action='store_true',
        help='Enable verbose output for each entry'
    )

    args = parser.parse_args()

    # Check dependencies
    if not HAS_DEPS:
        logger.error("Missing required dependencies (lxml)")
        logger.error("Install with: pip install lxml beautifulsoup4")
        sys.exit(1)

    # Register signal handlers
    signal.signal(signal.SIGINT, signal_handler)
    signal.signal(signal.SIGTERM, signal_handler)

    # Run batch
    try:
        run_batch(
            force=args.force,
            limit=args.limit,
            start=args.start,
            verbose=args.verbose,
        )
    except Exception as e:
        logger.exception(f"Batch processing failed: {e}")
        sys.exit(1)


if __name__ == '__main__':
    main()