#!/usr/bin/env python3 """ Batch runner for web claims extraction with logging and progress tracking. This script processes all entries in the NDE enriched entries directory, extracting web claims from archived HTML files. Features: - Progress logging to file and console - --force flag to re-extract already processed entries - Resume capability (skips already processed entries by default) - Detailed statistics and timing - Safe interruption handling Usage: # Process all unprocessed entries python scripts/batch_extract_web_claims.py # Force re-extraction of all entries python scripts/batch_extract_web_claims.py --force # Limit to first N entries python scripts/batch_extract_web_claims.py --limit 100 # Start from a specific entry number python scripts/batch_extract_web_claims.py --start 0500 # Run in background with nohup nohup python scripts/batch_extract_web_claims.py > batch_extract.out 2>&1 & """ import argparse import logging import os import signal import sys import time from datetime import datetime, timezone from pathlib import Path from typing import Optional, List, Dict, Any import yaml # Add scripts directory to path for imports sys.path.insert(0, str(Path(__file__).parent)) # Import the extraction module try: from extract_html_claims import ( process_entry, ENTRIES_DIR, WEB_DIR, HAS_DEPS, NER_ENABLED, NER_MODEL, NER_CONVENTION_VERSION, ) except ImportError as e: print(f"Error importing extract_html_claims: {e}") print("Make sure extract_html_claims.py is in the same directory") sys.exit(1) # Configure logging LOG_DIR = Path('/Users/kempersc/apps/glam/logs') LOG_DIR.mkdir(exist_ok=True) timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') log_file = LOG_DIR / f'batch_extract_{timestamp}.log' # Create formatters file_formatter = logging.Formatter( '%(asctime)s | %(levelname)-8s | %(message)s', datefmt='%Y-%m-%d %H:%M:%S' ) console_formatter = logging.Formatter( '%(asctime)s | %(message)s', datefmt='%H:%M:%S' ) # File handler (detailed) file_handler = logging.FileHandler(log_file, encoding='utf-8') file_handler.setLevel(logging.DEBUG) file_handler.setFormatter(file_formatter) # Console handler (summary) console_handler = logging.StreamHandler(sys.stdout) console_handler.setLevel(logging.INFO) console_handler.setFormatter(console_formatter) # Root logger logger = logging.getLogger('batch_extract') logger.setLevel(logging.DEBUG) logger.addHandler(file_handler) logger.addHandler(console_handler) # Statistics tracking stats = { 'total_entries': 0, 'processed': 0, 'skipped': 0, 'failed': 0, 'claims_extracted': 0, 'claims_validated': 0, 'claims_rejected': 0, 'start_time': None, 'end_time': None, } # Graceful shutdown flag shutdown_requested = False def signal_handler(signum, frame): """Handle interrupt signals gracefully.""" global shutdown_requested if shutdown_requested: logger.warning("Force quit requested, exiting immediately...") sys.exit(1) logger.warning("\nShutdown requested. Finishing current entry...") logger.warning("Press Ctrl+C again to force quit.") shutdown_requested = True def get_entry_files() -> List[Path]: """Get all entry YAML files sorted by number.""" entries = [] for entry_file in ENTRIES_DIR.glob('*.yaml'): if entry_file.name.startswith('.'): continue # Extract entry number from filename (e.g., 0022_Q123.yaml -> 0022) name_parts = entry_file.stem.split('_') if name_parts and name_parts[0].isdigit(): entries.append(entry_file) return sorted(entries, key=lambda x: x.stem.split('_')[0]) def has_web_claims(entry_file: Path) -> bool: """Check if entry already has web_claims in its YAML file.""" try: with open(entry_file, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) if data and isinstance(data, dict): return 'web_claims' in data except Exception: pass return False def has_html_files(entry_file: Path) -> bool: """Check if entry has archived HTML files to process.""" # Extract entry number from filename (e.g., 0022_Q123.yaml -> 0022) entry_num = entry_file.stem.split('_')[0] web_entry_dir = WEB_DIR / entry_num if not web_entry_dir.exists(): return False # Check for any .html files html_files = list(web_entry_dir.rglob('*.html')) return len(html_files) > 0 def process_single_entry( entry_file: Path, force: bool = False, verbose: bool = False, ) -> Dict[str, Any]: """ Process a single entry file and return results. Returns dict with keys: - status: 'processed' | 'skipped' | 'failed' | 'no_html' - claims_count: int - valid_count: int - invalid_count: int - error: str or None - duration: float (seconds) """ entry_num = entry_file.stem.split('_')[0] result = { 'entry': entry_num, 'status': 'unknown', 'claims_count': 0, 'valid_count': 0, 'invalid_count': 0, 'error': None, 'duration': 0.0, } start_time = time.time() try: # Check if already processed if not force and has_web_claims(entry_file): result['status'] = 'skipped' result['duration'] = time.time() - start_time return result # Check if has HTML files if not has_html_files(entry_file): result['status'] = 'no_html' result['duration'] = time.time() - start_time return result # Process the entry claims_count, errors = process_entry( filepath=entry_file, dry_run=False, ) if errors: for err in errors: logger.debug(f" Entry {entry_num}: {err}") result['status'] = 'processed' result['claims_count'] = claims_count result['valid_count'] = claims_count # All claims that made it through are valid except Exception as e: result['status'] = 'failed' result['error'] = str(e) logger.error(f"Entry {entry_num} failed: {e}") result['duration'] = time.time() - start_time return result def run_batch( force: bool = False, limit: Optional[int] = None, start: Optional[str] = None, verbose: bool = False, ) -> None: """Run batch extraction on all entries.""" global stats, shutdown_requested logger.info("=" * 70) logger.info("BATCH WEB CLAIMS EXTRACTION") logger.info("=" * 70) logger.info(f"Log file: {log_file}") logger.info(f"Force mode: {force}") logger.info(f"LLM validation: {NER_ENABLED}") if NER_ENABLED: logger.info(f"Model: {NER_MODEL}") logger.info(f"Convention: {NER_CONVENTION_VERSION}") logger.info("") # Get all entries entries = get_entry_files() stats['total_entries'] = len(entries) # Filter by start if start: entries = [e for e in entries if e.stem.split('_')[0] >= start] # Apply limit if limit: entries = entries[:limit] logger.info(f"Entries to process: {len(entries)}") logger.info("=" * 70) logger.info("") stats['start_time'] = datetime.now(timezone.utc) for i, entry_file in enumerate(entries): if shutdown_requested: logger.warning("Shutdown requested, stopping batch processing") break entry_num = entry_file.stem.split('_')[0] progress = f"[{i+1}/{len(entries)}]" result = process_single_entry(entry_file, force=force, verbose=verbose) if result['status'] == 'processed': stats['processed'] += 1 stats['claims_extracted'] += result['claims_count'] stats['claims_validated'] += result['valid_count'] stats['claims_rejected'] += result['invalid_count'] logger.info( f"{progress} Entry {entry_num}: " f"{result['valid_count']} claims, " f"{result['invalid_count']} rejected " f"({result['duration']:.1f}s)" ) elif result['status'] == 'skipped': stats['skipped'] += 1 logger.debug(f"{progress} Entry {entry_num}: Skipped (already has web_claims)") elif result['status'] == 'no_html': stats['skipped'] += 1 logger.debug(f"{progress} Entry {entry_num}: Skipped (no HTML files)") elif result['status'] == 'failed': stats['failed'] += 1 logger.error(f"{progress} Entry {entry_num}: FAILED - {result['error']}") stats['end_time'] = datetime.now(timezone.utc) # Print summary logger.info("") logger.info("=" * 70) logger.info("BATCH COMPLETE") logger.info("=" * 70) duration = (stats['end_time'] - stats['start_time']).total_seconds() hours, remainder = divmod(int(duration), 3600) minutes, seconds = divmod(remainder, 60) logger.info(f"Total entries: {stats['total_entries']}") logger.info(f"Processed: {stats['processed']}") logger.info(f"Skipped: {stats['skipped']}") logger.info(f"Failed: {stats['failed']}") logger.info(f"Claims extracted: {stats['claims_extracted']}") logger.info(f"Claims validated: {stats['claims_validated']}") logger.info(f"Claims rejected: {stats['claims_rejected']}") logger.info(f"Duration: {hours}h {minutes}m {seconds}s") logger.info(f"Avg per entry: {duration/max(stats['processed'],1):.1f}s") logger.info("") logger.info(f"Log file: {log_file}") logger.info("=" * 70) def main(): """Main entry point.""" parser = argparse.ArgumentParser( description='Batch extract web claims from archived HTML files', formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: python scripts/batch_extract_web_claims.py python scripts/batch_extract_web_claims.py --force python scripts/batch_extract_web_claims.py --limit 100 python scripts/batch_extract_web_claims.py --start 0500 --limit 50 Background execution: nohup python scripts/batch_extract_web_claims.py > batch.out 2>&1 & """ ) parser.add_argument( '--force', '-f', action='store_true', help='Force re-extraction even if web_claims already exist' ) parser.add_argument( '--limit', '-l', type=int, help='Limit number of entries to process' ) parser.add_argument( '--start', '-s', type=str, help='Start from entry number (e.g., 0500)' ) parser.add_argument( '--verbose', '-v', action='store_true', help='Enable verbose output for each entry' ) args = parser.parse_args() # Check dependencies if not HAS_DEPS: logger.error("Missing required dependencies (lxml)") logger.error("Install with: pip install lxml beautifulsoup4") sys.exit(1) # Register signal handlers signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGTERM, signal_handler) # Run batch try: run_batch( force=args.force, limit=args.limit, start=args.start, verbose=args.verbose, ) except Exception as e: logger.exception(f"Batch processing failed: {e}") sys.exit(1) if __name__ == '__main__': main()