385 lines
11 KiB
Python
Executable file
385 lines
11 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Batch runner for web claims extraction with logging and progress tracking.
|
|
|
|
This script processes all entries in the NDE enriched entries directory,
|
|
extracting web claims from archived HTML files.
|
|
|
|
Features:
|
|
- Progress logging to file and console
|
|
- --force flag to re-extract already processed entries
|
|
- Resume capability (skips already processed entries by default)
|
|
- Detailed statistics and timing
|
|
- Safe interruption handling
|
|
|
|
Usage:
|
|
# Process all unprocessed entries
|
|
python scripts/batch_extract_web_claims.py
|
|
|
|
# Force re-extraction of all entries
|
|
python scripts/batch_extract_web_claims.py --force
|
|
|
|
# Limit to first N entries
|
|
python scripts/batch_extract_web_claims.py --limit 100
|
|
|
|
# Start from a specific entry number
|
|
python scripts/batch_extract_web_claims.py --start 0500
|
|
|
|
# Run in background with nohup
|
|
nohup python scripts/batch_extract_web_claims.py > batch_extract.out 2>&1 &
|
|
"""
|
|
|
|
import argparse
|
|
import logging
|
|
import os
|
|
import signal
|
|
import sys
|
|
import time
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Optional, List, Dict, Any
|
|
|
|
import yaml
|
|
|
|
# Add scripts directory to path for imports
|
|
sys.path.insert(0, str(Path(__file__).parent))
|
|
|
|
# Import the extraction module
|
|
try:
|
|
from extract_html_claims import (
|
|
process_entry,
|
|
ENTRIES_DIR,
|
|
WEB_DIR,
|
|
HAS_DEPS,
|
|
NER_ENABLED,
|
|
NER_MODEL,
|
|
NER_CONVENTION_VERSION,
|
|
)
|
|
except ImportError as e:
|
|
print(f"Error importing extract_html_claims: {e}")
|
|
print("Make sure extract_html_claims.py is in the same directory")
|
|
sys.exit(1)
|
|
|
|
# Configure logging
|
|
LOG_DIR = Path('/Users/kempersc/apps/glam/logs')
|
|
LOG_DIR.mkdir(exist_ok=True)
|
|
|
|
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
log_file = LOG_DIR / f'batch_extract_{timestamp}.log'
|
|
|
|
# Create formatters
|
|
file_formatter = logging.Formatter(
|
|
'%(asctime)s | %(levelname)-8s | %(message)s',
|
|
datefmt='%Y-%m-%d %H:%M:%S'
|
|
)
|
|
console_formatter = logging.Formatter(
|
|
'%(asctime)s | %(message)s',
|
|
datefmt='%H:%M:%S'
|
|
)
|
|
|
|
# File handler (detailed)
|
|
file_handler = logging.FileHandler(log_file, encoding='utf-8')
|
|
file_handler.setLevel(logging.DEBUG)
|
|
file_handler.setFormatter(file_formatter)
|
|
|
|
# Console handler (summary)
|
|
console_handler = logging.StreamHandler(sys.stdout)
|
|
console_handler.setLevel(logging.INFO)
|
|
console_handler.setFormatter(console_formatter)
|
|
|
|
# Root logger
|
|
logger = logging.getLogger('batch_extract')
|
|
logger.setLevel(logging.DEBUG)
|
|
logger.addHandler(file_handler)
|
|
logger.addHandler(console_handler)
|
|
|
|
# Statistics tracking
|
|
stats = {
|
|
'total_entries': 0,
|
|
'processed': 0,
|
|
'skipped': 0,
|
|
'failed': 0,
|
|
'claims_extracted': 0,
|
|
'claims_validated': 0,
|
|
'claims_rejected': 0,
|
|
'start_time': None,
|
|
'end_time': None,
|
|
}
|
|
|
|
# Graceful shutdown flag
|
|
shutdown_requested = False
|
|
|
|
|
|
def signal_handler(signum, frame):
|
|
"""Handle interrupt signals gracefully."""
|
|
global shutdown_requested
|
|
if shutdown_requested:
|
|
logger.warning("Force quit requested, exiting immediately...")
|
|
sys.exit(1)
|
|
logger.warning("\nShutdown requested. Finishing current entry...")
|
|
logger.warning("Press Ctrl+C again to force quit.")
|
|
shutdown_requested = True
|
|
|
|
|
|
def get_entry_files() -> List[Path]:
|
|
"""Get all entry YAML files sorted by number."""
|
|
entries = []
|
|
for entry_file in ENTRIES_DIR.glob('*.yaml'):
|
|
if entry_file.name.startswith('.'):
|
|
continue
|
|
# Extract entry number from filename (e.g., 0022_Q123.yaml -> 0022)
|
|
name_parts = entry_file.stem.split('_')
|
|
if name_parts and name_parts[0].isdigit():
|
|
entries.append(entry_file)
|
|
return sorted(entries, key=lambda x: x.stem.split('_')[0])
|
|
|
|
|
|
def has_web_claims(entry_file: Path) -> bool:
|
|
"""Check if entry already has web_claims in its YAML file."""
|
|
try:
|
|
with open(entry_file, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
if data and isinstance(data, dict):
|
|
return 'web_claims' in data
|
|
except Exception:
|
|
pass
|
|
return False
|
|
|
|
|
|
def has_html_files(entry_file: Path) -> bool:
|
|
"""Check if entry has archived HTML files to process."""
|
|
# Extract entry number from filename (e.g., 0022_Q123.yaml -> 0022)
|
|
entry_num = entry_file.stem.split('_')[0]
|
|
web_entry_dir = WEB_DIR / entry_num
|
|
if not web_entry_dir.exists():
|
|
return False
|
|
|
|
# Check for any .html files
|
|
html_files = list(web_entry_dir.rglob('*.html'))
|
|
return len(html_files) > 0
|
|
|
|
|
|
def process_single_entry(
|
|
entry_file: Path,
|
|
force: bool = False,
|
|
verbose: bool = False,
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Process a single entry file and return results.
|
|
|
|
Returns dict with keys:
|
|
- status: 'processed' | 'skipped' | 'failed' | 'no_html'
|
|
- claims_count: int
|
|
- valid_count: int
|
|
- invalid_count: int
|
|
- error: str or None
|
|
- duration: float (seconds)
|
|
"""
|
|
entry_num = entry_file.stem.split('_')[0]
|
|
result = {
|
|
'entry': entry_num,
|
|
'status': 'unknown',
|
|
'claims_count': 0,
|
|
'valid_count': 0,
|
|
'invalid_count': 0,
|
|
'error': None,
|
|
'duration': 0.0,
|
|
}
|
|
|
|
start_time = time.time()
|
|
|
|
try:
|
|
# Check if already processed
|
|
if not force and has_web_claims(entry_file):
|
|
result['status'] = 'skipped'
|
|
result['duration'] = time.time() - start_time
|
|
return result
|
|
|
|
# Check if has HTML files
|
|
if not has_html_files(entry_file):
|
|
result['status'] = 'no_html'
|
|
result['duration'] = time.time() - start_time
|
|
return result
|
|
|
|
# Process the entry
|
|
claims_count, errors = process_entry(
|
|
filepath=entry_file,
|
|
dry_run=False,
|
|
)
|
|
|
|
if errors:
|
|
for err in errors:
|
|
logger.debug(f" Entry {entry_num}: {err}")
|
|
|
|
result['status'] = 'processed'
|
|
result['claims_count'] = claims_count
|
|
result['valid_count'] = claims_count # All claims that made it through are valid
|
|
|
|
except Exception as e:
|
|
result['status'] = 'failed'
|
|
result['error'] = str(e)
|
|
logger.error(f"Entry {entry_num} failed: {e}")
|
|
|
|
result['duration'] = time.time() - start_time
|
|
return result
|
|
|
|
|
|
def run_batch(
|
|
force: bool = False,
|
|
limit: Optional[int] = None,
|
|
start: Optional[str] = None,
|
|
verbose: bool = False,
|
|
) -> None:
|
|
"""Run batch extraction on all entries."""
|
|
global stats, shutdown_requested
|
|
|
|
logger.info("=" * 70)
|
|
logger.info("BATCH WEB CLAIMS EXTRACTION")
|
|
logger.info("=" * 70)
|
|
logger.info(f"Log file: {log_file}")
|
|
logger.info(f"Force mode: {force}")
|
|
logger.info(f"LLM validation: {NER_ENABLED}")
|
|
if NER_ENABLED:
|
|
logger.info(f"Model: {NER_MODEL}")
|
|
logger.info(f"Convention: {NER_CONVENTION_VERSION}")
|
|
logger.info("")
|
|
|
|
# Get all entries
|
|
entries = get_entry_files()
|
|
stats['total_entries'] = len(entries)
|
|
|
|
# Filter by start
|
|
if start:
|
|
entries = [e for e in entries if e.stem.split('_')[0] >= start]
|
|
|
|
# Apply limit
|
|
if limit:
|
|
entries = entries[:limit]
|
|
|
|
logger.info(f"Entries to process: {len(entries)}")
|
|
logger.info("=" * 70)
|
|
logger.info("")
|
|
|
|
stats['start_time'] = datetime.now(timezone.utc)
|
|
|
|
for i, entry_file in enumerate(entries):
|
|
if shutdown_requested:
|
|
logger.warning("Shutdown requested, stopping batch processing")
|
|
break
|
|
|
|
entry_num = entry_file.stem.split('_')[0]
|
|
progress = f"[{i+1}/{len(entries)}]"
|
|
|
|
result = process_single_entry(entry_file, force=force, verbose=verbose)
|
|
|
|
if result['status'] == 'processed':
|
|
stats['processed'] += 1
|
|
stats['claims_extracted'] += result['claims_count']
|
|
stats['claims_validated'] += result['valid_count']
|
|
stats['claims_rejected'] += result['invalid_count']
|
|
logger.info(
|
|
f"{progress} Entry {entry_num}: "
|
|
f"{result['valid_count']} claims, "
|
|
f"{result['invalid_count']} rejected "
|
|
f"({result['duration']:.1f}s)"
|
|
)
|
|
elif result['status'] == 'skipped':
|
|
stats['skipped'] += 1
|
|
logger.debug(f"{progress} Entry {entry_num}: Skipped (already has web_claims)")
|
|
elif result['status'] == 'no_html':
|
|
stats['skipped'] += 1
|
|
logger.debug(f"{progress} Entry {entry_num}: Skipped (no HTML files)")
|
|
elif result['status'] == 'failed':
|
|
stats['failed'] += 1
|
|
logger.error(f"{progress} Entry {entry_num}: FAILED - {result['error']}")
|
|
|
|
stats['end_time'] = datetime.now(timezone.utc)
|
|
|
|
# Print summary
|
|
logger.info("")
|
|
logger.info("=" * 70)
|
|
logger.info("BATCH COMPLETE")
|
|
logger.info("=" * 70)
|
|
|
|
duration = (stats['end_time'] - stats['start_time']).total_seconds()
|
|
hours, remainder = divmod(int(duration), 3600)
|
|
minutes, seconds = divmod(remainder, 60)
|
|
|
|
logger.info(f"Total entries: {stats['total_entries']}")
|
|
logger.info(f"Processed: {stats['processed']}")
|
|
logger.info(f"Skipped: {stats['skipped']}")
|
|
logger.info(f"Failed: {stats['failed']}")
|
|
logger.info(f"Claims extracted: {stats['claims_extracted']}")
|
|
logger.info(f"Claims validated: {stats['claims_validated']}")
|
|
logger.info(f"Claims rejected: {stats['claims_rejected']}")
|
|
logger.info(f"Duration: {hours}h {minutes}m {seconds}s")
|
|
logger.info(f"Avg per entry: {duration/max(stats['processed'],1):.1f}s")
|
|
logger.info("")
|
|
logger.info(f"Log file: {log_file}")
|
|
logger.info("=" * 70)
|
|
|
|
|
|
def main():
|
|
"""Main entry point."""
|
|
parser = argparse.ArgumentParser(
|
|
description='Batch extract web claims from archived HTML files',
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
Examples:
|
|
python scripts/batch_extract_web_claims.py
|
|
python scripts/batch_extract_web_claims.py --force
|
|
python scripts/batch_extract_web_claims.py --limit 100
|
|
python scripts/batch_extract_web_claims.py --start 0500 --limit 50
|
|
|
|
Background execution:
|
|
nohup python scripts/batch_extract_web_claims.py > batch.out 2>&1 &
|
|
"""
|
|
)
|
|
parser.add_argument(
|
|
'--force', '-f',
|
|
action='store_true',
|
|
help='Force re-extraction even if web_claims already exist'
|
|
)
|
|
parser.add_argument(
|
|
'--limit', '-l',
|
|
type=int,
|
|
help='Limit number of entries to process'
|
|
)
|
|
parser.add_argument(
|
|
'--start', '-s',
|
|
type=str,
|
|
help='Start from entry number (e.g., 0500)'
|
|
)
|
|
parser.add_argument(
|
|
'--verbose', '-v',
|
|
action='store_true',
|
|
help='Enable verbose output for each entry'
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Check dependencies
|
|
if not HAS_DEPS:
|
|
logger.error("Missing required dependencies (lxml)")
|
|
logger.error("Install with: pip install lxml beautifulsoup4")
|
|
sys.exit(1)
|
|
|
|
# Register signal handlers
|
|
signal.signal(signal.SIGINT, signal_handler)
|
|
signal.signal(signal.SIGTERM, signal_handler)
|
|
|
|
# Run batch
|
|
try:
|
|
run_batch(
|
|
force=args.force,
|
|
limit=args.limit,
|
|
start=args.start,
|
|
verbose=args.verbose,
|
|
)
|
|
except Exception as e:
|
|
logger.exception(f"Batch processing failed: {e}")
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|