glam/scripts/batch_extract_web_claims.py
2025-12-03 17:38:46 +01:00

385 lines
11 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Batch runner for web claims extraction with logging and progress tracking.
This script processes all entries in the NDE enriched entries directory,
extracting web claims from archived HTML files.
Features:
- Progress logging to file and console
- --force flag to re-extract already processed entries
- Resume capability (skips already processed entries by default)
- Detailed statistics and timing
- Safe interruption handling
Usage:
# Process all unprocessed entries
python scripts/batch_extract_web_claims.py
# Force re-extraction of all entries
python scripts/batch_extract_web_claims.py --force
# Limit to first N entries
python scripts/batch_extract_web_claims.py --limit 100
# Start from a specific entry number
python scripts/batch_extract_web_claims.py --start 0500
# Run in background with nohup
nohup python scripts/batch_extract_web_claims.py > batch_extract.out 2>&1 &
"""
import argparse
import logging
import os
import signal
import sys
import time
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional, List, Dict, Any
import yaml
# Add scripts directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))
# Import the extraction module
try:
from extract_html_claims import (
process_entry,
ENTRIES_DIR,
WEB_DIR,
HAS_DEPS,
NER_ENABLED,
NER_MODEL,
NER_CONVENTION_VERSION,
)
except ImportError as e:
print(f"Error importing extract_html_claims: {e}")
print("Make sure extract_html_claims.py is in the same directory")
sys.exit(1)
# Configure logging
LOG_DIR = Path('/Users/kempersc/apps/glam/logs')
LOG_DIR.mkdir(exist_ok=True)
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
log_file = LOG_DIR / f'batch_extract_{timestamp}.log'
# Create formatters
file_formatter = logging.Formatter(
'%(asctime)s | %(levelname)-8s | %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
console_formatter = logging.Formatter(
'%(asctime)s | %(message)s',
datefmt='%H:%M:%S'
)
# File handler (detailed)
file_handler = logging.FileHandler(log_file, encoding='utf-8')
file_handler.setLevel(logging.DEBUG)
file_handler.setFormatter(file_formatter)
# Console handler (summary)
console_handler = logging.StreamHandler(sys.stdout)
console_handler.setLevel(logging.INFO)
console_handler.setFormatter(console_formatter)
# Root logger
logger = logging.getLogger('batch_extract')
logger.setLevel(logging.DEBUG)
logger.addHandler(file_handler)
logger.addHandler(console_handler)
# Statistics tracking
stats = {
'total_entries': 0,
'processed': 0,
'skipped': 0,
'failed': 0,
'claims_extracted': 0,
'claims_validated': 0,
'claims_rejected': 0,
'start_time': None,
'end_time': None,
}
# Graceful shutdown flag
shutdown_requested = False
def signal_handler(signum, frame):
"""Handle interrupt signals gracefully."""
global shutdown_requested
if shutdown_requested:
logger.warning("Force quit requested, exiting immediately...")
sys.exit(1)
logger.warning("\nShutdown requested. Finishing current entry...")
logger.warning("Press Ctrl+C again to force quit.")
shutdown_requested = True
def get_entry_files() -> List[Path]:
"""Get all entry YAML files sorted by number."""
entries = []
for entry_file in ENTRIES_DIR.glob('*.yaml'):
if entry_file.name.startswith('.'):
continue
# Extract entry number from filename (e.g., 0022_Q123.yaml -> 0022)
name_parts = entry_file.stem.split('_')
if name_parts and name_parts[0].isdigit():
entries.append(entry_file)
return sorted(entries, key=lambda x: x.stem.split('_')[0])
def has_web_claims(entry_file: Path) -> bool:
"""Check if entry already has web_claims in its YAML file."""
try:
with open(entry_file, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
if data and isinstance(data, dict):
return 'web_claims' in data
except Exception:
pass
return False
def has_html_files(entry_file: Path) -> bool:
"""Check if entry has archived HTML files to process."""
# Extract entry number from filename (e.g., 0022_Q123.yaml -> 0022)
entry_num = entry_file.stem.split('_')[0]
web_entry_dir = WEB_DIR / entry_num
if not web_entry_dir.exists():
return False
# Check for any .html files
html_files = list(web_entry_dir.rglob('*.html'))
return len(html_files) > 0
def process_single_entry(
entry_file: Path,
force: bool = False,
verbose: bool = False,
) -> Dict[str, Any]:
"""
Process a single entry file and return results.
Returns dict with keys:
- status: 'processed' | 'skipped' | 'failed' | 'no_html'
- claims_count: int
- valid_count: int
- invalid_count: int
- error: str or None
- duration: float (seconds)
"""
entry_num = entry_file.stem.split('_')[0]
result = {
'entry': entry_num,
'status': 'unknown',
'claims_count': 0,
'valid_count': 0,
'invalid_count': 0,
'error': None,
'duration': 0.0,
}
start_time = time.time()
try:
# Check if already processed
if not force and has_web_claims(entry_file):
result['status'] = 'skipped'
result['duration'] = time.time() - start_time
return result
# Check if has HTML files
if not has_html_files(entry_file):
result['status'] = 'no_html'
result['duration'] = time.time() - start_time
return result
# Process the entry
claims_count, errors = process_entry(
filepath=entry_file,
dry_run=False,
)
if errors:
for err in errors:
logger.debug(f" Entry {entry_num}: {err}")
result['status'] = 'processed'
result['claims_count'] = claims_count
result['valid_count'] = claims_count # All claims that made it through are valid
except Exception as e:
result['status'] = 'failed'
result['error'] = str(e)
logger.error(f"Entry {entry_num} failed: {e}")
result['duration'] = time.time() - start_time
return result
def run_batch(
force: bool = False,
limit: Optional[int] = None,
start: Optional[str] = None,
verbose: bool = False,
) -> None:
"""Run batch extraction on all entries."""
global stats, shutdown_requested
logger.info("=" * 70)
logger.info("BATCH WEB CLAIMS EXTRACTION")
logger.info("=" * 70)
logger.info(f"Log file: {log_file}")
logger.info(f"Force mode: {force}")
logger.info(f"LLM validation: {NER_ENABLED}")
if NER_ENABLED:
logger.info(f"Model: {NER_MODEL}")
logger.info(f"Convention: {NER_CONVENTION_VERSION}")
logger.info("")
# Get all entries
entries = get_entry_files()
stats['total_entries'] = len(entries)
# Filter by start
if start:
entries = [e for e in entries if e.stem.split('_')[0] >= start]
# Apply limit
if limit:
entries = entries[:limit]
logger.info(f"Entries to process: {len(entries)}")
logger.info("=" * 70)
logger.info("")
stats['start_time'] = datetime.now(timezone.utc)
for i, entry_file in enumerate(entries):
if shutdown_requested:
logger.warning("Shutdown requested, stopping batch processing")
break
entry_num = entry_file.stem.split('_')[0]
progress = f"[{i+1}/{len(entries)}]"
result = process_single_entry(entry_file, force=force, verbose=verbose)
if result['status'] == 'processed':
stats['processed'] += 1
stats['claims_extracted'] += result['claims_count']
stats['claims_validated'] += result['valid_count']
stats['claims_rejected'] += result['invalid_count']
logger.info(
f"{progress} Entry {entry_num}: "
f"{result['valid_count']} claims, "
f"{result['invalid_count']} rejected "
f"({result['duration']:.1f}s)"
)
elif result['status'] == 'skipped':
stats['skipped'] += 1
logger.debug(f"{progress} Entry {entry_num}: Skipped (already has web_claims)")
elif result['status'] == 'no_html':
stats['skipped'] += 1
logger.debug(f"{progress} Entry {entry_num}: Skipped (no HTML files)")
elif result['status'] == 'failed':
stats['failed'] += 1
logger.error(f"{progress} Entry {entry_num}: FAILED - {result['error']}")
stats['end_time'] = datetime.now(timezone.utc)
# Print summary
logger.info("")
logger.info("=" * 70)
logger.info("BATCH COMPLETE")
logger.info("=" * 70)
duration = (stats['end_time'] - stats['start_time']).total_seconds()
hours, remainder = divmod(int(duration), 3600)
minutes, seconds = divmod(remainder, 60)
logger.info(f"Total entries: {stats['total_entries']}")
logger.info(f"Processed: {stats['processed']}")
logger.info(f"Skipped: {stats['skipped']}")
logger.info(f"Failed: {stats['failed']}")
logger.info(f"Claims extracted: {stats['claims_extracted']}")
logger.info(f"Claims validated: {stats['claims_validated']}")
logger.info(f"Claims rejected: {stats['claims_rejected']}")
logger.info(f"Duration: {hours}h {minutes}m {seconds}s")
logger.info(f"Avg per entry: {duration/max(stats['processed'],1):.1f}s")
logger.info("")
logger.info(f"Log file: {log_file}")
logger.info("=" * 70)
def main():
"""Main entry point."""
parser = argparse.ArgumentParser(
description='Batch extract web claims from archived HTML files',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
python scripts/batch_extract_web_claims.py
python scripts/batch_extract_web_claims.py --force
python scripts/batch_extract_web_claims.py --limit 100
python scripts/batch_extract_web_claims.py --start 0500 --limit 50
Background execution:
nohup python scripts/batch_extract_web_claims.py > batch.out 2>&1 &
"""
)
parser.add_argument(
'--force', '-f',
action='store_true',
help='Force re-extraction even if web_claims already exist'
)
parser.add_argument(
'--limit', '-l',
type=int,
help='Limit number of entries to process'
)
parser.add_argument(
'--start', '-s',
type=str,
help='Start from entry number (e.g., 0500)'
)
parser.add_argument(
'--verbose', '-v',
action='store_true',
help='Enable verbose output for each entry'
)
args = parser.parse_args()
# Check dependencies
if not HAS_DEPS:
logger.error("Missing required dependencies (lxml)")
logger.error("Install with: pip install lxml beautifulsoup4")
sys.exit(1)
# Register signal handlers
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, signal_handler)
# Run batch
try:
run_batch(
force=args.force,
limit=args.limit,
start=args.start,
verbose=args.verbose,
)
except Exception as e:
logger.exception(f"Batch processing failed: {e}")
sys.exit(1)
if __name__ == '__main__':
main()