glam/scripts/batch_extract_web_annotations.py

#!/usr/bin/env python3
"""
Batch Web Annotation Extractor using GLAM-NER v1.7.0.

Extracts annotations with claims and provenance from archived web pages
for all entries in data/nde/enriched/entries/web/.

Uses the LLM annotator with the updated GLAM-NER v1.7.0 system prompt
that includes new entity subcategories for TMP, ROL, WRK, THG, APP.

Features:
- Processes all web entries with archived HTML pages
- Generates claims with XPath provenance
- Supports parallel processing with rate limiting
- Saves annotations to YAML files alongside existing metadata
- Generates summary reports

Usage:
    python scripts/batch_extract_web_annotations.py --test  # Test on 5 entries
    python scripts/batch_extract_web_annotations.py --batch 100  # Process 100 entries
    python scripts/batch_extract_web_annotations.py --all  # Process all entries
    python scripts/batch_extract_web_annotations.py --resume  # Resume from last checkpoint
"""

import argparse
import asyncio
import json
import logging
import os
import sys
import traceback
from dataclasses import dataclass, field, asdict
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
import yaml

# Load environment variables from .env file
try:
    from dotenv import load_dotenv
    env_path = Path(__file__).parent.parent / ".env"
    if env_path.exists():
        load_dotenv(env_path)
except ImportError:
    pass  # python-dotenv not installed, rely on environment

# Add src to path for imports
src_path = Path(__file__).parent.parent / "src"
sys.path.insert(0, str(src_path))

# Now import from glam_extractor
try:
    from glam_extractor.annotators.llm_annotator import (
        LLMAnnotator,
        LLMAnnotatorConfig,
        LLMProvider,
        RetryConfig,
        create_llm_annotator,
    )
    from glam_extractor.annotators.base import AnnotationSession
    from glam_extractor.annotators.uncertainty import UncertaintyDetector
except ImportError as e:
    print(f"Import error: {e}")
    print(f"Looking in: {src_path}")
    print(f"sys.path: {sys.path[:3]}")
    raise

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler(),
        logging.FileHandler('batch_extraction.log'),
    ]
)
logger = logging.getLogger(__name__)


@dataclass
class ExtractionResult:
    """Result of processing a single entry."""
    entry_id: str
    domain: str
    success: bool
    entity_count: int = 0
    claim_count: int = 0
    relationship_count: int = 0
    layout_count: int = 0
    errors: List[str] = field(default_factory=list)
    processing_time_seconds: float = 0.0
    output_file: Optional[str] = None


@dataclass
class BatchReport:
    """Summary report for batch processing."""
    started_at: str
    completed_at: Optional[str] = None
    total_entries: int = 0
    processed_entries: int = 0
    successful_entries: int = 0
    failed_entries: int = 0
    skipped_entries: int = 0
    total_entities: int = 0
    total_claims: int = 0
    total_relationships: int = 0
    total_layout_regions: int = 0
    avg_processing_time: float = 0.0
    errors: List[Dict[str, Any]] = field(default_factory=list)
    results: List[ExtractionResult] = field(default_factory=list)


def find_web_entries(base_path: Path) -> List[Tuple[str, Path]]:
    """
    Find all web entry directories with archived HTML files.

    Returns:
        List of (entry_id, entry_path) tuples
    """
    entries = []

    # Web entries are structured as: web/{entry_id}/{domain}/
    for entry_dir in sorted(base_path.iterdir()):
        if not entry_dir.is_dir():
            continue

        entry_id = entry_dir.name

        # Find domain subdirectory
        for domain_dir in entry_dir.iterdir():
            if not domain_dir.is_dir():
                continue

            # Check for pages directory with HTML files
            pages_dir = domain_dir / "pages"
            if pages_dir.exists():
                html_files = list(pages_dir.glob("*.html"))
                if html_files:
                    entries.append((f"{entry_id}/{domain_dir.name}", domain_dir))

    return entries


def load_existing_metadata(entry_path: Path) -> Optional[Dict[str, Any]]:
    """Load existing metadata.yaml for an entry."""
    metadata_file = entry_path / "metadata.yaml"
    if metadata_file.exists():
        with open(metadata_file, 'r', encoding='utf-8') as f:
            return yaml.safe_load(f)
    return None


def get_html_content(entry_path: Path) -> Tuple[Optional[str], Optional[str]]:
    """
    Get HTML content from the entry's pages directory.

    Returns:
        Tuple of (html_content, html_file_path)
    """
    pages_dir = entry_path / "pages"
    if not pages_dir.exists():
        return None, None

    # Prefer index.html, otherwise take first HTML file
    html_files = list(pages_dir.glob("*.html"))
    if not html_files:
        return None, None

    index_html = pages_dir / "index.html"
    html_file = index_html if index_html.exists() else html_files[0]

    try:
        with open(html_file, 'r', encoding='utf-8', errors='replace') as f:
            content = f.read()
        return content, str(html_file)
    except Exception as e:
        logger.error(f"Failed to read {html_file}: {e}")
        return None, None


def session_to_dict(session: AnnotationSession) -> Dict[str, Any]:
    """Convert AnnotationSession to a serializable dictionary."""
    return session.to_dict()


async def process_entry(
    entry_id: str,
    entry_path: Path,
    annotator: LLMAnnotator,
    uncertainty_detector: UncertaintyDetector,
    output_dir: Path,
) -> ExtractionResult:
    """
    Process a single web entry and extract annotations.

    Args:
        entry_id: Entry identifier (e.g., "0141/airbornemuseum.nl")
        entry_path: Path to entry directory
        annotator: LLM annotator instance
        uncertainty_detector: Uncertainty/hedging detector
        output_dir: Directory to save annotation output

    Returns:
        ExtractionResult with processing status
    """
    import time
    start_time = time.time()

    result = ExtractionResult(
        entry_id=entry_id,
        domain=entry_path.name,
        success=False,
    )

    try:
        # Load existing metadata
        metadata = load_existing_metadata(entry_path)
        source_url = metadata.get('url') if metadata else None

        # Get HTML content
        html_content, html_file = get_html_content(entry_path)
        if not html_content:
            result.errors.append("No HTML content found")
            return result

        # Run LLM annotation
        logger.info(f"Processing {entry_id}...")
        session = await annotator.annotate(
            document=html_content,
            source_url=source_url,
        )

        # Count results
        result.entity_count = len(session.entity_claims)
        result.claim_count = len(session.aggregate_claims)
        result.relationship_count = len(session.relationship_claims)
        result.layout_count = len(session.layout_claims)

        # Add uncertainty analysis for aggregate claims
        for claim in session.aggregate_claims:
            if claim.claim_value:
                text = str(claim.claim_value)
                confidence = uncertainty_detector.get_claim_confidence(text)
                # Update provenance confidence
                if claim.provenance:
                    claim.provenance.confidence = confidence

        # Prepare output
        output_data = {
            'extraction_version': 'GLAM-NER v1.7.0',
            'extraction_timestamp': datetime.now(timezone.utc).isoformat(),
            'source_url': source_url,
            'html_file': html_file,
            'session': session_to_dict(session),
            'summary': {
                'entity_count': result.entity_count,
                'claim_count': result.claim_count,
                'relationship_count': result.relationship_count,
                'layout_count': result.layout_count,
                'errors': session.errors,
            }
        }

        # Save annotations
        output_file = entry_path / "annotations_v1.7.0.yaml"
        with open(output_file, 'w', encoding='utf-8') as f:
            yaml.dump(output_data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)

        result.output_file = str(output_file)
        result.success = True
        result.errors = session.errors

    except Exception as e:
        result.errors.append(f"Processing failed: {str(e)}")
        result.errors.append(traceback.format_exc())
        logger.error(f"Failed to process {entry_id}: {e}")

    result.processing_time_seconds = time.time() - start_time
    return result


async def process_batch(
    entries: List[Tuple[str, Path]],
    annotator: LLMAnnotator,
    uncertainty_detector: UncertaintyDetector,
    output_dir: Path,
    concurrency: int = 3,
    checkpoint_interval: int = 10,
    checkpoint_file: Optional[Path] = None,
) -> BatchReport:
    """
    Process a batch of entries with rate limiting.

    Args:
        entries: List of (entry_id, entry_path) tuples
        annotator: LLM annotator
        uncertainty_detector: Uncertainty detector
        output_dir: Output directory
        concurrency: Max concurrent requests
        checkpoint_interval: Save checkpoint every N entries
        checkpoint_file: File to save progress checkpoints

    Returns:
        BatchReport with summary
    """
    report = BatchReport(
        started_at=datetime.now(timezone.utc).isoformat(),
        total_entries=len(entries),
    )

    # Load checkpoint if exists
    processed_ids = set()
    if checkpoint_file and checkpoint_file.exists():
        with open(checkpoint_file, 'r') as f:
            checkpoint = json.load(f)
            processed_ids = set(checkpoint.get('processed_ids', []))
            logger.info(f"Loaded checkpoint with {len(processed_ids)} processed entries")

    # Filter already processed entries
    remaining_entries = [
        (eid, path) for eid, path in entries
        if eid not in processed_ids
    ]
    report.skipped_entries = len(processed_ids)

    logger.info(f"Processing {len(remaining_entries)} entries ({len(processed_ids)} already done)")

    semaphore = asyncio.Semaphore(concurrency)

    async def process_with_semaphore(entry_id: str, entry_path: Path) -> ExtractionResult:
        async with semaphore:
            # Rate limiting delay
            await asyncio.sleep(0.5)
            return await process_entry(
                entry_id=entry_id,
                entry_path=entry_path,
                annotator=annotator,
                uncertainty_detector=uncertainty_detector,
                output_dir=output_dir,
            )

    # Process entries
    total_time = 0.0
    for i, (entry_id, entry_path) in enumerate(remaining_entries):
        result = await process_with_semaphore(entry_id, entry_path)

        report.processed_entries += 1
        report.results.append(result)

        if result.success:
            report.successful_entries += 1
            report.total_entities += result.entity_count
            report.total_claims += result.claim_count
            report.total_relationships += result.relationship_count
            report.total_layout_regions += result.layout_count
        else:
            report.failed_entries += 1
            report.errors.append({
                'entry_id': entry_id,
                'errors': result.errors,
            })

        total_time += result.processing_time_seconds

        # Progress logging
        if (i + 1) % 10 == 0:
            pct = ((i + 1) / len(remaining_entries)) * 100
            logger.info(
                f"Progress: {i+1}/{len(remaining_entries)} ({pct:.1f}%) - "
                f"Success: {report.successful_entries}, Failed: {report.failed_entries}"
            )

        # Checkpoint
        if checkpoint_file and (i + 1) % checkpoint_interval == 0:
            processed_ids.add(entry_id)
            checkpoint_data = {
                'processed_ids': list(processed_ids),
                'last_updated': datetime.now(timezone.utc).isoformat(),
                'processed_count': report.processed_entries,
                'successful_count': report.successful_entries,
            }
            with open(checkpoint_file, 'w') as f:
                json.dump(checkpoint_data, f)
            logger.info(f"Saved checkpoint at {i+1} entries")

    # Final stats
    if report.processed_entries > 0:
        report.avg_processing_time = total_time / report.processed_entries

    report.completed_at = datetime.now(timezone.utc).isoformat()

    return report


def save_report(report: BatchReport, output_path: Path):
    """Save batch report to YAML file."""
    # Convert to serializable format
    report_dict = {
        'started_at': report.started_at,
        'completed_at': report.completed_at,
        'total_entries': report.total_entries,
        'processed_entries': report.processed_entries,
        'successful_entries': report.successful_entries,
        'failed_entries': report.failed_entries,
        'skipped_entries': report.skipped_entries,
        'total_entities': report.total_entities,
        'total_claims': report.total_claims,
        'total_relationships': report.total_relationships,
        'total_layout_regions': report.total_layout_regions,
        'avg_processing_time_seconds': report.avg_processing_time,
        'errors': report.errors[:100],  # Limit to first 100 errors
        # Don't include full results in summary report
    }

    with open(output_path, 'w', encoding='utf-8') as f:
        yaml.dump(report_dict, f, default_flow_style=False, allow_unicode=True)

    logger.info(f"Saved report to {output_path}")


async def main():
    parser = argparse.ArgumentParser(
        description='Batch extract web annotations using GLAM-NER v1.7.0'
    )
    parser.add_argument('--test', action='store_true', help='Test on 5 entries')
    parser.add_argument('--batch', type=int, default=0, help='Process N entries')
    parser.add_argument('--all', action='store_true', help='Process all entries')
    parser.add_argument('--resume', action='store_true', help='Resume from checkpoint')
    parser.add_argument('--concurrency', type=int, default=2, help='Max concurrent requests')
    parser.add_argument('--provider', default='zai', choices=['zai', 'anthropic', 'openai'])
    parser.add_argument('--start-offset', type=int, default=0, help='Start from entry N')
    parser.add_argument('--output-dir', type=str, default='data/nde/enriched/entries/web')

    args = parser.parse_args()

    # Paths
    base_path = Path(__file__).parent.parent / "data" / "nde" / "enriched" / "entries" / "web"
    output_dir = Path(__file__).parent.parent / args.output_dir
    checkpoint_file = Path(__file__).parent.parent / "data" / "extraction_checkpoint.json"
    report_dir = Path(__file__).parent.parent / "reports"
    report_dir.mkdir(exist_ok=True)

    # Find entries
    logger.info("Scanning for web entries...")
    entries = find_web_entries(base_path)
    logger.info(f"Found {len(entries)} web entries with HTML content")

    # Apply offset
    if args.start_offset > 0:
        entries = entries[args.start_offset:]
        logger.info(f"Starting from offset {args.start_offset}, {len(entries)} remaining")

    # Determine batch size
    if args.test:
        entries = entries[:5]
        logger.info("Test mode: processing 5 entries")
    elif args.batch > 0:
        entries = entries[:args.batch]
        logger.info(f"Batch mode: processing {len(entries)} entries")
    elif not args.all:
        logger.error("Please specify --test, --batch N, or --all")
        sys.exit(1)

    # Initialize annotator
    logger.info(f"Initializing LLM annotator with provider: {args.provider}")
    try:
        annotator = create_llm_annotator(
            provider=args.provider,
            enable_fallback=True,
            max_retries=5,
        )
    except ValueError as e:
        logger.error(f"Failed to initialize annotator: {e}")
        logger.error("Make sure ZAI_API_TOKEN, ANTHROPIC_API_KEY, or OPENAI_API_KEY is set")
        sys.exit(1)

    # Initialize uncertainty detector
    uncertainty_detector = UncertaintyDetector()

    # Process entries
    report = await process_batch(
        entries=entries,
        annotator=annotator,
        uncertainty_detector=uncertainty_detector,
        output_dir=output_dir,
        concurrency=args.concurrency,
        checkpoint_interval=10,
        checkpoint_file=checkpoint_file if args.resume or args.all else None,
    )

    # Save report
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    report_file = report_dir / f"web_extraction_report_{timestamp}.yaml"
    save_report(report, report_file)

    # Summary
    logger.info("=" * 60)
    logger.info("EXTRACTION COMPLETE")
    logger.info("=" * 60)
    logger.info(f"Total entries:       {report.total_entries}")
    logger.info(f"Processed:           {report.processed_entries}")
    logger.info(f"Successful:          {report.successful_entries}")
    logger.info(f"Failed:              {report.failed_entries}")
    logger.info(f"Skipped (checkpoint):{report.skipped_entries}")
    logger.info("-" * 60)
    logger.info(f"Total entities:      {report.total_entities}")
    logger.info(f"Total claims:        {report.total_claims}")
    logger.info(f"Total relationships: {report.total_relationships}")
    logger.info(f"Total layout regions:{report.total_layout_regions}")
    logger.info(f"Avg processing time: {report.avg_processing_time:.2f}s")
    logger.info("-" * 60)
    logger.info(f"Report saved to: {report_file}")


if __name__ == '__main__':
    asyncio.run(main())