glam/scripts/batch_extract_web_annotations.py
2025-12-05 15:30:23 +01:00

517 lines
18 KiB
Python

#!/usr/bin/env python3
"""
Batch Web Annotation Extractor using GLAM-NER v1.7.0.
Extracts annotations with claims and provenance from archived web pages
for all entries in data/nde/enriched/entries/web/.
Uses the LLM annotator with the updated GLAM-NER v1.7.0 system prompt
that includes new entity subcategories for TMP, ROL, WRK, THG, APP.
Features:
- Processes all web entries with archived HTML pages
- Generates claims with XPath provenance
- Supports parallel processing with rate limiting
- Saves annotations to YAML files alongside existing metadata
- Generates summary reports
Usage:
python scripts/batch_extract_web_annotations.py --test # Test on 5 entries
python scripts/batch_extract_web_annotations.py --batch 100 # Process 100 entries
python scripts/batch_extract_web_annotations.py --all # Process all entries
python scripts/batch_extract_web_annotations.py --resume # Resume from last checkpoint
"""
import argparse
import asyncio
import json
import logging
import os
import sys
import traceback
from dataclasses import dataclass, field, asdict
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
import yaml
# Load environment variables from .env file
try:
from dotenv import load_dotenv
env_path = Path(__file__).parent.parent / ".env"
if env_path.exists():
load_dotenv(env_path)
except ImportError:
pass # python-dotenv not installed, rely on environment
# Add src to path for imports
src_path = Path(__file__).parent.parent / "src"
sys.path.insert(0, str(src_path))
# Now import from glam_extractor
try:
from glam_extractor.annotators.llm_annotator import (
LLMAnnotator,
LLMAnnotatorConfig,
LLMProvider,
RetryConfig,
create_llm_annotator,
)
from glam_extractor.annotators.base import AnnotationSession
from glam_extractor.annotators.uncertainty import UncertaintyDetector
except ImportError as e:
print(f"Import error: {e}")
print(f"Looking in: {src_path}")
print(f"sys.path: {sys.path[:3]}")
raise
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler(),
logging.FileHandler('batch_extraction.log'),
]
)
logger = logging.getLogger(__name__)
@dataclass
class ExtractionResult:
"""Result of processing a single entry."""
entry_id: str
domain: str
success: bool
entity_count: int = 0
claim_count: int = 0
relationship_count: int = 0
layout_count: int = 0
errors: List[str] = field(default_factory=list)
processing_time_seconds: float = 0.0
output_file: Optional[str] = None
@dataclass
class BatchReport:
"""Summary report for batch processing."""
started_at: str
completed_at: Optional[str] = None
total_entries: int = 0
processed_entries: int = 0
successful_entries: int = 0
failed_entries: int = 0
skipped_entries: int = 0
total_entities: int = 0
total_claims: int = 0
total_relationships: int = 0
total_layout_regions: int = 0
avg_processing_time: float = 0.0
errors: List[Dict[str, Any]] = field(default_factory=list)
results: List[ExtractionResult] = field(default_factory=list)
def find_web_entries(base_path: Path) -> List[Tuple[str, Path]]:
"""
Find all web entry directories with archived HTML files.
Returns:
List of (entry_id, entry_path) tuples
"""
entries = []
# Web entries are structured as: web/{entry_id}/{domain}/
for entry_dir in sorted(base_path.iterdir()):
if not entry_dir.is_dir():
continue
entry_id = entry_dir.name
# Find domain subdirectory
for domain_dir in entry_dir.iterdir():
if not domain_dir.is_dir():
continue
# Check for pages directory with HTML files
pages_dir = domain_dir / "pages"
if pages_dir.exists():
html_files = list(pages_dir.glob("*.html"))
if html_files:
entries.append((f"{entry_id}/{domain_dir.name}", domain_dir))
return entries
def load_existing_metadata(entry_path: Path) -> Optional[Dict[str, Any]]:
"""Load existing metadata.yaml for an entry."""
metadata_file = entry_path / "metadata.yaml"
if metadata_file.exists():
with open(metadata_file, 'r', encoding='utf-8') as f:
return yaml.safe_load(f)
return None
def get_html_content(entry_path: Path) -> Tuple[Optional[str], Optional[str]]:
"""
Get HTML content from the entry's pages directory.
Returns:
Tuple of (html_content, html_file_path)
"""
pages_dir = entry_path / "pages"
if not pages_dir.exists():
return None, None
# Prefer index.html, otherwise take first HTML file
html_files = list(pages_dir.glob("*.html"))
if not html_files:
return None, None
index_html = pages_dir / "index.html"
html_file = index_html if index_html.exists() else html_files[0]
try:
with open(html_file, 'r', encoding='utf-8', errors='replace') as f:
content = f.read()
return content, str(html_file)
except Exception as e:
logger.error(f"Failed to read {html_file}: {e}")
return None, None
def session_to_dict(session: AnnotationSession) -> Dict[str, Any]:
"""Convert AnnotationSession to a serializable dictionary."""
return session.to_dict()
async def process_entry(
entry_id: str,
entry_path: Path,
annotator: LLMAnnotator,
uncertainty_detector: UncertaintyDetector,
output_dir: Path,
) -> ExtractionResult:
"""
Process a single web entry and extract annotations.
Args:
entry_id: Entry identifier (e.g., "0141/airbornemuseum.nl")
entry_path: Path to entry directory
annotator: LLM annotator instance
uncertainty_detector: Uncertainty/hedging detector
output_dir: Directory to save annotation output
Returns:
ExtractionResult with processing status
"""
import time
start_time = time.time()
result = ExtractionResult(
entry_id=entry_id,
domain=entry_path.name,
success=False,
)
try:
# Load existing metadata
metadata = load_existing_metadata(entry_path)
source_url = metadata.get('url') if metadata else None
# Get HTML content
html_content, html_file = get_html_content(entry_path)
if not html_content:
result.errors.append("No HTML content found")
return result
# Run LLM annotation
logger.info(f"Processing {entry_id}...")
session = await annotator.annotate(
document=html_content,
source_url=source_url,
)
# Count results
result.entity_count = len(session.entity_claims)
result.claim_count = len(session.aggregate_claims)
result.relationship_count = len(session.relationship_claims)
result.layout_count = len(session.layout_claims)
# Add uncertainty analysis for aggregate claims
for claim in session.aggregate_claims:
if claim.claim_value:
text = str(claim.claim_value)
confidence = uncertainty_detector.get_claim_confidence(text)
# Update provenance confidence
if claim.provenance:
claim.provenance.confidence = confidence
# Prepare output
output_data = {
'extraction_version': 'GLAM-NER v1.7.0',
'extraction_timestamp': datetime.now(timezone.utc).isoformat(),
'source_url': source_url,
'html_file': html_file,
'session': session_to_dict(session),
'summary': {
'entity_count': result.entity_count,
'claim_count': result.claim_count,
'relationship_count': result.relationship_count,
'layout_count': result.layout_count,
'errors': session.errors,
}
}
# Save annotations
output_file = entry_path / "annotations_v1.7.0.yaml"
with open(output_file, 'w', encoding='utf-8') as f:
yaml.dump(output_data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
result.output_file = str(output_file)
result.success = True
result.errors = session.errors
except Exception as e:
result.errors.append(f"Processing failed: {str(e)}")
result.errors.append(traceback.format_exc())
logger.error(f"Failed to process {entry_id}: {e}")
result.processing_time_seconds = time.time() - start_time
return result
async def process_batch(
entries: List[Tuple[str, Path]],
annotator: LLMAnnotator,
uncertainty_detector: UncertaintyDetector,
output_dir: Path,
concurrency: int = 3,
checkpoint_interval: int = 10,
checkpoint_file: Optional[Path] = None,
) -> BatchReport:
"""
Process a batch of entries with rate limiting.
Args:
entries: List of (entry_id, entry_path) tuples
annotator: LLM annotator
uncertainty_detector: Uncertainty detector
output_dir: Output directory
concurrency: Max concurrent requests
checkpoint_interval: Save checkpoint every N entries
checkpoint_file: File to save progress checkpoints
Returns:
BatchReport with summary
"""
report = BatchReport(
started_at=datetime.now(timezone.utc).isoformat(),
total_entries=len(entries),
)
# Load checkpoint if exists
processed_ids = set()
if checkpoint_file and checkpoint_file.exists():
with open(checkpoint_file, 'r') as f:
checkpoint = json.load(f)
processed_ids = set(checkpoint.get('processed_ids', []))
logger.info(f"Loaded checkpoint with {len(processed_ids)} processed entries")
# Filter already processed entries
remaining_entries = [
(eid, path) for eid, path in entries
if eid not in processed_ids
]
report.skipped_entries = len(processed_ids)
logger.info(f"Processing {len(remaining_entries)} entries ({len(processed_ids)} already done)")
semaphore = asyncio.Semaphore(concurrency)
async def process_with_semaphore(entry_id: str, entry_path: Path) -> ExtractionResult:
async with semaphore:
# Rate limiting delay
await asyncio.sleep(0.5)
return await process_entry(
entry_id=entry_id,
entry_path=entry_path,
annotator=annotator,
uncertainty_detector=uncertainty_detector,
output_dir=output_dir,
)
# Process entries
total_time = 0.0
for i, (entry_id, entry_path) in enumerate(remaining_entries):
result = await process_with_semaphore(entry_id, entry_path)
report.processed_entries += 1
report.results.append(result)
if result.success:
report.successful_entries += 1
report.total_entities += result.entity_count
report.total_claims += result.claim_count
report.total_relationships += result.relationship_count
report.total_layout_regions += result.layout_count
else:
report.failed_entries += 1
report.errors.append({
'entry_id': entry_id,
'errors': result.errors,
})
total_time += result.processing_time_seconds
# Progress logging
if (i + 1) % 10 == 0:
pct = ((i + 1) / len(remaining_entries)) * 100
logger.info(
f"Progress: {i+1}/{len(remaining_entries)} ({pct:.1f}%) - "
f"Success: {report.successful_entries}, Failed: {report.failed_entries}"
)
# Checkpoint
if checkpoint_file and (i + 1) % checkpoint_interval == 0:
processed_ids.add(entry_id)
checkpoint_data = {
'processed_ids': list(processed_ids),
'last_updated': datetime.now(timezone.utc).isoformat(),
'processed_count': report.processed_entries,
'successful_count': report.successful_entries,
}
with open(checkpoint_file, 'w') as f:
json.dump(checkpoint_data, f)
logger.info(f"Saved checkpoint at {i+1} entries")
# Final stats
if report.processed_entries > 0:
report.avg_processing_time = total_time / report.processed_entries
report.completed_at = datetime.now(timezone.utc).isoformat()
return report
def save_report(report: BatchReport, output_path: Path):
"""Save batch report to YAML file."""
# Convert to serializable format
report_dict = {
'started_at': report.started_at,
'completed_at': report.completed_at,
'total_entries': report.total_entries,
'processed_entries': report.processed_entries,
'successful_entries': report.successful_entries,
'failed_entries': report.failed_entries,
'skipped_entries': report.skipped_entries,
'total_entities': report.total_entities,
'total_claims': report.total_claims,
'total_relationships': report.total_relationships,
'total_layout_regions': report.total_layout_regions,
'avg_processing_time_seconds': report.avg_processing_time,
'errors': report.errors[:100], # Limit to first 100 errors
# Don't include full results in summary report
}
with open(output_path, 'w', encoding='utf-8') as f:
yaml.dump(report_dict, f, default_flow_style=False, allow_unicode=True)
logger.info(f"Saved report to {output_path}")
async def main():
parser = argparse.ArgumentParser(
description='Batch extract web annotations using GLAM-NER v1.7.0'
)
parser.add_argument('--test', action='store_true', help='Test on 5 entries')
parser.add_argument('--batch', type=int, default=0, help='Process N entries')
parser.add_argument('--all', action='store_true', help='Process all entries')
parser.add_argument('--resume', action='store_true', help='Resume from checkpoint')
parser.add_argument('--concurrency', type=int, default=2, help='Max concurrent requests')
parser.add_argument('--provider', default='zai', choices=['zai', 'anthropic', 'openai'])
parser.add_argument('--start-offset', type=int, default=0, help='Start from entry N')
parser.add_argument('--output-dir', type=str, default='data/nde/enriched/entries/web')
args = parser.parse_args()
# Paths
base_path = Path(__file__).parent.parent / "data" / "nde" / "enriched" / "entries" / "web"
output_dir = Path(__file__).parent.parent / args.output_dir
checkpoint_file = Path(__file__).parent.parent / "data" / "extraction_checkpoint.json"
report_dir = Path(__file__).parent.parent / "reports"
report_dir.mkdir(exist_ok=True)
# Find entries
logger.info("Scanning for web entries...")
entries = find_web_entries(base_path)
logger.info(f"Found {len(entries)} web entries with HTML content")
# Apply offset
if args.start_offset > 0:
entries = entries[args.start_offset:]
logger.info(f"Starting from offset {args.start_offset}, {len(entries)} remaining")
# Determine batch size
if args.test:
entries = entries[:5]
logger.info("Test mode: processing 5 entries")
elif args.batch > 0:
entries = entries[:args.batch]
logger.info(f"Batch mode: processing {len(entries)} entries")
elif not args.all:
logger.error("Please specify --test, --batch N, or --all")
sys.exit(1)
# Initialize annotator
logger.info(f"Initializing LLM annotator with provider: {args.provider}")
try:
annotator = create_llm_annotator(
provider=args.provider,
enable_fallback=True,
max_retries=5,
)
except ValueError as e:
logger.error(f"Failed to initialize annotator: {e}")
logger.error("Make sure ZAI_API_TOKEN, ANTHROPIC_API_KEY, or OPENAI_API_KEY is set")
sys.exit(1)
# Initialize uncertainty detector
uncertainty_detector = UncertaintyDetector()
# Process entries
report = await process_batch(
entries=entries,
annotator=annotator,
uncertainty_detector=uncertainty_detector,
output_dir=output_dir,
concurrency=args.concurrency,
checkpoint_interval=10,
checkpoint_file=checkpoint_file if args.resume or args.all else None,
)
# Save report
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
report_file = report_dir / f"web_extraction_report_{timestamp}.yaml"
save_report(report, report_file)
# Summary
logger.info("=" * 60)
logger.info("EXTRACTION COMPLETE")
logger.info("=" * 60)
logger.info(f"Total entries: {report.total_entries}")
logger.info(f"Processed: {report.processed_entries}")
logger.info(f"Successful: {report.successful_entries}")
logger.info(f"Failed: {report.failed_entries}")
logger.info(f"Skipped (checkpoint):{report.skipped_entries}")
logger.info("-" * 60)
logger.info(f"Total entities: {report.total_entities}")
logger.info(f"Total claims: {report.total_claims}")
logger.info(f"Total relationships: {report.total_relationships}")
logger.info(f"Total layout regions:{report.total_layout_regions}")
logger.info(f"Avg processing time: {report.avg_processing_time:.2f}s")
logger.info("-" * 60)
logger.info(f"Report saved to: {report_file}")
if __name__ == '__main__':
asyncio.run(main())