517 lines
18 KiB
Python
517 lines
18 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Batch Web Annotation Extractor using GLAM-NER v1.7.0.
|
|
|
|
Extracts annotations with claims and provenance from archived web pages
|
|
for all entries in data/nde/enriched/entries/web/.
|
|
|
|
Uses the LLM annotator with the updated GLAM-NER v1.7.0 system prompt
|
|
that includes new entity subcategories for TMP, ROL, WRK, THG, APP.
|
|
|
|
Features:
|
|
- Processes all web entries with archived HTML pages
|
|
- Generates claims with XPath provenance
|
|
- Supports parallel processing with rate limiting
|
|
- Saves annotations to YAML files alongside existing metadata
|
|
- Generates summary reports
|
|
|
|
Usage:
|
|
python scripts/batch_extract_web_annotations.py --test # Test on 5 entries
|
|
python scripts/batch_extract_web_annotations.py --batch 100 # Process 100 entries
|
|
python scripts/batch_extract_web_annotations.py --all # Process all entries
|
|
python scripts/batch_extract_web_annotations.py --resume # Resume from last checkpoint
|
|
"""
|
|
|
|
import argparse
|
|
import asyncio
|
|
import json
|
|
import logging
|
|
import os
|
|
import sys
|
|
import traceback
|
|
from dataclasses import dataclass, field, asdict
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Optional, Tuple
|
|
import yaml
|
|
|
|
# Load environment variables from .env file
|
|
try:
|
|
from dotenv import load_dotenv
|
|
env_path = Path(__file__).parent.parent / ".env"
|
|
if env_path.exists():
|
|
load_dotenv(env_path)
|
|
except ImportError:
|
|
pass # python-dotenv not installed, rely on environment
|
|
|
|
# Add src to path for imports
|
|
src_path = Path(__file__).parent.parent / "src"
|
|
sys.path.insert(0, str(src_path))
|
|
|
|
# Now import from glam_extractor
|
|
try:
|
|
from glam_extractor.annotators.llm_annotator import (
|
|
LLMAnnotator,
|
|
LLMAnnotatorConfig,
|
|
LLMProvider,
|
|
RetryConfig,
|
|
create_llm_annotator,
|
|
)
|
|
from glam_extractor.annotators.base import AnnotationSession
|
|
from glam_extractor.annotators.uncertainty import UncertaintyDetector
|
|
except ImportError as e:
|
|
print(f"Import error: {e}")
|
|
print(f"Looking in: {src_path}")
|
|
print(f"sys.path: {sys.path[:3]}")
|
|
raise
|
|
|
|
# Configure logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
|
handlers=[
|
|
logging.StreamHandler(),
|
|
logging.FileHandler('batch_extraction.log'),
|
|
]
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class ExtractionResult:
|
|
"""Result of processing a single entry."""
|
|
entry_id: str
|
|
domain: str
|
|
success: bool
|
|
entity_count: int = 0
|
|
claim_count: int = 0
|
|
relationship_count: int = 0
|
|
layout_count: int = 0
|
|
errors: List[str] = field(default_factory=list)
|
|
processing_time_seconds: float = 0.0
|
|
output_file: Optional[str] = None
|
|
|
|
|
|
@dataclass
|
|
class BatchReport:
|
|
"""Summary report for batch processing."""
|
|
started_at: str
|
|
completed_at: Optional[str] = None
|
|
total_entries: int = 0
|
|
processed_entries: int = 0
|
|
successful_entries: int = 0
|
|
failed_entries: int = 0
|
|
skipped_entries: int = 0
|
|
total_entities: int = 0
|
|
total_claims: int = 0
|
|
total_relationships: int = 0
|
|
total_layout_regions: int = 0
|
|
avg_processing_time: float = 0.0
|
|
errors: List[Dict[str, Any]] = field(default_factory=list)
|
|
results: List[ExtractionResult] = field(default_factory=list)
|
|
|
|
|
|
def find_web_entries(base_path: Path) -> List[Tuple[str, Path]]:
|
|
"""
|
|
Find all web entry directories with archived HTML files.
|
|
|
|
Returns:
|
|
List of (entry_id, entry_path) tuples
|
|
"""
|
|
entries = []
|
|
|
|
# Web entries are structured as: web/{entry_id}/{domain}/
|
|
for entry_dir in sorted(base_path.iterdir()):
|
|
if not entry_dir.is_dir():
|
|
continue
|
|
|
|
entry_id = entry_dir.name
|
|
|
|
# Find domain subdirectory
|
|
for domain_dir in entry_dir.iterdir():
|
|
if not domain_dir.is_dir():
|
|
continue
|
|
|
|
# Check for pages directory with HTML files
|
|
pages_dir = domain_dir / "pages"
|
|
if pages_dir.exists():
|
|
html_files = list(pages_dir.glob("*.html"))
|
|
if html_files:
|
|
entries.append((f"{entry_id}/{domain_dir.name}", domain_dir))
|
|
|
|
return entries
|
|
|
|
|
|
def load_existing_metadata(entry_path: Path) -> Optional[Dict[str, Any]]:
|
|
"""Load existing metadata.yaml for an entry."""
|
|
metadata_file = entry_path / "metadata.yaml"
|
|
if metadata_file.exists():
|
|
with open(metadata_file, 'r', encoding='utf-8') as f:
|
|
return yaml.safe_load(f)
|
|
return None
|
|
|
|
|
|
def get_html_content(entry_path: Path) -> Tuple[Optional[str], Optional[str]]:
|
|
"""
|
|
Get HTML content from the entry's pages directory.
|
|
|
|
Returns:
|
|
Tuple of (html_content, html_file_path)
|
|
"""
|
|
pages_dir = entry_path / "pages"
|
|
if not pages_dir.exists():
|
|
return None, None
|
|
|
|
# Prefer index.html, otherwise take first HTML file
|
|
html_files = list(pages_dir.glob("*.html"))
|
|
if not html_files:
|
|
return None, None
|
|
|
|
index_html = pages_dir / "index.html"
|
|
html_file = index_html if index_html.exists() else html_files[0]
|
|
|
|
try:
|
|
with open(html_file, 'r', encoding='utf-8', errors='replace') as f:
|
|
content = f.read()
|
|
return content, str(html_file)
|
|
except Exception as e:
|
|
logger.error(f"Failed to read {html_file}: {e}")
|
|
return None, None
|
|
|
|
|
|
def session_to_dict(session: AnnotationSession) -> Dict[str, Any]:
|
|
"""Convert AnnotationSession to a serializable dictionary."""
|
|
return session.to_dict()
|
|
|
|
|
|
async def process_entry(
|
|
entry_id: str,
|
|
entry_path: Path,
|
|
annotator: LLMAnnotator,
|
|
uncertainty_detector: UncertaintyDetector,
|
|
output_dir: Path,
|
|
) -> ExtractionResult:
|
|
"""
|
|
Process a single web entry and extract annotations.
|
|
|
|
Args:
|
|
entry_id: Entry identifier (e.g., "0141/airbornemuseum.nl")
|
|
entry_path: Path to entry directory
|
|
annotator: LLM annotator instance
|
|
uncertainty_detector: Uncertainty/hedging detector
|
|
output_dir: Directory to save annotation output
|
|
|
|
Returns:
|
|
ExtractionResult with processing status
|
|
"""
|
|
import time
|
|
start_time = time.time()
|
|
|
|
result = ExtractionResult(
|
|
entry_id=entry_id,
|
|
domain=entry_path.name,
|
|
success=False,
|
|
)
|
|
|
|
try:
|
|
# Load existing metadata
|
|
metadata = load_existing_metadata(entry_path)
|
|
source_url = metadata.get('url') if metadata else None
|
|
|
|
# Get HTML content
|
|
html_content, html_file = get_html_content(entry_path)
|
|
if not html_content:
|
|
result.errors.append("No HTML content found")
|
|
return result
|
|
|
|
# Run LLM annotation
|
|
logger.info(f"Processing {entry_id}...")
|
|
session = await annotator.annotate(
|
|
document=html_content,
|
|
source_url=source_url,
|
|
)
|
|
|
|
# Count results
|
|
result.entity_count = len(session.entity_claims)
|
|
result.claim_count = len(session.aggregate_claims)
|
|
result.relationship_count = len(session.relationship_claims)
|
|
result.layout_count = len(session.layout_claims)
|
|
|
|
# Add uncertainty analysis for aggregate claims
|
|
for claim in session.aggregate_claims:
|
|
if claim.claim_value:
|
|
text = str(claim.claim_value)
|
|
confidence = uncertainty_detector.get_claim_confidence(text)
|
|
# Update provenance confidence
|
|
if claim.provenance:
|
|
claim.provenance.confidence = confidence
|
|
|
|
# Prepare output
|
|
output_data = {
|
|
'extraction_version': 'GLAM-NER v1.7.0',
|
|
'extraction_timestamp': datetime.now(timezone.utc).isoformat(),
|
|
'source_url': source_url,
|
|
'html_file': html_file,
|
|
'session': session_to_dict(session),
|
|
'summary': {
|
|
'entity_count': result.entity_count,
|
|
'claim_count': result.claim_count,
|
|
'relationship_count': result.relationship_count,
|
|
'layout_count': result.layout_count,
|
|
'errors': session.errors,
|
|
}
|
|
}
|
|
|
|
# Save annotations
|
|
output_file = entry_path / "annotations_v1.7.0.yaml"
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
yaml.dump(output_data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
result.output_file = str(output_file)
|
|
result.success = True
|
|
result.errors = session.errors
|
|
|
|
except Exception as e:
|
|
result.errors.append(f"Processing failed: {str(e)}")
|
|
result.errors.append(traceback.format_exc())
|
|
logger.error(f"Failed to process {entry_id}: {e}")
|
|
|
|
result.processing_time_seconds = time.time() - start_time
|
|
return result
|
|
|
|
|
|
async def process_batch(
|
|
entries: List[Tuple[str, Path]],
|
|
annotator: LLMAnnotator,
|
|
uncertainty_detector: UncertaintyDetector,
|
|
output_dir: Path,
|
|
concurrency: int = 3,
|
|
checkpoint_interval: int = 10,
|
|
checkpoint_file: Optional[Path] = None,
|
|
) -> BatchReport:
|
|
"""
|
|
Process a batch of entries with rate limiting.
|
|
|
|
Args:
|
|
entries: List of (entry_id, entry_path) tuples
|
|
annotator: LLM annotator
|
|
uncertainty_detector: Uncertainty detector
|
|
output_dir: Output directory
|
|
concurrency: Max concurrent requests
|
|
checkpoint_interval: Save checkpoint every N entries
|
|
checkpoint_file: File to save progress checkpoints
|
|
|
|
Returns:
|
|
BatchReport with summary
|
|
"""
|
|
report = BatchReport(
|
|
started_at=datetime.now(timezone.utc).isoformat(),
|
|
total_entries=len(entries),
|
|
)
|
|
|
|
# Load checkpoint if exists
|
|
processed_ids = set()
|
|
if checkpoint_file and checkpoint_file.exists():
|
|
with open(checkpoint_file, 'r') as f:
|
|
checkpoint = json.load(f)
|
|
processed_ids = set(checkpoint.get('processed_ids', []))
|
|
logger.info(f"Loaded checkpoint with {len(processed_ids)} processed entries")
|
|
|
|
# Filter already processed entries
|
|
remaining_entries = [
|
|
(eid, path) for eid, path in entries
|
|
if eid not in processed_ids
|
|
]
|
|
report.skipped_entries = len(processed_ids)
|
|
|
|
logger.info(f"Processing {len(remaining_entries)} entries ({len(processed_ids)} already done)")
|
|
|
|
semaphore = asyncio.Semaphore(concurrency)
|
|
|
|
async def process_with_semaphore(entry_id: str, entry_path: Path) -> ExtractionResult:
|
|
async with semaphore:
|
|
# Rate limiting delay
|
|
await asyncio.sleep(0.5)
|
|
return await process_entry(
|
|
entry_id=entry_id,
|
|
entry_path=entry_path,
|
|
annotator=annotator,
|
|
uncertainty_detector=uncertainty_detector,
|
|
output_dir=output_dir,
|
|
)
|
|
|
|
# Process entries
|
|
total_time = 0.0
|
|
for i, (entry_id, entry_path) in enumerate(remaining_entries):
|
|
result = await process_with_semaphore(entry_id, entry_path)
|
|
|
|
report.processed_entries += 1
|
|
report.results.append(result)
|
|
|
|
if result.success:
|
|
report.successful_entries += 1
|
|
report.total_entities += result.entity_count
|
|
report.total_claims += result.claim_count
|
|
report.total_relationships += result.relationship_count
|
|
report.total_layout_regions += result.layout_count
|
|
else:
|
|
report.failed_entries += 1
|
|
report.errors.append({
|
|
'entry_id': entry_id,
|
|
'errors': result.errors,
|
|
})
|
|
|
|
total_time += result.processing_time_seconds
|
|
|
|
# Progress logging
|
|
if (i + 1) % 10 == 0:
|
|
pct = ((i + 1) / len(remaining_entries)) * 100
|
|
logger.info(
|
|
f"Progress: {i+1}/{len(remaining_entries)} ({pct:.1f}%) - "
|
|
f"Success: {report.successful_entries}, Failed: {report.failed_entries}"
|
|
)
|
|
|
|
# Checkpoint
|
|
if checkpoint_file and (i + 1) % checkpoint_interval == 0:
|
|
processed_ids.add(entry_id)
|
|
checkpoint_data = {
|
|
'processed_ids': list(processed_ids),
|
|
'last_updated': datetime.now(timezone.utc).isoformat(),
|
|
'processed_count': report.processed_entries,
|
|
'successful_count': report.successful_entries,
|
|
}
|
|
with open(checkpoint_file, 'w') as f:
|
|
json.dump(checkpoint_data, f)
|
|
logger.info(f"Saved checkpoint at {i+1} entries")
|
|
|
|
# Final stats
|
|
if report.processed_entries > 0:
|
|
report.avg_processing_time = total_time / report.processed_entries
|
|
|
|
report.completed_at = datetime.now(timezone.utc).isoformat()
|
|
|
|
return report
|
|
|
|
|
|
def save_report(report: BatchReport, output_path: Path):
|
|
"""Save batch report to YAML file."""
|
|
# Convert to serializable format
|
|
report_dict = {
|
|
'started_at': report.started_at,
|
|
'completed_at': report.completed_at,
|
|
'total_entries': report.total_entries,
|
|
'processed_entries': report.processed_entries,
|
|
'successful_entries': report.successful_entries,
|
|
'failed_entries': report.failed_entries,
|
|
'skipped_entries': report.skipped_entries,
|
|
'total_entities': report.total_entities,
|
|
'total_claims': report.total_claims,
|
|
'total_relationships': report.total_relationships,
|
|
'total_layout_regions': report.total_layout_regions,
|
|
'avg_processing_time_seconds': report.avg_processing_time,
|
|
'errors': report.errors[:100], # Limit to first 100 errors
|
|
# Don't include full results in summary report
|
|
}
|
|
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(report_dict, f, default_flow_style=False, allow_unicode=True)
|
|
|
|
logger.info(f"Saved report to {output_path}")
|
|
|
|
|
|
async def main():
|
|
parser = argparse.ArgumentParser(
|
|
description='Batch extract web annotations using GLAM-NER v1.7.0'
|
|
)
|
|
parser.add_argument('--test', action='store_true', help='Test on 5 entries')
|
|
parser.add_argument('--batch', type=int, default=0, help='Process N entries')
|
|
parser.add_argument('--all', action='store_true', help='Process all entries')
|
|
parser.add_argument('--resume', action='store_true', help='Resume from checkpoint')
|
|
parser.add_argument('--concurrency', type=int, default=2, help='Max concurrent requests')
|
|
parser.add_argument('--provider', default='zai', choices=['zai', 'anthropic', 'openai'])
|
|
parser.add_argument('--start-offset', type=int, default=0, help='Start from entry N')
|
|
parser.add_argument('--output-dir', type=str, default='data/nde/enriched/entries/web')
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Paths
|
|
base_path = Path(__file__).parent.parent / "data" / "nde" / "enriched" / "entries" / "web"
|
|
output_dir = Path(__file__).parent.parent / args.output_dir
|
|
checkpoint_file = Path(__file__).parent.parent / "data" / "extraction_checkpoint.json"
|
|
report_dir = Path(__file__).parent.parent / "reports"
|
|
report_dir.mkdir(exist_ok=True)
|
|
|
|
# Find entries
|
|
logger.info("Scanning for web entries...")
|
|
entries = find_web_entries(base_path)
|
|
logger.info(f"Found {len(entries)} web entries with HTML content")
|
|
|
|
# Apply offset
|
|
if args.start_offset > 0:
|
|
entries = entries[args.start_offset:]
|
|
logger.info(f"Starting from offset {args.start_offset}, {len(entries)} remaining")
|
|
|
|
# Determine batch size
|
|
if args.test:
|
|
entries = entries[:5]
|
|
logger.info("Test mode: processing 5 entries")
|
|
elif args.batch > 0:
|
|
entries = entries[:args.batch]
|
|
logger.info(f"Batch mode: processing {len(entries)} entries")
|
|
elif not args.all:
|
|
logger.error("Please specify --test, --batch N, or --all")
|
|
sys.exit(1)
|
|
|
|
# Initialize annotator
|
|
logger.info(f"Initializing LLM annotator with provider: {args.provider}")
|
|
try:
|
|
annotator = create_llm_annotator(
|
|
provider=args.provider,
|
|
enable_fallback=True,
|
|
max_retries=5,
|
|
)
|
|
except ValueError as e:
|
|
logger.error(f"Failed to initialize annotator: {e}")
|
|
logger.error("Make sure ZAI_API_TOKEN, ANTHROPIC_API_KEY, or OPENAI_API_KEY is set")
|
|
sys.exit(1)
|
|
|
|
# Initialize uncertainty detector
|
|
uncertainty_detector = UncertaintyDetector()
|
|
|
|
# Process entries
|
|
report = await process_batch(
|
|
entries=entries,
|
|
annotator=annotator,
|
|
uncertainty_detector=uncertainty_detector,
|
|
output_dir=output_dir,
|
|
concurrency=args.concurrency,
|
|
checkpoint_interval=10,
|
|
checkpoint_file=checkpoint_file if args.resume or args.all else None,
|
|
)
|
|
|
|
# Save report
|
|
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
report_file = report_dir / f"web_extraction_report_{timestamp}.yaml"
|
|
save_report(report, report_file)
|
|
|
|
# Summary
|
|
logger.info("=" * 60)
|
|
logger.info("EXTRACTION COMPLETE")
|
|
logger.info("=" * 60)
|
|
logger.info(f"Total entries: {report.total_entries}")
|
|
logger.info(f"Processed: {report.processed_entries}")
|
|
logger.info(f"Successful: {report.successful_entries}")
|
|
logger.info(f"Failed: {report.failed_entries}")
|
|
logger.info(f"Skipped (checkpoint):{report.skipped_entries}")
|
|
logger.info("-" * 60)
|
|
logger.info(f"Total entities: {report.total_entities}")
|
|
logger.info(f"Total claims: {report.total_claims}")
|
|
logger.info(f"Total relationships: {report.total_relationships}")
|
|
logger.info(f"Total layout regions:{report.total_layout_regions}")
|
|
logger.info(f"Avg processing time: {report.avg_processing_time:.2f}s")
|
|
logger.info("-" * 60)
|
|
logger.info(f"Report saved to: {report_file}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
asyncio.run(main())
|