glam/scripts/agentic_annotator.py

#!/usr/bin/env python3
"""
Agentic Annotator CLI - Annotate heritage documents with entities and layout.

Usage:
    python scripts/agentic_annotator.py <input_path> [options]

Examples:
    # Annotate a single HTML file
    python scripts/agentic_annotator.py data/nde/enriched/entries/web/0576/rhc-eindhoven.nl/mirror/www.rhc-eindhoven.nl/index.html

    # Annotate a WARC archive
    python scripts/agentic_annotator.py data/nde/enriched/entries/web/0576/rhc-eindhoven.nl/archive.warc.gz

    # Annotate a mirror directory
    python scripts/agentic_annotator.py data/nde/enriched/entries/web/0576/rhc-eindhoven.nl/mirror/ --recursive

    # Output as JSON instead of YAML
    python scripts/agentic_annotator.py input.html --format json

Based on GLAM-NER v1.7.0-unified Entity Annotation Convention.
"""

import argparse
import json
import sys
from pathlib import Path
from typing import List, Optional

# Add src to path
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))

from glam_extractor.annotators import (
    AgenticAnnotator,
    AnnotationConfig,
    AnnotationSession,
    HTMLParser,
    create_annotator,
)


def main():
    parser = argparse.ArgumentParser(
        description="Annotate heritage documents with entities and layout regions.",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
    # Annotate HTML file
    %(prog)s data/web/page.html

    # Annotate WARC archive
    %(prog)s data/web/archive.warc.gz

    # Annotate directory recursively
    %(prog)s data/web/mirror/ --recursive

    # Export as JSON
    %(prog)s data/web/page.html --format json --output annotations.json
        """,
    )

    parser.add_argument(
        "input",
        type=str,
        help="Input file or directory to annotate",
    )

    parser.add_argument(
        "-o", "--output",
        type=str,
        help="Output file path (auto-generated if not specified)",
    )

    parser.add_argument(
        "--output-dir",
        type=str,
        default="data/annotations",
        help="Output directory for annotations (default: data/annotations)",
    )

    parser.add_argument(
        "-f", "--format",
        choices=["yaml", "json", "jsonld"],
        default="yaml",
        help="Output format (default: yaml)",
    )

    parser.add_argument(
        "--recursive", "-r",
        action="store_true",
        help="Process directory recursively",
    )

    parser.add_argument(
        "--no-entities",
        action="store_true",
        help="Skip entity recognition",
    )

    parser.add_argument(
        "--no-layout",
        action="store_true",
        help="Skip layout analysis",
    )

    parser.add_argument(
        "--no-aggregates",
        action="store_true",
        help="Skip aggregate claim creation",
    )

    parser.add_argument(
        "--use-llm",
        action="store_true",
        help="Use LLM for enhanced entity recognition",
    )

    parser.add_argument(
        "--llm-model",
        type=str,
        default="glm-4-flash",
        help="LLM model to use (default: glm-4-flash)",
    )

    parser.add_argument(
        "--source-url",
        type=str,
        help="Source URL for provenance tracking",
    )

    parser.add_argument(
        "--summary",
        action="store_true",
        help="Print summary statistics only",
    )

    parser.add_argument(
        "--verbose", "-v",
        action="store_true",
        help="Verbose output",
    )

    args = parser.parse_args()

    # Validate input
    input_path = Path(args.input)
    if not input_path.exists():
        print(f"Error: Input path does not exist: {input_path}", file=sys.stderr)
        sys.exit(1)

    # Create configuration
    config = AnnotationConfig(
        annotate_entities=not args.no_entities,
        annotate_layout=not args.no_layout,
        create_aggregates=not args.no_aggregates,
        use_llm_entities=args.use_llm,
        llm_model=args.llm_model,
        output_format=args.format,
        output_dir=args.output_dir,
    )

    # Create annotator
    annotator = AgenticAnnotator(config)

    # Process input
    sessions: List[AnnotationSession] = []

    if input_path.is_file():
        if input_path.suffix == '.gz' and 'warc' in input_path.stem.lower():
            # WARC archive
            if args.verbose:
                print(f"Processing WARC archive: {input_path}")
            sessions = annotator.annotate_warc(input_path)
        else:
            # Single HTML file
            if args.verbose:
                print(f"Processing HTML file: {input_path}")
            session = annotator.annotate_file(input_path, source_url=args.source_url)
            sessions = [session]

    elif input_path.is_dir():
        if args.recursive:
            # Recursive directory
            if args.verbose:
                print(f"Processing directory recursively: {input_path}")
            sessions = annotator.annotate_mirror_directory(input_path)
        else:
            # Non-recursive - find HTML files in directory
            if args.verbose:
                print(f"Processing directory: {input_path}")
            for html_file in input_path.glob("*.html"):
                session = annotator.annotate_file(html_file, source_url=args.source_url)
                sessions.append(session)

    # Report results
    if not sessions:
        print("No documents processed.", file=sys.stderr)
        sys.exit(1)

    print(f"\nProcessed {len(sessions)} document(s)")

    # Export or summarize
    for session in sessions:
        summary = annotator.get_session_summary(session)

        if args.summary:
            print(f"\n--- Session {session.session_id[:8]} ---")
            print(f"  Source: {summary['source_file'] or summary['source_url']}")
            print(f"  Entity claims: {summary['entity_claims']}")
            if summary['entity_counts']:
                for hypernym, count in sorted(summary['entity_counts'].items()):
                    print(f"    {hypernym}: {count}")
            print(f"  Layout claims: {summary['layout_claims']}")
            if summary['layout_counts']:
                for region, count in sorted(summary['layout_counts'].items()):
                    print(f"    {region}: {count}")
            print(f"  Aggregate claims: {summary['aggregate_claims']}")
            if summary['errors']:
                print(f"  Errors: {len(summary['errors'])}")
        else:
            # Export to file
            output_path = annotator.export_session(
                session,
                output_path=args.output if len(sessions) == 1 else None,
            )
            if args.verbose:
                print(f"  Exported: {output_path}")

    # Final summary
    total_entities = sum(len(s.entity_claims) for s in sessions)
    total_layout = sum(len(s.layout_claims) for s in sessions)
    total_aggregates = sum(len(s.aggregate_claims) for s in sessions)

    print(f"\nTotal: {total_entities} entities, {total_layout} layout regions, {total_aggregates} aggregates")


if __name__ == "__main__":
    main()