#!/usr/bin/env python3 """ Agentic Annotator CLI - Annotate heritage documents with entities and layout. Usage: python scripts/agentic_annotator.py [options] Examples: # Annotate a single HTML file python scripts/agentic_annotator.py data/nde/enriched/entries/web/0576/rhc-eindhoven.nl/mirror/www.rhc-eindhoven.nl/index.html # Annotate a WARC archive python scripts/agentic_annotator.py data/nde/enriched/entries/web/0576/rhc-eindhoven.nl/archive.warc.gz # Annotate a mirror directory python scripts/agentic_annotator.py data/nde/enriched/entries/web/0576/rhc-eindhoven.nl/mirror/ --recursive # Output as JSON instead of YAML python scripts/agentic_annotator.py input.html --format json Based on GLAM-NER v1.7.0-unified Entity Annotation Convention. """ import argparse import json import sys from pathlib import Path from typing import List, Optional # Add src to path sys.path.insert(0, str(Path(__file__).parent.parent / "src")) from glam_extractor.annotators import ( AgenticAnnotator, AnnotationConfig, AnnotationSession, HTMLParser, create_annotator, ) def main(): parser = argparse.ArgumentParser( description="Annotate heritage documents with entities and layout regions.", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: # Annotate HTML file %(prog)s data/web/page.html # Annotate WARC archive %(prog)s data/web/archive.warc.gz # Annotate directory recursively %(prog)s data/web/mirror/ --recursive # Export as JSON %(prog)s data/web/page.html --format json --output annotations.json """, ) parser.add_argument( "input", type=str, help="Input file or directory to annotate", ) parser.add_argument( "-o", "--output", type=str, help="Output file path (auto-generated if not specified)", ) parser.add_argument( "--output-dir", type=str, default="data/annotations", help="Output directory for annotations (default: data/annotations)", ) parser.add_argument( "-f", "--format", choices=["yaml", "json", "jsonld"], default="yaml", help="Output format (default: yaml)", ) parser.add_argument( "--recursive", "-r", action="store_true", help="Process directory recursively", ) parser.add_argument( "--no-entities", action="store_true", help="Skip entity recognition", ) parser.add_argument( "--no-layout", action="store_true", help="Skip layout analysis", ) parser.add_argument( "--no-aggregates", action="store_true", help="Skip aggregate claim creation", ) parser.add_argument( "--use-llm", action="store_true", help="Use LLM for enhanced entity recognition", ) parser.add_argument( "--llm-model", type=str, default="glm-4-flash", help="LLM model to use (default: glm-4-flash)", ) parser.add_argument( "--source-url", type=str, help="Source URL for provenance tracking", ) parser.add_argument( "--summary", action="store_true", help="Print summary statistics only", ) parser.add_argument( "--verbose", "-v", action="store_true", help="Verbose output", ) args = parser.parse_args() # Validate input input_path = Path(args.input) if not input_path.exists(): print(f"Error: Input path does not exist: {input_path}", file=sys.stderr) sys.exit(1) # Create configuration config = AnnotationConfig( annotate_entities=not args.no_entities, annotate_layout=not args.no_layout, create_aggregates=not args.no_aggregates, use_llm_entities=args.use_llm, llm_model=args.llm_model, output_format=args.format, output_dir=args.output_dir, ) # Create annotator annotator = AgenticAnnotator(config) # Process input sessions: List[AnnotationSession] = [] if input_path.is_file(): if input_path.suffix == '.gz' and 'warc' in input_path.stem.lower(): # WARC archive if args.verbose: print(f"Processing WARC archive: {input_path}") sessions = annotator.annotate_warc(input_path) else: # Single HTML file if args.verbose: print(f"Processing HTML file: {input_path}") session = annotator.annotate_file(input_path, source_url=args.source_url) sessions = [session] elif input_path.is_dir(): if args.recursive: # Recursive directory if args.verbose: print(f"Processing directory recursively: {input_path}") sessions = annotator.annotate_mirror_directory(input_path) else: # Non-recursive - find HTML files in directory if args.verbose: print(f"Processing directory: {input_path}") for html_file in input_path.glob("*.html"): session = annotator.annotate_file(html_file, source_url=args.source_url) sessions.append(session) # Report results if not sessions: print("No documents processed.", file=sys.stderr) sys.exit(1) print(f"\nProcessed {len(sessions)} document(s)") # Export or summarize for session in sessions: summary = annotator.get_session_summary(session) if args.summary: print(f"\n--- Session {session.session_id[:8]} ---") print(f" Source: {summary['source_file'] or summary['source_url']}") print(f" Entity claims: {summary['entity_claims']}") if summary['entity_counts']: for hypernym, count in sorted(summary['entity_counts'].items()): print(f" {hypernym}: {count}") print(f" Layout claims: {summary['layout_claims']}") if summary['layout_counts']: for region, count in sorted(summary['layout_counts'].items()): print(f" {region}: {count}") print(f" Aggregate claims: {summary['aggregate_claims']}") if summary['errors']: print(f" Errors: {len(summary['errors'])}") else: # Export to file output_path = annotator.export_session( session, output_path=args.output if len(sessions) == 1 else None, ) if args.verbose: print(f" Exported: {output_path}") # Final summary total_entities = sum(len(s.entity_claims) for s in sessions) total_layout = sum(len(s.layout_claims) for s in sessions) total_aggregates = sum(len(s.aggregate_claims) for s in sessions) print(f"\nTotal: {total_entities} entities, {total_layout} layout regions, {total_aggregates} aggregates") if __name__ == "__main__": main()