glam/scripts/agentic_annotator.py
2025-12-05 15:30:23 +01:00

238 lines
7.1 KiB
Python

#!/usr/bin/env python3
"""
Agentic Annotator CLI - Annotate heritage documents with entities and layout.
Usage:
python scripts/agentic_annotator.py <input_path> [options]
Examples:
# Annotate a single HTML file
python scripts/agentic_annotator.py data/nde/enriched/entries/web/0576/rhc-eindhoven.nl/mirror/www.rhc-eindhoven.nl/index.html
# Annotate a WARC archive
python scripts/agentic_annotator.py data/nde/enriched/entries/web/0576/rhc-eindhoven.nl/archive.warc.gz
# Annotate a mirror directory
python scripts/agentic_annotator.py data/nde/enriched/entries/web/0576/rhc-eindhoven.nl/mirror/ --recursive
# Output as JSON instead of YAML
python scripts/agentic_annotator.py input.html --format json
Based on GLAM-NER v1.7.0-unified Entity Annotation Convention.
"""
import argparse
import json
import sys
from pathlib import Path
from typing import List, Optional
# Add src to path
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
from glam_extractor.annotators import (
AgenticAnnotator,
AnnotationConfig,
AnnotationSession,
HTMLParser,
create_annotator,
)
def main():
parser = argparse.ArgumentParser(
description="Annotate heritage documents with entities and layout regions.",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Annotate HTML file
%(prog)s data/web/page.html
# Annotate WARC archive
%(prog)s data/web/archive.warc.gz
# Annotate directory recursively
%(prog)s data/web/mirror/ --recursive
# Export as JSON
%(prog)s data/web/page.html --format json --output annotations.json
""",
)
parser.add_argument(
"input",
type=str,
help="Input file or directory to annotate",
)
parser.add_argument(
"-o", "--output",
type=str,
help="Output file path (auto-generated if not specified)",
)
parser.add_argument(
"--output-dir",
type=str,
default="data/annotations",
help="Output directory for annotations (default: data/annotations)",
)
parser.add_argument(
"-f", "--format",
choices=["yaml", "json", "jsonld"],
default="yaml",
help="Output format (default: yaml)",
)
parser.add_argument(
"--recursive", "-r",
action="store_true",
help="Process directory recursively",
)
parser.add_argument(
"--no-entities",
action="store_true",
help="Skip entity recognition",
)
parser.add_argument(
"--no-layout",
action="store_true",
help="Skip layout analysis",
)
parser.add_argument(
"--no-aggregates",
action="store_true",
help="Skip aggregate claim creation",
)
parser.add_argument(
"--use-llm",
action="store_true",
help="Use LLM for enhanced entity recognition",
)
parser.add_argument(
"--llm-model",
type=str,
default="glm-4-flash",
help="LLM model to use (default: glm-4-flash)",
)
parser.add_argument(
"--source-url",
type=str,
help="Source URL for provenance tracking",
)
parser.add_argument(
"--summary",
action="store_true",
help="Print summary statistics only",
)
parser.add_argument(
"--verbose", "-v",
action="store_true",
help="Verbose output",
)
args = parser.parse_args()
# Validate input
input_path = Path(args.input)
if not input_path.exists():
print(f"Error: Input path does not exist: {input_path}", file=sys.stderr)
sys.exit(1)
# Create configuration
config = AnnotationConfig(
annotate_entities=not args.no_entities,
annotate_layout=not args.no_layout,
create_aggregates=not args.no_aggregates,
use_llm_entities=args.use_llm,
llm_model=args.llm_model,
output_format=args.format,
output_dir=args.output_dir,
)
# Create annotator
annotator = AgenticAnnotator(config)
# Process input
sessions: List[AnnotationSession] = []
if input_path.is_file():
if input_path.suffix == '.gz' and 'warc' in input_path.stem.lower():
# WARC archive
if args.verbose:
print(f"Processing WARC archive: {input_path}")
sessions = annotator.annotate_warc(input_path)
else:
# Single HTML file
if args.verbose:
print(f"Processing HTML file: {input_path}")
session = annotator.annotate_file(input_path, source_url=args.source_url)
sessions = [session]
elif input_path.is_dir():
if args.recursive:
# Recursive directory
if args.verbose:
print(f"Processing directory recursively: {input_path}")
sessions = annotator.annotate_mirror_directory(input_path)
else:
# Non-recursive - find HTML files in directory
if args.verbose:
print(f"Processing directory: {input_path}")
for html_file in input_path.glob("*.html"):
session = annotator.annotate_file(html_file, source_url=args.source_url)
sessions.append(session)
# Report results
if not sessions:
print("No documents processed.", file=sys.stderr)
sys.exit(1)
print(f"\nProcessed {len(sessions)} document(s)")
# Export or summarize
for session in sessions:
summary = annotator.get_session_summary(session)
if args.summary:
print(f"\n--- Session {session.session_id[:8]} ---")
print(f" Source: {summary['source_file'] or summary['source_url']}")
print(f" Entity claims: {summary['entity_claims']}")
if summary['entity_counts']:
for hypernym, count in sorted(summary['entity_counts'].items()):
print(f" {hypernym}: {count}")
print(f" Layout claims: {summary['layout_claims']}")
if summary['layout_counts']:
for region, count in sorted(summary['layout_counts'].items()):
print(f" {region}: {count}")
print(f" Aggregate claims: {summary['aggregate_claims']}")
if summary['errors']:
print(f" Errors: {len(summary['errors'])}")
else:
# Export to file
output_path = annotator.export_session(
session,
output_path=args.output if len(sessions) == 1 else None,
)
if args.verbose:
print(f" Exported: {output_path}")
# Final summary
total_entities = sum(len(s.entity_claims) for s in sessions)
total_layout = sum(len(s.layout_claims) for s in sessions)
total_aggregates = sum(len(s.aggregate_claims) for s in sessions)
print(f"\nTotal: {total_entities} entities, {total_layout} layout regions, {total_aggregates} aggregates")
if __name__ == "__main__":
main()