238 lines
7.1 KiB
Python
238 lines
7.1 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Agentic Annotator CLI - Annotate heritage documents with entities and layout.
|
|
|
|
Usage:
|
|
python scripts/agentic_annotator.py <input_path> [options]
|
|
|
|
Examples:
|
|
# Annotate a single HTML file
|
|
python scripts/agentic_annotator.py data/nde/enriched/entries/web/0576/rhc-eindhoven.nl/mirror/www.rhc-eindhoven.nl/index.html
|
|
|
|
# Annotate a WARC archive
|
|
python scripts/agentic_annotator.py data/nde/enriched/entries/web/0576/rhc-eindhoven.nl/archive.warc.gz
|
|
|
|
# Annotate a mirror directory
|
|
python scripts/agentic_annotator.py data/nde/enriched/entries/web/0576/rhc-eindhoven.nl/mirror/ --recursive
|
|
|
|
# Output as JSON instead of YAML
|
|
python scripts/agentic_annotator.py input.html --format json
|
|
|
|
Based on GLAM-NER v1.7.0-unified Entity Annotation Convention.
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import List, Optional
|
|
|
|
# Add src to path
|
|
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
|
|
|
from glam_extractor.annotators import (
|
|
AgenticAnnotator,
|
|
AnnotationConfig,
|
|
AnnotationSession,
|
|
HTMLParser,
|
|
create_annotator,
|
|
)
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Annotate heritage documents with entities and layout regions.",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
Examples:
|
|
# Annotate HTML file
|
|
%(prog)s data/web/page.html
|
|
|
|
# Annotate WARC archive
|
|
%(prog)s data/web/archive.warc.gz
|
|
|
|
# Annotate directory recursively
|
|
%(prog)s data/web/mirror/ --recursive
|
|
|
|
# Export as JSON
|
|
%(prog)s data/web/page.html --format json --output annotations.json
|
|
""",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"input",
|
|
type=str,
|
|
help="Input file or directory to annotate",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"-o", "--output",
|
|
type=str,
|
|
help="Output file path (auto-generated if not specified)",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--output-dir",
|
|
type=str,
|
|
default="data/annotations",
|
|
help="Output directory for annotations (default: data/annotations)",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"-f", "--format",
|
|
choices=["yaml", "json", "jsonld"],
|
|
default="yaml",
|
|
help="Output format (default: yaml)",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--recursive", "-r",
|
|
action="store_true",
|
|
help="Process directory recursively",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--no-entities",
|
|
action="store_true",
|
|
help="Skip entity recognition",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--no-layout",
|
|
action="store_true",
|
|
help="Skip layout analysis",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--no-aggregates",
|
|
action="store_true",
|
|
help="Skip aggregate claim creation",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--use-llm",
|
|
action="store_true",
|
|
help="Use LLM for enhanced entity recognition",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--llm-model",
|
|
type=str,
|
|
default="glm-4-flash",
|
|
help="LLM model to use (default: glm-4-flash)",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--source-url",
|
|
type=str,
|
|
help="Source URL for provenance tracking",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--summary",
|
|
action="store_true",
|
|
help="Print summary statistics only",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--verbose", "-v",
|
|
action="store_true",
|
|
help="Verbose output",
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Validate input
|
|
input_path = Path(args.input)
|
|
if not input_path.exists():
|
|
print(f"Error: Input path does not exist: {input_path}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
# Create configuration
|
|
config = AnnotationConfig(
|
|
annotate_entities=not args.no_entities,
|
|
annotate_layout=not args.no_layout,
|
|
create_aggregates=not args.no_aggregates,
|
|
use_llm_entities=args.use_llm,
|
|
llm_model=args.llm_model,
|
|
output_format=args.format,
|
|
output_dir=args.output_dir,
|
|
)
|
|
|
|
# Create annotator
|
|
annotator = AgenticAnnotator(config)
|
|
|
|
# Process input
|
|
sessions: List[AnnotationSession] = []
|
|
|
|
if input_path.is_file():
|
|
if input_path.suffix == '.gz' and 'warc' in input_path.stem.lower():
|
|
# WARC archive
|
|
if args.verbose:
|
|
print(f"Processing WARC archive: {input_path}")
|
|
sessions = annotator.annotate_warc(input_path)
|
|
else:
|
|
# Single HTML file
|
|
if args.verbose:
|
|
print(f"Processing HTML file: {input_path}")
|
|
session = annotator.annotate_file(input_path, source_url=args.source_url)
|
|
sessions = [session]
|
|
|
|
elif input_path.is_dir():
|
|
if args.recursive:
|
|
# Recursive directory
|
|
if args.verbose:
|
|
print(f"Processing directory recursively: {input_path}")
|
|
sessions = annotator.annotate_mirror_directory(input_path)
|
|
else:
|
|
# Non-recursive - find HTML files in directory
|
|
if args.verbose:
|
|
print(f"Processing directory: {input_path}")
|
|
for html_file in input_path.glob("*.html"):
|
|
session = annotator.annotate_file(html_file, source_url=args.source_url)
|
|
sessions.append(session)
|
|
|
|
# Report results
|
|
if not sessions:
|
|
print("No documents processed.", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
print(f"\nProcessed {len(sessions)} document(s)")
|
|
|
|
# Export or summarize
|
|
for session in sessions:
|
|
summary = annotator.get_session_summary(session)
|
|
|
|
if args.summary:
|
|
print(f"\n--- Session {session.session_id[:8]} ---")
|
|
print(f" Source: {summary['source_file'] or summary['source_url']}")
|
|
print(f" Entity claims: {summary['entity_claims']}")
|
|
if summary['entity_counts']:
|
|
for hypernym, count in sorted(summary['entity_counts'].items()):
|
|
print(f" {hypernym}: {count}")
|
|
print(f" Layout claims: {summary['layout_claims']}")
|
|
if summary['layout_counts']:
|
|
for region, count in sorted(summary['layout_counts'].items()):
|
|
print(f" {region}: {count}")
|
|
print(f" Aggregate claims: {summary['aggregate_claims']}")
|
|
if summary['errors']:
|
|
print(f" Errors: {len(summary['errors'])}")
|
|
else:
|
|
# Export to file
|
|
output_path = annotator.export_session(
|
|
session,
|
|
output_path=args.output if len(sessions) == 1 else None,
|
|
)
|
|
if args.verbose:
|
|
print(f" Exported: {output_path}")
|
|
|
|
# Final summary
|
|
total_entities = sum(len(s.entity_claims) for s in sessions)
|
|
total_layout = sum(len(s.layout_claims) for s in sessions)
|
|
total_aggregates = sum(len(s.aggregate_claims) for s in sessions)
|
|
|
|
print(f"\nTotal: {total_entities} entities, {total_layout} layout regions, {total_aggregates} aggregates")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|