380 lines
12 KiB
Python
380 lines
12 KiB
Python
"""
|
|
Agentic Annotator - Main orchestration for multi-pass document annotation.
|
|
|
|
Combines entity recognition and layout analysis with optional LLM-based
|
|
enhancement using Z.AI GLM4.6 or other models.
|
|
|
|
This is the primary interface for the annotation system.
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import yaml
|
|
from dataclasses import dataclass, field
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Optional, Union
|
|
|
|
from .base import (
|
|
AnnotationSession,
|
|
AggregateClaim,
|
|
EntityClaim,
|
|
LayoutClaim,
|
|
Provenance,
|
|
)
|
|
from .html_parser import HTMLParser, HTMLDocument
|
|
from .entity_annotator import EntityAnnotator, create_heritage_entity_annotator
|
|
from .layout_annotator import LayoutAnnotator, create_heritage_layout_annotator
|
|
|
|
|
|
@dataclass
|
|
class AnnotationConfig:
|
|
"""Configuration for agentic annotation."""
|
|
|
|
# Annotation options
|
|
annotate_entities: bool = True
|
|
annotate_layout: bool = True
|
|
create_aggregates: bool = True
|
|
|
|
# Entity annotator options
|
|
use_llm_entities: bool = False
|
|
entity_patterns: Optional[List] = None
|
|
|
|
# Layout annotator options
|
|
include_all_elements: bool = False
|
|
layout_patterns: Optional[List] = None
|
|
|
|
# LLM options
|
|
llm_model: str = "glm-4-flash"
|
|
llm_api_key: Optional[str] = None
|
|
llm_base_url: Optional[str] = None
|
|
|
|
# Output options
|
|
output_format: str = "yaml" # yaml, json, jsonld
|
|
output_dir: Optional[str] = None
|
|
|
|
# Session options
|
|
session_name: Optional[str] = None
|
|
|
|
|
|
class AgenticAnnotator:
|
|
"""
|
|
Main agentic annotator for heritage documents.
|
|
|
|
Orchestrates multiple annotation passes:
|
|
1. Layout analysis (document structure)
|
|
2. Entity recognition (named entities)
|
|
3. Aggregate claim creation (linking entities to regions)
|
|
4. Optional LLM enhancement pass
|
|
|
|
Supports multi-session annotation where different agents/models
|
|
can annotate the same document in separate passes.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
config: Optional[AnnotationConfig] = None,
|
|
):
|
|
"""
|
|
Initialize agentic annotator.
|
|
|
|
Args:
|
|
config: Annotation configuration
|
|
"""
|
|
self.config = config or AnnotationConfig()
|
|
|
|
# Initialize sub-annotators
|
|
self.html_parser = HTMLParser()
|
|
self.entity_annotator = create_heritage_entity_annotator()
|
|
self.layout_annotator = create_heritage_layout_annotator()
|
|
|
|
# Session history
|
|
self.sessions: List[AnnotationSession] = []
|
|
|
|
def annotate_file(
|
|
self,
|
|
file_path: Union[str, Path],
|
|
source_url: Optional[str] = None,
|
|
) -> AnnotationSession:
|
|
"""
|
|
Annotate an HTML file.
|
|
|
|
Args:
|
|
file_path: Path to HTML file
|
|
source_url: Optional source URL for provenance
|
|
|
|
Returns:
|
|
AnnotationSession with all claims
|
|
"""
|
|
file_path = Path(file_path)
|
|
|
|
# Parse HTML
|
|
document = self.html_parser.parse_file(file_path)
|
|
if source_url:
|
|
document.source_url = source_url
|
|
|
|
return self.annotate_document(document)
|
|
|
|
def annotate_document(
|
|
self,
|
|
document: HTMLDocument,
|
|
) -> AnnotationSession:
|
|
"""
|
|
Annotate a parsed HTML document.
|
|
|
|
Args:
|
|
document: Parsed HTMLDocument
|
|
|
|
Returns:
|
|
AnnotationSession with all claims
|
|
"""
|
|
# Create main session
|
|
session = AnnotationSession(
|
|
agent_name="AgenticAnnotator",
|
|
agent_version="1.0.0",
|
|
model_id=self.config.llm_model if self.config.use_llm_entities else None,
|
|
source_url=document.source_url,
|
|
source_file=document.source_file,
|
|
document_hash=document.content_hash,
|
|
config={
|
|
"annotate_entities": self.config.annotate_entities,
|
|
"annotate_layout": self.config.annotate_layout,
|
|
"use_llm_entities": self.config.use_llm_entities,
|
|
},
|
|
)
|
|
|
|
# Pass 1: Layout analysis
|
|
if self.config.annotate_layout:
|
|
self._annotate_layout(document, session)
|
|
|
|
# Pass 2: Entity recognition
|
|
if self.config.annotate_entities:
|
|
self._annotate_entities(document, session)
|
|
|
|
# Pass 3: Create aggregate claims
|
|
if self.config.create_aggregates:
|
|
self._create_aggregates(session)
|
|
|
|
# Complete session
|
|
session.complete()
|
|
self.sessions.append(session)
|
|
|
|
return session
|
|
|
|
def _annotate_layout(
|
|
self,
|
|
document: HTMLDocument,
|
|
session: AnnotationSession,
|
|
):
|
|
"""Run layout annotation pass."""
|
|
layout_session = self.layout_annotator.annotate(document)
|
|
|
|
# Copy claims to main session
|
|
for claim in layout_session.layout_claims:
|
|
session.add_layout_claim(claim)
|
|
|
|
def _annotate_entities(
|
|
self,
|
|
document: HTMLDocument,
|
|
session: AnnotationSession,
|
|
):
|
|
"""Run entity annotation pass."""
|
|
entity_session = self.entity_annotator.annotate(document)
|
|
|
|
# Copy claims to main session
|
|
for claim in entity_session.entity_claims:
|
|
session.add_entity_claim(claim)
|
|
|
|
def _create_aggregates(
|
|
self,
|
|
session: AnnotationSession,
|
|
):
|
|
"""Create aggregate claims linking entities to layout regions."""
|
|
# Build XPath to layout claim mapping
|
|
layout_by_xpath: Dict[str, LayoutClaim] = {}
|
|
for claim in session.layout_claims:
|
|
if claim.xpath:
|
|
layout_by_xpath[claim.xpath] = claim
|
|
|
|
# Group entity claims by their containing layout region
|
|
entities_by_region: Dict[str, List[EntityClaim]] = {}
|
|
|
|
for entity in session.entity_claims:
|
|
if entity.provenance and entity.provenance.path:
|
|
xpath = entity.provenance.path
|
|
|
|
# Find containing layout region
|
|
layout_claim = layout_by_xpath.get(xpath)
|
|
|
|
if layout_claim:
|
|
region_id = layout_claim.claim_id
|
|
if region_id not in entities_by_region:
|
|
entities_by_region[region_id] = []
|
|
entities_by_region[region_id].append(entity)
|
|
|
|
# Create aggregate claims
|
|
for region_id, entities in entities_by_region.items():
|
|
layout_claim = next(
|
|
(c for c in session.layout_claims if c.claim_id == region_id),
|
|
None
|
|
)
|
|
|
|
if layout_claim and entities:
|
|
aggregate = AggregateClaim(
|
|
layout_claim=layout_claim,
|
|
entity_claims=entities,
|
|
provenance=Provenance.create(
|
|
namespace="glam",
|
|
path=layout_claim.xpath or "",
|
|
agent="AgenticAnnotator/1.0.0",
|
|
confidence=min(
|
|
layout_claim.provenance.confidence if layout_claim.provenance else 1.0,
|
|
min(e.provenance.confidence if e.provenance else 1.0 for e in entities),
|
|
),
|
|
),
|
|
)
|
|
session.add_aggregate_claim(aggregate)
|
|
|
|
def annotate_warc(
|
|
self,
|
|
warc_path: Union[str, Path],
|
|
) -> List[AnnotationSession]:
|
|
"""
|
|
Annotate all HTML documents in a WARC archive.
|
|
|
|
Args:
|
|
warc_path: Path to WARC file
|
|
|
|
Returns:
|
|
List of AnnotationSession objects
|
|
"""
|
|
documents = self.html_parser.parse_warc(warc_path)
|
|
sessions = []
|
|
|
|
for doc in documents:
|
|
session = self.annotate_document(doc)
|
|
sessions.append(session)
|
|
|
|
return sessions
|
|
|
|
def annotate_mirror_directory(
|
|
self,
|
|
mirror_path: Union[str, Path],
|
|
) -> List[AnnotationSession]:
|
|
"""
|
|
Annotate all HTML files in a mirror directory.
|
|
|
|
Args:
|
|
mirror_path: Path to mirror directory
|
|
|
|
Returns:
|
|
List of AnnotationSession objects
|
|
"""
|
|
documents = self.html_parser.parse_mirror_directory(mirror_path)
|
|
sessions = []
|
|
|
|
for doc in documents:
|
|
session = self.annotate_document(doc)
|
|
sessions.append(session)
|
|
|
|
return sessions
|
|
|
|
def export_session(
|
|
self,
|
|
session: AnnotationSession,
|
|
output_path: Optional[Union[str, Path]] = None,
|
|
) -> str:
|
|
"""
|
|
Export annotation session to file.
|
|
|
|
Args:
|
|
session: Session to export
|
|
output_path: Output file path (auto-generated if not provided)
|
|
|
|
Returns:
|
|
Path to exported file
|
|
"""
|
|
data = session.to_dict()
|
|
|
|
if output_path is None:
|
|
output_dir = Path(self.config.output_dir or ".")
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
filename = f"annotations_{session.session_id[:8]}_{timestamp}"
|
|
|
|
if self.config.output_format == "yaml":
|
|
output_path = output_dir / f"{filename}.yaml"
|
|
else:
|
|
output_path = output_dir / f"{filename}.json"
|
|
|
|
output_path = Path(output_path)
|
|
|
|
if self.config.output_format == "yaml":
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, default_flow_style=False, allow_unicode=True)
|
|
else:
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
json.dump(data, f, indent=2, ensure_ascii=False)
|
|
|
|
return str(output_path)
|
|
|
|
def get_session_summary(
|
|
self,
|
|
session: AnnotationSession,
|
|
) -> Dict[str, Any]:
|
|
"""Get summary statistics for a session."""
|
|
entity_counts = {}
|
|
for claim in session.entity_claims:
|
|
hypernym = claim.hypernym.value if claim.hypernym else "UNKNOWN"
|
|
entity_counts[hypernym] = entity_counts.get(hypernym, 0) + 1
|
|
|
|
layout_counts = {}
|
|
for claim in session.layout_claims:
|
|
region = claim.region.value if claim.region else "UNKNOWN"
|
|
layout_counts[region] = layout_counts.get(region, 0) + 1
|
|
|
|
return {
|
|
"session_id": session.session_id,
|
|
"source_url": session.source_url,
|
|
"source_file": session.source_file,
|
|
"started_at": session.started_at,
|
|
"completed_at": session.completed_at,
|
|
"entity_claims": len(session.entity_claims),
|
|
"layout_claims": len(session.layout_claims),
|
|
"aggregate_claims": len(session.aggregate_claims),
|
|
"entity_counts": entity_counts,
|
|
"layout_counts": layout_counts,
|
|
"errors": session.errors,
|
|
}
|
|
|
|
|
|
def create_annotator(
|
|
annotate_entities: bool = True,
|
|
annotate_layout: bool = True,
|
|
use_llm: bool = False,
|
|
output_format: str = "yaml",
|
|
output_dir: Optional[str] = None,
|
|
) -> AgenticAnnotator:
|
|
"""
|
|
Factory function to create an agentic annotator.
|
|
|
|
Args:
|
|
annotate_entities: Enable entity recognition
|
|
annotate_layout: Enable layout analysis
|
|
use_llm: Use LLM for enhanced recognition
|
|
output_format: Output format (yaml, json)
|
|
output_dir: Output directory for exports
|
|
|
|
Returns:
|
|
Configured AgenticAnnotator
|
|
"""
|
|
config = AnnotationConfig(
|
|
annotate_entities=annotate_entities,
|
|
annotate_layout=annotate_layout,
|
|
use_llm_entities=use_llm,
|
|
output_format=output_format,
|
|
output_dir=output_dir,
|
|
)
|
|
|
|
return AgenticAnnotator(config)
|