glam/archive/deprecated_heuristic_annotators/agentic_annotator.py
2025-12-05 15:30:23 +01:00

380 lines
12 KiB
Python

"""
Agentic Annotator - Main orchestration for multi-pass document annotation.
Combines entity recognition and layout analysis with optional LLM-based
enhancement using Z.AI GLM4.6 or other models.
This is the primary interface for the annotation system.
"""
import json
import os
import yaml
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Optional, Union
from .base import (
AnnotationSession,
AggregateClaim,
EntityClaim,
LayoutClaim,
Provenance,
)
from .html_parser import HTMLParser, HTMLDocument
from .entity_annotator import EntityAnnotator, create_heritage_entity_annotator
from .layout_annotator import LayoutAnnotator, create_heritage_layout_annotator
@dataclass
class AnnotationConfig:
"""Configuration for agentic annotation."""
# Annotation options
annotate_entities: bool = True
annotate_layout: bool = True
create_aggregates: bool = True
# Entity annotator options
use_llm_entities: bool = False
entity_patterns: Optional[List] = None
# Layout annotator options
include_all_elements: bool = False
layout_patterns: Optional[List] = None
# LLM options
llm_model: str = "glm-4-flash"
llm_api_key: Optional[str] = None
llm_base_url: Optional[str] = None
# Output options
output_format: str = "yaml" # yaml, json, jsonld
output_dir: Optional[str] = None
# Session options
session_name: Optional[str] = None
class AgenticAnnotator:
"""
Main agentic annotator for heritage documents.
Orchestrates multiple annotation passes:
1. Layout analysis (document structure)
2. Entity recognition (named entities)
3. Aggregate claim creation (linking entities to regions)
4. Optional LLM enhancement pass
Supports multi-session annotation where different agents/models
can annotate the same document in separate passes.
"""
def __init__(
self,
config: Optional[AnnotationConfig] = None,
):
"""
Initialize agentic annotator.
Args:
config: Annotation configuration
"""
self.config = config or AnnotationConfig()
# Initialize sub-annotators
self.html_parser = HTMLParser()
self.entity_annotator = create_heritage_entity_annotator()
self.layout_annotator = create_heritage_layout_annotator()
# Session history
self.sessions: List[AnnotationSession] = []
def annotate_file(
self,
file_path: Union[str, Path],
source_url: Optional[str] = None,
) -> AnnotationSession:
"""
Annotate an HTML file.
Args:
file_path: Path to HTML file
source_url: Optional source URL for provenance
Returns:
AnnotationSession with all claims
"""
file_path = Path(file_path)
# Parse HTML
document = self.html_parser.parse_file(file_path)
if source_url:
document.source_url = source_url
return self.annotate_document(document)
def annotate_document(
self,
document: HTMLDocument,
) -> AnnotationSession:
"""
Annotate a parsed HTML document.
Args:
document: Parsed HTMLDocument
Returns:
AnnotationSession with all claims
"""
# Create main session
session = AnnotationSession(
agent_name="AgenticAnnotator",
agent_version="1.0.0",
model_id=self.config.llm_model if self.config.use_llm_entities else None,
source_url=document.source_url,
source_file=document.source_file,
document_hash=document.content_hash,
config={
"annotate_entities": self.config.annotate_entities,
"annotate_layout": self.config.annotate_layout,
"use_llm_entities": self.config.use_llm_entities,
},
)
# Pass 1: Layout analysis
if self.config.annotate_layout:
self._annotate_layout(document, session)
# Pass 2: Entity recognition
if self.config.annotate_entities:
self._annotate_entities(document, session)
# Pass 3: Create aggregate claims
if self.config.create_aggregates:
self._create_aggregates(session)
# Complete session
session.complete()
self.sessions.append(session)
return session
def _annotate_layout(
self,
document: HTMLDocument,
session: AnnotationSession,
):
"""Run layout annotation pass."""
layout_session = self.layout_annotator.annotate(document)
# Copy claims to main session
for claim in layout_session.layout_claims:
session.add_layout_claim(claim)
def _annotate_entities(
self,
document: HTMLDocument,
session: AnnotationSession,
):
"""Run entity annotation pass."""
entity_session = self.entity_annotator.annotate(document)
# Copy claims to main session
for claim in entity_session.entity_claims:
session.add_entity_claim(claim)
def _create_aggregates(
self,
session: AnnotationSession,
):
"""Create aggregate claims linking entities to layout regions."""
# Build XPath to layout claim mapping
layout_by_xpath: Dict[str, LayoutClaim] = {}
for claim in session.layout_claims:
if claim.xpath:
layout_by_xpath[claim.xpath] = claim
# Group entity claims by their containing layout region
entities_by_region: Dict[str, List[EntityClaim]] = {}
for entity in session.entity_claims:
if entity.provenance and entity.provenance.path:
xpath = entity.provenance.path
# Find containing layout region
layout_claim = layout_by_xpath.get(xpath)
if layout_claim:
region_id = layout_claim.claim_id
if region_id not in entities_by_region:
entities_by_region[region_id] = []
entities_by_region[region_id].append(entity)
# Create aggregate claims
for region_id, entities in entities_by_region.items():
layout_claim = next(
(c for c in session.layout_claims if c.claim_id == region_id),
None
)
if layout_claim and entities:
aggregate = AggregateClaim(
layout_claim=layout_claim,
entity_claims=entities,
provenance=Provenance.create(
namespace="glam",
path=layout_claim.xpath or "",
agent="AgenticAnnotator/1.0.0",
confidence=min(
layout_claim.provenance.confidence if layout_claim.provenance else 1.0,
min(e.provenance.confidence if e.provenance else 1.0 for e in entities),
),
),
)
session.add_aggregate_claim(aggregate)
def annotate_warc(
self,
warc_path: Union[str, Path],
) -> List[AnnotationSession]:
"""
Annotate all HTML documents in a WARC archive.
Args:
warc_path: Path to WARC file
Returns:
List of AnnotationSession objects
"""
documents = self.html_parser.parse_warc(warc_path)
sessions = []
for doc in documents:
session = self.annotate_document(doc)
sessions.append(session)
return sessions
def annotate_mirror_directory(
self,
mirror_path: Union[str, Path],
) -> List[AnnotationSession]:
"""
Annotate all HTML files in a mirror directory.
Args:
mirror_path: Path to mirror directory
Returns:
List of AnnotationSession objects
"""
documents = self.html_parser.parse_mirror_directory(mirror_path)
sessions = []
for doc in documents:
session = self.annotate_document(doc)
sessions.append(session)
return sessions
def export_session(
self,
session: AnnotationSession,
output_path: Optional[Union[str, Path]] = None,
) -> str:
"""
Export annotation session to file.
Args:
session: Session to export
output_path: Output file path (auto-generated if not provided)
Returns:
Path to exported file
"""
data = session.to_dict()
if output_path is None:
output_dir = Path(self.config.output_dir or ".")
output_dir.mkdir(parents=True, exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"annotations_{session.session_id[:8]}_{timestamp}"
if self.config.output_format == "yaml":
output_path = output_dir / f"{filename}.yaml"
else:
output_path = output_dir / f"{filename}.json"
output_path = Path(output_path)
if self.config.output_format == "yaml":
with open(output_path, 'w', encoding='utf-8') as f:
yaml.dump(data, f, default_flow_style=False, allow_unicode=True)
else:
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
return str(output_path)
def get_session_summary(
self,
session: AnnotationSession,
) -> Dict[str, Any]:
"""Get summary statistics for a session."""
entity_counts = {}
for claim in session.entity_claims:
hypernym = claim.hypernym.value if claim.hypernym else "UNKNOWN"
entity_counts[hypernym] = entity_counts.get(hypernym, 0) + 1
layout_counts = {}
for claim in session.layout_claims:
region = claim.region.value if claim.region else "UNKNOWN"
layout_counts[region] = layout_counts.get(region, 0) + 1
return {
"session_id": session.session_id,
"source_url": session.source_url,
"source_file": session.source_file,
"started_at": session.started_at,
"completed_at": session.completed_at,
"entity_claims": len(session.entity_claims),
"layout_claims": len(session.layout_claims),
"aggregate_claims": len(session.aggregate_claims),
"entity_counts": entity_counts,
"layout_counts": layout_counts,
"errors": session.errors,
}
def create_annotator(
annotate_entities: bool = True,
annotate_layout: bool = True,
use_llm: bool = False,
output_format: str = "yaml",
output_dir: Optional[str] = None,
) -> AgenticAnnotator:
"""
Factory function to create an agentic annotator.
Args:
annotate_entities: Enable entity recognition
annotate_layout: Enable layout analysis
use_llm: Use LLM for enhanced recognition
output_format: Output format (yaml, json)
output_dir: Output directory for exports
Returns:
Configured AgenticAnnotator
"""
config = AnnotationConfig(
annotate_entities=annotate_entities,
annotate_layout=annotate_layout,
use_llm_entities=use_llm,
output_format=output_format,
output_dir=output_dir,
)
return AgenticAnnotator(config)