""" Agentic Annotator - Main orchestration for multi-pass document annotation. Combines entity recognition and layout analysis with optional LLM-based enhancement using Z.AI GLM4.6 or other models. This is the primary interface for the annotation system. """ import json import os import yaml from dataclasses import dataclass, field from datetime import datetime, timezone from pathlib import Path from typing import Any, Dict, List, Optional, Union from .base import ( AnnotationSession, AggregateClaim, EntityClaim, LayoutClaim, Provenance, ) from .html_parser import HTMLParser, HTMLDocument from .entity_annotator import EntityAnnotator, create_heritage_entity_annotator from .layout_annotator import LayoutAnnotator, create_heritage_layout_annotator @dataclass class AnnotationConfig: """Configuration for agentic annotation.""" # Annotation options annotate_entities: bool = True annotate_layout: bool = True create_aggregates: bool = True # Entity annotator options use_llm_entities: bool = False entity_patterns: Optional[List] = None # Layout annotator options include_all_elements: bool = False layout_patterns: Optional[List] = None # LLM options llm_model: str = "glm-4-flash" llm_api_key: Optional[str] = None llm_base_url: Optional[str] = None # Output options output_format: str = "yaml" # yaml, json, jsonld output_dir: Optional[str] = None # Session options session_name: Optional[str] = None class AgenticAnnotator: """ Main agentic annotator for heritage documents. Orchestrates multiple annotation passes: 1. Layout analysis (document structure) 2. Entity recognition (named entities) 3. Aggregate claim creation (linking entities to regions) 4. Optional LLM enhancement pass Supports multi-session annotation where different agents/models can annotate the same document in separate passes. """ def __init__( self, config: Optional[AnnotationConfig] = None, ): """ Initialize agentic annotator. Args: config: Annotation configuration """ self.config = config or AnnotationConfig() # Initialize sub-annotators self.html_parser = HTMLParser() self.entity_annotator = create_heritage_entity_annotator() self.layout_annotator = create_heritage_layout_annotator() # Session history self.sessions: List[AnnotationSession] = [] def annotate_file( self, file_path: Union[str, Path], source_url: Optional[str] = None, ) -> AnnotationSession: """ Annotate an HTML file. Args: file_path: Path to HTML file source_url: Optional source URL for provenance Returns: AnnotationSession with all claims """ file_path = Path(file_path) # Parse HTML document = self.html_parser.parse_file(file_path) if source_url: document.source_url = source_url return self.annotate_document(document) def annotate_document( self, document: HTMLDocument, ) -> AnnotationSession: """ Annotate a parsed HTML document. Args: document: Parsed HTMLDocument Returns: AnnotationSession with all claims """ # Create main session session = AnnotationSession( agent_name="AgenticAnnotator", agent_version="1.0.0", model_id=self.config.llm_model if self.config.use_llm_entities else None, source_url=document.source_url, source_file=document.source_file, document_hash=document.content_hash, config={ "annotate_entities": self.config.annotate_entities, "annotate_layout": self.config.annotate_layout, "use_llm_entities": self.config.use_llm_entities, }, ) # Pass 1: Layout analysis if self.config.annotate_layout: self._annotate_layout(document, session) # Pass 2: Entity recognition if self.config.annotate_entities: self._annotate_entities(document, session) # Pass 3: Create aggregate claims if self.config.create_aggregates: self._create_aggregates(session) # Complete session session.complete() self.sessions.append(session) return session def _annotate_layout( self, document: HTMLDocument, session: AnnotationSession, ): """Run layout annotation pass.""" layout_session = self.layout_annotator.annotate(document) # Copy claims to main session for claim in layout_session.layout_claims: session.add_layout_claim(claim) def _annotate_entities( self, document: HTMLDocument, session: AnnotationSession, ): """Run entity annotation pass.""" entity_session = self.entity_annotator.annotate(document) # Copy claims to main session for claim in entity_session.entity_claims: session.add_entity_claim(claim) def _create_aggregates( self, session: AnnotationSession, ): """Create aggregate claims linking entities to layout regions.""" # Build XPath to layout claim mapping layout_by_xpath: Dict[str, LayoutClaim] = {} for claim in session.layout_claims: if claim.xpath: layout_by_xpath[claim.xpath] = claim # Group entity claims by their containing layout region entities_by_region: Dict[str, List[EntityClaim]] = {} for entity in session.entity_claims: if entity.provenance and entity.provenance.path: xpath = entity.provenance.path # Find containing layout region layout_claim = layout_by_xpath.get(xpath) if layout_claim: region_id = layout_claim.claim_id if region_id not in entities_by_region: entities_by_region[region_id] = [] entities_by_region[region_id].append(entity) # Create aggregate claims for region_id, entities in entities_by_region.items(): layout_claim = next( (c for c in session.layout_claims if c.claim_id == region_id), None ) if layout_claim and entities: aggregate = AggregateClaim( layout_claim=layout_claim, entity_claims=entities, provenance=Provenance.create( namespace="glam", path=layout_claim.xpath or "", agent="AgenticAnnotator/1.0.0", confidence=min( layout_claim.provenance.confidence if layout_claim.provenance else 1.0, min(e.provenance.confidence if e.provenance else 1.0 for e in entities), ), ), ) session.add_aggregate_claim(aggregate) def annotate_warc( self, warc_path: Union[str, Path], ) -> List[AnnotationSession]: """ Annotate all HTML documents in a WARC archive. Args: warc_path: Path to WARC file Returns: List of AnnotationSession objects """ documents = self.html_parser.parse_warc(warc_path) sessions = [] for doc in documents: session = self.annotate_document(doc) sessions.append(session) return sessions def annotate_mirror_directory( self, mirror_path: Union[str, Path], ) -> List[AnnotationSession]: """ Annotate all HTML files in a mirror directory. Args: mirror_path: Path to mirror directory Returns: List of AnnotationSession objects """ documents = self.html_parser.parse_mirror_directory(mirror_path) sessions = [] for doc in documents: session = self.annotate_document(doc) sessions.append(session) return sessions def export_session( self, session: AnnotationSession, output_path: Optional[Union[str, Path]] = None, ) -> str: """ Export annotation session to file. Args: session: Session to export output_path: Output file path (auto-generated if not provided) Returns: Path to exported file """ data = session.to_dict() if output_path is None: output_dir = Path(self.config.output_dir or ".") output_dir.mkdir(parents=True, exist_ok=True) timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") filename = f"annotations_{session.session_id[:8]}_{timestamp}" if self.config.output_format == "yaml": output_path = output_dir / f"{filename}.yaml" else: output_path = output_dir / f"{filename}.json" output_path = Path(output_path) if self.config.output_format == "yaml": with open(output_path, 'w', encoding='utf-8') as f: yaml.dump(data, f, default_flow_style=False, allow_unicode=True) else: with open(output_path, 'w', encoding='utf-8') as f: json.dump(data, f, indent=2, ensure_ascii=False) return str(output_path) def get_session_summary( self, session: AnnotationSession, ) -> Dict[str, Any]: """Get summary statistics for a session.""" entity_counts = {} for claim in session.entity_claims: hypernym = claim.hypernym.value if claim.hypernym else "UNKNOWN" entity_counts[hypernym] = entity_counts.get(hypernym, 0) + 1 layout_counts = {} for claim in session.layout_claims: region = claim.region.value if claim.region else "UNKNOWN" layout_counts[region] = layout_counts.get(region, 0) + 1 return { "session_id": session.session_id, "source_url": session.source_url, "source_file": session.source_file, "started_at": session.started_at, "completed_at": session.completed_at, "entity_claims": len(session.entity_claims), "layout_claims": len(session.layout_claims), "aggregate_claims": len(session.aggregate_claims), "entity_counts": entity_counts, "layout_counts": layout_counts, "errors": session.errors, } def create_annotator( annotate_entities: bool = True, annotate_layout: bool = True, use_llm: bool = False, output_format: str = "yaml", output_dir: Optional[str] = None, ) -> AgenticAnnotator: """ Factory function to create an agentic annotator. Args: annotate_entities: Enable entity recognition annotate_layout: Enable layout analysis use_llm: Use LLM for enhanced recognition output_format: Output format (yaml, json) output_dir: Output directory for exports Returns: Configured AgenticAnnotator """ config = AnnotationConfig( annotate_entities=annotate_entities, annotate_layout=annotate_layout, use_llm_entities=use_llm, output_format=output_format, output_dir=output_dir, ) return AgenticAnnotator(config)