""" Layout Annotator for GLAM documents. Analyzes document structure and creates layout region claims following the DOC hypernym from GLAM-NER v1.7.0 Section 15. Regions include: - Primary: HDR, PAR, SEN, LST, TBL - Media: GAL, MAP, AUD, VID, EMB - Navigation: NAV, TOC, IDX - Front/Back matter: TTP, DED, COL, BIB, APP, GLO - Commercial: ADV, LOG NO COMPUTER VISION - uses text/structure patterns only. """ import re from dataclasses import dataclass from typing import Any, Dict, List, Optional, Set, Tuple from .base import ( BaseAnnotator, AnnotationSession, LayoutClaim, LayoutRegion, SemanticRole, Provenance, ) from .html_parser import HTMLDocument, HTMLElement # ============================================================================= # LAYOUT DETECTION PATTERNS # ============================================================================= @dataclass class LayoutPattern: """Pattern for layout region detection.""" region: LayoutRegion semantic_role: SemanticRole # Detection criteria (any match triggers) html_tags: Optional[Set[str]] = None css_classes: Optional[Set[str]] = None css_ids: Optional[Set[str]] = None text_patterns: Optional[List[str]] = None # Additional constraints min_text_length: int = 0 max_text_length: int = 100000 priority: int = 5 def __post_init__(self): if self.html_tags is None: self.html_tags = set() if self.css_classes is None: self.css_classes = set() if self.css_ids is None: self.css_ids = set() if self.text_patterns is None: self.text_patterns = [] # Layout patterns for heritage documents LAYOUT_PATTERNS: List[LayoutPattern] = [ # Headings (H1-H6) LayoutPattern( region=LayoutRegion.HDR, semantic_role=SemanticRole.STRC, html_tags={'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}, priority=10, ), # Paragraphs LayoutPattern( region=LayoutRegion.PAR, semantic_role=SemanticRole.PRIM, html_tags={'p'}, min_text_length=10, priority=5, ), # Lists LayoutPattern( region=LayoutRegion.LST, semantic_role=SemanticRole.PRIM, html_tags={'ul', 'ol', 'dl'}, priority=7, ), LayoutPattern( region=LayoutRegion.LIT, semantic_role=SemanticRole.PRIM, html_tags={'li', 'dt', 'dd'}, priority=6, ), # Tables LayoutPattern( region=LayoutRegion.TBL, semantic_role=SemanticRole.PRIM, html_tags={'table'}, priority=8, ), # Navigation LayoutPattern( region=LayoutRegion.NAV, semantic_role=SemanticRole.NAV, html_tags={'nav'}, css_classes={'navigation', 'nav', 'menu', 'navbar', 'sidebar-nav'}, priority=9, ), # Figures and captions LayoutPattern( region=LayoutRegion.FIG, semantic_role=SemanticRole.VIS, html_tags={'figure', 'img'}, priority=7, ), LayoutPattern( region=LayoutRegion.CAP, semantic_role=SemanticRole.SUPP, html_tags={'figcaption', 'caption'}, priority=8, ), # Block quotes LayoutPattern( region=LayoutRegion.BLK, semantic_role=SemanticRole.PRIM, html_tags={'blockquote', 'q'}, priority=7, ), # Sidebar content LayoutPattern( region=LayoutRegion.SDB, semantic_role=SemanticRole.SUPP, html_tags={'aside'}, css_classes={'sidebar', 'aside', 'infobox', 'callout'}, priority=8, ), # Footnotes LayoutPattern( region=LayoutRegion.FTN, semantic_role=SemanticRole.REF, css_classes={'footnote', 'footnotes', 'endnote', 'fn'}, css_ids={'footnotes', 'endnotes'}, priority=8, ), # Bibliography LayoutPattern( region=LayoutRegion.BIB, semantic_role=SemanticRole.BACK, css_classes={'bibliography', 'references', 'literature', 'literatuur', 'bronnen'}, text_patterns=[r'(?i)bibliografie|bibliography|references|literatuur|bronnen'], priority=9, ), # Table of contents LayoutPattern( region=LayoutRegion.TOC, semantic_role=SemanticRole.NAV, css_classes={'toc', 'table-of-contents', 'inhoud', 'inhoudsopgave'}, css_ids={'toc', 'table-of-contents'}, text_patterns=[r'(?i)inhoudsopgave|table of contents|contents'], priority=9, ), # Embedded content LayoutPattern( region=LayoutRegion.EMB, semantic_role=SemanticRole.INT, html_tags={'iframe', 'embed', 'object'}, priority=8, ), # Gallery/image collections LayoutPattern( region=LayoutRegion.GAL, semantic_role=SemanticRole.VIS, css_classes={'gallery', 'image-gallery', 'photo-gallery', 'carousel', 'slider'}, priority=8, ), # Maps LayoutPattern( region=LayoutRegion.MAP, semantic_role=SemanticRole.SPAT, css_classes={'map', 'leaflet', 'mapbox', 'google-map', 'kaart'}, priority=8, ), # Footer (often contains colophon) LayoutPattern( region=LayoutRegion.COL, semantic_role=SemanticRole.BACK, html_tags={'footer'}, css_classes={'footer', 'colophon', 'site-info'}, priority=6, ), # Header (site header, not headings) LayoutPattern( region=LayoutRegion.TTP, semantic_role=SemanticRole.FRNT, html_tags={'header'}, css_classes={'site-header', 'page-header', 'masthead'}, priority=6, ), # Advertisements LayoutPattern( region=LayoutRegion.ADV, semantic_role=SemanticRole.COMM, css_classes={'ad', 'advertisement', 'advertentie', 'sponsor', 'banner-ad'}, priority=7, ), # Logos LayoutPattern( region=LayoutRegion.LOG, semantic_role=SemanticRole.COMM, css_classes={'logo', 'site-logo', 'brand'}, priority=7, ), # Metadata blocks LayoutPattern( region=LayoutRegion.MTD, semantic_role=SemanticRole.META, css_classes={'metadata', 'meta-info', 'post-meta', 'entry-meta'}, priority=7, ), ] class LayoutAnnotator(BaseAnnotator): """ Layout annotator for heritage documents. Analyzes document structure using HTML semantics and CSS patterns (NO computer vision). Creates layout region claims following GLAM-NER v1.7.0 DOC hypernym taxonomy. """ def __init__( self, patterns: Optional[List[LayoutPattern]] = None, include_all_elements: bool = False, ): """ Initialize layout annotator. Args: patterns: Custom layout patterns (defaults to LAYOUT_PATTERNS) include_all_elements: Annotate all elements (not just matched ones) """ super().__init__( agent_name="LayoutAnnotator", agent_version="1.0.0", ) self.patterns = patterns or LAYOUT_PATTERNS self.include_all_elements = include_all_elements # Pre-compile text patterns (use id as key since dataclass not hashable) self._compiled_text_patterns: Dict[int, List[re.Pattern]] = {} for pattern in self.patterns: if pattern.text_patterns: self._compiled_text_patterns[id(pattern)] = [ re.compile(p) for p in pattern.text_patterns ] def annotate( self, document: Any, session: Optional[AnnotationSession] = None, ) -> AnnotationSession: """ Annotate layout regions in a document. Args: document: HTMLDocument to annotate session: Existing session to add claims to Returns: AnnotationSession with layout claims """ if not isinstance(document, HTMLDocument): raise ValueError("LayoutAnnotator requires HTMLDocument") if session is None: session = self.create_session( source_url=document.source_url, source_file=document.source_file, ) # Track parent-child relationships claim_map: Dict[str, LayoutClaim] = {} for element in document.elements: claim = self._classify_element(element, document) if claim: # Link to parent if element.parent_xpath and element.parent_xpath in claim_map: parent_claim = claim_map[element.parent_xpath] claim.parent_claim_id = parent_claim.claim_id parent_claim.child_claim_ids.append(claim.claim_id) claim_map[element.xpath] = claim session.add_layout_claim(claim) return session def _classify_element( self, element: HTMLElement, document: HTMLDocument, ) -> Optional[LayoutClaim]: """ Classify an HTML element as a layout region. Args: element: HTML element to classify document: Parent document Returns: LayoutClaim if classified, None otherwise """ best_match: Optional[Tuple[LayoutPattern, float]] = None for pattern in self.patterns: score = self._match_pattern(element, pattern) if score > 0: if best_match is None or score > best_match[1]: best_match = (pattern, score) if best_match is None: if self.include_all_elements: # Default to paragraph for unmatched block elements return self._create_default_claim(element, document) return None pattern, confidence = best_match # Create provenance provenance = self.create_provenance( namespace="glam-doc", path=element.xpath, confidence=min(confidence / 10, 1.0), # Normalize to 0-1 source_url=document.source_url, source_file=document.source_file, ) # Create claim claim = LayoutClaim( region=pattern.region, semantic_role=pattern.semantic_role, xpath=element.xpath, text_content=element.text_content[:500] if element.text_content else None, start_offset=element.start_offset, end_offset=element.end_offset, provenance=provenance, ) # Add heading level if applicable if pattern.region == LayoutRegion.HDR and element.heading_level: claim.heading_level = element.heading_level return claim def _match_pattern( self, element: HTMLElement, pattern: LayoutPattern, ) -> float: """ Calculate match score between element and pattern. Returns score >= 0 (0 = no match, higher = better match) """ score = 0.0 # Check HTML tag if pattern.html_tags and element.tag.lower() in pattern.html_tags: score += 5.0 # Check CSS classes elem_classes = set(element.attributes.get('class', '').lower().split()) if pattern.css_classes and elem_classes & pattern.css_classes: score += 3.0 # Check CSS IDs elem_id = element.attributes.get('id', '').lower() if pattern.css_ids and elem_id in pattern.css_ids: score += 3.0 # Check text patterns if id(pattern) in self._compiled_text_patterns and element.text_content: for regex in self._compiled_text_patterns[id(pattern)]: if regex.search(element.text_content): score += 2.0 break # Check text length constraints text_len = len(element.text_content) if element.text_content else 0 if text_len < pattern.min_text_length or text_len > pattern.max_text_length: return 0.0 # Fails constraint # Apply priority multiplier if score > 0: score += pattern.priority * 0.1 return score def _create_default_claim( self, element: HTMLElement, document: HTMLDocument, ) -> Optional[LayoutClaim]: """Create default claim for unmatched elements.""" if not element.is_block: return None provenance = self.create_provenance( namespace="glam-doc", path=element.xpath, confidence=0.3, # Low confidence for defaults source_url=document.source_url, source_file=document.source_file, ) # Determine default region if element.is_heading: region = LayoutRegion.HDR semantic_role = SemanticRole.STRC else: region = LayoutRegion.PAR semantic_role = SemanticRole.PRIM return LayoutClaim( region=region, semantic_role=semantic_role, xpath=element.xpath, text_content=element.text_content[:500] if element.text_content else None, start_offset=element.start_offset, end_offset=element.end_offset, heading_level=element.heading_level, provenance=provenance, ) def add_pattern(self, pattern: LayoutPattern): """Add a new layout pattern.""" self.patterns.append(pattern) if pattern.text_patterns: self._compiled_text_patterns[id(pattern)] = [ re.compile(p) for p in pattern.text_patterns ] def create_heritage_layout_annotator() -> LayoutAnnotator: """ Create a layout annotator optimized for heritage documents. Includes patterns for: - Standard document structure (headings, paragraphs, lists, tables) - Heritage-specific regions (galleries, maps, bibliographies) - Dutch language patterns (inhoudsopgave, literatuur, etc.) """ return LayoutAnnotator( patterns=LAYOUT_PATTERNS, include_all_elements=False, )