478 lines
14 KiB
Python
478 lines
14 KiB
Python
"""
|
|
Layout Annotator for GLAM documents.
|
|
|
|
Analyzes document structure and creates layout region claims
|
|
following the DOC hypernym from GLAM-NER v1.7.0 Section 15.
|
|
|
|
Regions include:
|
|
- Primary: HDR, PAR, SEN, LST, TBL
|
|
- Media: GAL, MAP, AUD, VID, EMB
|
|
- Navigation: NAV, TOC, IDX
|
|
- Front/Back matter: TTP, DED, COL, BIB, APP, GLO
|
|
- Commercial: ADV, LOG
|
|
|
|
NO COMPUTER VISION - uses text/structure patterns only.
|
|
"""
|
|
|
|
import re
|
|
from dataclasses import dataclass
|
|
from typing import Any, Dict, List, Optional, Set, Tuple
|
|
|
|
from .base import (
|
|
BaseAnnotator,
|
|
AnnotationSession,
|
|
LayoutClaim,
|
|
LayoutRegion,
|
|
SemanticRole,
|
|
Provenance,
|
|
)
|
|
from .html_parser import HTMLDocument, HTMLElement
|
|
|
|
|
|
# =============================================================================
|
|
# LAYOUT DETECTION PATTERNS
|
|
# =============================================================================
|
|
|
|
@dataclass
|
|
class LayoutPattern:
|
|
"""Pattern for layout region detection."""
|
|
region: LayoutRegion
|
|
semantic_role: SemanticRole
|
|
|
|
# Detection criteria (any match triggers)
|
|
html_tags: Optional[Set[str]] = None
|
|
css_classes: Optional[Set[str]] = None
|
|
css_ids: Optional[Set[str]] = None
|
|
text_patterns: Optional[List[str]] = None
|
|
|
|
# Additional constraints
|
|
min_text_length: int = 0
|
|
max_text_length: int = 100000
|
|
|
|
priority: int = 5
|
|
|
|
def __post_init__(self):
|
|
if self.html_tags is None:
|
|
self.html_tags = set()
|
|
if self.css_classes is None:
|
|
self.css_classes = set()
|
|
if self.css_ids is None:
|
|
self.css_ids = set()
|
|
if self.text_patterns is None:
|
|
self.text_patterns = []
|
|
|
|
|
|
# Layout patterns for heritage documents
|
|
LAYOUT_PATTERNS: List[LayoutPattern] = [
|
|
# Headings (H1-H6)
|
|
LayoutPattern(
|
|
region=LayoutRegion.HDR,
|
|
semantic_role=SemanticRole.STRC,
|
|
html_tags={'h1', 'h2', 'h3', 'h4', 'h5', 'h6'},
|
|
priority=10,
|
|
),
|
|
|
|
# Paragraphs
|
|
LayoutPattern(
|
|
region=LayoutRegion.PAR,
|
|
semantic_role=SemanticRole.PRIM,
|
|
html_tags={'p'},
|
|
min_text_length=10,
|
|
priority=5,
|
|
),
|
|
|
|
# Lists
|
|
LayoutPattern(
|
|
region=LayoutRegion.LST,
|
|
semantic_role=SemanticRole.PRIM,
|
|
html_tags={'ul', 'ol', 'dl'},
|
|
priority=7,
|
|
),
|
|
LayoutPattern(
|
|
region=LayoutRegion.LIT,
|
|
semantic_role=SemanticRole.PRIM,
|
|
html_tags={'li', 'dt', 'dd'},
|
|
priority=6,
|
|
),
|
|
|
|
# Tables
|
|
LayoutPattern(
|
|
region=LayoutRegion.TBL,
|
|
semantic_role=SemanticRole.PRIM,
|
|
html_tags={'table'},
|
|
priority=8,
|
|
),
|
|
|
|
# Navigation
|
|
LayoutPattern(
|
|
region=LayoutRegion.NAV,
|
|
semantic_role=SemanticRole.NAV,
|
|
html_tags={'nav'},
|
|
css_classes={'navigation', 'nav', 'menu', 'navbar', 'sidebar-nav'},
|
|
priority=9,
|
|
),
|
|
|
|
# Figures and captions
|
|
LayoutPattern(
|
|
region=LayoutRegion.FIG,
|
|
semantic_role=SemanticRole.VIS,
|
|
html_tags={'figure', 'img'},
|
|
priority=7,
|
|
),
|
|
LayoutPattern(
|
|
region=LayoutRegion.CAP,
|
|
semantic_role=SemanticRole.SUPP,
|
|
html_tags={'figcaption', 'caption'},
|
|
priority=8,
|
|
),
|
|
|
|
# Block quotes
|
|
LayoutPattern(
|
|
region=LayoutRegion.BLK,
|
|
semantic_role=SemanticRole.PRIM,
|
|
html_tags={'blockquote', 'q'},
|
|
priority=7,
|
|
),
|
|
|
|
# Sidebar content
|
|
LayoutPattern(
|
|
region=LayoutRegion.SDB,
|
|
semantic_role=SemanticRole.SUPP,
|
|
html_tags={'aside'},
|
|
css_classes={'sidebar', 'aside', 'infobox', 'callout'},
|
|
priority=8,
|
|
),
|
|
|
|
# Footnotes
|
|
LayoutPattern(
|
|
region=LayoutRegion.FTN,
|
|
semantic_role=SemanticRole.REF,
|
|
css_classes={'footnote', 'footnotes', 'endnote', 'fn'},
|
|
css_ids={'footnotes', 'endnotes'},
|
|
priority=8,
|
|
),
|
|
|
|
# Bibliography
|
|
LayoutPattern(
|
|
region=LayoutRegion.BIB,
|
|
semantic_role=SemanticRole.BACK,
|
|
css_classes={'bibliography', 'references', 'literature', 'literatuur', 'bronnen'},
|
|
text_patterns=[r'(?i)bibliografie|bibliography|references|literatuur|bronnen'],
|
|
priority=9,
|
|
),
|
|
|
|
# Table of contents
|
|
LayoutPattern(
|
|
region=LayoutRegion.TOC,
|
|
semantic_role=SemanticRole.NAV,
|
|
css_classes={'toc', 'table-of-contents', 'inhoud', 'inhoudsopgave'},
|
|
css_ids={'toc', 'table-of-contents'},
|
|
text_patterns=[r'(?i)inhoudsopgave|table of contents|contents'],
|
|
priority=9,
|
|
),
|
|
|
|
# Embedded content
|
|
LayoutPattern(
|
|
region=LayoutRegion.EMB,
|
|
semantic_role=SemanticRole.INT,
|
|
html_tags={'iframe', 'embed', 'object'},
|
|
priority=8,
|
|
),
|
|
|
|
# Gallery/image collections
|
|
LayoutPattern(
|
|
region=LayoutRegion.GAL,
|
|
semantic_role=SemanticRole.VIS,
|
|
css_classes={'gallery', 'image-gallery', 'photo-gallery', 'carousel', 'slider'},
|
|
priority=8,
|
|
),
|
|
|
|
# Maps
|
|
LayoutPattern(
|
|
region=LayoutRegion.MAP,
|
|
semantic_role=SemanticRole.SPAT,
|
|
css_classes={'map', 'leaflet', 'mapbox', 'google-map', 'kaart'},
|
|
priority=8,
|
|
),
|
|
|
|
# Footer (often contains colophon)
|
|
LayoutPattern(
|
|
region=LayoutRegion.COL,
|
|
semantic_role=SemanticRole.BACK,
|
|
html_tags={'footer'},
|
|
css_classes={'footer', 'colophon', 'site-info'},
|
|
priority=6,
|
|
),
|
|
|
|
# Header (site header, not headings)
|
|
LayoutPattern(
|
|
region=LayoutRegion.TTP,
|
|
semantic_role=SemanticRole.FRNT,
|
|
html_tags={'header'},
|
|
css_classes={'site-header', 'page-header', 'masthead'},
|
|
priority=6,
|
|
),
|
|
|
|
# Advertisements
|
|
LayoutPattern(
|
|
region=LayoutRegion.ADV,
|
|
semantic_role=SemanticRole.COMM,
|
|
css_classes={'ad', 'advertisement', 'advertentie', 'sponsor', 'banner-ad'},
|
|
priority=7,
|
|
),
|
|
|
|
# Logos
|
|
LayoutPattern(
|
|
region=LayoutRegion.LOG,
|
|
semantic_role=SemanticRole.COMM,
|
|
css_classes={'logo', 'site-logo', 'brand'},
|
|
priority=7,
|
|
),
|
|
|
|
# Metadata blocks
|
|
LayoutPattern(
|
|
region=LayoutRegion.MTD,
|
|
semantic_role=SemanticRole.META,
|
|
css_classes={'metadata', 'meta-info', 'post-meta', 'entry-meta'},
|
|
priority=7,
|
|
),
|
|
]
|
|
|
|
|
|
class LayoutAnnotator(BaseAnnotator):
|
|
"""
|
|
Layout annotator for heritage documents.
|
|
|
|
Analyzes document structure using HTML semantics and CSS patterns
|
|
(NO computer vision). Creates layout region claims following
|
|
GLAM-NER v1.7.0 DOC hypernym taxonomy.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
patterns: Optional[List[LayoutPattern]] = None,
|
|
include_all_elements: bool = False,
|
|
):
|
|
"""
|
|
Initialize layout annotator.
|
|
|
|
Args:
|
|
patterns: Custom layout patterns (defaults to LAYOUT_PATTERNS)
|
|
include_all_elements: Annotate all elements (not just matched ones)
|
|
"""
|
|
super().__init__(
|
|
agent_name="LayoutAnnotator",
|
|
agent_version="1.0.0",
|
|
)
|
|
self.patterns = patterns or LAYOUT_PATTERNS
|
|
self.include_all_elements = include_all_elements
|
|
|
|
# Pre-compile text patterns (use id as key since dataclass not hashable)
|
|
self._compiled_text_patterns: Dict[int, List[re.Pattern]] = {}
|
|
for pattern in self.patterns:
|
|
if pattern.text_patterns:
|
|
self._compiled_text_patterns[id(pattern)] = [
|
|
re.compile(p) for p in pattern.text_patterns
|
|
]
|
|
|
|
def annotate(
|
|
self,
|
|
document: Any,
|
|
session: Optional[AnnotationSession] = None,
|
|
) -> AnnotationSession:
|
|
"""
|
|
Annotate layout regions in a document.
|
|
|
|
Args:
|
|
document: HTMLDocument to annotate
|
|
session: Existing session to add claims to
|
|
|
|
Returns:
|
|
AnnotationSession with layout claims
|
|
"""
|
|
if not isinstance(document, HTMLDocument):
|
|
raise ValueError("LayoutAnnotator requires HTMLDocument")
|
|
|
|
if session is None:
|
|
session = self.create_session(
|
|
source_url=document.source_url,
|
|
source_file=document.source_file,
|
|
)
|
|
|
|
# Track parent-child relationships
|
|
claim_map: Dict[str, LayoutClaim] = {}
|
|
|
|
for element in document.elements:
|
|
claim = self._classify_element(element, document)
|
|
|
|
if claim:
|
|
# Link to parent
|
|
if element.parent_xpath and element.parent_xpath in claim_map:
|
|
parent_claim = claim_map[element.parent_xpath]
|
|
claim.parent_claim_id = parent_claim.claim_id
|
|
parent_claim.child_claim_ids.append(claim.claim_id)
|
|
|
|
claim_map[element.xpath] = claim
|
|
session.add_layout_claim(claim)
|
|
|
|
return session
|
|
|
|
def _classify_element(
|
|
self,
|
|
element: HTMLElement,
|
|
document: HTMLDocument,
|
|
) -> Optional[LayoutClaim]:
|
|
"""
|
|
Classify an HTML element as a layout region.
|
|
|
|
Args:
|
|
element: HTML element to classify
|
|
document: Parent document
|
|
|
|
Returns:
|
|
LayoutClaim if classified, None otherwise
|
|
"""
|
|
best_match: Optional[Tuple[LayoutPattern, float]] = None
|
|
|
|
for pattern in self.patterns:
|
|
score = self._match_pattern(element, pattern)
|
|
if score > 0:
|
|
if best_match is None or score > best_match[1]:
|
|
best_match = (pattern, score)
|
|
|
|
if best_match is None:
|
|
if self.include_all_elements:
|
|
# Default to paragraph for unmatched block elements
|
|
return self._create_default_claim(element, document)
|
|
return None
|
|
|
|
pattern, confidence = best_match
|
|
|
|
# Create provenance
|
|
provenance = self.create_provenance(
|
|
namespace="glam-doc",
|
|
path=element.xpath,
|
|
confidence=min(confidence / 10, 1.0), # Normalize to 0-1
|
|
source_url=document.source_url,
|
|
source_file=document.source_file,
|
|
)
|
|
|
|
# Create claim
|
|
claim = LayoutClaim(
|
|
region=pattern.region,
|
|
semantic_role=pattern.semantic_role,
|
|
xpath=element.xpath,
|
|
text_content=element.text_content[:500] if element.text_content else None,
|
|
start_offset=element.start_offset,
|
|
end_offset=element.end_offset,
|
|
provenance=provenance,
|
|
)
|
|
|
|
# Add heading level if applicable
|
|
if pattern.region == LayoutRegion.HDR and element.heading_level:
|
|
claim.heading_level = element.heading_level
|
|
|
|
return claim
|
|
|
|
def _match_pattern(
|
|
self,
|
|
element: HTMLElement,
|
|
pattern: LayoutPattern,
|
|
) -> float:
|
|
"""
|
|
Calculate match score between element and pattern.
|
|
|
|
Returns score >= 0 (0 = no match, higher = better match)
|
|
"""
|
|
score = 0.0
|
|
|
|
# Check HTML tag
|
|
if pattern.html_tags and element.tag.lower() in pattern.html_tags:
|
|
score += 5.0
|
|
|
|
# Check CSS classes
|
|
elem_classes = set(element.attributes.get('class', '').lower().split())
|
|
if pattern.css_classes and elem_classes & pattern.css_classes:
|
|
score += 3.0
|
|
|
|
# Check CSS IDs
|
|
elem_id = element.attributes.get('id', '').lower()
|
|
if pattern.css_ids and elem_id in pattern.css_ids:
|
|
score += 3.0
|
|
|
|
# Check text patterns
|
|
if id(pattern) in self._compiled_text_patterns and element.text_content:
|
|
for regex in self._compiled_text_patterns[id(pattern)]:
|
|
if regex.search(element.text_content):
|
|
score += 2.0
|
|
break
|
|
|
|
# Check text length constraints
|
|
text_len = len(element.text_content) if element.text_content else 0
|
|
if text_len < pattern.min_text_length or text_len > pattern.max_text_length:
|
|
return 0.0 # Fails constraint
|
|
|
|
# Apply priority multiplier
|
|
if score > 0:
|
|
score += pattern.priority * 0.1
|
|
|
|
return score
|
|
|
|
def _create_default_claim(
|
|
self,
|
|
element: HTMLElement,
|
|
document: HTMLDocument,
|
|
) -> Optional[LayoutClaim]:
|
|
"""Create default claim for unmatched elements."""
|
|
if not element.is_block:
|
|
return None
|
|
|
|
provenance = self.create_provenance(
|
|
namespace="glam-doc",
|
|
path=element.xpath,
|
|
confidence=0.3, # Low confidence for defaults
|
|
source_url=document.source_url,
|
|
source_file=document.source_file,
|
|
)
|
|
|
|
# Determine default region
|
|
if element.is_heading:
|
|
region = LayoutRegion.HDR
|
|
semantic_role = SemanticRole.STRC
|
|
else:
|
|
region = LayoutRegion.PAR
|
|
semantic_role = SemanticRole.PRIM
|
|
|
|
return LayoutClaim(
|
|
region=region,
|
|
semantic_role=semantic_role,
|
|
xpath=element.xpath,
|
|
text_content=element.text_content[:500] if element.text_content else None,
|
|
start_offset=element.start_offset,
|
|
end_offset=element.end_offset,
|
|
heading_level=element.heading_level,
|
|
provenance=provenance,
|
|
)
|
|
|
|
def add_pattern(self, pattern: LayoutPattern):
|
|
"""Add a new layout pattern."""
|
|
self.patterns.append(pattern)
|
|
if pattern.text_patterns:
|
|
self._compiled_text_patterns[id(pattern)] = [
|
|
re.compile(p) for p in pattern.text_patterns
|
|
]
|
|
|
|
|
|
def create_heritage_layout_annotator() -> LayoutAnnotator:
|
|
"""
|
|
Create a layout annotator optimized for heritage documents.
|
|
|
|
Includes patterns for:
|
|
- Standard document structure (headings, paragraphs, lists, tables)
|
|
- Heritage-specific regions (galleries, maps, bibliographies)
|
|
- Dutch language patterns (inhoudsopgave, literatuur, etc.)
|
|
"""
|
|
return LayoutAnnotator(
|
|
patterns=LAYOUT_PATTERNS,
|
|
include_all_elements=False,
|
|
)
|