""" HTML Parser for web archive documents. Parses HTML files from web archives and creates structured document representations suitable for annotation. Supports: - WARC archives - HTML mirror directories - Rendered HTML from Playwright """ import hashlib import re from dataclasses import dataclass, field from pathlib import Path from typing import Any, Dict, List, Optional, Tuple from lxml import html, etree from lxml.html import HtmlElement @dataclass class HTMLElement: """ Represents an HTML element with XPath and content. """ tag: str xpath: str text_content: str tail_text: str = "" attributes: Dict[str, str] = field(default_factory=dict) # Position in document start_offset: Optional[int] = None end_offset: Optional[int] = None # Nesting parent_xpath: Optional[str] = None children_xpaths: List[str] = field(default_factory=list) # Layout hints is_block: bool = True is_heading: bool = False heading_level: Optional[int] = None @property def full_text(self) -> str: """Get text content including tail.""" return self.text_content + self.tail_text @dataclass class HTMLDocument: """ Parsed HTML document ready for annotation. """ # Source information source_url: Optional[str] = None source_file: Optional[str] = None retrieved_at: Optional[str] = None # Document metadata title: str = "" language: Optional[str] = None encoding: str = "utf-8" # Content hash for deduplication content_hash: str = "" # Raw content raw_html: str = "" # Parsed structure elements: List[HTMLElement] = field(default_factory=list) # Full text (for offset calculations) full_text: str = "" # XPath to element mapping xpath_map: Dict[str, HTMLElement] = field(default_factory=dict) def get_element_by_xpath(self, xpath: str) -> Optional[HTMLElement]: """Get element by XPath.""" return self.xpath_map.get(xpath) def get_text_at_offset(self, start: int, end: int) -> str: """Get text slice by character offsets.""" return self.full_text[start:end] class HTMLParser: """ Parser for HTML documents from web archives. Creates HTMLDocument objects suitable for annotation with XPath-based provenance tracking. """ # Block-level elements BLOCK_ELEMENTS = { 'div', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'ol', 'li', 'table', 'tr', 'td', 'th', 'blockquote', 'pre', 'article', 'section', 'aside', 'header', 'footer', 'nav', 'main', 'figure', 'figcaption', 'address', 'form', 'fieldset', 'legend', 'dl', 'dt', 'dd', } # Heading elements HEADING_ELEMENTS = {'h1': 1, 'h2': 2, 'h3': 3, 'h4': 4, 'h5': 5, 'h6': 6} # Elements to skip (no semantic content) SKIP_ELEMENTS = {'script', 'style', 'noscript', 'svg', 'path', 'meta', 'link'} def __init__(self, include_invisible: bool = False): """ Initialize HTML parser. Args: include_invisible: Include hidden/invisible elements """ self.include_invisible = include_invisible def parse_file(self, file_path: str | Path) -> HTMLDocument: """ Parse an HTML file. Args: file_path: Path to HTML file Returns: Parsed HTMLDocument """ file_path = Path(file_path) with open(file_path, 'r', encoding='utf-8', errors='replace') as f: html_content = f.read() doc = self.parse_string(html_content) doc.source_file = str(file_path) return doc def parse_string(self, html_content: str) -> HTMLDocument: """ Parse HTML from string. Args: html_content: HTML content as string Returns: Parsed HTMLDocument """ doc = HTMLDocument() doc.raw_html = html_content doc.content_hash = hashlib.sha256(html_content.encode('utf-8')).hexdigest() # Parse with lxml try: tree = html.fromstring(html_content) except Exception as e: # Fallback for malformed HTML tree = html.document_fromstring(html_content) # Extract metadata doc.title = self._extract_title(tree) doc.language = self._extract_language(tree) # Build element list with XPaths doc.elements, doc.full_text = self._extract_elements(tree) # Build XPath map doc.xpath_map = {elem.xpath: elem for elem in doc.elements} return doc def _extract_title(self, tree: HtmlElement) -> str: """Extract document title.""" title_elems = tree.xpath('//title/text()') if title_elems: return str(title_elems[0]).strip() # Fallback to h1 h1_elems = tree.xpath('//h1/text()') if h1_elems: return str(h1_elems[0]).strip() return "" def _extract_language(self, tree: HtmlElement) -> Optional[str]: """Extract document language.""" # Check html lang attribute lang = tree.get('lang') if lang: return lang # Check meta content-language meta_lang = tree.xpath('//meta[@http-equiv="content-language"]/@content') if meta_lang: return str(meta_lang[0]) return None def _extract_elements( self, tree: HtmlElement, ) -> Tuple[List[HTMLElement], str]: """ Extract elements with XPaths and text content. Returns: Tuple of (element list, full text) """ elements: List[HTMLElement] = [] text_parts: List[str] = [] current_offset = 0 def process_element(elem: HtmlElement, parent_xpath: Optional[str] = None): nonlocal current_offset tag = elem.tag if isinstance(elem.tag, str) else "unknown" # Skip non-content elements if tag in self.SKIP_ELEMENTS: return # Skip comments and processing instructions if not isinstance(tag, str): return # Generate XPath xpath = tree.getroottree().getpath(elem) # Get text content text = elem.text or "" tail = elem.tail or "" # Skip invisible elements unless configured otherwise if not self.include_invisible: style = elem.get('style', '') if 'display:none' in style or 'visibility:hidden' in style: return # Create element html_elem = HTMLElement( tag=tag, xpath=xpath, text_content=text, tail_text=tail, attributes=dict(elem.attrib), parent_xpath=parent_xpath, is_block=tag.lower() in self.BLOCK_ELEMENTS, is_heading=tag.lower() in self.HEADING_ELEMENTS, heading_level=self.HEADING_ELEMENTS.get(tag.lower()), ) # Calculate offsets if text: html_elem.start_offset = current_offset text_parts.append(text) current_offset += len(text) html_elem.end_offset = current_offset elements.append(html_elem) # Process children children_xpaths = [] for child in elem: if isinstance(child.tag, str): child_xpath = tree.getroottree().getpath(child) children_xpaths.append(child_xpath) process_element(child, xpath) html_elem.children_xpaths = children_xpaths # Add tail text to offset tracking if tail: text_parts.append(tail) current_offset += len(tail) # Start processing from body or root body = tree.xpath('//body') if body: process_element(body[0]) else: process_element(tree) full_text = ''.join(text_parts) return elements, full_text def parse_warc(self, warc_path: str | Path) -> List[HTMLDocument]: """ Parse HTML documents from a WARC archive. Args: warc_path: Path to WARC file Returns: List of parsed HTMLDocument objects """ # Import warcio only when needed try: from warcio.archiveiterator import ArchiveIterator except ImportError: raise ImportError("warcio required for WARC parsing: pip install warcio") warc_path = Path(warc_path) documents = [] with open(warc_path, 'rb') as f: for record in ArchiveIterator(f): if record.rec_type == 'response': content_type = record.http_headers.get_header('Content-Type', '') if 'text/html' in content_type: url = record.rec_headers.get_header('WARC-Target-URI') content = record.content_stream().read().decode('utf-8', errors='replace') doc = self.parse_string(content) doc.source_url = url doc.source_file = str(warc_path) # Extract date from WARC header warc_date = record.rec_headers.get_header('WARC-Date') if warc_date: doc.retrieved_at = warc_date documents.append(doc) return documents def parse_mirror_directory( self, mirror_path: str | Path, ) -> List[HTMLDocument]: """ Parse all HTML files in a mirror directory. Args: mirror_path: Path to mirror directory Returns: List of parsed HTMLDocument objects """ mirror_path = Path(mirror_path) documents = [] for html_file in mirror_path.rglob('*.html'): try: doc = self.parse_file(html_file) # Infer URL from directory structure relative_path = html_file.relative_to(mirror_path) doc.source_url = f"https://{relative_path}" documents.append(doc) except Exception as e: print(f"Error parsing {html_file}: {e}") continue return documents def get_xpath_for_text( document: HTMLDocument, search_text: str, fuzzy: bool = False, ) -> List[Tuple[str, int, int]]: """ Find XPath(s) containing specific text. Args: document: Parsed HTML document search_text: Text to search for fuzzy: Use fuzzy matching Returns: List of (xpath, start_offset, end_offset) tuples """ results = [] for elem in document.elements: if search_text in elem.text_content: # Find exact position within element start = elem.text_content.find(search_text) end = start + len(search_text) # Calculate document-level offsets if elem.start_offset is not None: doc_start = elem.start_offset + start doc_end = elem.start_offset + end else: doc_start = start doc_end = end results.append((elem.xpath, doc_start, doc_end)) return results