#!/usr/bin/env python3 """ Extract website layout features using Docling. This script processes archived websites and extracts structural information that can be used to establish basic web claims with high confidence: BASIC FEATURES (high confidence, pattern-based): - page_title: HTML content - page_count: Number of HTML pages archived - h1_headers: All H1-level headers found - nav_items: Navigation menu items - image_count: Total images on main page - has_footer: Whether site has detectable footer - has_contact_section: Whether contact info section exists - language: Detected language from content These features establish a FOUNDATION for claims before doing any NLP extraction. """ import argparse import json import logging import sys from collections import Counter, defaultdict from datetime import datetime, timezone from pathlib import Path from typing import Any import yaml # Docling imports from docling.document_converter import DocumentConverter from docling.datamodel.base_models import InputFormat logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) class WebsiteLayoutExtractor: """Extract layout features from archived websites using Docling.""" def __init__(self, web_archive_dir: Path): self.web_archive_dir = web_archive_dir self.converter = DocumentConverter() def find_entry_archives(self) -> list[tuple[str, Path]]: """Find all entry archives with HTML files.""" archives = [] for entry_dir in sorted(self.web_archive_dir.iterdir()): if not entry_dir.is_dir() or entry_dir.name.startswith('_'): continue # Find HTML files in this archive html_files = list(entry_dir.rglob('*.html')) if html_files: archives.append((entry_dir.name, entry_dir)) return archives def find_main_page(self, archive_dir: Path) -> Path | None: """Find the main index.html page in an archive.""" # Look for index.html at various levels for pattern in ['**/index.html', '**/home.html', '**/default.html']: matches = list(archive_dir.glob(pattern)) if matches: # Prefer shortest path (root index) return min(matches, key=lambda p: len(p.parts)) # Fallback: any HTML file html_files = list(archive_dir.rglob('*.html')) if html_files: return min(html_files, key=lambda p: len(p.parts)) return None def count_pages(self, archive_dir: Path) -> int: """Count unique HTML pages in archive.""" return len(list(archive_dir.rglob('*.html'))) def extract_layout_features(self, html_path: Path) -> dict[str, Any]: """Extract layout features from a single HTML page using Docling.""" features = { 'extraction_timestamp': datetime.now(timezone.utc).isoformat(), 'html_file': str(html_path), 'success': False, 'error': None, } try: result = self.converter.convert(str(html_path)) doc = result.document features['success'] = True # 1. Page title features['page_title'] = None if hasattr(doc, 'texts'): for txt in doc.texts: if hasattr(txt, 'label') and txt.label == 'title': features['page_title'] = txt.text.strip() break # 2. All headers by level features['headers'] = { 'h1': [], 'h2': [], 'h3': [], 'section_headers': [], } if hasattr(doc, 'texts'): for txt in doc.texts: if hasattr(txt, 'label') and txt.label == 'section_header': text = txt.text.strip() if text: features['headers']['section_headers'].append(text) # Docling doesn't preserve heading level from HTML # We'll analyze the first few as likely H1/H2 # First section header is often the main H1 if features['headers']['section_headers']: features['headers']['h1'] = [features['headers']['section_headers'][0]] if len(features['headers']['section_headers']) > 1: features['headers']['h2'] = features['headers']['section_headers'][1:10] # 3. Text statistics features['text_stats'] = { 'total_text_elements': 0, 'total_list_items': 0, 'total_paragraphs': 0, 'total_captions': 0, } if hasattr(doc, 'texts'): for txt in doc.texts: label = getattr(txt, 'label', 'unknown') if label == 'text': features['text_stats']['total_paragraphs'] += 1 elif label == 'list_item': features['text_stats']['total_list_items'] += 1 elif label == 'caption': features['text_stats']['total_captions'] += 1 features['text_stats']['total_text_elements'] += 1 # 4. Image count features['image_count'] = len(doc.pictures) if hasattr(doc, 'pictures') else 0 # 5. Table count features['table_count'] = len(doc.tables) if hasattr(doc, 'tables') else 0 # 6. Navigation items (look for common patterns in section headers) nav_patterns = [ 'home', 'contact', 'about', 'over ons', 'nieuws', 'news', 'collectie', 'collection', 'bezoek', 'visit', 'agenda', 'tickets', 'shop', 'winkel', 'educatie', 'education', ] features['nav_items'] = [] for header in features['headers']['section_headers'][:20]: header_lower = header.lower() if any(pat in header_lower for pat in nav_patterns): features['nav_items'].append(header) # 7. Has footer indicators footer_patterns = ['footer', 'copyright', '©', 'contact', 'adres', 'address'] features['has_footer_indicators'] = False if hasattr(doc, 'texts'): for txt in doc.texts[-20:]: # Check last 20 text elements text_lower = txt.text.lower() if any(pat in text_lower for pat in footer_patterns): features['has_footer_indicators'] = True break # 8. Has contact section contact_patterns = ['email', 'e-mail', 'telefoon', 'phone', 'tel:', '@', 'contact'] features['has_contact_indicators'] = False if hasattr(doc, 'texts'): for txt in doc.texts: text_lower = txt.text.lower() if any(pat in text_lower for pat in contact_patterns): features['has_contact_indicators'] = True break # 9. Markdown export for full text analysis features['markdown_length'] = len(doc.export_to_markdown()) # 10. Origin metadata if hasattr(doc, 'origin'): features['origin'] = { 'mimetype': doc.origin.mimetype if hasattr(doc.origin, 'mimetype') else None, 'filename': doc.origin.filename if hasattr(doc.origin, 'filename') else None, } except Exception as e: features['success'] = False features['error'] = str(e) logger.error(f"Error processing {html_path}: {e}") return features def process_archive(self, entry_id: str, archive_dir: Path) -> dict[str, Any]: """Process a complete website archive.""" result = { 'entry_id': entry_id, 'archive_path': str(archive_dir), 'extraction_timestamp': datetime.now(timezone.utc).isoformat(), 'page_count': self.count_pages(archive_dir), 'main_page': None, 'all_pages': [], } # Find and process main page main_page = self.find_main_page(archive_dir) if main_page: result['main_page'] = self.extract_layout_features(main_page) # List all pages with basic info for html_file in archive_dir.rglob('*.html'): rel_path = html_file.relative_to(archive_dir) result['all_pages'].append({ 'path': str(rel_path), 'size_bytes': html_file.stat().st_size, }) return result def generate_basic_claims(self, layout_features: dict) -> list[dict]: """Generate basic claims from layout features.""" claims = [] main_page = layout_features.get('main_page', {}) if not main_page or not main_page.get('success'): return claims # Claim: page_title if main_page.get('page_title'): claims.append({ 'claim_type': 'page_title', 'claim_value': main_page['page_title'], 'extraction_method': 'docling_layout', 'confidence': 1.0, # Direct extraction, high confidence }) # Claim: main_h1 (likely organization name) h1_headers = main_page.get('headers', {}).get('h1', []) if h1_headers: claims.append({ 'claim_type': 'main_h1', 'claim_value': h1_headers[0], 'extraction_method': 'docling_layout', 'confidence': 0.9, # First section header, usually H1 }) # Claim: page_count claims.append({ 'claim_type': 'page_count', 'claim_value': str(layout_features.get('page_count', 0)), 'extraction_method': 'docling_layout', 'confidence': 1.0, }) # Claim: image_count if main_page.get('image_count', 0) > 0: claims.append({ 'claim_type': 'image_count', 'claim_value': str(main_page['image_count']), 'extraction_method': 'docling_layout', 'confidence': 1.0, }) # Claim: has_contact_section if main_page.get('has_contact_indicators'): claims.append({ 'claim_type': 'has_contact_section', 'claim_value': 'true', 'extraction_method': 'docling_layout', 'confidence': 0.8, # Pattern-based detection }) # Claim: nav_items (as JSON list) nav_items = main_page.get('nav_items', []) if nav_items: claims.append({ 'claim_type': 'nav_items', 'claim_value': json.dumps(nav_items), 'extraction_method': 'docling_layout', 'confidence': 0.85, }) return claims def main(): parser = argparse.ArgumentParser(description='Extract website layout features using Docling') parser.add_argument('--web-archive-dir', type=Path, default=Path('data/nde/enriched/entries/web'), help='Path to web archive directory') parser.add_argument('--output', type=Path, default=Path('data/nde/layout_features.yaml'), help='Output file for layout features') parser.add_argument('--entry', type=str, help='Process single entry ID (e.g., 0001)') parser.add_argument('--limit', type=int, default=None, help='Limit number of entries to process') parser.add_argument('--sample', action='store_true', help='Just sample 10 entries for testing') args = parser.parse_args() if not args.web_archive_dir.exists(): logger.error(f"Web archive directory not found: {args.web_archive_dir}") sys.exit(1) extractor = WebsiteLayoutExtractor(args.web_archive_dir) # Find archives to process archives = extractor.find_entry_archives() logger.info(f"Found {len(archives)} entry archives") if args.entry: archives = [(e, p) for e, p in archives if e == args.entry] if not archives: logger.error(f"Entry {args.entry} not found") sys.exit(1) if args.sample: archives = archives[:10] elif args.limit: archives = archives[:args.limit] # Process archives results = { 'extraction_timestamp': datetime.now(timezone.utc).isoformat(), 'total_archives': len(archives), 'entries': {}, 'summary': { 'successful': 0, 'failed': 0, 'total_pages': 0, 'total_images': 0, } } for entry_id, archive_dir in archives: logger.info(f"Processing {entry_id}...") try: layout = extractor.process_archive(entry_id, archive_dir) claims = extractor.generate_basic_claims(layout) results['entries'][entry_id] = { 'layout': layout, 'basic_claims': claims, } results['summary']['successful'] += 1 results['summary']['total_pages'] += layout.get('page_count', 0) if layout.get('main_page', {}).get('success'): results['summary']['total_images'] += layout['main_page'].get('image_count', 0) except Exception as e: logger.error(f"Failed to process {entry_id}: {e}") results['entries'][entry_id] = {'error': str(e)} results['summary']['failed'] += 1 # Save results with open(args.output, 'w', encoding='utf-8') as f: yaml.dump(results, f, default_flow_style=False, allow_unicode=True, sort_keys=False) logger.info(f"\nResults saved to: {args.output}") logger.info(f"Summary:") logger.info(f" - Successful: {results['summary']['successful']}") logger.info(f" - Failed: {results['summary']['failed']}") logger.info(f" - Total pages: {results['summary']['total_pages']}") logger.info(f" - Total images: {results['summary']['total_images']}") if __name__ == '__main__': main()