glam/scripts/extract_website_layout_docling.py

#!/usr/bin/env python3
"""
Extract website layout features using Docling.

This script processes archived websites and extracts structural information
that can be used to establish basic web claims with high confidence:

BASIC FEATURES (high confidence, pattern-based):
- page_title: HTML <title> content
- page_count: Number of HTML pages archived
- h1_headers: All H1-level headers found
- nav_items: Navigation menu items
- image_count: Total images on main page
- has_footer: Whether site has detectable footer
- has_contact_section: Whether contact info section exists
- language: Detected language from content

These features establish a FOUNDATION for claims before doing any NLP extraction.
"""

import argparse
import json
import logging
import sys
from collections import Counter, defaultdict
from datetime import datetime, timezone
from pathlib import Path
from typing import Any

import yaml

# Docling imports
from docling.document_converter import DocumentConverter
from docling.datamodel.base_models import InputFormat

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)


class WebsiteLayoutExtractor:
    """Extract layout features from archived websites using Docling."""

    def __init__(self, web_archive_dir: Path):
        self.web_archive_dir = web_archive_dir
        self.converter = DocumentConverter()

    def find_entry_archives(self) -> list[tuple[str, Path]]:
        """Find all entry archives with HTML files."""
        archives = []

        for entry_dir in sorted(self.web_archive_dir.iterdir()):
            if not entry_dir.is_dir() or entry_dir.name.startswith('_'):
                continue

            # Find HTML files in this archive
            html_files = list(entry_dir.rglob('*.html'))
            if html_files:
                archives.append((entry_dir.name, entry_dir))

        return archives

    def find_main_page(self, archive_dir: Path) -> Path | None:
        """Find the main index.html page in an archive."""
        # Look for index.html at various levels
        for pattern in ['**/index.html', '**/home.html', '**/default.html']:
            matches = list(archive_dir.glob(pattern))
            if matches:
                # Prefer shortest path (root index)
                return min(matches, key=lambda p: len(p.parts))

        # Fallback: any HTML file
        html_files = list(archive_dir.rglob('*.html'))
        if html_files:
            return min(html_files, key=lambda p: len(p.parts))

        return None

    def count_pages(self, archive_dir: Path) -> int:
        """Count unique HTML pages in archive."""
        return len(list(archive_dir.rglob('*.html')))

    def extract_layout_features(self, html_path: Path) -> dict[str, Any]:
        """Extract layout features from a single HTML page using Docling."""
        features = {
            'extraction_timestamp': datetime.now(timezone.utc).isoformat(),
            'html_file': str(html_path),
            'success': False,
            'error': None,
        }

        try:
            result = self.converter.convert(str(html_path))
            doc = result.document

            features['success'] = True

            # 1. Page title
            features['page_title'] = None
            if hasattr(doc, 'texts'):
                for txt in doc.texts:
                    if hasattr(txt, 'label') and txt.label == 'title':
                        features['page_title'] = txt.text.strip()
                        break

            # 2. All headers by level
            features['headers'] = {
                'h1': [],
                'h2': [],
                'h3': [],
                'section_headers': [],
            }

            if hasattr(doc, 'texts'):
                for txt in doc.texts:
                    if hasattr(txt, 'label') and txt.label == 'section_header':
                        text = txt.text.strip()
                        if text:
                            features['headers']['section_headers'].append(text)
                            # Docling doesn't preserve heading level from HTML
                            # We'll analyze the first few as likely H1/H2

            # First section header is often the main H1
            if features['headers']['section_headers']:
                features['headers']['h1'] = [features['headers']['section_headers'][0]]
                if len(features['headers']['section_headers']) > 1:
                    features['headers']['h2'] = features['headers']['section_headers'][1:10]

            # 3. Text statistics
            features['text_stats'] = {
                'total_text_elements': 0,
                'total_list_items': 0,
                'total_paragraphs': 0,
                'total_captions': 0,
            }

            if hasattr(doc, 'texts'):
                for txt in doc.texts:
                    label = getattr(txt, 'label', 'unknown')
                    if label == 'text':
                        features['text_stats']['total_paragraphs'] += 1
                    elif label == 'list_item':
                        features['text_stats']['total_list_items'] += 1
                    elif label == 'caption':
                        features['text_stats']['total_captions'] += 1
                    features['text_stats']['total_text_elements'] += 1

            # 4. Image count
            features['image_count'] = len(doc.pictures) if hasattr(doc, 'pictures') else 0

            # 5. Table count
            features['table_count'] = len(doc.tables) if hasattr(doc, 'tables') else 0

            # 6. Navigation items (look for common patterns in section headers)
            nav_patterns = [
                'home', 'contact', 'about', 'over ons', 'nieuws', 'news',
                'collectie', 'collection', 'bezoek', 'visit', 'agenda',
                'tickets', 'shop', 'winkel', 'educatie', 'education',
            ]
            features['nav_items'] = []
            for header in features['headers']['section_headers'][:20]:
                header_lower = header.lower()
                if any(pat in header_lower for pat in nav_patterns):
                    features['nav_items'].append(header)

            # 7. Has footer indicators
            footer_patterns = ['footer', 'copyright', '©', 'contact', 'adres', 'address']
            features['has_footer_indicators'] = False
            if hasattr(doc, 'texts'):
                for txt in doc.texts[-20:]:  # Check last 20 text elements
                    text_lower = txt.text.lower()
                    if any(pat in text_lower for pat in footer_patterns):
                        features['has_footer_indicators'] = True
                        break

            # 8. Has contact section
            contact_patterns = ['email', 'e-mail', 'telefoon', 'phone', 'tel:', '@', 'contact']
            features['has_contact_indicators'] = False
            if hasattr(doc, 'texts'):
                for txt in doc.texts:
                    text_lower = txt.text.lower()
                    if any(pat in text_lower for pat in contact_patterns):
                        features['has_contact_indicators'] = True
                        break

            # 9. Markdown export for full text analysis
            features['markdown_length'] = len(doc.export_to_markdown())

            # 10. Origin metadata
            if hasattr(doc, 'origin'):
                features['origin'] = {
                    'mimetype': doc.origin.mimetype if hasattr(doc.origin, 'mimetype') else None,
                    'filename': doc.origin.filename if hasattr(doc.origin, 'filename') else None,
                }

        except Exception as e:
            features['success'] = False
            features['error'] = str(e)
            logger.error(f"Error processing {html_path}: {e}")

        return features

    def process_archive(self, entry_id: str, archive_dir: Path) -> dict[str, Any]:
        """Process a complete website archive."""
        result = {
            'entry_id': entry_id,
            'archive_path': str(archive_dir),
            'extraction_timestamp': datetime.now(timezone.utc).isoformat(),
            'page_count': self.count_pages(archive_dir),
            'main_page': None,
            'all_pages': [],
        }

        # Find and process main page
        main_page = self.find_main_page(archive_dir)
        if main_page:
            result['main_page'] = self.extract_layout_features(main_page)

        # List all pages with basic info
        for html_file in archive_dir.rglob('*.html'):
            rel_path = html_file.relative_to(archive_dir)
            result['all_pages'].append({
                'path': str(rel_path),
                'size_bytes': html_file.stat().st_size,
            })

        return result

    def generate_basic_claims(self, layout_features: dict) -> list[dict]:
        """Generate basic claims from layout features."""
        claims = []

        main_page = layout_features.get('main_page', {})
        if not main_page or not main_page.get('success'):
            return claims

        # Claim: page_title
        if main_page.get('page_title'):
            claims.append({
                'claim_type': 'page_title',
                'claim_value': main_page['page_title'],
                'extraction_method': 'docling_layout',
                'confidence': 1.0,  # Direct extraction, high confidence
            })

        # Claim: main_h1 (likely organization name)
        h1_headers = main_page.get('headers', {}).get('h1', [])
        if h1_headers:
            claims.append({
                'claim_type': 'main_h1',
                'claim_value': h1_headers[0],
                'extraction_method': 'docling_layout',
                'confidence': 0.9,  # First section header, usually H1
            })

        # Claim: page_count
        claims.append({
            'claim_type': 'page_count',
            'claim_value': str(layout_features.get('page_count', 0)),
            'extraction_method': 'docling_layout',
            'confidence': 1.0,
        })

        # Claim: image_count
        if main_page.get('image_count', 0) > 0:
            claims.append({
                'claim_type': 'image_count',
                'claim_value': str(main_page['image_count']),
                'extraction_method': 'docling_layout',
                'confidence': 1.0,
            })

        # Claim: has_contact_section
        if main_page.get('has_contact_indicators'):
            claims.append({
                'claim_type': 'has_contact_section',
                'claim_value': 'true',
                'extraction_method': 'docling_layout',
                'confidence': 0.8,  # Pattern-based detection
            })

        # Claim: nav_items (as JSON list)
        nav_items = main_page.get('nav_items', [])
        if nav_items:
            claims.append({
                'claim_type': 'nav_items',
                'claim_value': json.dumps(nav_items),
                'extraction_method': 'docling_layout',
                'confidence': 0.85,
            })

        return claims


def main():
    parser = argparse.ArgumentParser(description='Extract website layout features using Docling')
    parser.add_argument('--web-archive-dir', type=Path,
                        default=Path('data/nde/enriched/entries/web'),
                        help='Path to web archive directory')
    parser.add_argument('--output', type=Path,
                        default=Path('data/nde/layout_features.yaml'),
                        help='Output file for layout features')
    parser.add_argument('--entry', type=str,
                        help='Process single entry ID (e.g., 0001)')
    parser.add_argument('--limit', type=int, default=None,
                        help='Limit number of entries to process')
    parser.add_argument('--sample', action='store_true',
                        help='Just sample 10 entries for testing')

    args = parser.parse_args()

    if not args.web_archive_dir.exists():
        logger.error(f"Web archive directory not found: {args.web_archive_dir}")
        sys.exit(1)

    extractor = WebsiteLayoutExtractor(args.web_archive_dir)

    # Find archives to process
    archives = extractor.find_entry_archives()
    logger.info(f"Found {len(archives)} entry archives")

    if args.entry:
        archives = [(e, p) for e, p in archives if e == args.entry]
        if not archives:
            logger.error(f"Entry {args.entry} not found")
            sys.exit(1)

    if args.sample:
        archives = archives[:10]
    elif args.limit:
        archives = archives[:args.limit]

    # Process archives
    results = {
        'extraction_timestamp': datetime.now(timezone.utc).isoformat(),
        'total_archives': len(archives),
        'entries': {},
        'summary': {
            'successful': 0,
            'failed': 0,
            'total_pages': 0,
            'total_images': 0,
        }
    }

    for entry_id, archive_dir in archives:
        logger.info(f"Processing {entry_id}...")

        try:
            layout = extractor.process_archive(entry_id, archive_dir)
            claims = extractor.generate_basic_claims(layout)

            results['entries'][entry_id] = {
                'layout': layout,
                'basic_claims': claims,
            }

            results['summary']['successful'] += 1
            results['summary']['total_pages'] += layout.get('page_count', 0)

            if layout.get('main_page', {}).get('success'):
                results['summary']['total_images'] += layout['main_page'].get('image_count', 0)

        except Exception as e:
            logger.error(f"Failed to process {entry_id}: {e}")
            results['entries'][entry_id] = {'error': str(e)}
            results['summary']['failed'] += 1

    # Save results
    with open(args.output, 'w', encoding='utf-8') as f:
        yaml.dump(results, f, default_flow_style=False, allow_unicode=True, sort_keys=False)

    logger.info(f"\nResults saved to: {args.output}")
    logger.info(f"Summary:")
    logger.info(f"  - Successful: {results['summary']['successful']}")
    logger.info(f"  - Failed: {results['summary']['failed']}")
    logger.info(f"  - Total pages: {results['summary']['total_pages']}")
    logger.info(f"  - Total images: {results['summary']['total_images']}")


if __name__ == '__main__':
    main()