#!/usr/bin/env python3
"""
Extract website layout features using Docling.
This script processes archived websites and extracts structural information
that can be used to establish basic web claims with high confidence:
BASIC FEATURES (high confidence, pattern-based):
- page_title: HTML
content
- page_count: Number of HTML pages archived
- h1_headers: All H1-level headers found
- nav_items: Navigation menu items
- image_count: Total images on main page
- has_footer: Whether site has detectable footer
- has_contact_section: Whether contact info section exists
- language: Detected language from content
These features establish a FOUNDATION for claims before doing any NLP extraction.
"""
import argparse
import json
import logging
import sys
from collections import Counter, defaultdict
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
import yaml
# Docling imports
from docling.document_converter import DocumentConverter
from docling.datamodel.base_models import InputFormat
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
class WebsiteLayoutExtractor:
"""Extract layout features from archived websites using Docling."""
def __init__(self, web_archive_dir: Path):
self.web_archive_dir = web_archive_dir
self.converter = DocumentConverter()
def find_entry_archives(self) -> list[tuple[str, Path]]:
"""Find all entry archives with HTML files."""
archives = []
for entry_dir in sorted(self.web_archive_dir.iterdir()):
if not entry_dir.is_dir() or entry_dir.name.startswith('_'):
continue
# Find HTML files in this archive
html_files = list(entry_dir.rglob('*.html'))
if html_files:
archives.append((entry_dir.name, entry_dir))
return archives
def find_main_page(self, archive_dir: Path) -> Path | None:
"""Find the main index.html page in an archive."""
# Look for index.html at various levels
for pattern in ['**/index.html', '**/home.html', '**/default.html']:
matches = list(archive_dir.glob(pattern))
if matches:
# Prefer shortest path (root index)
return min(matches, key=lambda p: len(p.parts))
# Fallback: any HTML file
html_files = list(archive_dir.rglob('*.html'))
if html_files:
return min(html_files, key=lambda p: len(p.parts))
return None
def count_pages(self, archive_dir: Path) -> int:
"""Count unique HTML pages in archive."""
return len(list(archive_dir.rglob('*.html')))
def extract_layout_features(self, html_path: Path) -> dict[str, Any]:
"""Extract layout features from a single HTML page using Docling."""
features = {
'extraction_timestamp': datetime.now(timezone.utc).isoformat(),
'html_file': str(html_path),
'success': False,
'error': None,
}
try:
result = self.converter.convert(str(html_path))
doc = result.document
features['success'] = True
# 1. Page title
features['page_title'] = None
if hasattr(doc, 'texts'):
for txt in doc.texts:
if hasattr(txt, 'label') and txt.label == 'title':
features['page_title'] = txt.text.strip()
break
# 2. All headers by level
features['headers'] = {
'h1': [],
'h2': [],
'h3': [],
'section_headers': [],
}
if hasattr(doc, 'texts'):
for txt in doc.texts:
if hasattr(txt, 'label') and txt.label == 'section_header':
text = txt.text.strip()
if text:
features['headers']['section_headers'].append(text)
# Docling doesn't preserve heading level from HTML
# We'll analyze the first few as likely H1/H2
# First section header is often the main H1
if features['headers']['section_headers']:
features['headers']['h1'] = [features['headers']['section_headers'][0]]
if len(features['headers']['section_headers']) > 1:
features['headers']['h2'] = features['headers']['section_headers'][1:10]
# 3. Text statistics
features['text_stats'] = {
'total_text_elements': 0,
'total_list_items': 0,
'total_paragraphs': 0,
'total_captions': 0,
}
if hasattr(doc, 'texts'):
for txt in doc.texts:
label = getattr(txt, 'label', 'unknown')
if label == 'text':
features['text_stats']['total_paragraphs'] += 1
elif label == 'list_item':
features['text_stats']['total_list_items'] += 1
elif label == 'caption':
features['text_stats']['total_captions'] += 1
features['text_stats']['total_text_elements'] += 1
# 4. Image count
features['image_count'] = len(doc.pictures) if hasattr(doc, 'pictures') else 0
# 5. Table count
features['table_count'] = len(doc.tables) if hasattr(doc, 'tables') else 0
# 6. Navigation items (look for common patterns in section headers)
nav_patterns = [
'home', 'contact', 'about', 'over ons', 'nieuws', 'news',
'collectie', 'collection', 'bezoek', 'visit', 'agenda',
'tickets', 'shop', 'winkel', 'educatie', 'education',
]
features['nav_items'] = []
for header in features['headers']['section_headers'][:20]:
header_lower = header.lower()
if any(pat in header_lower for pat in nav_patterns):
features['nav_items'].append(header)
# 7. Has footer indicators
footer_patterns = ['footer', 'copyright', '©', 'contact', 'adres', 'address']
features['has_footer_indicators'] = False
if hasattr(doc, 'texts'):
for txt in doc.texts[-20:]: # Check last 20 text elements
text_lower = txt.text.lower()
if any(pat in text_lower for pat in footer_patterns):
features['has_footer_indicators'] = True
break
# 8. Has contact section
contact_patterns = ['email', 'e-mail', 'telefoon', 'phone', 'tel:', '@', 'contact']
features['has_contact_indicators'] = False
if hasattr(doc, 'texts'):
for txt in doc.texts:
text_lower = txt.text.lower()
if any(pat in text_lower for pat in contact_patterns):
features['has_contact_indicators'] = True
break
# 9. Markdown export for full text analysis
features['markdown_length'] = len(doc.export_to_markdown())
# 10. Origin metadata
if hasattr(doc, 'origin'):
features['origin'] = {
'mimetype': doc.origin.mimetype if hasattr(doc.origin, 'mimetype') else None,
'filename': doc.origin.filename if hasattr(doc.origin, 'filename') else None,
}
except Exception as e:
features['success'] = False
features['error'] = str(e)
logger.error(f"Error processing {html_path}: {e}")
return features
def process_archive(self, entry_id: str, archive_dir: Path) -> dict[str, Any]:
"""Process a complete website archive."""
result = {
'entry_id': entry_id,
'archive_path': str(archive_dir),
'extraction_timestamp': datetime.now(timezone.utc).isoformat(),
'page_count': self.count_pages(archive_dir),
'main_page': None,
'all_pages': [],
}
# Find and process main page
main_page = self.find_main_page(archive_dir)
if main_page:
result['main_page'] = self.extract_layout_features(main_page)
# List all pages with basic info
for html_file in archive_dir.rglob('*.html'):
rel_path = html_file.relative_to(archive_dir)
result['all_pages'].append({
'path': str(rel_path),
'size_bytes': html_file.stat().st_size,
})
return result
def generate_basic_claims(self, layout_features: dict) -> list[dict]:
"""Generate basic claims from layout features."""
claims = []
main_page = layout_features.get('main_page', {})
if not main_page or not main_page.get('success'):
return claims
# Claim: page_title
if main_page.get('page_title'):
claims.append({
'claim_type': 'page_title',
'claim_value': main_page['page_title'],
'extraction_method': 'docling_layout',
'confidence': 1.0, # Direct extraction, high confidence
})
# Claim: main_h1 (likely organization name)
h1_headers = main_page.get('headers', {}).get('h1', [])
if h1_headers:
claims.append({
'claim_type': 'main_h1',
'claim_value': h1_headers[0],
'extraction_method': 'docling_layout',
'confidence': 0.9, # First section header, usually H1
})
# Claim: page_count
claims.append({
'claim_type': 'page_count',
'claim_value': str(layout_features.get('page_count', 0)),
'extraction_method': 'docling_layout',
'confidence': 1.0,
})
# Claim: image_count
if main_page.get('image_count', 0) > 0:
claims.append({
'claim_type': 'image_count',
'claim_value': str(main_page['image_count']),
'extraction_method': 'docling_layout',
'confidence': 1.0,
})
# Claim: has_contact_section
if main_page.get('has_contact_indicators'):
claims.append({
'claim_type': 'has_contact_section',
'claim_value': 'true',
'extraction_method': 'docling_layout',
'confidence': 0.8, # Pattern-based detection
})
# Claim: nav_items (as JSON list)
nav_items = main_page.get('nav_items', [])
if nav_items:
claims.append({
'claim_type': 'nav_items',
'claim_value': json.dumps(nav_items),
'extraction_method': 'docling_layout',
'confidence': 0.85,
})
return claims
def main():
parser = argparse.ArgumentParser(description='Extract website layout features using Docling')
parser.add_argument('--web-archive-dir', type=Path,
default=Path('data/nde/enriched/entries/web'),
help='Path to web archive directory')
parser.add_argument('--output', type=Path,
default=Path('data/nde/layout_features.yaml'),
help='Output file for layout features')
parser.add_argument('--entry', type=str,
help='Process single entry ID (e.g., 0001)')
parser.add_argument('--limit', type=int, default=None,
help='Limit number of entries to process')
parser.add_argument('--sample', action='store_true',
help='Just sample 10 entries for testing')
args = parser.parse_args()
if not args.web_archive_dir.exists():
logger.error(f"Web archive directory not found: {args.web_archive_dir}")
sys.exit(1)
extractor = WebsiteLayoutExtractor(args.web_archive_dir)
# Find archives to process
archives = extractor.find_entry_archives()
logger.info(f"Found {len(archives)} entry archives")
if args.entry:
archives = [(e, p) for e, p in archives if e == args.entry]
if not archives:
logger.error(f"Entry {args.entry} not found")
sys.exit(1)
if args.sample:
archives = archives[:10]
elif args.limit:
archives = archives[:args.limit]
# Process archives
results = {
'extraction_timestamp': datetime.now(timezone.utc).isoformat(),
'total_archives': len(archives),
'entries': {},
'summary': {
'successful': 0,
'failed': 0,
'total_pages': 0,
'total_images': 0,
}
}
for entry_id, archive_dir in archives:
logger.info(f"Processing {entry_id}...")
try:
layout = extractor.process_archive(entry_id, archive_dir)
claims = extractor.generate_basic_claims(layout)
results['entries'][entry_id] = {
'layout': layout,
'basic_claims': claims,
}
results['summary']['successful'] += 1
results['summary']['total_pages'] += layout.get('page_count', 0)
if layout.get('main_page', {}).get('success'):
results['summary']['total_images'] += layout['main_page'].get('image_count', 0)
except Exception as e:
logger.error(f"Failed to process {entry_id}: {e}")
results['entries'][entry_id] = {'error': str(e)}
results['summary']['failed'] += 1
# Save results
with open(args.output, 'w', encoding='utf-8') as f:
yaml.dump(results, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
logger.info(f"\nResults saved to: {args.output}")
logger.info(f"Summary:")
logger.info(f" - Successful: {results['summary']['successful']}")
logger.info(f" - Failed: {results['summary']['failed']}")
logger.info(f" - Total pages: {results['summary']['total_pages']}")
logger.info(f" - Total images: {results['summary']['total_images']}")
if __name__ == '__main__':
main()