384 lines
15 KiB
Python
384 lines
15 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Extract website layout features using Docling.
|
|
|
|
This script processes archived websites and extracts structural information
|
|
that can be used to establish basic web claims with high confidence:
|
|
|
|
BASIC FEATURES (high confidence, pattern-based):
|
|
- page_title: HTML <title> content
|
|
- page_count: Number of HTML pages archived
|
|
- h1_headers: All H1-level headers found
|
|
- nav_items: Navigation menu items
|
|
- image_count: Total images on main page
|
|
- has_footer: Whether site has detectable footer
|
|
- has_contact_section: Whether contact info section exists
|
|
- language: Detected language from content
|
|
|
|
These features establish a FOUNDATION for claims before doing any NLP extraction.
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import logging
|
|
import sys
|
|
from collections import Counter, defaultdict
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
import yaml
|
|
|
|
# Docling imports
|
|
from docling.document_converter import DocumentConverter
|
|
from docling.datamodel.base_models import InputFormat
|
|
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class WebsiteLayoutExtractor:
|
|
"""Extract layout features from archived websites using Docling."""
|
|
|
|
def __init__(self, web_archive_dir: Path):
|
|
self.web_archive_dir = web_archive_dir
|
|
self.converter = DocumentConverter()
|
|
|
|
def find_entry_archives(self) -> list[tuple[str, Path]]:
|
|
"""Find all entry archives with HTML files."""
|
|
archives = []
|
|
|
|
for entry_dir in sorted(self.web_archive_dir.iterdir()):
|
|
if not entry_dir.is_dir() or entry_dir.name.startswith('_'):
|
|
continue
|
|
|
|
# Find HTML files in this archive
|
|
html_files = list(entry_dir.rglob('*.html'))
|
|
if html_files:
|
|
archives.append((entry_dir.name, entry_dir))
|
|
|
|
return archives
|
|
|
|
def find_main_page(self, archive_dir: Path) -> Path | None:
|
|
"""Find the main index.html page in an archive."""
|
|
# Look for index.html at various levels
|
|
for pattern in ['**/index.html', '**/home.html', '**/default.html']:
|
|
matches = list(archive_dir.glob(pattern))
|
|
if matches:
|
|
# Prefer shortest path (root index)
|
|
return min(matches, key=lambda p: len(p.parts))
|
|
|
|
# Fallback: any HTML file
|
|
html_files = list(archive_dir.rglob('*.html'))
|
|
if html_files:
|
|
return min(html_files, key=lambda p: len(p.parts))
|
|
|
|
return None
|
|
|
|
def count_pages(self, archive_dir: Path) -> int:
|
|
"""Count unique HTML pages in archive."""
|
|
return len(list(archive_dir.rglob('*.html')))
|
|
|
|
def extract_layout_features(self, html_path: Path) -> dict[str, Any]:
|
|
"""Extract layout features from a single HTML page using Docling."""
|
|
features = {
|
|
'extraction_timestamp': datetime.now(timezone.utc).isoformat(),
|
|
'html_file': str(html_path),
|
|
'success': False,
|
|
'error': None,
|
|
}
|
|
|
|
try:
|
|
result = self.converter.convert(str(html_path))
|
|
doc = result.document
|
|
|
|
features['success'] = True
|
|
|
|
# 1. Page title
|
|
features['page_title'] = None
|
|
if hasattr(doc, 'texts'):
|
|
for txt in doc.texts:
|
|
if hasattr(txt, 'label') and txt.label == 'title':
|
|
features['page_title'] = txt.text.strip()
|
|
break
|
|
|
|
# 2. All headers by level
|
|
features['headers'] = {
|
|
'h1': [],
|
|
'h2': [],
|
|
'h3': [],
|
|
'section_headers': [],
|
|
}
|
|
|
|
if hasattr(doc, 'texts'):
|
|
for txt in doc.texts:
|
|
if hasattr(txt, 'label') and txt.label == 'section_header':
|
|
text = txt.text.strip()
|
|
if text:
|
|
features['headers']['section_headers'].append(text)
|
|
# Docling doesn't preserve heading level from HTML
|
|
# We'll analyze the first few as likely H1/H2
|
|
|
|
# First section header is often the main H1
|
|
if features['headers']['section_headers']:
|
|
features['headers']['h1'] = [features['headers']['section_headers'][0]]
|
|
if len(features['headers']['section_headers']) > 1:
|
|
features['headers']['h2'] = features['headers']['section_headers'][1:10]
|
|
|
|
# 3. Text statistics
|
|
features['text_stats'] = {
|
|
'total_text_elements': 0,
|
|
'total_list_items': 0,
|
|
'total_paragraphs': 0,
|
|
'total_captions': 0,
|
|
}
|
|
|
|
if hasattr(doc, 'texts'):
|
|
for txt in doc.texts:
|
|
label = getattr(txt, 'label', 'unknown')
|
|
if label == 'text':
|
|
features['text_stats']['total_paragraphs'] += 1
|
|
elif label == 'list_item':
|
|
features['text_stats']['total_list_items'] += 1
|
|
elif label == 'caption':
|
|
features['text_stats']['total_captions'] += 1
|
|
features['text_stats']['total_text_elements'] += 1
|
|
|
|
# 4. Image count
|
|
features['image_count'] = len(doc.pictures) if hasattr(doc, 'pictures') else 0
|
|
|
|
# 5. Table count
|
|
features['table_count'] = len(doc.tables) if hasattr(doc, 'tables') else 0
|
|
|
|
# 6. Navigation items (look for common patterns in section headers)
|
|
nav_patterns = [
|
|
'home', 'contact', 'about', 'over ons', 'nieuws', 'news',
|
|
'collectie', 'collection', 'bezoek', 'visit', 'agenda',
|
|
'tickets', 'shop', 'winkel', 'educatie', 'education',
|
|
]
|
|
features['nav_items'] = []
|
|
for header in features['headers']['section_headers'][:20]:
|
|
header_lower = header.lower()
|
|
if any(pat in header_lower for pat in nav_patterns):
|
|
features['nav_items'].append(header)
|
|
|
|
# 7. Has footer indicators
|
|
footer_patterns = ['footer', 'copyright', '©', 'contact', 'adres', 'address']
|
|
features['has_footer_indicators'] = False
|
|
if hasattr(doc, 'texts'):
|
|
for txt in doc.texts[-20:]: # Check last 20 text elements
|
|
text_lower = txt.text.lower()
|
|
if any(pat in text_lower for pat in footer_patterns):
|
|
features['has_footer_indicators'] = True
|
|
break
|
|
|
|
# 8. Has contact section
|
|
contact_patterns = ['email', 'e-mail', 'telefoon', 'phone', 'tel:', '@', 'contact']
|
|
features['has_contact_indicators'] = False
|
|
if hasattr(doc, 'texts'):
|
|
for txt in doc.texts:
|
|
text_lower = txt.text.lower()
|
|
if any(pat in text_lower for pat in contact_patterns):
|
|
features['has_contact_indicators'] = True
|
|
break
|
|
|
|
# 9. Markdown export for full text analysis
|
|
features['markdown_length'] = len(doc.export_to_markdown())
|
|
|
|
# 10. Origin metadata
|
|
if hasattr(doc, 'origin'):
|
|
features['origin'] = {
|
|
'mimetype': doc.origin.mimetype if hasattr(doc.origin, 'mimetype') else None,
|
|
'filename': doc.origin.filename if hasattr(doc.origin, 'filename') else None,
|
|
}
|
|
|
|
except Exception as e:
|
|
features['success'] = False
|
|
features['error'] = str(e)
|
|
logger.error(f"Error processing {html_path}: {e}")
|
|
|
|
return features
|
|
|
|
def process_archive(self, entry_id: str, archive_dir: Path) -> dict[str, Any]:
|
|
"""Process a complete website archive."""
|
|
result = {
|
|
'entry_id': entry_id,
|
|
'archive_path': str(archive_dir),
|
|
'extraction_timestamp': datetime.now(timezone.utc).isoformat(),
|
|
'page_count': self.count_pages(archive_dir),
|
|
'main_page': None,
|
|
'all_pages': [],
|
|
}
|
|
|
|
# Find and process main page
|
|
main_page = self.find_main_page(archive_dir)
|
|
if main_page:
|
|
result['main_page'] = self.extract_layout_features(main_page)
|
|
|
|
# List all pages with basic info
|
|
for html_file in archive_dir.rglob('*.html'):
|
|
rel_path = html_file.relative_to(archive_dir)
|
|
result['all_pages'].append({
|
|
'path': str(rel_path),
|
|
'size_bytes': html_file.stat().st_size,
|
|
})
|
|
|
|
return result
|
|
|
|
def generate_basic_claims(self, layout_features: dict) -> list[dict]:
|
|
"""Generate basic claims from layout features."""
|
|
claims = []
|
|
|
|
main_page = layout_features.get('main_page', {})
|
|
if not main_page or not main_page.get('success'):
|
|
return claims
|
|
|
|
# Claim: page_title
|
|
if main_page.get('page_title'):
|
|
claims.append({
|
|
'claim_type': 'page_title',
|
|
'claim_value': main_page['page_title'],
|
|
'extraction_method': 'docling_layout',
|
|
'confidence': 1.0, # Direct extraction, high confidence
|
|
})
|
|
|
|
# Claim: main_h1 (likely organization name)
|
|
h1_headers = main_page.get('headers', {}).get('h1', [])
|
|
if h1_headers:
|
|
claims.append({
|
|
'claim_type': 'main_h1',
|
|
'claim_value': h1_headers[0],
|
|
'extraction_method': 'docling_layout',
|
|
'confidence': 0.9, # First section header, usually H1
|
|
})
|
|
|
|
# Claim: page_count
|
|
claims.append({
|
|
'claim_type': 'page_count',
|
|
'claim_value': str(layout_features.get('page_count', 0)),
|
|
'extraction_method': 'docling_layout',
|
|
'confidence': 1.0,
|
|
})
|
|
|
|
# Claim: image_count
|
|
if main_page.get('image_count', 0) > 0:
|
|
claims.append({
|
|
'claim_type': 'image_count',
|
|
'claim_value': str(main_page['image_count']),
|
|
'extraction_method': 'docling_layout',
|
|
'confidence': 1.0,
|
|
})
|
|
|
|
# Claim: has_contact_section
|
|
if main_page.get('has_contact_indicators'):
|
|
claims.append({
|
|
'claim_type': 'has_contact_section',
|
|
'claim_value': 'true',
|
|
'extraction_method': 'docling_layout',
|
|
'confidence': 0.8, # Pattern-based detection
|
|
})
|
|
|
|
# Claim: nav_items (as JSON list)
|
|
nav_items = main_page.get('nav_items', [])
|
|
if nav_items:
|
|
claims.append({
|
|
'claim_type': 'nav_items',
|
|
'claim_value': json.dumps(nav_items),
|
|
'extraction_method': 'docling_layout',
|
|
'confidence': 0.85,
|
|
})
|
|
|
|
return claims
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='Extract website layout features using Docling')
|
|
parser.add_argument('--web-archive-dir', type=Path,
|
|
default=Path('data/nde/enriched/entries/web'),
|
|
help='Path to web archive directory')
|
|
parser.add_argument('--output', type=Path,
|
|
default=Path('data/nde/layout_features.yaml'),
|
|
help='Output file for layout features')
|
|
parser.add_argument('--entry', type=str,
|
|
help='Process single entry ID (e.g., 0001)')
|
|
parser.add_argument('--limit', type=int, default=None,
|
|
help='Limit number of entries to process')
|
|
parser.add_argument('--sample', action='store_true',
|
|
help='Just sample 10 entries for testing')
|
|
|
|
args = parser.parse_args()
|
|
|
|
if not args.web_archive_dir.exists():
|
|
logger.error(f"Web archive directory not found: {args.web_archive_dir}")
|
|
sys.exit(1)
|
|
|
|
extractor = WebsiteLayoutExtractor(args.web_archive_dir)
|
|
|
|
# Find archives to process
|
|
archives = extractor.find_entry_archives()
|
|
logger.info(f"Found {len(archives)} entry archives")
|
|
|
|
if args.entry:
|
|
archives = [(e, p) for e, p in archives if e == args.entry]
|
|
if not archives:
|
|
logger.error(f"Entry {args.entry} not found")
|
|
sys.exit(1)
|
|
|
|
if args.sample:
|
|
archives = archives[:10]
|
|
elif args.limit:
|
|
archives = archives[:args.limit]
|
|
|
|
# Process archives
|
|
results = {
|
|
'extraction_timestamp': datetime.now(timezone.utc).isoformat(),
|
|
'total_archives': len(archives),
|
|
'entries': {},
|
|
'summary': {
|
|
'successful': 0,
|
|
'failed': 0,
|
|
'total_pages': 0,
|
|
'total_images': 0,
|
|
}
|
|
}
|
|
|
|
for entry_id, archive_dir in archives:
|
|
logger.info(f"Processing {entry_id}...")
|
|
|
|
try:
|
|
layout = extractor.process_archive(entry_id, archive_dir)
|
|
claims = extractor.generate_basic_claims(layout)
|
|
|
|
results['entries'][entry_id] = {
|
|
'layout': layout,
|
|
'basic_claims': claims,
|
|
}
|
|
|
|
results['summary']['successful'] += 1
|
|
results['summary']['total_pages'] += layout.get('page_count', 0)
|
|
|
|
if layout.get('main_page', {}).get('success'):
|
|
results['summary']['total_images'] += layout['main_page'].get('image_count', 0)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to process {entry_id}: {e}")
|
|
results['entries'][entry_id] = {'error': str(e)}
|
|
results['summary']['failed'] += 1
|
|
|
|
# Save results
|
|
with open(args.output, 'w', encoding='utf-8') as f:
|
|
yaml.dump(results, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
logger.info(f"\nResults saved to: {args.output}")
|
|
logger.info(f"Summary:")
|
|
logger.info(f" - Successful: {results['summary']['successful']}")
|
|
logger.info(f" - Failed: {results['summary']['failed']}")
|
|
logger.info(f" - Total pages: {results['summary']['total_pages']}")
|
|
logger.info(f" - Total images: {results['summary']['total_images']}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|