#!/usr/bin/env python3 """ Integrate docling layout features into NDE entry YAML files. This script: 1. Reads layout features from layout_features_full.yaml 2. Adds structural claims (TIER 1) to each entry's web_claims 3. Updates entry metadata with page counts and image counts Usage: # Dry run (analyze only) python scripts/integrate_layout_features.py --dry-run # Integrate all entries python scripts/integrate_layout_features.py # Single entry python scripts/integrate_layout_features.py --entry 0001 """ import argparse import logging import sys from datetime import datetime, timezone from pathlib import Path from typing import Any import yaml logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) class LayoutIntegrator: """Integrate layout features into entry YAML files.""" def __init__(self, entries_dir: Path, layout_file: Path, dry_run: bool = False): self.entries_dir = entries_dir self.layout_file = layout_file self.dry_run = dry_run self.layout_data = None self.stats = { 'entries_processed': 0, 'entries_updated': 0, 'entries_skipped': 0, 'claims_added': 0, } def load_layout_features(self): """Load the layout features file.""" logger.info(f"Loading layout features from {self.layout_file}") with open(self.layout_file, 'r', encoding='utf-8') as f: self.layout_data = yaml.safe_load(f) logger.info(f"Loaded {len(self.layout_data.get('entries', {}))} layout entries") def find_entry_files(self) -> list[Path]: """Find all entry YAML files.""" return sorted(self.entries_dir.glob('*.yaml')) def get_entry_id(self, path: Path) -> str: """Extract entry ID from filename (e.g., '0001_Q123.yaml' -> '0001').""" return path.stem.split('_')[0] def create_structural_claims(self, layout_entry: dict) -> list[dict]: """Create TIER 1 structural claims from layout features.""" claims = [] layout = layout_entry.get('layout', {}) main_page = layout.get('main_page', {}) if not main_page.get('success'): return claims timestamp = datetime.now(timezone.utc).isoformat() html_file = main_page.get('html_file', '') # page_title (TIER 1, confidence 1.0) if main_page.get('page_title'): claims.append({ 'claim_type': 'page_title', 'claim_value': main_page['page_title'], 'extraction_method': 'docling_structural', 'confidence': 1.0, 'tier': 1, 'html_file': html_file, 'extraction_timestamp': timestamp, }) # page_count (TIER 1, confidence 1.0) page_count = layout.get('page_count', 0) if page_count > 0: claims.append({ 'claim_type': 'page_count', 'claim_value': str(page_count), 'extraction_method': 'docling_structural', 'confidence': 1.0, 'tier': 1, 'extraction_timestamp': timestamp, }) # image_count (TIER 1, confidence 1.0) image_count = main_page.get('image_count', 0) if image_count > 0: claims.append({ 'claim_type': 'image_count', 'claim_value': str(image_count), 'extraction_method': 'docling_structural', 'confidence': 1.0, 'tier': 1, 'html_file': html_file, 'extraction_timestamp': timestamp, }) # table_count (TIER 1, confidence 1.0) table_count = main_page.get('table_count', 0) if table_count > 0: claims.append({ 'claim_type': 'table_count', 'claim_value': str(table_count), 'extraction_method': 'docling_structural', 'confidence': 1.0, 'tier': 1, 'html_file': html_file, 'extraction_timestamp': timestamp, }) # markdown_length (TIER 1, confidence 1.0) markdown_length = main_page.get('markdown_length', 0) if markdown_length > 0: claims.append({ 'claim_type': 'markdown_length', 'claim_value': str(markdown_length), 'extraction_method': 'docling_structural', 'confidence': 1.0, 'tier': 1, 'html_file': html_file, 'extraction_timestamp': timestamp, }) # main_h1 (TIER 2, confidence 0.9) headers = main_page.get('headers', {}) h1_list = headers.get('h1', []) if h1_list: claims.append({ 'claim_type': 'main_h1', 'claim_value': h1_list[0], 'extraction_method': 'docling_pattern', 'confidence': 0.9, 'tier': 2, 'html_file': html_file, 'extraction_timestamp': timestamp, }) # has_contact_section (TIER 2, confidence 0.8) if main_page.get('has_contact_indicators'): claims.append({ 'claim_type': 'has_contact_section', 'claim_value': 'true', 'extraction_method': 'docling_pattern', 'confidence': 0.8, 'tier': 2, 'html_file': html_file, 'extraction_timestamp': timestamp, }) # has_footer (TIER 2, confidence 0.8) if main_page.get('has_footer_indicators'): claims.append({ 'claim_type': 'has_footer', 'claim_value': 'true', 'extraction_method': 'docling_pattern', 'confidence': 0.8, 'tier': 2, 'html_file': html_file, 'extraction_timestamp': timestamp, }) return claims def merge_claims(self, existing_claims: list, new_claims: list) -> tuple[list, int]: """Merge new claims with existing, avoiding duplicates by claim_type.""" # Get existing claim types existing_types = {c.get('claim_type') for c in existing_claims} # Add new claims that don't conflict merged = list(existing_claims) added = 0 for claim in new_claims: claim_type = claim.get('claim_type') # Only add if type doesn't exist OR if new claim is higher tier if claim_type not in existing_types: merged.append(claim) added += 1 return merged, added def process_entry(self, entry_path: Path) -> bool: """Process a single entry file.""" entry_id = self.get_entry_id(entry_path) # Get layout data for this entry if not self.layout_data: return False entries = self.layout_data.get('entries', {}) layout_entry = entries.get(entry_id) if entries else None if not layout_entry: self.stats['entries_skipped'] += 1 return False # Load entry try: with open(entry_path, 'r', encoding='utf-8') as f: entry = yaml.safe_load(f) except Exception as e: logger.error(f"Error loading {entry_path}: {e}") return False if not entry: return False # Check if already integrated web_claims = entry.get('web_claims', {}) if web_claims.get('layout_integrated'): logger.debug(f"Skipping {entry_id} - layout already integrated") self.stats['entries_skipped'] += 1 return False # Create structural claims new_claims = self.create_structural_claims(layout_entry) if not new_claims: self.stats['entries_skipped'] += 1 return False # Merge with existing claims existing_claims = web_claims.get('claims', []) merged_claims, added = self.merge_claims(existing_claims, new_claims) # Update entry if 'web_claims' not in entry: entry['web_claims'] = {} entry['web_claims']['claims'] = merged_claims entry['web_claims']['layout_integrated'] = True entry['web_claims']['layout_integration_timestamp'] = datetime.now(timezone.utc).isoformat() # Add layout metadata layout = layout_entry.get('layout', {}) entry['web_claims']['layout_metadata'] = { 'page_count': layout.get('page_count', 0), 'archive_path': layout.get('archive_path'), 'extraction_timestamp': layout.get('extraction_timestamp'), } self.stats['entries_processed'] += 1 self.stats['entries_updated'] += 1 self.stats['claims_added'] += added # Write if not dry run if not self.dry_run: try: with open(entry_path, 'w', encoding='utf-8') as f: yaml.dump(entry, f, default_flow_style=False, allow_unicode=True, sort_keys=False) except Exception as e: logger.error(f"Error writing {entry_path}: {e}") return False return True def run(self, entry_filter: str | None = None): """Run integration on all entries.""" # Load layout features self.load_layout_features() # Find entry files files = self.find_entry_files() if entry_filter: files = [f for f in files if entry_filter in f.name] logger.info(f"Processing {len(files)} entry files") for i, path in enumerate(files): self.process_entry(path) if (i + 1) % 100 == 0: logger.info(f"Processed {i + 1}/{len(files)} entries...") self.report() def report(self): """Print integration report.""" print("\n" + "=" * 60) print("LAYOUT INTEGRATION REPORT") print("=" * 60) mode = "DRY RUN" if self.dry_run else "INTEGRATION" print(f"\nMode: {mode}") print(f"\nEntries:") print(f" - Processed: {self.stats['entries_processed']}") print(f" - Updated: {self.stats['entries_updated']}") print(f" - Skipped: {self.stats['entries_skipped']}") print(f"\nClaims added: {self.stats['claims_added']}") def main(): parser = argparse.ArgumentParser(description='Integrate layout features into entry files') parser.add_argument('--entries-dir', type=Path, default=Path('data/nde/enriched/entries'), help='Path to entries directory') parser.add_argument('--layout-file', type=Path, default=Path('data/nde/layout_features_full.yaml'), help='Path to layout features file') parser.add_argument('--entry', type=str, help='Filter to specific entry ID (e.g., 0001)') parser.add_argument('--dry-run', action='store_true', help='Analyze without writing changes') args = parser.parse_args() if not args.entries_dir.exists(): logger.error(f"Entries directory not found: {args.entries_dir}") sys.exit(1) if not args.layout_file.exists(): logger.error(f"Layout file not found: {args.layout_file}") sys.exit(1) integrator = LayoutIntegrator(args.entries_dir, args.layout_file, dry_run=args.dry_run) integrator.run(entry_filter=args.entry) if __name__ == '__main__': main()