glam/scripts/integrate_layout_features.py

#!/usr/bin/env python3
"""
Integrate docling layout features into NDE entry YAML files.

This script:
1. Reads layout features from layout_features_full.yaml
2. Adds structural claims (TIER 1) to each entry's web_claims
3. Updates entry metadata with page counts and image counts

Usage:
    # Dry run (analyze only)
    python scripts/integrate_layout_features.py --dry-run

    # Integrate all entries
    python scripts/integrate_layout_features.py

    # Single entry
    python scripts/integrate_layout_features.py --entry 0001
"""

import argparse
import logging
import sys
from datetime import datetime, timezone
from pathlib import Path
from typing import Any

import yaml

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)


class LayoutIntegrator:
    """Integrate layout features into entry YAML files."""

    def __init__(self, entries_dir: Path, layout_file: Path, dry_run: bool = False):
        self.entries_dir = entries_dir
        self.layout_file = layout_file
        self.dry_run = dry_run
        self.layout_data = None
        self.stats = {
            'entries_processed': 0,
            'entries_updated': 0,
            'entries_skipped': 0,
            'claims_added': 0,
        }

    def load_layout_features(self):
        """Load the layout features file."""
        logger.info(f"Loading layout features from {self.layout_file}")
        with open(self.layout_file, 'r', encoding='utf-8') as f:
            self.layout_data = yaml.safe_load(f)
        logger.info(f"Loaded {len(self.layout_data.get('entries', {}))} layout entries")

    def find_entry_files(self) -> list[Path]:
        """Find all entry YAML files."""
        return sorted(self.entries_dir.glob('*.yaml'))

    def get_entry_id(self, path: Path) -> str:
        """Extract entry ID from filename (e.g., '0001_Q123.yaml' -> '0001')."""
        return path.stem.split('_')[0]

    def create_structural_claims(self, layout_entry: dict) -> list[dict]:
        """Create TIER 1 structural claims from layout features."""
        claims = []
        layout = layout_entry.get('layout', {})
        main_page = layout.get('main_page', {})

        if not main_page.get('success'):
            return claims

        timestamp = datetime.now(timezone.utc).isoformat()
        html_file = main_page.get('html_file', '')

        # page_title (TIER 1, confidence 1.0)
        if main_page.get('page_title'):
            claims.append({
                'claim_type': 'page_title',
                'claim_value': main_page['page_title'],
                'extraction_method': 'docling_structural',
                'confidence': 1.0,
                'tier': 1,
                'html_file': html_file,
                'extraction_timestamp': timestamp,
            })

        # page_count (TIER 1, confidence 1.0)
        page_count = layout.get('page_count', 0)
        if page_count > 0:
            claims.append({
                'claim_type': 'page_count',
                'claim_value': str(page_count),
                'extraction_method': 'docling_structural',
                'confidence': 1.0,
                'tier': 1,
                'extraction_timestamp': timestamp,
            })

        # image_count (TIER 1, confidence 1.0)
        image_count = main_page.get('image_count', 0)
        if image_count > 0:
            claims.append({
                'claim_type': 'image_count',
                'claim_value': str(image_count),
                'extraction_method': 'docling_structural',
                'confidence': 1.0,
                'tier': 1,
                'html_file': html_file,
                'extraction_timestamp': timestamp,
            })

        # table_count (TIER 1, confidence 1.0)
        table_count = main_page.get('table_count', 0)
        if table_count > 0:
            claims.append({
                'claim_type': 'table_count',
                'claim_value': str(table_count),
                'extraction_method': 'docling_structural',
                'confidence': 1.0,
                'tier': 1,
                'html_file': html_file,
                'extraction_timestamp': timestamp,
            })

        # markdown_length (TIER 1, confidence 1.0)
        markdown_length = main_page.get('markdown_length', 0)
        if markdown_length > 0:
            claims.append({
                'claim_type': 'markdown_length',
                'claim_value': str(markdown_length),
                'extraction_method': 'docling_structural',
                'confidence': 1.0,
                'tier': 1,
                'html_file': html_file,
                'extraction_timestamp': timestamp,
            })

        # main_h1 (TIER 2, confidence 0.9)
        headers = main_page.get('headers', {})
        h1_list = headers.get('h1', [])
        if h1_list:
            claims.append({
                'claim_type': 'main_h1',
                'claim_value': h1_list[0],
                'extraction_method': 'docling_pattern',
                'confidence': 0.9,
                'tier': 2,
                'html_file': html_file,
                'extraction_timestamp': timestamp,
            })

        # has_contact_section (TIER 2, confidence 0.8)
        if main_page.get('has_contact_indicators'):
            claims.append({
                'claim_type': 'has_contact_section',
                'claim_value': 'true',
                'extraction_method': 'docling_pattern',
                'confidence': 0.8,
                'tier': 2,
                'html_file': html_file,
                'extraction_timestamp': timestamp,
            })

        # has_footer (TIER 2, confidence 0.8)
        if main_page.get('has_footer_indicators'):
            claims.append({
                'claim_type': 'has_footer',
                'claim_value': 'true',
                'extraction_method': 'docling_pattern',
                'confidence': 0.8,
                'tier': 2,
                'html_file': html_file,
                'extraction_timestamp': timestamp,
            })

        return claims

    def merge_claims(self, existing_claims: list, new_claims: list) -> tuple[list, int]:
        """Merge new claims with existing, avoiding duplicates by claim_type."""
        # Get existing claim types
        existing_types = {c.get('claim_type') for c in existing_claims}

        # Add new claims that don't conflict
        merged = list(existing_claims)
        added = 0
        for claim in new_claims:
            claim_type = claim.get('claim_type')
            # Only add if type doesn't exist OR if new claim is higher tier
            if claim_type not in existing_types:
                merged.append(claim)
                added += 1

        return merged, added

    def process_entry(self, entry_path: Path) -> bool:
        """Process a single entry file."""
        entry_id = self.get_entry_id(entry_path)

        # Get layout data for this entry
        if not self.layout_data:
            return False
        entries = self.layout_data.get('entries', {})
        layout_entry = entries.get(entry_id) if entries else None
        if not layout_entry:
            self.stats['entries_skipped'] += 1
            return False

        # Load entry
        try:
            with open(entry_path, 'r', encoding='utf-8') as f:
                entry = yaml.safe_load(f)
        except Exception as e:
            logger.error(f"Error loading {entry_path}: {e}")
            return False

        if not entry:
            return False

        # Check if already integrated
        web_claims = entry.get('web_claims', {})
        if web_claims.get('layout_integrated'):
            logger.debug(f"Skipping {entry_id} - layout already integrated")
            self.stats['entries_skipped'] += 1
            return False

        # Create structural claims
        new_claims = self.create_structural_claims(layout_entry)
        if not new_claims:
            self.stats['entries_skipped'] += 1
            return False

        # Merge with existing claims
        existing_claims = web_claims.get('claims', [])
        merged_claims, added = self.merge_claims(existing_claims, new_claims)

        # Update entry
        if 'web_claims' not in entry:
            entry['web_claims'] = {}

        entry['web_claims']['claims'] = merged_claims
        entry['web_claims']['layout_integrated'] = True
        entry['web_claims']['layout_integration_timestamp'] = datetime.now(timezone.utc).isoformat()

        # Add layout metadata
        layout = layout_entry.get('layout', {})
        entry['web_claims']['layout_metadata'] = {
            'page_count': layout.get('page_count', 0),
            'archive_path': layout.get('archive_path'),
            'extraction_timestamp': layout.get('extraction_timestamp'),
        }

        self.stats['entries_processed'] += 1
        self.stats['entries_updated'] += 1
        self.stats['claims_added'] += added

        # Write if not dry run
        if not self.dry_run:
            try:
                with open(entry_path, 'w', encoding='utf-8') as f:
                    yaml.dump(entry, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
            except Exception as e:
                logger.error(f"Error writing {entry_path}: {e}")
                return False

        return True

    def run(self, entry_filter: str | None = None):
        """Run integration on all entries."""
        # Load layout features
        self.load_layout_features()

        # Find entry files
        files = self.find_entry_files()

        if entry_filter:
            files = [f for f in files if entry_filter in f.name]

        logger.info(f"Processing {len(files)} entry files")

        for i, path in enumerate(files):
            self.process_entry(path)
            if (i + 1) % 100 == 0:
                logger.info(f"Processed {i + 1}/{len(files)} entries...")

        self.report()

    def report(self):
        """Print integration report."""
        print("\n" + "=" * 60)
        print("LAYOUT INTEGRATION REPORT")
        print("=" * 60)

        mode = "DRY RUN" if self.dry_run else "INTEGRATION"
        print(f"\nMode: {mode}")
        print(f"\nEntries:")
        print(f"  - Processed: {self.stats['entries_processed']}")
        print(f"  - Updated:   {self.stats['entries_updated']}")
        print(f"  - Skipped:   {self.stats['entries_skipped']}")
        print(f"\nClaims added: {self.stats['claims_added']}")


def main():
    parser = argparse.ArgumentParser(description='Integrate layout features into entry files')
    parser.add_argument('--entries-dir', type=Path,
                        default=Path('data/nde/enriched/entries'),
                        help='Path to entries directory')
    parser.add_argument('--layout-file', type=Path,
                        default=Path('data/nde/layout_features_full.yaml'),
                        help='Path to layout features file')
    parser.add_argument('--entry', type=str,
                        help='Filter to specific entry ID (e.g., 0001)')
    parser.add_argument('--dry-run', action='store_true',
                        help='Analyze without writing changes')

    args = parser.parse_args()

    if not args.entries_dir.exists():
        logger.error(f"Entries directory not found: {args.entries_dir}")
        sys.exit(1)

    if not args.layout_file.exists():
        logger.error(f"Layout file not found: {args.layout_file}")
        sys.exit(1)

    integrator = LayoutIntegrator(args.entries_dir, args.layout_file, dry_run=args.dry_run)
    integrator.run(entry_filter=args.entry)


if __name__ == '__main__':
    main()