334 lines
12 KiB
Python
334 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Integrate docling layout features into NDE entry YAML files.
|
|
|
|
This script:
|
|
1. Reads layout features from layout_features_full.yaml
|
|
2. Adds structural claims (TIER 1) to each entry's web_claims
|
|
3. Updates entry metadata with page counts and image counts
|
|
|
|
Usage:
|
|
# Dry run (analyze only)
|
|
python scripts/integrate_layout_features.py --dry-run
|
|
|
|
# Integrate all entries
|
|
python scripts/integrate_layout_features.py
|
|
|
|
# Single entry
|
|
python scripts/integrate_layout_features.py --entry 0001
|
|
"""
|
|
|
|
import argparse
|
|
import logging
|
|
import sys
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
import yaml
|
|
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class LayoutIntegrator:
|
|
"""Integrate layout features into entry YAML files."""
|
|
|
|
def __init__(self, entries_dir: Path, layout_file: Path, dry_run: bool = False):
|
|
self.entries_dir = entries_dir
|
|
self.layout_file = layout_file
|
|
self.dry_run = dry_run
|
|
self.layout_data = None
|
|
self.stats = {
|
|
'entries_processed': 0,
|
|
'entries_updated': 0,
|
|
'entries_skipped': 0,
|
|
'claims_added': 0,
|
|
}
|
|
|
|
def load_layout_features(self):
|
|
"""Load the layout features file."""
|
|
logger.info(f"Loading layout features from {self.layout_file}")
|
|
with open(self.layout_file, 'r', encoding='utf-8') as f:
|
|
self.layout_data = yaml.safe_load(f)
|
|
logger.info(f"Loaded {len(self.layout_data.get('entries', {}))} layout entries")
|
|
|
|
def find_entry_files(self) -> list[Path]:
|
|
"""Find all entry YAML files."""
|
|
return sorted(self.entries_dir.glob('*.yaml'))
|
|
|
|
def get_entry_id(self, path: Path) -> str:
|
|
"""Extract entry ID from filename (e.g., '0001_Q123.yaml' -> '0001')."""
|
|
return path.stem.split('_')[0]
|
|
|
|
def create_structural_claims(self, layout_entry: dict) -> list[dict]:
|
|
"""Create TIER 1 structural claims from layout features."""
|
|
claims = []
|
|
layout = layout_entry.get('layout', {})
|
|
main_page = layout.get('main_page', {})
|
|
|
|
if not main_page.get('success'):
|
|
return claims
|
|
|
|
timestamp = datetime.now(timezone.utc).isoformat()
|
|
html_file = main_page.get('html_file', '')
|
|
|
|
# page_title (TIER 1, confidence 1.0)
|
|
if main_page.get('page_title'):
|
|
claims.append({
|
|
'claim_type': 'page_title',
|
|
'claim_value': main_page['page_title'],
|
|
'extraction_method': 'docling_structural',
|
|
'confidence': 1.0,
|
|
'tier': 1,
|
|
'html_file': html_file,
|
|
'extraction_timestamp': timestamp,
|
|
})
|
|
|
|
# page_count (TIER 1, confidence 1.0)
|
|
page_count = layout.get('page_count', 0)
|
|
if page_count > 0:
|
|
claims.append({
|
|
'claim_type': 'page_count',
|
|
'claim_value': str(page_count),
|
|
'extraction_method': 'docling_structural',
|
|
'confidence': 1.0,
|
|
'tier': 1,
|
|
'extraction_timestamp': timestamp,
|
|
})
|
|
|
|
# image_count (TIER 1, confidence 1.0)
|
|
image_count = main_page.get('image_count', 0)
|
|
if image_count > 0:
|
|
claims.append({
|
|
'claim_type': 'image_count',
|
|
'claim_value': str(image_count),
|
|
'extraction_method': 'docling_structural',
|
|
'confidence': 1.0,
|
|
'tier': 1,
|
|
'html_file': html_file,
|
|
'extraction_timestamp': timestamp,
|
|
})
|
|
|
|
# table_count (TIER 1, confidence 1.0)
|
|
table_count = main_page.get('table_count', 0)
|
|
if table_count > 0:
|
|
claims.append({
|
|
'claim_type': 'table_count',
|
|
'claim_value': str(table_count),
|
|
'extraction_method': 'docling_structural',
|
|
'confidence': 1.0,
|
|
'tier': 1,
|
|
'html_file': html_file,
|
|
'extraction_timestamp': timestamp,
|
|
})
|
|
|
|
# markdown_length (TIER 1, confidence 1.0)
|
|
markdown_length = main_page.get('markdown_length', 0)
|
|
if markdown_length > 0:
|
|
claims.append({
|
|
'claim_type': 'markdown_length',
|
|
'claim_value': str(markdown_length),
|
|
'extraction_method': 'docling_structural',
|
|
'confidence': 1.0,
|
|
'tier': 1,
|
|
'html_file': html_file,
|
|
'extraction_timestamp': timestamp,
|
|
})
|
|
|
|
# main_h1 (TIER 2, confidence 0.9)
|
|
headers = main_page.get('headers', {})
|
|
h1_list = headers.get('h1', [])
|
|
if h1_list:
|
|
claims.append({
|
|
'claim_type': 'main_h1',
|
|
'claim_value': h1_list[0],
|
|
'extraction_method': 'docling_pattern',
|
|
'confidence': 0.9,
|
|
'tier': 2,
|
|
'html_file': html_file,
|
|
'extraction_timestamp': timestamp,
|
|
})
|
|
|
|
# has_contact_section (TIER 2, confidence 0.8)
|
|
if main_page.get('has_contact_indicators'):
|
|
claims.append({
|
|
'claim_type': 'has_contact_section',
|
|
'claim_value': 'true',
|
|
'extraction_method': 'docling_pattern',
|
|
'confidence': 0.8,
|
|
'tier': 2,
|
|
'html_file': html_file,
|
|
'extraction_timestamp': timestamp,
|
|
})
|
|
|
|
# has_footer (TIER 2, confidence 0.8)
|
|
if main_page.get('has_footer_indicators'):
|
|
claims.append({
|
|
'claim_type': 'has_footer',
|
|
'claim_value': 'true',
|
|
'extraction_method': 'docling_pattern',
|
|
'confidence': 0.8,
|
|
'tier': 2,
|
|
'html_file': html_file,
|
|
'extraction_timestamp': timestamp,
|
|
})
|
|
|
|
return claims
|
|
|
|
def merge_claims(self, existing_claims: list, new_claims: list) -> tuple[list, int]:
|
|
"""Merge new claims with existing, avoiding duplicates by claim_type."""
|
|
# Get existing claim types
|
|
existing_types = {c.get('claim_type') for c in existing_claims}
|
|
|
|
# Add new claims that don't conflict
|
|
merged = list(existing_claims)
|
|
added = 0
|
|
for claim in new_claims:
|
|
claim_type = claim.get('claim_type')
|
|
# Only add if type doesn't exist OR if new claim is higher tier
|
|
if claim_type not in existing_types:
|
|
merged.append(claim)
|
|
added += 1
|
|
|
|
return merged, added
|
|
|
|
def process_entry(self, entry_path: Path) -> bool:
|
|
"""Process a single entry file."""
|
|
entry_id = self.get_entry_id(entry_path)
|
|
|
|
# Get layout data for this entry
|
|
if not self.layout_data:
|
|
return False
|
|
entries = self.layout_data.get('entries', {})
|
|
layout_entry = entries.get(entry_id) if entries else None
|
|
if not layout_entry:
|
|
self.stats['entries_skipped'] += 1
|
|
return False
|
|
|
|
# Load entry
|
|
try:
|
|
with open(entry_path, 'r', encoding='utf-8') as f:
|
|
entry = yaml.safe_load(f)
|
|
except Exception as e:
|
|
logger.error(f"Error loading {entry_path}: {e}")
|
|
return False
|
|
|
|
if not entry:
|
|
return False
|
|
|
|
# Check if already integrated
|
|
web_claims = entry.get('web_claims', {})
|
|
if web_claims.get('layout_integrated'):
|
|
logger.debug(f"Skipping {entry_id} - layout already integrated")
|
|
self.stats['entries_skipped'] += 1
|
|
return False
|
|
|
|
# Create structural claims
|
|
new_claims = self.create_structural_claims(layout_entry)
|
|
if not new_claims:
|
|
self.stats['entries_skipped'] += 1
|
|
return False
|
|
|
|
# Merge with existing claims
|
|
existing_claims = web_claims.get('claims', [])
|
|
merged_claims, added = self.merge_claims(existing_claims, new_claims)
|
|
|
|
# Update entry
|
|
if 'web_claims' not in entry:
|
|
entry['web_claims'] = {}
|
|
|
|
entry['web_claims']['claims'] = merged_claims
|
|
entry['web_claims']['layout_integrated'] = True
|
|
entry['web_claims']['layout_integration_timestamp'] = datetime.now(timezone.utc).isoformat()
|
|
|
|
# Add layout metadata
|
|
layout = layout_entry.get('layout', {})
|
|
entry['web_claims']['layout_metadata'] = {
|
|
'page_count': layout.get('page_count', 0),
|
|
'archive_path': layout.get('archive_path'),
|
|
'extraction_timestamp': layout.get('extraction_timestamp'),
|
|
}
|
|
|
|
self.stats['entries_processed'] += 1
|
|
self.stats['entries_updated'] += 1
|
|
self.stats['claims_added'] += added
|
|
|
|
# Write if not dry run
|
|
if not self.dry_run:
|
|
try:
|
|
with open(entry_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(entry, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
except Exception as e:
|
|
logger.error(f"Error writing {entry_path}: {e}")
|
|
return False
|
|
|
|
return True
|
|
|
|
def run(self, entry_filter: str | None = None):
|
|
"""Run integration on all entries."""
|
|
# Load layout features
|
|
self.load_layout_features()
|
|
|
|
# Find entry files
|
|
files = self.find_entry_files()
|
|
|
|
if entry_filter:
|
|
files = [f for f in files if entry_filter in f.name]
|
|
|
|
logger.info(f"Processing {len(files)} entry files")
|
|
|
|
for i, path in enumerate(files):
|
|
self.process_entry(path)
|
|
if (i + 1) % 100 == 0:
|
|
logger.info(f"Processed {i + 1}/{len(files)} entries...")
|
|
|
|
self.report()
|
|
|
|
def report(self):
|
|
"""Print integration report."""
|
|
print("\n" + "=" * 60)
|
|
print("LAYOUT INTEGRATION REPORT")
|
|
print("=" * 60)
|
|
|
|
mode = "DRY RUN" if self.dry_run else "INTEGRATION"
|
|
print(f"\nMode: {mode}")
|
|
print(f"\nEntries:")
|
|
print(f" - Processed: {self.stats['entries_processed']}")
|
|
print(f" - Updated: {self.stats['entries_updated']}")
|
|
print(f" - Skipped: {self.stats['entries_skipped']}")
|
|
print(f"\nClaims added: {self.stats['claims_added']}")
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='Integrate layout features into entry files')
|
|
parser.add_argument('--entries-dir', type=Path,
|
|
default=Path('data/nde/enriched/entries'),
|
|
help='Path to entries directory')
|
|
parser.add_argument('--layout-file', type=Path,
|
|
default=Path('data/nde/layout_features_full.yaml'),
|
|
help='Path to layout features file')
|
|
parser.add_argument('--entry', type=str,
|
|
help='Filter to specific entry ID (e.g., 0001)')
|
|
parser.add_argument('--dry-run', action='store_true',
|
|
help='Analyze without writing changes')
|
|
|
|
args = parser.parse_args()
|
|
|
|
if not args.entries_dir.exists():
|
|
logger.error(f"Entries directory not found: {args.entries_dir}")
|
|
sys.exit(1)
|
|
|
|
if not args.layout_file.exists():
|
|
logger.error(f"Layout file not found: {args.layout_file}")
|
|
sys.exit(1)
|
|
|
|
integrator = LayoutIntegrator(args.entries_dir, args.layout_file, dry_run=args.dry_run)
|
|
integrator.run(entry_filter=args.entry)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|