glam/scripts/integrate_layout_features.py
2025-12-02 14:36:01 +01:00

334 lines
12 KiB
Python

#!/usr/bin/env python3
"""
Integrate docling layout features into NDE entry YAML files.
This script:
1. Reads layout features from layout_features_full.yaml
2. Adds structural claims (TIER 1) to each entry's web_claims
3. Updates entry metadata with page counts and image counts
Usage:
# Dry run (analyze only)
python scripts/integrate_layout_features.py --dry-run
# Integrate all entries
python scripts/integrate_layout_features.py
# Single entry
python scripts/integrate_layout_features.py --entry 0001
"""
import argparse
import logging
import sys
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
import yaml
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
class LayoutIntegrator:
"""Integrate layout features into entry YAML files."""
def __init__(self, entries_dir: Path, layout_file: Path, dry_run: bool = False):
self.entries_dir = entries_dir
self.layout_file = layout_file
self.dry_run = dry_run
self.layout_data = None
self.stats = {
'entries_processed': 0,
'entries_updated': 0,
'entries_skipped': 0,
'claims_added': 0,
}
def load_layout_features(self):
"""Load the layout features file."""
logger.info(f"Loading layout features from {self.layout_file}")
with open(self.layout_file, 'r', encoding='utf-8') as f:
self.layout_data = yaml.safe_load(f)
logger.info(f"Loaded {len(self.layout_data.get('entries', {}))} layout entries")
def find_entry_files(self) -> list[Path]:
"""Find all entry YAML files."""
return sorted(self.entries_dir.glob('*.yaml'))
def get_entry_id(self, path: Path) -> str:
"""Extract entry ID from filename (e.g., '0001_Q123.yaml' -> '0001')."""
return path.stem.split('_')[0]
def create_structural_claims(self, layout_entry: dict) -> list[dict]:
"""Create TIER 1 structural claims from layout features."""
claims = []
layout = layout_entry.get('layout', {})
main_page = layout.get('main_page', {})
if not main_page.get('success'):
return claims
timestamp = datetime.now(timezone.utc).isoformat()
html_file = main_page.get('html_file', '')
# page_title (TIER 1, confidence 1.0)
if main_page.get('page_title'):
claims.append({
'claim_type': 'page_title',
'claim_value': main_page['page_title'],
'extraction_method': 'docling_structural',
'confidence': 1.0,
'tier': 1,
'html_file': html_file,
'extraction_timestamp': timestamp,
})
# page_count (TIER 1, confidence 1.0)
page_count = layout.get('page_count', 0)
if page_count > 0:
claims.append({
'claim_type': 'page_count',
'claim_value': str(page_count),
'extraction_method': 'docling_structural',
'confidence': 1.0,
'tier': 1,
'extraction_timestamp': timestamp,
})
# image_count (TIER 1, confidence 1.0)
image_count = main_page.get('image_count', 0)
if image_count > 0:
claims.append({
'claim_type': 'image_count',
'claim_value': str(image_count),
'extraction_method': 'docling_structural',
'confidence': 1.0,
'tier': 1,
'html_file': html_file,
'extraction_timestamp': timestamp,
})
# table_count (TIER 1, confidence 1.0)
table_count = main_page.get('table_count', 0)
if table_count > 0:
claims.append({
'claim_type': 'table_count',
'claim_value': str(table_count),
'extraction_method': 'docling_structural',
'confidence': 1.0,
'tier': 1,
'html_file': html_file,
'extraction_timestamp': timestamp,
})
# markdown_length (TIER 1, confidence 1.0)
markdown_length = main_page.get('markdown_length', 0)
if markdown_length > 0:
claims.append({
'claim_type': 'markdown_length',
'claim_value': str(markdown_length),
'extraction_method': 'docling_structural',
'confidence': 1.0,
'tier': 1,
'html_file': html_file,
'extraction_timestamp': timestamp,
})
# main_h1 (TIER 2, confidence 0.9)
headers = main_page.get('headers', {})
h1_list = headers.get('h1', [])
if h1_list:
claims.append({
'claim_type': 'main_h1',
'claim_value': h1_list[0],
'extraction_method': 'docling_pattern',
'confidence': 0.9,
'tier': 2,
'html_file': html_file,
'extraction_timestamp': timestamp,
})
# has_contact_section (TIER 2, confidence 0.8)
if main_page.get('has_contact_indicators'):
claims.append({
'claim_type': 'has_contact_section',
'claim_value': 'true',
'extraction_method': 'docling_pattern',
'confidence': 0.8,
'tier': 2,
'html_file': html_file,
'extraction_timestamp': timestamp,
})
# has_footer (TIER 2, confidence 0.8)
if main_page.get('has_footer_indicators'):
claims.append({
'claim_type': 'has_footer',
'claim_value': 'true',
'extraction_method': 'docling_pattern',
'confidence': 0.8,
'tier': 2,
'html_file': html_file,
'extraction_timestamp': timestamp,
})
return claims
def merge_claims(self, existing_claims: list, new_claims: list) -> tuple[list, int]:
"""Merge new claims with existing, avoiding duplicates by claim_type."""
# Get existing claim types
existing_types = {c.get('claim_type') for c in existing_claims}
# Add new claims that don't conflict
merged = list(existing_claims)
added = 0
for claim in new_claims:
claim_type = claim.get('claim_type')
# Only add if type doesn't exist OR if new claim is higher tier
if claim_type not in existing_types:
merged.append(claim)
added += 1
return merged, added
def process_entry(self, entry_path: Path) -> bool:
"""Process a single entry file."""
entry_id = self.get_entry_id(entry_path)
# Get layout data for this entry
if not self.layout_data:
return False
entries = self.layout_data.get('entries', {})
layout_entry = entries.get(entry_id) if entries else None
if not layout_entry:
self.stats['entries_skipped'] += 1
return False
# Load entry
try:
with open(entry_path, 'r', encoding='utf-8') as f:
entry = yaml.safe_load(f)
except Exception as e:
logger.error(f"Error loading {entry_path}: {e}")
return False
if not entry:
return False
# Check if already integrated
web_claims = entry.get('web_claims', {})
if web_claims.get('layout_integrated'):
logger.debug(f"Skipping {entry_id} - layout already integrated")
self.stats['entries_skipped'] += 1
return False
# Create structural claims
new_claims = self.create_structural_claims(layout_entry)
if not new_claims:
self.stats['entries_skipped'] += 1
return False
# Merge with existing claims
existing_claims = web_claims.get('claims', [])
merged_claims, added = self.merge_claims(existing_claims, new_claims)
# Update entry
if 'web_claims' not in entry:
entry['web_claims'] = {}
entry['web_claims']['claims'] = merged_claims
entry['web_claims']['layout_integrated'] = True
entry['web_claims']['layout_integration_timestamp'] = datetime.now(timezone.utc).isoformat()
# Add layout metadata
layout = layout_entry.get('layout', {})
entry['web_claims']['layout_metadata'] = {
'page_count': layout.get('page_count', 0),
'archive_path': layout.get('archive_path'),
'extraction_timestamp': layout.get('extraction_timestamp'),
}
self.stats['entries_processed'] += 1
self.stats['entries_updated'] += 1
self.stats['claims_added'] += added
# Write if not dry run
if not self.dry_run:
try:
with open(entry_path, 'w', encoding='utf-8') as f:
yaml.dump(entry, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
except Exception as e:
logger.error(f"Error writing {entry_path}: {e}")
return False
return True
def run(self, entry_filter: str | None = None):
"""Run integration on all entries."""
# Load layout features
self.load_layout_features()
# Find entry files
files = self.find_entry_files()
if entry_filter:
files = [f for f in files if entry_filter in f.name]
logger.info(f"Processing {len(files)} entry files")
for i, path in enumerate(files):
self.process_entry(path)
if (i + 1) % 100 == 0:
logger.info(f"Processed {i + 1}/{len(files)} entries...")
self.report()
def report(self):
"""Print integration report."""
print("\n" + "=" * 60)
print("LAYOUT INTEGRATION REPORT")
print("=" * 60)
mode = "DRY RUN" if self.dry_run else "INTEGRATION"
print(f"\nMode: {mode}")
print(f"\nEntries:")
print(f" - Processed: {self.stats['entries_processed']}")
print(f" - Updated: {self.stats['entries_updated']}")
print(f" - Skipped: {self.stats['entries_skipped']}")
print(f"\nClaims added: {self.stats['claims_added']}")
def main():
parser = argparse.ArgumentParser(description='Integrate layout features into entry files')
parser.add_argument('--entries-dir', type=Path,
default=Path('data/nde/enriched/entries'),
help='Path to entries directory')
parser.add_argument('--layout-file', type=Path,
default=Path('data/nde/layout_features_full.yaml'),
help='Path to layout features file')
parser.add_argument('--entry', type=str,
help='Filter to specific entry ID (e.g., 0001)')
parser.add_argument('--dry-run', action='store_true',
help='Analyze without writing changes')
args = parser.parse_args()
if not args.entries_dir.exists():
logger.error(f"Entries directory not found: {args.entries_dir}")
sys.exit(1)
if not args.layout_file.exists():
logger.error(f"Layout file not found: {args.layout_file}")
sys.exit(1)
integrator = LayoutIntegrator(args.entries_dir, args.layout_file, dry_run=args.dry_run)
integrator.run(entry_filter=args.entry)
if __name__ == '__main__':
main()