glam/scripts/extract_website_headers.py

#!/usr/bin/env python3
"""
Extract and analyze headers (h1-h6, nav items) from all archived websites.

This script:
1. Extracts all headers from archived HTML files
2. Identifies common patterns across heritage institution websites
3. Generates comparison sets for quality control
4. Helps identify which headers are meaningful vs. generic UI

Usage:
    python scripts/extract_website_headers.py [--limit N] [--output FILE]
"""

import argparse
import json
import sys
from collections import defaultdict
from pathlib import Path
from typing import Dict, List, Set, Optional, Any

try:
    from lxml import etree
    HAS_LXML = True
except ImportError:
    HAS_LXML = False
    print("Error: lxml is required. Install with: pip install lxml")

import yaml


ENTRIES_DIR = Path('/Users/kempersc/apps/glam/data/nde/enriched/entries')
WEB_DIR = ENTRIES_DIR / 'web'


def extract_headers_from_html(html_content: str) -> Dict[str, List[str]]:
    """Extract all headers from HTML content."""
    headers = {
        'h1': [],
        'h2': [],
        'h3': [],
        'h4': [],
        'h5': [],
        'h6': [],
        'nav_items': [],
        'title': None,
        'meta_description': None,
        'og_site_name': None,
    }

    try:
        tree = etree.HTML(html_content)

        # Extract h1-h6 headers
        for level in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
            elements = tree.xpath(f'//{level}')
            for el in elements:
                text = ''.join(el.itertext()).strip()
                if text and len(text) > 1 and len(text) < 500:
                    headers[level].append(text)

        # Extract title
        titles = tree.xpath('//title/text()')
        if titles:
            headers['title'] = titles[0].strip()

        # Extract meta description
        meta_desc = tree.xpath('//meta[@name="description"]/@content')
        if meta_desc:
            headers['meta_description'] = meta_desc[0].strip()

        # Extract og:site_name
        og_name = tree.xpath('//meta[@property="og:site_name"]/@content')
        if og_name:
            headers['og_site_name'] = og_name[0].strip()

        # Extract navigation items
        nav_links = tree.xpath('//nav//a/text() | //nav//span/text() | //*[@class[contains(., "nav")]]//a/text()')
        for text in nav_links:
            text = text.strip()
            if text and len(text) > 1 and len(text) < 100:
                headers['nav_items'].append(text)

    except Exception as e:
        print(f"  Warning: Failed to parse HTML: {e}")

    return headers


def find_html_file(archive_path: Path) -> Optional[Path]:
    """Find the main HTML file in an archive directory."""
    # Priority order
    candidates = [
        archive_path / 'rendered.html',
        archive_path / 'pages' / 'index.html',
    ]

    for candidate in candidates:
        if candidate.exists():
            return candidate

    # Fallback: find first HTML in pages/
    pages_dir = archive_path / 'pages'
    if pages_dir.exists():
        html_files = list(pages_dir.glob('*.html'))
        if html_files:
            return html_files[0]

    return None


def process_entry(entry_path: Path) -> Optional[Dict]:
    """Process a single entry and extract headers from its website."""
    with open(entry_path, 'r', encoding='utf-8') as f:
        data = yaml.safe_load(f)

    if not data:
        return None

    # Get entry number from filename
    entry_num = entry_path.name.split('_')[0]

    # Find web archive directory
    web_enrichment = data.get('web_enrichment', {})
    web_archives = web_enrichment.get('web_archives', [])

    archive_path = None
    if web_archives:
        archive = web_archives[0]
        directory = archive.get('directory')
        if directory:
            archive_path = ENTRIES_DIR / directory

    if not archive_path:
        # Fallback: check web/{entry_num}/
        entry_web_dir = WEB_DIR / entry_num
        if entry_web_dir.exists():
            subdirs = [d for d in entry_web_dir.iterdir() if d.is_dir()]
            if subdirs:
                archive_path = subdirs[0]

    if not archive_path or not archive_path.exists():
        return None

    # Find and read HTML file
    html_file = find_html_file(archive_path)
    if not html_file:
        return None

    try:
        with open(html_file, 'r', encoding='utf-8', errors='replace') as f:
            html_content = f.read()
    except Exception as e:
        print(f"  Warning: Failed to read {html_file}: {e}")
        return None

    # Extract headers
    headers = extract_headers_from_html(html_content)

    # Get custodian name for comparison
    custodian_name = data.get('custodian_name', {}).get('claim_value', '')
    google_name = data.get('google_maps_enrichment', {}).get('name', '')
    wikidata_name = data.get('wikidata_enrichment', {}).get('wikidata_label_nl', '')

    return {
        'entry_file': entry_path.name,
        'entry_num': entry_num,
        'custodian_name': custodian_name,
        'google_name': google_name,
        'wikidata_name': wikidata_name,
        'website_url': data.get('wikidata_enrichment', {}).get('wikidata_official_website', ''),
        'headers': headers,
    }


def analyze_patterns(results: List[Dict]) -> Dict:
    """Analyze common patterns across all websites."""

    # Collect all header values
    all_h1 = defaultdict(int)
    all_h2 = defaultdict(int)
    all_nav = defaultdict(int)
    all_titles = defaultdict(int)

    # Compare headers with known names
    h1_matches_name = 0
    title_matches_name = 0
    og_matches_name = 0
    total_with_headers = 0

    for result in results:
        headers = result.get('headers', {})
        known_names = {
            result.get('custodian_name', '').lower(),
            result.get('google_name', '').lower(),
            result.get('wikidata_name', '').lower(),
        }
        known_names.discard('')

        # Count header occurrences
        for h1 in headers.get('h1', []):
            all_h1[h1] += 1

        for h2 in headers.get('h2', []):
            all_h2[h2] += 1

        for nav in headers.get('nav_items', []):
            all_nav[nav] += 1

        title = headers.get('title')
        if title:
            all_titles[title] += 1

        # Check if headers match known names
        if headers.get('h1'):
            total_with_headers += 1
            h1_text = headers['h1'][0].lower() if headers['h1'] else ''
            if any(name in h1_text or h1_text in name for name in known_names if name):
                h1_matches_name += 1

        if title:
            title_lower = title.lower()
            if any(name in title_lower or title_lower in name for name in known_names if name):
                title_matches_name += 1

        og_name = headers.get('og_site_name', '')
        if og_name:
            og_lower = og_name.lower()
            if any(name in og_lower or og_lower in name for name in known_names if name):
                og_matches_name += 1

    return {
        'total_entries': len(results),
        'entries_with_headers': total_with_headers,
        'h1_name_match_rate': h1_matches_name / total_with_headers if total_with_headers else 0,
        'title_name_match_rate': title_matches_name / len(results) if results else 0,
        'og_name_match_rate': og_matches_name / len(results) if results else 0,
        'common_h1_values': sorted(all_h1.items(), key=lambda x: -x[1])[:50],
        'common_h2_values': sorted(all_h2.items(), key=lambda x: -x[1])[:50],
        'common_nav_items': sorted(all_nav.items(), key=lambda x: -x[1])[:50],
        'common_titles': sorted(all_titles.items(), key=lambda x: -x[1])[:30],
    }


def main():
    parser = argparse.ArgumentParser(description='Extract and analyze website headers')
    parser.add_argument('--limit', type=int, default=None, help='Limit entries to process')
    parser.add_argument('--output', type=str, default=None, help='Output JSON file')
    parser.add_argument('--verbose', action='store_true', help='Show detailed output')
    args = parser.parse_args()

    if not HAS_LXML:
        return 1

    # Find entry files
    files = sorted([f for f in ENTRIES_DIR.glob('*.yaml') if f.is_file() and not f.name.startswith('.')])

    if args.limit:
        files = files[:args.limit]

    print(f"Processing {len(files)} entries...")

    results = []
    processed = 0

    for filepath in files:
        result = process_entry(filepath)
        if result:
            results.append(result)
            processed += 1
            if args.verbose and processed % 100 == 0:
                print(f"  Processed {processed} entries with headers")

    print(f"\nExtracted headers from {len(results)} entries")

    # Analyze patterns
    analysis = analyze_patterns(results)

    # Print summary
    print("\n" + "=" * 60)
    print("HEADER ANALYSIS SUMMARY")
    print("=" * 60)
    print(f"Total entries processed: {analysis['total_entries']}")
    print(f"Entries with H1 headers: {analysis['entries_with_headers']}")
    print(f"H1 matches known name: {analysis['h1_name_match_rate']:.1%}")
    print(f"Title matches known name: {analysis['title_name_match_rate']:.1%}")
    print(f"OG:site_name matches known name: {analysis['og_name_match_rate']:.1%}")

    print("\n" + "-" * 40)
    print("MOST COMMON H1 VALUES (likely generic):")
    for value, count in analysis['common_h1_values'][:20]:
        if count > 1:
            print(f"  {count:4d}x  {value[:60]}")

    print("\n" + "-" * 40)
    print("MOST COMMON NAV ITEMS:")
    for value, count in analysis['common_nav_items'][:20]:
        if count > 5:
            print(f"  {count:4d}x  {value[:40]}")

    # Save output
    if args.output:
        output_data = {
            'analysis': analysis,
            'entries': results,
        }
        with open(args.output, 'w', encoding='utf-8') as f:
            json.dump(output_data, f, indent=2, ensure_ascii=False)
        print(f"\nSaved detailed output to {args.output}")

    return 0


if __name__ == '__main__':
    sys.exit(main())