glam/scripts/analyze_layout_patterns.py

#!/usr/bin/env python3
"""
Analyze layout patterns across archived heritage institution websites.

This script examines the relationship between:
1. DOM structure patterns (XPath locations)
2. String content patterns found at those locations
3. Entity types extracted from each location

The goal is to identify common layout patterns across Dutch heritage websites
and map them to the entity extraction patterns in dutch_web_patterns.yaml.

Usage:
    python scripts/analyze_layout_patterns.py [--limit N] [--output FILE]
    python scripts/analyze_layout_patterns.py --limit 100 --output reports/layout_analysis.md
"""

import argparse
import os
import re
from collections import defaultdict
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from typing import Optional

import yaml


@dataclass
class LayoutCategoryStats:
    """Aggregated stats for a layout category."""
    count: int = 0
    websites: set[str] = field(default_factory=set)
    xpath_patterns: dict[str, int] = field(default_factory=dict)
    string_patterns: dict[str, int] = field(default_factory=dict)
    samples: list[dict] = field(default_factory=list)


@dataclass
class EntityTypeStats:
    """Aggregated stats for an entity type."""
    count: int = 0
    websites: set[str] = field(default_factory=set)
    categories: dict[str, int] = field(default_factory=dict)
    xpath_patterns: dict[str, int] = field(default_factory=dict)
    samples: list[dict] = field(default_factory=list)


def simplify_xpath(xpath: str) -> str:
    """
    Simplify an XPath to a generic pattern for comparison.

    /html/body/div[4]/section/div/div/div[1]/div/h1 -> body/*/h1
    /html/head/title -> head/title
    /html/body/div[2]/div/div[2]/div/nav -> body/*/nav
    """
    if not xpath:
        return ""

    # Remove /html prefix
    xpath = re.sub(r'^/html/?', '', xpath)

    # Remove index notation but keep element names
    # e.g., div[4] -> div, section[1] -> section
    xpath = re.sub(r'\[\d+\]', '', xpath)

    # Collapse repeated div/span patterns
    # body/div/div/div/div/h1 -> body/*/h1
    parts = xpath.split('/')
    simplified = []
    prev_generic = False

    for part in parts:
        if not part:
            continue
        # Strip attributes like @content
        part = re.sub(r'/@.*$', '', part)
        part = re.sub(r'/text\(\)$', '', part)

        # Generic container elements
        if part in ('div', 'span', 'section', 'article', 'main', 'aside'):
            if not prev_generic:
                simplified.append('*')
                prev_generic = True
        else:
            simplified.append(part)
            prev_generic = False

    return '/'.join(simplified)


def get_xpath_category(xpath: Optional[str]) -> str:
    """
    Categorize XPath into semantic regions.

    Returns: header, nav, main_heading, main_content, sidebar, footer, meta, unknown
    """
    if not xpath:
        return 'unknown'

    xpath_lower = xpath.lower()
    simplified = simplify_xpath(xpath)

    # Meta information
    if 'head/' in simplified or simplified.startswith('head'):
        if 'title' in simplified:
            return 'meta_title'
        if 'meta' in simplified:
            return 'meta_tag'
        return 'meta_other'

    # Navigation
    if '/nav' in simplified or '/menu' in simplified:
        return 'nav'
    if 'header' in simplified:
        return 'header'

    # Footer
    if 'footer' in simplified:
        return 'footer'

    # Main headings
    if simplified.endswith('/h1'):
        return 'main_heading'
    if re.search(r'/h[2-6]$', simplified):
        return 'sub_heading'

    # Contact/address sections
    if 'contact' in xpath_lower or 'address' in xpath_lower:
        return 'contact'

    # Main content areas
    if '/main' in simplified or '/article' in simplified:
        return 'main_content'
    if '/p' in simplified or simplified.endswith('/p'):
        return 'paragraph'

    # Lists
    if '/ul' in simplified or '/ol' in simplified or '/li' in simplified:
        return 'list'

    # Sidebar
    if 'aside' in simplified or 'sidebar' in xpath_lower:
        return 'sidebar'

    return 'other'


def detect_string_pattern(text: str) -> list[str]:
    """
    Detect which string patterns match the given text.
    Returns list of pattern names that match.
    """
    patterns = []
    text_clean = text.strip()
    text_lower = text_clean.lower()

    # Organization patterns
    if re.search(r'\b(stichting|vereniging|genootschap|kring)\b', text_lower):
        patterns.append('org:foundation_or_association')
    if re.search(r'\b(museum|archief|bibliotheek)\b', text_lower):
        patterns.append('heritage:institution_type')
    if re.search(r'\bgemeente\s+\w+', text_lower):
        patterns.append('gov:municipality')

    # Location patterns
    if re.search(r'\b\d{4}\s*[A-Z]{2}\b', text):  # Dutch postal code
        patterns.append('loc:postal_code_nl')
    if re.search(r'\b[A-Z][a-z]+weg\b|\b[A-Z][a-z]+straat\b|\b[A-Z][a-z]+laan\b', text):
        patterns.append('loc:street_address')

    # Contact patterns
    if re.search(r'\b[\w.-]+@[\w.-]+\.\w+\b', text):
        patterns.append('contact:email')
    if re.search(r'\b(?:tel|telefoon|phone)[:.]?\s*[\d\s-]+', text_lower):
        patterns.append('contact:phone')

    # Navigation patterns
    if re.search(r'\b(home|contact|over ons|collectie|bezoek|programma)\b', text_lower):
        patterns.append('nav:menu_item')

    # Opening hours
    if re.search(r'\b(maandag|dinsdag|woensdag|donderdag|vrijdag|zaterdag|zondag)\b', text_lower):
        patterns.append('info:opening_hours')
    if re.search(r'\b\d{1,2}[:\.]\d{2}\s*[-–]\s*\d{1,2}[:\.]\d{2}\b', text):
        patterns.append('info:time_range')

    # Person names (simplified pattern)
    if re.search(r'\b[A-Z][a-z]+\s+(?:van\s+(?:de\s+|het\s+)?)?[A-Z][a-z]+\b', text):
        patterns.append('person:dutch_name')

    return patterns if patterns else ['unknown']


def analyze_annotation_file(annotation_path: Path) -> Optional[dict]:
    """
    Analyze a single annotation file and extract layout/entity patterns.

    Returns dict with:
    - website: domain name
    - layouts: list of (xpath, category, text_sample, patterns)
    - entities: list of (entity_type, xpath, text, patterns)

    Returns None if file cannot be parsed or has no session data.
    """
    try:
        with open(annotation_path, 'r', encoding='utf-8') as f:
            data = yaml.safe_load(f)
    except Exception:
        return None

    if not data or 'session' not in data:
        return None

    source_url = data.get('source_url') or ''
    domain = re.sub(r'^https?://(?:www\.)?', '', source_url).split('/')[0] if source_url else 'unknown'

    result = {
        'website': domain,
        'source_url': source_url,
        'layouts': [],
        'entities': [],
    }

    claims = data.get('session', {}).get('claims', {})

    # Process layout claims
    for layout in claims.get('layout', []):
        xpath = layout.get('xpath') or layout.get('provenance', {}).get('path', '')
        text = layout.get('text_content', '')[:200]  # Truncate long text
        category = get_xpath_category(xpath)
        simplified_xpath = simplify_xpath(xpath)
        patterns = detect_string_pattern(text)

        result['layouts'].append({
            'xpath': xpath,
            'xpath_simplified': simplified_xpath,
            'category': category,
            'text_sample': text,
            'patterns': patterns,
        })

    # Process entity claims
    for entity in claims.get('entity', []):
        xpath = entity.get('provenance', {}).get('path', '')
        text = entity.get('text_content', '')[:200]
        entity_type = entity.get('hyponym') or entity.get('hypernym', 'UNKNOWN')
        category = get_xpath_category(xpath)
        simplified_xpath = simplify_xpath(xpath)
        patterns = detect_string_pattern(text)

        result['entities'].append({
            'entity_type': entity_type,
            'xpath': xpath,
            'xpath_simplified': simplified_xpath,
            'category': category,
            'text': text,
            'patterns': patterns,
        })

    return result


def analyze_all_archives(web_dir: Path, limit: Optional[int] = None) -> dict:
    """
    Analyze all annotation files in the web archive directory.

    Returns aggregated statistics on layout patterns.
    """
    # Find all annotation files
    annotation_files = list(web_dir.glob('*/*/annotations_v1.7.0.yaml'))

    if limit:
        annotation_files = annotation_files[:limit]

    print(f"Found {len(annotation_files)} annotation files to analyze")

    # Aggregation structures using typed dataclasses
    layout_by_category: dict[str, LayoutCategoryStats] = {}
    entity_by_type: dict[str, EntityTypeStats] = {}
    xpath_to_entities: dict[str, dict[str, int]] = {}
    pattern_cooccurrence: dict[str, dict[str, int]] = {}

    websites_analyzed: set[str] = set()
    total_layouts = 0
    total_entities = 0

    for i, ann_file in enumerate(annotation_files):
        if i % 100 == 0:
            print(f"  Processing {i}/{len(annotation_files)}...")

        result = analyze_annotation_file(ann_file)
        if not result:
            continue

        website = result['website']
        websites_analyzed.add(website)

        # Aggregate layouts
        for layout in result['layouts']:
            cat = layout['category']

            # Initialize if not exists
            if cat not in layout_by_category:
                layout_by_category[cat] = LayoutCategoryStats()

            stats = layout_by_category[cat]
            stats.count += 1
            stats.websites.add(website)

            xpath_key = layout['xpath_simplified']
            stats.xpath_patterns[xpath_key] = stats.xpath_patterns.get(xpath_key, 0) + 1

            for pattern in layout['patterns']:
                stats.string_patterns[pattern] = stats.string_patterns.get(pattern, 0) + 1

            if len(stats.samples) < 20:
                stats.samples.append({
                    'website': website,
                    'text': layout['text_sample'][:100],
                    'patterns': layout['patterns'],
                })

            total_layouts += 1

        # Aggregate entities
        for entity in result['entities']:
            etype = entity['entity_type']

            # Initialize if not exists
            if etype not in entity_by_type:
                entity_by_type[etype] = EntityTypeStats()

            stats = entity_by_type[etype]
            stats.count += 1
            stats.websites.add(website)

            cat_key = entity['category']
            stats.categories[cat_key] = stats.categories.get(cat_key, 0) + 1

            xpath_key = entity['xpath_simplified']
            stats.xpath_patterns[xpath_key] = stats.xpath_patterns.get(xpath_key, 0) + 1

            # Track xpath -> entity type mapping
            if xpath_key not in xpath_to_entities:
                xpath_to_entities[xpath_key] = {}
            xpath_to_entities[xpath_key][etype] = xpath_to_entities[xpath_key].get(etype, 0) + 1

            # Track pattern co-occurrence
            if etype not in pattern_cooccurrence:
                pattern_cooccurrence[etype] = {}
            for pattern in entity['patterns']:
                pattern_cooccurrence[etype][pattern] = pattern_cooccurrence[etype].get(pattern, 0) + 1

            if len(stats.samples) < 10:
                stats.samples.append({
                    'website': website,
                    'text': entity['text'][:100],
                    'xpath': entity['xpath_simplified'],
                })

            total_entities += 1

    # Convert dataclasses to dicts for report generation
    layout_dict = {
        cat: {
            'count': s.count,
            'websites': s.websites,
            'xpath_patterns': s.xpath_patterns,
            'string_patterns': s.string_patterns,
            'samples': s.samples,
        }
        for cat, s in layout_by_category.items()
    }

    entity_dict = {
        etype: {
            'count': s.count,
            'websites': s.websites,
            'categories': s.categories,
            'xpath_patterns': s.xpath_patterns,
            'samples': s.samples,
        }
        for etype, s in entity_by_type.items()
    }

    return {
        'summary': {
            'annotation_files': len(annotation_files),
            'websites_analyzed': len(websites_analyzed),
            'total_layouts': total_layouts,
            'total_entities': total_entities,
        },
        'layout_by_category': layout_dict,
        'entity_by_type': entity_dict,
        'xpath_to_entities': xpath_to_entities,
        'pattern_cooccurrence': pattern_cooccurrence,
    }


def generate_report(analysis: dict, output_path: Optional[Path] = None) -> str:
    """Generate a markdown report from the analysis results."""
    lines = []

    lines.append("# Web Archive Layout Pattern Analysis")
    lines.append(f"\nGenerated: {datetime.now().isoformat()}")
    lines.append("")

    # Summary
    summary = analysis['summary']
    lines.append("## Summary")
    lines.append("")
    lines.append(f"- **Annotation files analyzed**: {summary['annotation_files']}")
    lines.append(f"- **Unique websites**: {summary['websites_analyzed']}")
    lines.append(f"- **Total layout claims**: {summary['total_layouts']}")
    lines.append(f"- **Total entity claims**: {summary['total_entities']}")
    lines.append("")

    # Layout categories
    lines.append("## Layout Categories")
    lines.append("")
    lines.append("Distribution of content by DOM location category:")
    lines.append("")

    layout_data = analysis['layout_by_category']
    sorted_categories = sorted(layout_data.items(), key=lambda x: x[1]['count'], reverse=True)

    for category, data in sorted_categories:
        lines.append(f"### {category}")
        lines.append(f"- **Occurrences**: {data['count']}")
        lines.append(f"- **Websites**: {len(data['websites'])}")
        lines.append("")

        # Top XPath patterns
        lines.append("**Top XPath patterns:**")
        top_xpaths = sorted(data['xpath_patterns'].items(), key=lambda x: x[1], reverse=True)[:5]
        for xpath, count in top_xpaths:
            lines.append(f"- `{xpath}` ({count})")
        lines.append("")

        # Top string patterns
        lines.append("**String patterns found:**")
        top_patterns = sorted(data['string_patterns'].items(), key=lambda x: x[1], reverse=True)[:5]
        for pattern, count in top_patterns:
            lines.append(f"- `{pattern}` ({count})")
        lines.append("")

        # Samples
        if data['samples']:
            lines.append("**Samples:**")
            for sample in data['samples'][:3]:
                text = sample['text'].replace('\n', ' ')[:80]
                lines.append(f"- \"{text}...\" ({sample['website']})")
            lines.append("")

    # Entity types
    lines.append("## Entity Type Distribution")
    lines.append("")

    entity_data = analysis['entity_by_type']
    sorted_entities = sorted(entity_data.items(), key=lambda x: x[1]['count'], reverse=True)

    lines.append("| Entity Type | Count | Websites | Primary Location |")
    lines.append("|-------------|-------|----------|------------------|")

    for etype, data in sorted_entities[:20]:
        top_category = max(data['categories'].items(), key=lambda x: x[1])[0] if data['categories'] else '-'
        lines.append(f"| {etype} | {data['count']} | {len(data['websites'])} | {top_category} |")

    lines.append("")

    # Entity type details
    lines.append("### Entity Type Details")
    lines.append("")

    for etype, data in sorted_entities[:10]:
        lines.append(f"#### {etype}")
        lines.append(f"- **Total occurrences**: {data['count']}")
        lines.append(f"- **Unique websites**: {len(data['websites'])}")
        lines.append("")

        lines.append("**Found in DOM categories:**")
        top_cats = sorted(data['categories'].items(), key=lambda x: x[1], reverse=True)[:5]
        for cat, count in top_cats:
            pct = (count / data['count']) * 100
            lines.append(f"- {cat}: {count} ({pct:.1f}%)")
        lines.append("")

        lines.append("**Common XPath patterns:**")
        top_xpaths = sorted(data['xpath_patterns'].items(), key=lambda x: x[1], reverse=True)[:5]
        for xpath, count in top_xpaths:
            lines.append(f"- `{xpath}` ({count})")
        lines.append("")

        if data['samples']:
            lines.append("**Samples:**")
            for sample in data['samples'][:3]:
                lines.append(f"- \"{sample['text']}\" @ `{sample['xpath']}`")
            lines.append("")

    # XPath to Entity mapping
    lines.append("## XPath → Entity Type Mapping")
    lines.append("")
    lines.append("Which entity types are typically found at which DOM locations:")
    lines.append("")

    xpath_entities = analysis['xpath_to_entities']
    # Sort by total entity count
    sorted_xpaths = sorted(
        xpath_entities.items(),
        key=lambda x: sum(x[1].values()),
        reverse=True
    )[:20]

    for xpath, entities in sorted_xpaths:
        total = sum(entities.values())
        lines.append(f"### `{xpath}` ({total} entities)")
        top_entities = sorted(entities.items(), key=lambda x: x[1], reverse=True)[:5]
        for etype, count in top_entities:
            pct = (count / total) * 100
            lines.append(f"- {etype}: {count} ({pct:.1f}%)")
        lines.append("")

    # Pattern co-occurrence
    lines.append("## String Pattern → Entity Type Correlation")
    lines.append("")
    lines.append("Which string patterns are most associated with which entity types:")
    lines.append("")

    cooccurrence = analysis['pattern_cooccurrence']

    # Build reverse mapping: pattern -> entity types
    pattern_to_entities = defaultdict(dict)
    for etype, patterns in cooccurrence.items():
        for pattern, count in patterns.items():
            pattern_to_entities[pattern][etype] = count

    sorted_patterns = sorted(
        pattern_to_entities.items(),
        key=lambda x: sum(x[1].values()),
        reverse=True
    )[:15]

    for pattern, entities in sorted_patterns:
        total = sum(entities.values())
        lines.append(f"### `{pattern}` ({total} occurrences)")
        top_entities = sorted(entities.items(), key=lambda x: x[1], reverse=True)[:5]
        for etype, count in top_entities:
            pct = (count / total) * 100
            lines.append(f"- {etype}: {count} ({pct:.1f}%)")
        lines.append("")

    # Recommendations for pattern file
    lines.append("## Recommendations for dutch_web_patterns.yaml")
    lines.append("")
    lines.append("Based on the analysis, the following layout-aware patterns could be added:")
    lines.append("")
    lines.append("### High-Value XPath Targets")
    lines.append("")
    lines.append("1. **`head/title`** - Almost always contains institution name")
    lines.append("2. **`body/*/h1`** - Primary heading, usually institution name")
    lines.append("3. **`body/*/nav`** - Navigation menu (discard patterns)")
    lines.append("4. **`body/*/footer`** - Contact info, address, social links")
    lines.append("")

    lines.append("### Suggested Pattern Additions")
    lines.append("")
    lines.append("```yaml")
    lines.append("# Add xpath_hint to patterns for better precision")
    lines.append("entity_patterns:")
    lines.append("  organizations:")
    lines.append("    heritage_institutions:")
    lines.append("      patterns:")
    lines.append("        - pattern: '^(het|de)\\s+(\\w+)\\s*(museum|archief|bibliotheek)$'")
    lines.append("          xpath_hints:")
    lines.append("            - 'head/title'")
    lines.append("            - 'body/*/h1'")
    lines.append("          confidence_boost: 0.2  # Higher confidence when found at expected location")
    lines.append("```")
    lines.append("")

    report = '\n'.join(lines)

    if output_path:
        output_path.parent.mkdir(parents=True, exist_ok=True)
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(report)
        print(f"Report written to {output_path}")

    return report


def main():
    parser = argparse.ArgumentParser(description='Analyze layout patterns in web archives')
    parser.add_argument('--limit', type=int, help='Limit number of files to analyze')
    parser.add_argument('--output', '-o', type=str, help='Output file path for report')
    parser.add_argument('--web-dir', type=str,
                        default='/Users/kempersc/apps/glam/data/custodian/web',
                        help='Path to web archive directory')
    args = parser.parse_args()

    web_dir = Path(args.web_dir)
    if not web_dir.exists():
        print(f"Error: Web directory not found: {web_dir}")
        return 1

    print(f"Analyzing web archives in {web_dir}")
    analysis = analyze_all_archives(web_dir, limit=args.limit)

    output_path = Path(args.output) if args.output else None
    report = generate_report(analysis, output_path)

    if not output_path:
        print(report)

    return 0


if __name__ == '__main__':
    exit(main())