#!/usr/bin/env python3 """ Analyze layout patterns across archived heritage institution websites. This script examines the relationship between: 1. DOM structure patterns (XPath locations) 2. String content patterns found at those locations 3. Entity types extracted from each location The goal is to identify common layout patterns across Dutch heritage websites and map them to the entity extraction patterns in dutch_web_patterns.yaml. Usage: python scripts/analyze_layout_patterns.py [--limit N] [--output FILE] python scripts/analyze_layout_patterns.py --limit 100 --output reports/layout_analysis.md """ import argparse import os import re from collections import defaultdict from dataclasses import dataclass, field from datetime import datetime from pathlib import Path from typing import Optional import yaml @dataclass class LayoutCategoryStats: """Aggregated stats for a layout category.""" count: int = 0 websites: set[str] = field(default_factory=set) xpath_patterns: dict[str, int] = field(default_factory=dict) string_patterns: dict[str, int] = field(default_factory=dict) samples: list[dict] = field(default_factory=list) @dataclass class EntityTypeStats: """Aggregated stats for an entity type.""" count: int = 0 websites: set[str] = field(default_factory=set) categories: dict[str, int] = field(default_factory=dict) xpath_patterns: dict[str, int] = field(default_factory=dict) samples: list[dict] = field(default_factory=list) def simplify_xpath(xpath: str) -> str: """ Simplify an XPath to a generic pattern for comparison. /html/body/div[4]/section/div/div/div[1]/div/h1 -> body/*/h1 /html/head/title -> head/title /html/body/div[2]/div/div[2]/div/nav -> body/*/nav """ if not xpath: return "" # Remove /html prefix xpath = re.sub(r'^/html/?', '', xpath) # Remove index notation but keep element names # e.g., div[4] -> div, section[1] -> section xpath = re.sub(r'\[\d+\]', '', xpath) # Collapse repeated div/span patterns # body/div/div/div/div/h1 -> body/*/h1 parts = xpath.split('/') simplified = [] prev_generic = False for part in parts: if not part: continue # Strip attributes like @content part = re.sub(r'/@.*$', '', part) part = re.sub(r'/text\(\)$', '', part) # Generic container elements if part in ('div', 'span', 'section', 'article', 'main', 'aside'): if not prev_generic: simplified.append('*') prev_generic = True else: simplified.append(part) prev_generic = False return '/'.join(simplified) def get_xpath_category(xpath: Optional[str]) -> str: """ Categorize XPath into semantic regions. Returns: header, nav, main_heading, main_content, sidebar, footer, meta, unknown """ if not xpath: return 'unknown' xpath_lower = xpath.lower() simplified = simplify_xpath(xpath) # Meta information if 'head/' in simplified or simplified.startswith('head'): if 'title' in simplified: return 'meta_title' if 'meta' in simplified: return 'meta_tag' return 'meta_other' # Navigation if '/nav' in simplified or '/menu' in simplified: return 'nav' if 'header' in simplified: return 'header' # Footer if 'footer' in simplified: return 'footer' # Main headings if simplified.endswith('/h1'): return 'main_heading' if re.search(r'/h[2-6]$', simplified): return 'sub_heading' # Contact/address sections if 'contact' in xpath_lower or 'address' in xpath_lower: return 'contact' # Main content areas if '/main' in simplified or '/article' in simplified: return 'main_content' if '/p' in simplified or simplified.endswith('/p'): return 'paragraph' # Lists if '/ul' in simplified or '/ol' in simplified or '/li' in simplified: return 'list' # Sidebar if 'aside' in simplified or 'sidebar' in xpath_lower: return 'sidebar' return 'other' def detect_string_pattern(text: str) -> list[str]: """ Detect which string patterns match the given text. Returns list of pattern names that match. """ patterns = [] text_clean = text.strip() text_lower = text_clean.lower() # Organization patterns if re.search(r'\b(stichting|vereniging|genootschap|kring)\b', text_lower): patterns.append('org:foundation_or_association') if re.search(r'\b(museum|archief|bibliotheek)\b', text_lower): patterns.append('heritage:institution_type') if re.search(r'\bgemeente\s+\w+', text_lower): patterns.append('gov:municipality') # Location patterns if re.search(r'\b\d{4}\s*[A-Z]{2}\b', text): # Dutch postal code patterns.append('loc:postal_code_nl') if re.search(r'\b[A-Z][a-z]+weg\b|\b[A-Z][a-z]+straat\b|\b[A-Z][a-z]+laan\b', text): patterns.append('loc:street_address') # Contact patterns if re.search(r'\b[\w.-]+@[\w.-]+\.\w+\b', text): patterns.append('contact:email') if re.search(r'\b(?:tel|telefoon|phone)[:.]?\s*[\d\s-]+', text_lower): patterns.append('contact:phone') # Navigation patterns if re.search(r'\b(home|contact|over ons|collectie|bezoek|programma)\b', text_lower): patterns.append('nav:menu_item') # Opening hours if re.search(r'\b(maandag|dinsdag|woensdag|donderdag|vrijdag|zaterdag|zondag)\b', text_lower): patterns.append('info:opening_hours') if re.search(r'\b\d{1,2}[:\.]\d{2}\s*[-–]\s*\d{1,2}[:\.]\d{2}\b', text): patterns.append('info:time_range') # Person names (simplified pattern) if re.search(r'\b[A-Z][a-z]+\s+(?:van\s+(?:de\s+|het\s+)?)?[A-Z][a-z]+\b', text): patterns.append('person:dutch_name') return patterns if patterns else ['unknown'] def analyze_annotation_file(annotation_path: Path) -> Optional[dict]: """ Analyze a single annotation file and extract layout/entity patterns. Returns dict with: - website: domain name - layouts: list of (xpath, category, text_sample, patterns) - entities: list of (entity_type, xpath, text, patterns) Returns None if file cannot be parsed or has no session data. """ try: with open(annotation_path, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) except Exception: return None if not data or 'session' not in data: return None source_url = data.get('source_url') or '' domain = re.sub(r'^https?://(?:www\.)?', '', source_url).split('/')[0] if source_url else 'unknown' result = { 'website': domain, 'source_url': source_url, 'layouts': [], 'entities': [], } claims = data.get('session', {}).get('claims', {}) # Process layout claims for layout in claims.get('layout', []): xpath = layout.get('xpath') or layout.get('provenance', {}).get('path', '') text = layout.get('text_content', '')[:200] # Truncate long text category = get_xpath_category(xpath) simplified_xpath = simplify_xpath(xpath) patterns = detect_string_pattern(text) result['layouts'].append({ 'xpath': xpath, 'xpath_simplified': simplified_xpath, 'category': category, 'text_sample': text, 'patterns': patterns, }) # Process entity claims for entity in claims.get('entity', []): xpath = entity.get('provenance', {}).get('path', '') text = entity.get('text_content', '')[:200] entity_type = entity.get('hyponym') or entity.get('hypernym', 'UNKNOWN') category = get_xpath_category(xpath) simplified_xpath = simplify_xpath(xpath) patterns = detect_string_pattern(text) result['entities'].append({ 'entity_type': entity_type, 'xpath': xpath, 'xpath_simplified': simplified_xpath, 'category': category, 'text': text, 'patterns': patterns, }) return result def analyze_all_archives(web_dir: Path, limit: Optional[int] = None) -> dict: """ Analyze all annotation files in the web archive directory. Returns aggregated statistics on layout patterns. """ # Find all annotation files annotation_files = list(web_dir.glob('*/*/annotations_v1.7.0.yaml')) if limit: annotation_files = annotation_files[:limit] print(f"Found {len(annotation_files)} annotation files to analyze") # Aggregation structures using typed dataclasses layout_by_category: dict[str, LayoutCategoryStats] = {} entity_by_type: dict[str, EntityTypeStats] = {} xpath_to_entities: dict[str, dict[str, int]] = {} pattern_cooccurrence: dict[str, dict[str, int]] = {} websites_analyzed: set[str] = set() total_layouts = 0 total_entities = 0 for i, ann_file in enumerate(annotation_files): if i % 100 == 0: print(f" Processing {i}/{len(annotation_files)}...") result = analyze_annotation_file(ann_file) if not result: continue website = result['website'] websites_analyzed.add(website) # Aggregate layouts for layout in result['layouts']: cat = layout['category'] # Initialize if not exists if cat not in layout_by_category: layout_by_category[cat] = LayoutCategoryStats() stats = layout_by_category[cat] stats.count += 1 stats.websites.add(website) xpath_key = layout['xpath_simplified'] stats.xpath_patterns[xpath_key] = stats.xpath_patterns.get(xpath_key, 0) + 1 for pattern in layout['patterns']: stats.string_patterns[pattern] = stats.string_patterns.get(pattern, 0) + 1 if len(stats.samples) < 20: stats.samples.append({ 'website': website, 'text': layout['text_sample'][:100], 'patterns': layout['patterns'], }) total_layouts += 1 # Aggregate entities for entity in result['entities']: etype = entity['entity_type'] # Initialize if not exists if etype not in entity_by_type: entity_by_type[etype] = EntityTypeStats() stats = entity_by_type[etype] stats.count += 1 stats.websites.add(website) cat_key = entity['category'] stats.categories[cat_key] = stats.categories.get(cat_key, 0) + 1 xpath_key = entity['xpath_simplified'] stats.xpath_patterns[xpath_key] = stats.xpath_patterns.get(xpath_key, 0) + 1 # Track xpath -> entity type mapping if xpath_key not in xpath_to_entities: xpath_to_entities[xpath_key] = {} xpath_to_entities[xpath_key][etype] = xpath_to_entities[xpath_key].get(etype, 0) + 1 # Track pattern co-occurrence if etype not in pattern_cooccurrence: pattern_cooccurrence[etype] = {} for pattern in entity['patterns']: pattern_cooccurrence[etype][pattern] = pattern_cooccurrence[etype].get(pattern, 0) + 1 if len(stats.samples) < 10: stats.samples.append({ 'website': website, 'text': entity['text'][:100], 'xpath': entity['xpath_simplified'], }) total_entities += 1 # Convert dataclasses to dicts for report generation layout_dict = { cat: { 'count': s.count, 'websites': s.websites, 'xpath_patterns': s.xpath_patterns, 'string_patterns': s.string_patterns, 'samples': s.samples, } for cat, s in layout_by_category.items() } entity_dict = { etype: { 'count': s.count, 'websites': s.websites, 'categories': s.categories, 'xpath_patterns': s.xpath_patterns, 'samples': s.samples, } for etype, s in entity_by_type.items() } return { 'summary': { 'annotation_files': len(annotation_files), 'websites_analyzed': len(websites_analyzed), 'total_layouts': total_layouts, 'total_entities': total_entities, }, 'layout_by_category': layout_dict, 'entity_by_type': entity_dict, 'xpath_to_entities': xpath_to_entities, 'pattern_cooccurrence': pattern_cooccurrence, } def generate_report(analysis: dict, output_path: Optional[Path] = None) -> str: """Generate a markdown report from the analysis results.""" lines = [] lines.append("# Web Archive Layout Pattern Analysis") lines.append(f"\nGenerated: {datetime.now().isoformat()}") lines.append("") # Summary summary = analysis['summary'] lines.append("## Summary") lines.append("") lines.append(f"- **Annotation files analyzed**: {summary['annotation_files']}") lines.append(f"- **Unique websites**: {summary['websites_analyzed']}") lines.append(f"- **Total layout claims**: {summary['total_layouts']}") lines.append(f"- **Total entity claims**: {summary['total_entities']}") lines.append("") # Layout categories lines.append("## Layout Categories") lines.append("") lines.append("Distribution of content by DOM location category:") lines.append("") layout_data = analysis['layout_by_category'] sorted_categories = sorted(layout_data.items(), key=lambda x: x[1]['count'], reverse=True) for category, data in sorted_categories: lines.append(f"### {category}") lines.append(f"- **Occurrences**: {data['count']}") lines.append(f"- **Websites**: {len(data['websites'])}") lines.append("") # Top XPath patterns lines.append("**Top XPath patterns:**") top_xpaths = sorted(data['xpath_patterns'].items(), key=lambda x: x[1], reverse=True)[:5] for xpath, count in top_xpaths: lines.append(f"- `{xpath}` ({count})") lines.append("") # Top string patterns lines.append("**String patterns found:**") top_patterns = sorted(data['string_patterns'].items(), key=lambda x: x[1], reverse=True)[:5] for pattern, count in top_patterns: lines.append(f"- `{pattern}` ({count})") lines.append("") # Samples if data['samples']: lines.append("**Samples:**") for sample in data['samples'][:3]: text = sample['text'].replace('\n', ' ')[:80] lines.append(f"- \"{text}...\" ({sample['website']})") lines.append("") # Entity types lines.append("## Entity Type Distribution") lines.append("") entity_data = analysis['entity_by_type'] sorted_entities = sorted(entity_data.items(), key=lambda x: x[1]['count'], reverse=True) lines.append("| Entity Type | Count | Websites | Primary Location |") lines.append("|-------------|-------|----------|------------------|") for etype, data in sorted_entities[:20]: top_category = max(data['categories'].items(), key=lambda x: x[1])[0] if data['categories'] else '-' lines.append(f"| {etype} | {data['count']} | {len(data['websites'])} | {top_category} |") lines.append("") # Entity type details lines.append("### Entity Type Details") lines.append("") for etype, data in sorted_entities[:10]: lines.append(f"#### {etype}") lines.append(f"- **Total occurrences**: {data['count']}") lines.append(f"- **Unique websites**: {len(data['websites'])}") lines.append("") lines.append("**Found in DOM categories:**") top_cats = sorted(data['categories'].items(), key=lambda x: x[1], reverse=True)[:5] for cat, count in top_cats: pct = (count / data['count']) * 100 lines.append(f"- {cat}: {count} ({pct:.1f}%)") lines.append("") lines.append("**Common XPath patterns:**") top_xpaths = sorted(data['xpath_patterns'].items(), key=lambda x: x[1], reverse=True)[:5] for xpath, count in top_xpaths: lines.append(f"- `{xpath}` ({count})") lines.append("") if data['samples']: lines.append("**Samples:**") for sample in data['samples'][:3]: lines.append(f"- \"{sample['text']}\" @ `{sample['xpath']}`") lines.append("") # XPath to Entity mapping lines.append("## XPath → Entity Type Mapping") lines.append("") lines.append("Which entity types are typically found at which DOM locations:") lines.append("") xpath_entities = analysis['xpath_to_entities'] # Sort by total entity count sorted_xpaths = sorted( xpath_entities.items(), key=lambda x: sum(x[1].values()), reverse=True )[:20] for xpath, entities in sorted_xpaths: total = sum(entities.values()) lines.append(f"### `{xpath}` ({total} entities)") top_entities = sorted(entities.items(), key=lambda x: x[1], reverse=True)[:5] for etype, count in top_entities: pct = (count / total) * 100 lines.append(f"- {etype}: {count} ({pct:.1f}%)") lines.append("") # Pattern co-occurrence lines.append("## String Pattern → Entity Type Correlation") lines.append("") lines.append("Which string patterns are most associated with which entity types:") lines.append("") cooccurrence = analysis['pattern_cooccurrence'] # Build reverse mapping: pattern -> entity types pattern_to_entities = defaultdict(dict) for etype, patterns in cooccurrence.items(): for pattern, count in patterns.items(): pattern_to_entities[pattern][etype] = count sorted_patterns = sorted( pattern_to_entities.items(), key=lambda x: sum(x[1].values()), reverse=True )[:15] for pattern, entities in sorted_patterns: total = sum(entities.values()) lines.append(f"### `{pattern}` ({total} occurrences)") top_entities = sorted(entities.items(), key=lambda x: x[1], reverse=True)[:5] for etype, count in top_entities: pct = (count / total) * 100 lines.append(f"- {etype}: {count} ({pct:.1f}%)") lines.append("") # Recommendations for pattern file lines.append("## Recommendations for dutch_web_patterns.yaml") lines.append("") lines.append("Based on the analysis, the following layout-aware patterns could be added:") lines.append("") lines.append("### High-Value XPath Targets") lines.append("") lines.append("1. **`head/title`** - Almost always contains institution name") lines.append("2. **`body/*/h1`** - Primary heading, usually institution name") lines.append("3. **`body/*/nav`** - Navigation menu (discard patterns)") lines.append("4. **`body/*/footer`** - Contact info, address, social links") lines.append("") lines.append("### Suggested Pattern Additions") lines.append("") lines.append("```yaml") lines.append("# Add xpath_hint to patterns for better precision") lines.append("entity_patterns:") lines.append(" organizations:") lines.append(" heritage_institutions:") lines.append(" patterns:") lines.append(" - pattern: '^(het|de)\\s+(\\w+)\\s*(museum|archief|bibliotheek)$'") lines.append(" xpath_hints:") lines.append(" - 'head/title'") lines.append(" - 'body/*/h1'") lines.append(" confidence_boost: 0.2 # Higher confidence when found at expected location") lines.append("```") lines.append("") report = '\n'.join(lines) if output_path: output_path.parent.mkdir(parents=True, exist_ok=True) with open(output_path, 'w', encoding='utf-8') as f: f.write(report) print(f"Report written to {output_path}") return report def main(): parser = argparse.ArgumentParser(description='Analyze layout patterns in web archives') parser.add_argument('--limit', type=int, help='Limit number of files to analyze') parser.add_argument('--output', '-o', type=str, help='Output file path for report') parser.add_argument('--web-dir', type=str, default='/Users/kempersc/apps/glam/data/custodian/web', help='Path to web archive directory') args = parser.parse_args() web_dir = Path(args.web_dir) if not web_dir.exists(): print(f"Error: Web directory not found: {web_dir}") return 1 print(f"Analyzing web archives in {web_dir}") analysis = analyze_all_archives(web_dir, limit=args.limit) output_path = Path(args.output) if args.output else None report = generate_report(analysis, output_path) if not output_path: print(report) return 0 if __name__ == '__main__': exit(main())