glam/scripts/analyze_layout_patterns.py
2025-12-14 17:09:55 +01:00

621 lines
21 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Analyze layout patterns across archived heritage institution websites.
This script examines the relationship between:
1. DOM structure patterns (XPath locations)
2. String content patterns found at those locations
3. Entity types extracted from each location
The goal is to identify common layout patterns across Dutch heritage websites
and map them to the entity extraction patterns in dutch_web_patterns.yaml.
Usage:
python scripts/analyze_layout_patterns.py [--limit N] [--output FILE]
python scripts/analyze_layout_patterns.py --limit 100 --output reports/layout_analysis.md
"""
import argparse
import os
import re
from collections import defaultdict
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from typing import Optional
import yaml
@dataclass
class LayoutCategoryStats:
"""Aggregated stats for a layout category."""
count: int = 0
websites: set[str] = field(default_factory=set)
xpath_patterns: dict[str, int] = field(default_factory=dict)
string_patterns: dict[str, int] = field(default_factory=dict)
samples: list[dict] = field(default_factory=list)
@dataclass
class EntityTypeStats:
"""Aggregated stats for an entity type."""
count: int = 0
websites: set[str] = field(default_factory=set)
categories: dict[str, int] = field(default_factory=dict)
xpath_patterns: dict[str, int] = field(default_factory=dict)
samples: list[dict] = field(default_factory=list)
def simplify_xpath(xpath: str) -> str:
"""
Simplify an XPath to a generic pattern for comparison.
/html/body/div[4]/section/div/div/div[1]/div/h1 -> body/*/h1
/html/head/title -> head/title
/html/body/div[2]/div/div[2]/div/nav -> body/*/nav
"""
if not xpath:
return ""
# Remove /html prefix
xpath = re.sub(r'^/html/?', '', xpath)
# Remove index notation but keep element names
# e.g., div[4] -> div, section[1] -> section
xpath = re.sub(r'\[\d+\]', '', xpath)
# Collapse repeated div/span patterns
# body/div/div/div/div/h1 -> body/*/h1
parts = xpath.split('/')
simplified = []
prev_generic = False
for part in parts:
if not part:
continue
# Strip attributes like @content
part = re.sub(r'/@.*$', '', part)
part = re.sub(r'/text\(\)$', '', part)
# Generic container elements
if part in ('div', 'span', 'section', 'article', 'main', 'aside'):
if not prev_generic:
simplified.append('*')
prev_generic = True
else:
simplified.append(part)
prev_generic = False
return '/'.join(simplified)
def get_xpath_category(xpath: Optional[str]) -> str:
"""
Categorize XPath into semantic regions.
Returns: header, nav, main_heading, main_content, sidebar, footer, meta, unknown
"""
if not xpath:
return 'unknown'
xpath_lower = xpath.lower()
simplified = simplify_xpath(xpath)
# Meta information
if 'head/' in simplified or simplified.startswith('head'):
if 'title' in simplified:
return 'meta_title'
if 'meta' in simplified:
return 'meta_tag'
return 'meta_other'
# Navigation
if '/nav' in simplified or '/menu' in simplified:
return 'nav'
if 'header' in simplified:
return 'header'
# Footer
if 'footer' in simplified:
return 'footer'
# Main headings
if simplified.endswith('/h1'):
return 'main_heading'
if re.search(r'/h[2-6]$', simplified):
return 'sub_heading'
# Contact/address sections
if 'contact' in xpath_lower or 'address' in xpath_lower:
return 'contact'
# Main content areas
if '/main' in simplified or '/article' in simplified:
return 'main_content'
if '/p' in simplified or simplified.endswith('/p'):
return 'paragraph'
# Lists
if '/ul' in simplified or '/ol' in simplified or '/li' in simplified:
return 'list'
# Sidebar
if 'aside' in simplified or 'sidebar' in xpath_lower:
return 'sidebar'
return 'other'
def detect_string_pattern(text: str) -> list[str]:
"""
Detect which string patterns match the given text.
Returns list of pattern names that match.
"""
patterns = []
text_clean = text.strip()
text_lower = text_clean.lower()
# Organization patterns
if re.search(r'\b(stichting|vereniging|genootschap|kring)\b', text_lower):
patterns.append('org:foundation_or_association')
if re.search(r'\b(museum|archief|bibliotheek)\b', text_lower):
patterns.append('heritage:institution_type')
if re.search(r'\bgemeente\s+\w+', text_lower):
patterns.append('gov:municipality')
# Location patterns
if re.search(r'\b\d{4}\s*[A-Z]{2}\b', text): # Dutch postal code
patterns.append('loc:postal_code_nl')
if re.search(r'\b[A-Z][a-z]+weg\b|\b[A-Z][a-z]+straat\b|\b[A-Z][a-z]+laan\b', text):
patterns.append('loc:street_address')
# Contact patterns
if re.search(r'\b[\w.-]+@[\w.-]+\.\w+\b', text):
patterns.append('contact:email')
if re.search(r'\b(?:tel|telefoon|phone)[:.]?\s*[\d\s-]+', text_lower):
patterns.append('contact:phone')
# Navigation patterns
if re.search(r'\b(home|contact|over ons|collectie|bezoek|programma)\b', text_lower):
patterns.append('nav:menu_item')
# Opening hours
if re.search(r'\b(maandag|dinsdag|woensdag|donderdag|vrijdag|zaterdag|zondag)\b', text_lower):
patterns.append('info:opening_hours')
if re.search(r'\b\d{1,2}[:\.]\d{2}\s*[-]\s*\d{1,2}[:\.]\d{2}\b', text):
patterns.append('info:time_range')
# Person names (simplified pattern)
if re.search(r'\b[A-Z][a-z]+\s+(?:van\s+(?:de\s+|het\s+)?)?[A-Z][a-z]+\b', text):
patterns.append('person:dutch_name')
return patterns if patterns else ['unknown']
def analyze_annotation_file(annotation_path: Path) -> Optional[dict]:
"""
Analyze a single annotation file and extract layout/entity patterns.
Returns dict with:
- website: domain name
- layouts: list of (xpath, category, text_sample, patterns)
- entities: list of (entity_type, xpath, text, patterns)
Returns None if file cannot be parsed or has no session data.
"""
try:
with open(annotation_path, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
except Exception:
return None
if not data or 'session' not in data:
return None
source_url = data.get('source_url') or ''
domain = re.sub(r'^https?://(?:www\.)?', '', source_url).split('/')[0] if source_url else 'unknown'
result = {
'website': domain,
'source_url': source_url,
'layouts': [],
'entities': [],
}
claims = data.get('session', {}).get('claims', {})
# Process layout claims
for layout in claims.get('layout', []):
xpath = layout.get('xpath') or layout.get('provenance', {}).get('path', '')
text = layout.get('text_content', '')[:200] # Truncate long text
category = get_xpath_category(xpath)
simplified_xpath = simplify_xpath(xpath)
patterns = detect_string_pattern(text)
result['layouts'].append({
'xpath': xpath,
'xpath_simplified': simplified_xpath,
'category': category,
'text_sample': text,
'patterns': patterns,
})
# Process entity claims
for entity in claims.get('entity', []):
xpath = entity.get('provenance', {}).get('path', '')
text = entity.get('text_content', '')[:200]
entity_type = entity.get('hyponym') or entity.get('hypernym', 'UNKNOWN')
category = get_xpath_category(xpath)
simplified_xpath = simplify_xpath(xpath)
patterns = detect_string_pattern(text)
result['entities'].append({
'entity_type': entity_type,
'xpath': xpath,
'xpath_simplified': simplified_xpath,
'category': category,
'text': text,
'patterns': patterns,
})
return result
def analyze_all_archives(web_dir: Path, limit: Optional[int] = None) -> dict:
"""
Analyze all annotation files in the web archive directory.
Returns aggregated statistics on layout patterns.
"""
# Find all annotation files
annotation_files = list(web_dir.glob('*/*/annotations_v1.7.0.yaml'))
if limit:
annotation_files = annotation_files[:limit]
print(f"Found {len(annotation_files)} annotation files to analyze")
# Aggregation structures using typed dataclasses
layout_by_category: dict[str, LayoutCategoryStats] = {}
entity_by_type: dict[str, EntityTypeStats] = {}
xpath_to_entities: dict[str, dict[str, int]] = {}
pattern_cooccurrence: dict[str, dict[str, int]] = {}
websites_analyzed: set[str] = set()
total_layouts = 0
total_entities = 0
for i, ann_file in enumerate(annotation_files):
if i % 100 == 0:
print(f" Processing {i}/{len(annotation_files)}...")
result = analyze_annotation_file(ann_file)
if not result:
continue
website = result['website']
websites_analyzed.add(website)
# Aggregate layouts
for layout in result['layouts']:
cat = layout['category']
# Initialize if not exists
if cat not in layout_by_category:
layout_by_category[cat] = LayoutCategoryStats()
stats = layout_by_category[cat]
stats.count += 1
stats.websites.add(website)
xpath_key = layout['xpath_simplified']
stats.xpath_patterns[xpath_key] = stats.xpath_patterns.get(xpath_key, 0) + 1
for pattern in layout['patterns']:
stats.string_patterns[pattern] = stats.string_patterns.get(pattern, 0) + 1
if len(stats.samples) < 20:
stats.samples.append({
'website': website,
'text': layout['text_sample'][:100],
'patterns': layout['patterns'],
})
total_layouts += 1
# Aggregate entities
for entity in result['entities']:
etype = entity['entity_type']
# Initialize if not exists
if etype not in entity_by_type:
entity_by_type[etype] = EntityTypeStats()
stats = entity_by_type[etype]
stats.count += 1
stats.websites.add(website)
cat_key = entity['category']
stats.categories[cat_key] = stats.categories.get(cat_key, 0) + 1
xpath_key = entity['xpath_simplified']
stats.xpath_patterns[xpath_key] = stats.xpath_patterns.get(xpath_key, 0) + 1
# Track xpath -> entity type mapping
if xpath_key not in xpath_to_entities:
xpath_to_entities[xpath_key] = {}
xpath_to_entities[xpath_key][etype] = xpath_to_entities[xpath_key].get(etype, 0) + 1
# Track pattern co-occurrence
if etype not in pattern_cooccurrence:
pattern_cooccurrence[etype] = {}
for pattern in entity['patterns']:
pattern_cooccurrence[etype][pattern] = pattern_cooccurrence[etype].get(pattern, 0) + 1
if len(stats.samples) < 10:
stats.samples.append({
'website': website,
'text': entity['text'][:100],
'xpath': entity['xpath_simplified'],
})
total_entities += 1
# Convert dataclasses to dicts for report generation
layout_dict = {
cat: {
'count': s.count,
'websites': s.websites,
'xpath_patterns': s.xpath_patterns,
'string_patterns': s.string_patterns,
'samples': s.samples,
}
for cat, s in layout_by_category.items()
}
entity_dict = {
etype: {
'count': s.count,
'websites': s.websites,
'categories': s.categories,
'xpath_patterns': s.xpath_patterns,
'samples': s.samples,
}
for etype, s in entity_by_type.items()
}
return {
'summary': {
'annotation_files': len(annotation_files),
'websites_analyzed': len(websites_analyzed),
'total_layouts': total_layouts,
'total_entities': total_entities,
},
'layout_by_category': layout_dict,
'entity_by_type': entity_dict,
'xpath_to_entities': xpath_to_entities,
'pattern_cooccurrence': pattern_cooccurrence,
}
def generate_report(analysis: dict, output_path: Optional[Path] = None) -> str:
"""Generate a markdown report from the analysis results."""
lines = []
lines.append("# Web Archive Layout Pattern Analysis")
lines.append(f"\nGenerated: {datetime.now().isoformat()}")
lines.append("")
# Summary
summary = analysis['summary']
lines.append("## Summary")
lines.append("")
lines.append(f"- **Annotation files analyzed**: {summary['annotation_files']}")
lines.append(f"- **Unique websites**: {summary['websites_analyzed']}")
lines.append(f"- **Total layout claims**: {summary['total_layouts']}")
lines.append(f"- **Total entity claims**: {summary['total_entities']}")
lines.append("")
# Layout categories
lines.append("## Layout Categories")
lines.append("")
lines.append("Distribution of content by DOM location category:")
lines.append("")
layout_data = analysis['layout_by_category']
sorted_categories = sorted(layout_data.items(), key=lambda x: x[1]['count'], reverse=True)
for category, data in sorted_categories:
lines.append(f"### {category}")
lines.append(f"- **Occurrences**: {data['count']}")
lines.append(f"- **Websites**: {len(data['websites'])}")
lines.append("")
# Top XPath patterns
lines.append("**Top XPath patterns:**")
top_xpaths = sorted(data['xpath_patterns'].items(), key=lambda x: x[1], reverse=True)[:5]
for xpath, count in top_xpaths:
lines.append(f"- `{xpath}` ({count})")
lines.append("")
# Top string patterns
lines.append("**String patterns found:**")
top_patterns = sorted(data['string_patterns'].items(), key=lambda x: x[1], reverse=True)[:5]
for pattern, count in top_patterns:
lines.append(f"- `{pattern}` ({count})")
lines.append("")
# Samples
if data['samples']:
lines.append("**Samples:**")
for sample in data['samples'][:3]:
text = sample['text'].replace('\n', ' ')[:80]
lines.append(f"- \"{text}...\" ({sample['website']})")
lines.append("")
# Entity types
lines.append("## Entity Type Distribution")
lines.append("")
entity_data = analysis['entity_by_type']
sorted_entities = sorted(entity_data.items(), key=lambda x: x[1]['count'], reverse=True)
lines.append("| Entity Type | Count | Websites | Primary Location |")
lines.append("|-------------|-------|----------|------------------|")
for etype, data in sorted_entities[:20]:
top_category = max(data['categories'].items(), key=lambda x: x[1])[0] if data['categories'] else '-'
lines.append(f"| {etype} | {data['count']} | {len(data['websites'])} | {top_category} |")
lines.append("")
# Entity type details
lines.append("### Entity Type Details")
lines.append("")
for etype, data in sorted_entities[:10]:
lines.append(f"#### {etype}")
lines.append(f"- **Total occurrences**: {data['count']}")
lines.append(f"- **Unique websites**: {len(data['websites'])}")
lines.append("")
lines.append("**Found in DOM categories:**")
top_cats = sorted(data['categories'].items(), key=lambda x: x[1], reverse=True)[:5]
for cat, count in top_cats:
pct = (count / data['count']) * 100
lines.append(f"- {cat}: {count} ({pct:.1f}%)")
lines.append("")
lines.append("**Common XPath patterns:**")
top_xpaths = sorted(data['xpath_patterns'].items(), key=lambda x: x[1], reverse=True)[:5]
for xpath, count in top_xpaths:
lines.append(f"- `{xpath}` ({count})")
lines.append("")
if data['samples']:
lines.append("**Samples:**")
for sample in data['samples'][:3]:
lines.append(f"- \"{sample['text']}\" @ `{sample['xpath']}`")
lines.append("")
# XPath to Entity mapping
lines.append("## XPath → Entity Type Mapping")
lines.append("")
lines.append("Which entity types are typically found at which DOM locations:")
lines.append("")
xpath_entities = analysis['xpath_to_entities']
# Sort by total entity count
sorted_xpaths = sorted(
xpath_entities.items(),
key=lambda x: sum(x[1].values()),
reverse=True
)[:20]
for xpath, entities in sorted_xpaths:
total = sum(entities.values())
lines.append(f"### `{xpath}` ({total} entities)")
top_entities = sorted(entities.items(), key=lambda x: x[1], reverse=True)[:5]
for etype, count in top_entities:
pct = (count / total) * 100
lines.append(f"- {etype}: {count} ({pct:.1f}%)")
lines.append("")
# Pattern co-occurrence
lines.append("## String Pattern → Entity Type Correlation")
lines.append("")
lines.append("Which string patterns are most associated with which entity types:")
lines.append("")
cooccurrence = analysis['pattern_cooccurrence']
# Build reverse mapping: pattern -> entity types
pattern_to_entities = defaultdict(dict)
for etype, patterns in cooccurrence.items():
for pattern, count in patterns.items():
pattern_to_entities[pattern][etype] = count
sorted_patterns = sorted(
pattern_to_entities.items(),
key=lambda x: sum(x[1].values()),
reverse=True
)[:15]
for pattern, entities in sorted_patterns:
total = sum(entities.values())
lines.append(f"### `{pattern}` ({total} occurrences)")
top_entities = sorted(entities.items(), key=lambda x: x[1], reverse=True)[:5]
for etype, count in top_entities:
pct = (count / total) * 100
lines.append(f"- {etype}: {count} ({pct:.1f}%)")
lines.append("")
# Recommendations for pattern file
lines.append("## Recommendations for dutch_web_patterns.yaml")
lines.append("")
lines.append("Based on the analysis, the following layout-aware patterns could be added:")
lines.append("")
lines.append("### High-Value XPath Targets")
lines.append("")
lines.append("1. **`head/title`** - Almost always contains institution name")
lines.append("2. **`body/*/h1`** - Primary heading, usually institution name")
lines.append("3. **`body/*/nav`** - Navigation menu (discard patterns)")
lines.append("4. **`body/*/footer`** - Contact info, address, social links")
lines.append("")
lines.append("### Suggested Pattern Additions")
lines.append("")
lines.append("```yaml")
lines.append("# Add xpath_hint to patterns for better precision")
lines.append("entity_patterns:")
lines.append(" organizations:")
lines.append(" heritage_institutions:")
lines.append(" patterns:")
lines.append(" - pattern: '^(het|de)\\s+(\\w+)\\s*(museum|archief|bibliotheek)$'")
lines.append(" xpath_hints:")
lines.append(" - 'head/title'")
lines.append(" - 'body/*/h1'")
lines.append(" confidence_boost: 0.2 # Higher confidence when found at expected location")
lines.append("```")
lines.append("")
report = '\n'.join(lines)
if output_path:
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, 'w', encoding='utf-8') as f:
f.write(report)
print(f"Report written to {output_path}")
return report
def main():
parser = argparse.ArgumentParser(description='Analyze layout patterns in web archives')
parser.add_argument('--limit', type=int, help='Limit number of files to analyze')
parser.add_argument('--output', '-o', type=str, help='Output file path for report')
parser.add_argument('--web-dir', type=str,
default='/Users/kempersc/apps/glam/data/custodian/web',
help='Path to web archive directory')
args = parser.parse_args()
web_dir = Path(args.web_dir)
if not web_dir.exists():
print(f"Error: Web directory not found: {web_dir}")
return 1
print(f"Analyzing web archives in {web_dir}")
analysis = analyze_all_archives(web_dir, limit=args.limit)
output_path = Path(args.output) if args.output else None
report = generate_report(analysis, output_path)
if not output_path:
print(report)
return 0
if __name__ == '__main__':
exit(main())