glam/scripts/extract_website_headers.py
2025-12-02 14:36:01 +01:00

315 lines
10 KiB
Python

#!/usr/bin/env python3
"""
Extract and analyze headers (h1-h6, nav items) from all archived websites.
This script:
1. Extracts all headers from archived HTML files
2. Identifies common patterns across heritage institution websites
3. Generates comparison sets for quality control
4. Helps identify which headers are meaningful vs. generic UI
Usage:
python scripts/extract_website_headers.py [--limit N] [--output FILE]
"""
import argparse
import json
import sys
from collections import defaultdict
from pathlib import Path
from typing import Dict, List, Set, Optional, Any
try:
from lxml import etree
HAS_LXML = True
except ImportError:
HAS_LXML = False
print("Error: lxml is required. Install with: pip install lxml")
import yaml
ENTRIES_DIR = Path('/Users/kempersc/apps/glam/data/nde/enriched/entries')
WEB_DIR = ENTRIES_DIR / 'web'
def extract_headers_from_html(html_content: str) -> Dict[str, List[str]]:
"""Extract all headers from HTML content."""
headers = {
'h1': [],
'h2': [],
'h3': [],
'h4': [],
'h5': [],
'h6': [],
'nav_items': [],
'title': None,
'meta_description': None,
'og_site_name': None,
}
try:
tree = etree.HTML(html_content)
# Extract h1-h6 headers
for level in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
elements = tree.xpath(f'//{level}')
for el in elements:
text = ''.join(el.itertext()).strip()
if text and len(text) > 1 and len(text) < 500:
headers[level].append(text)
# Extract title
titles = tree.xpath('//title/text()')
if titles:
headers['title'] = titles[0].strip()
# Extract meta description
meta_desc = tree.xpath('//meta[@name="description"]/@content')
if meta_desc:
headers['meta_description'] = meta_desc[0].strip()
# Extract og:site_name
og_name = tree.xpath('//meta[@property="og:site_name"]/@content')
if og_name:
headers['og_site_name'] = og_name[0].strip()
# Extract navigation items
nav_links = tree.xpath('//nav//a/text() | //nav//span/text() | //*[@class[contains(., "nav")]]//a/text()')
for text in nav_links:
text = text.strip()
if text and len(text) > 1 and len(text) < 100:
headers['nav_items'].append(text)
except Exception as e:
print(f" Warning: Failed to parse HTML: {e}")
return headers
def find_html_file(archive_path: Path) -> Optional[Path]:
"""Find the main HTML file in an archive directory."""
# Priority order
candidates = [
archive_path / 'rendered.html',
archive_path / 'pages' / 'index.html',
]
for candidate in candidates:
if candidate.exists():
return candidate
# Fallback: find first HTML in pages/
pages_dir = archive_path / 'pages'
if pages_dir.exists():
html_files = list(pages_dir.glob('*.html'))
if html_files:
return html_files[0]
return None
def process_entry(entry_path: Path) -> Optional[Dict]:
"""Process a single entry and extract headers from its website."""
with open(entry_path, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
if not data:
return None
# Get entry number from filename
entry_num = entry_path.name.split('_')[0]
# Find web archive directory
web_enrichment = data.get('web_enrichment', {})
web_archives = web_enrichment.get('web_archives', [])
archive_path = None
if web_archives:
archive = web_archives[0]
directory = archive.get('directory')
if directory:
archive_path = ENTRIES_DIR / directory
if not archive_path:
# Fallback: check web/{entry_num}/
entry_web_dir = WEB_DIR / entry_num
if entry_web_dir.exists():
subdirs = [d for d in entry_web_dir.iterdir() if d.is_dir()]
if subdirs:
archive_path = subdirs[0]
if not archive_path or not archive_path.exists():
return None
# Find and read HTML file
html_file = find_html_file(archive_path)
if not html_file:
return None
try:
with open(html_file, 'r', encoding='utf-8', errors='replace') as f:
html_content = f.read()
except Exception as e:
print(f" Warning: Failed to read {html_file}: {e}")
return None
# Extract headers
headers = extract_headers_from_html(html_content)
# Get custodian name for comparison
custodian_name = data.get('custodian_name', {}).get('claim_value', '')
google_name = data.get('google_maps_enrichment', {}).get('name', '')
wikidata_name = data.get('wikidata_enrichment', {}).get('wikidata_label_nl', '')
return {
'entry_file': entry_path.name,
'entry_num': entry_num,
'custodian_name': custodian_name,
'google_name': google_name,
'wikidata_name': wikidata_name,
'website_url': data.get('wikidata_enrichment', {}).get('wikidata_official_website', ''),
'headers': headers,
}
def analyze_patterns(results: List[Dict]) -> Dict:
"""Analyze common patterns across all websites."""
# Collect all header values
all_h1 = defaultdict(int)
all_h2 = defaultdict(int)
all_nav = defaultdict(int)
all_titles = defaultdict(int)
# Compare headers with known names
h1_matches_name = 0
title_matches_name = 0
og_matches_name = 0
total_with_headers = 0
for result in results:
headers = result.get('headers', {})
known_names = {
result.get('custodian_name', '').lower(),
result.get('google_name', '').lower(),
result.get('wikidata_name', '').lower(),
}
known_names.discard('')
# Count header occurrences
for h1 in headers.get('h1', []):
all_h1[h1] += 1
for h2 in headers.get('h2', []):
all_h2[h2] += 1
for nav in headers.get('nav_items', []):
all_nav[nav] += 1
title = headers.get('title')
if title:
all_titles[title] += 1
# Check if headers match known names
if headers.get('h1'):
total_with_headers += 1
h1_text = headers['h1'][0].lower() if headers['h1'] else ''
if any(name in h1_text or h1_text in name for name in known_names if name):
h1_matches_name += 1
if title:
title_lower = title.lower()
if any(name in title_lower or title_lower in name for name in known_names if name):
title_matches_name += 1
og_name = headers.get('og_site_name', '')
if og_name:
og_lower = og_name.lower()
if any(name in og_lower or og_lower in name for name in known_names if name):
og_matches_name += 1
return {
'total_entries': len(results),
'entries_with_headers': total_with_headers,
'h1_name_match_rate': h1_matches_name / total_with_headers if total_with_headers else 0,
'title_name_match_rate': title_matches_name / len(results) if results else 0,
'og_name_match_rate': og_matches_name / len(results) if results else 0,
'common_h1_values': sorted(all_h1.items(), key=lambda x: -x[1])[:50],
'common_h2_values': sorted(all_h2.items(), key=lambda x: -x[1])[:50],
'common_nav_items': sorted(all_nav.items(), key=lambda x: -x[1])[:50],
'common_titles': sorted(all_titles.items(), key=lambda x: -x[1])[:30],
}
def main():
parser = argparse.ArgumentParser(description='Extract and analyze website headers')
parser.add_argument('--limit', type=int, default=None, help='Limit entries to process')
parser.add_argument('--output', type=str, default=None, help='Output JSON file')
parser.add_argument('--verbose', action='store_true', help='Show detailed output')
args = parser.parse_args()
if not HAS_LXML:
return 1
# Find entry files
files = sorted([f for f in ENTRIES_DIR.glob('*.yaml') if f.is_file() and not f.name.startswith('.')])
if args.limit:
files = files[:args.limit]
print(f"Processing {len(files)} entries...")
results = []
processed = 0
for filepath in files:
result = process_entry(filepath)
if result:
results.append(result)
processed += 1
if args.verbose and processed % 100 == 0:
print(f" Processed {processed} entries with headers")
print(f"\nExtracted headers from {len(results)} entries")
# Analyze patterns
analysis = analyze_patterns(results)
# Print summary
print("\n" + "=" * 60)
print("HEADER ANALYSIS SUMMARY")
print("=" * 60)
print(f"Total entries processed: {analysis['total_entries']}")
print(f"Entries with H1 headers: {analysis['entries_with_headers']}")
print(f"H1 matches known name: {analysis['h1_name_match_rate']:.1%}")
print(f"Title matches known name: {analysis['title_name_match_rate']:.1%}")
print(f"OG:site_name matches known name: {analysis['og_name_match_rate']:.1%}")
print("\n" + "-" * 40)
print("MOST COMMON H1 VALUES (likely generic):")
for value, count in analysis['common_h1_values'][:20]:
if count > 1:
print(f" {count:4d}x {value[:60]}")
print("\n" + "-" * 40)
print("MOST COMMON NAV ITEMS:")
for value, count in analysis['common_nav_items'][:20]:
if count > 5:
print(f" {count:4d}x {value[:40]}")
# Save output
if args.output:
output_data = {
'analysis': analysis,
'entries': results,
}
with open(args.output, 'w', encoding='utf-8') as f:
json.dump(output_data, f, indent=2, ensure_ascii=False)
print(f"\nSaved detailed output to {args.output}")
return 0
if __name__ == '__main__':
sys.exit(main())