#!/usr/bin/env python3 """ Extract and analyze headers (h1-h6, nav items) from all archived websites. This script: 1. Extracts all headers from archived HTML files 2. Identifies common patterns across heritage institution websites 3. Generates comparison sets for quality control 4. Helps identify which headers are meaningful vs. generic UI Usage: python scripts/extract_website_headers.py [--limit N] [--output FILE] """ import argparse import json import sys from collections import defaultdict from pathlib import Path from typing import Dict, List, Set, Optional, Any try: from lxml import etree HAS_LXML = True except ImportError: HAS_LXML = False print("Error: lxml is required. Install with: pip install lxml") import yaml ENTRIES_DIR = Path('/Users/kempersc/apps/glam/data/nde/enriched/entries') WEB_DIR = ENTRIES_DIR / 'web' def extract_headers_from_html(html_content: str) -> Dict[str, List[str]]: """Extract all headers from HTML content.""" headers = { 'h1': [], 'h2': [], 'h3': [], 'h4': [], 'h5': [], 'h6': [], 'nav_items': [], 'title': None, 'meta_description': None, 'og_site_name': None, } try: tree = etree.HTML(html_content) # Extract h1-h6 headers for level in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']: elements = tree.xpath(f'//{level}') for el in elements: text = ''.join(el.itertext()).strip() if text and len(text) > 1 and len(text) < 500: headers[level].append(text) # Extract title titles = tree.xpath('//title/text()') if titles: headers['title'] = titles[0].strip() # Extract meta description meta_desc = tree.xpath('//meta[@name="description"]/@content') if meta_desc: headers['meta_description'] = meta_desc[0].strip() # Extract og:site_name og_name = tree.xpath('//meta[@property="og:site_name"]/@content') if og_name: headers['og_site_name'] = og_name[0].strip() # Extract navigation items nav_links = tree.xpath('//nav//a/text() | //nav//span/text() | //*[@class[contains(., "nav")]]//a/text()') for text in nav_links: text = text.strip() if text and len(text) > 1 and len(text) < 100: headers['nav_items'].append(text) except Exception as e: print(f" Warning: Failed to parse HTML: {e}") return headers def find_html_file(archive_path: Path) -> Optional[Path]: """Find the main HTML file in an archive directory.""" # Priority order candidates = [ archive_path / 'rendered.html', archive_path / 'pages' / 'index.html', ] for candidate in candidates: if candidate.exists(): return candidate # Fallback: find first HTML in pages/ pages_dir = archive_path / 'pages' if pages_dir.exists(): html_files = list(pages_dir.glob('*.html')) if html_files: return html_files[0] return None def process_entry(entry_path: Path) -> Optional[Dict]: """Process a single entry and extract headers from its website.""" with open(entry_path, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) if not data: return None # Get entry number from filename entry_num = entry_path.name.split('_')[0] # Find web archive directory web_enrichment = data.get('web_enrichment', {}) web_archives = web_enrichment.get('web_archives', []) archive_path = None if web_archives: archive = web_archives[0] directory = archive.get('directory') if directory: archive_path = ENTRIES_DIR / directory if not archive_path: # Fallback: check web/{entry_num}/ entry_web_dir = WEB_DIR / entry_num if entry_web_dir.exists(): subdirs = [d for d in entry_web_dir.iterdir() if d.is_dir()] if subdirs: archive_path = subdirs[0] if not archive_path or not archive_path.exists(): return None # Find and read HTML file html_file = find_html_file(archive_path) if not html_file: return None try: with open(html_file, 'r', encoding='utf-8', errors='replace') as f: html_content = f.read() except Exception as e: print(f" Warning: Failed to read {html_file}: {e}") return None # Extract headers headers = extract_headers_from_html(html_content) # Get custodian name for comparison custodian_name = data.get('custodian_name', {}).get('claim_value', '') google_name = data.get('google_maps_enrichment', {}).get('name', '') wikidata_name = data.get('wikidata_enrichment', {}).get('wikidata_label_nl', '') return { 'entry_file': entry_path.name, 'entry_num': entry_num, 'custodian_name': custodian_name, 'google_name': google_name, 'wikidata_name': wikidata_name, 'website_url': data.get('wikidata_enrichment', {}).get('wikidata_official_website', ''), 'headers': headers, } def analyze_patterns(results: List[Dict]) -> Dict: """Analyze common patterns across all websites.""" # Collect all header values all_h1 = defaultdict(int) all_h2 = defaultdict(int) all_nav = defaultdict(int) all_titles = defaultdict(int) # Compare headers with known names h1_matches_name = 0 title_matches_name = 0 og_matches_name = 0 total_with_headers = 0 for result in results: headers = result.get('headers', {}) known_names = { result.get('custodian_name', '').lower(), result.get('google_name', '').lower(), result.get('wikidata_name', '').lower(), } known_names.discard('') # Count header occurrences for h1 in headers.get('h1', []): all_h1[h1] += 1 for h2 in headers.get('h2', []): all_h2[h2] += 1 for nav in headers.get('nav_items', []): all_nav[nav] += 1 title = headers.get('title') if title: all_titles[title] += 1 # Check if headers match known names if headers.get('h1'): total_with_headers += 1 h1_text = headers['h1'][0].lower() if headers['h1'] else '' if any(name in h1_text or h1_text in name for name in known_names if name): h1_matches_name += 1 if title: title_lower = title.lower() if any(name in title_lower or title_lower in name for name in known_names if name): title_matches_name += 1 og_name = headers.get('og_site_name', '') if og_name: og_lower = og_name.lower() if any(name in og_lower or og_lower in name for name in known_names if name): og_matches_name += 1 return { 'total_entries': len(results), 'entries_with_headers': total_with_headers, 'h1_name_match_rate': h1_matches_name / total_with_headers if total_with_headers else 0, 'title_name_match_rate': title_matches_name / len(results) if results else 0, 'og_name_match_rate': og_matches_name / len(results) if results else 0, 'common_h1_values': sorted(all_h1.items(), key=lambda x: -x[1])[:50], 'common_h2_values': sorted(all_h2.items(), key=lambda x: -x[1])[:50], 'common_nav_items': sorted(all_nav.items(), key=lambda x: -x[1])[:50], 'common_titles': sorted(all_titles.items(), key=lambda x: -x[1])[:30], } def main(): parser = argparse.ArgumentParser(description='Extract and analyze website headers') parser.add_argument('--limit', type=int, default=None, help='Limit entries to process') parser.add_argument('--output', type=str, default=None, help='Output JSON file') parser.add_argument('--verbose', action='store_true', help='Show detailed output') args = parser.parse_args() if not HAS_LXML: return 1 # Find entry files files = sorted([f for f in ENTRIES_DIR.glob('*.yaml') if f.is_file() and not f.name.startswith('.')]) if args.limit: files = files[:args.limit] print(f"Processing {len(files)} entries...") results = [] processed = 0 for filepath in files: result = process_entry(filepath) if result: results.append(result) processed += 1 if args.verbose and processed % 100 == 0: print(f" Processed {processed} entries with headers") print(f"\nExtracted headers from {len(results)} entries") # Analyze patterns analysis = analyze_patterns(results) # Print summary print("\n" + "=" * 60) print("HEADER ANALYSIS SUMMARY") print("=" * 60) print(f"Total entries processed: {analysis['total_entries']}") print(f"Entries with H1 headers: {analysis['entries_with_headers']}") print(f"H1 matches known name: {analysis['h1_name_match_rate']:.1%}") print(f"Title matches known name: {analysis['title_name_match_rate']:.1%}") print(f"OG:site_name matches known name: {analysis['og_name_match_rate']:.1%}") print("\n" + "-" * 40) print("MOST COMMON H1 VALUES (likely generic):") for value, count in analysis['common_h1_values'][:20]: if count > 1: print(f" {count:4d}x {value[:60]}") print("\n" + "-" * 40) print("MOST COMMON NAV ITEMS:") for value, count in analysis['common_nav_items'][:20]: if count > 5: print(f" {count:4d}x {value[:40]}") # Save output if args.output: output_data = { 'analysis': analysis, 'entries': results, } with open(args.output, 'w', encoding='utf-8') as f: json.dump(output_data, f, indent=2, ensure_ascii=False) print(f"\nSaved detailed output to {args.output}") return 0 if __name__ == '__main__': sys.exit(main())