315 lines
10 KiB
Python
315 lines
10 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Extract and analyze headers (h1-h6, nav items) from all archived websites.
|
|
|
|
This script:
|
|
1. Extracts all headers from archived HTML files
|
|
2. Identifies common patterns across heritage institution websites
|
|
3. Generates comparison sets for quality control
|
|
4. Helps identify which headers are meaningful vs. generic UI
|
|
|
|
Usage:
|
|
python scripts/extract_website_headers.py [--limit N] [--output FILE]
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import sys
|
|
from collections import defaultdict
|
|
from pathlib import Path
|
|
from typing import Dict, List, Set, Optional, Any
|
|
|
|
try:
|
|
from lxml import etree
|
|
HAS_LXML = True
|
|
except ImportError:
|
|
HAS_LXML = False
|
|
print("Error: lxml is required. Install with: pip install lxml")
|
|
|
|
import yaml
|
|
|
|
|
|
ENTRIES_DIR = Path('/Users/kempersc/apps/glam/data/nde/enriched/entries')
|
|
WEB_DIR = ENTRIES_DIR / 'web'
|
|
|
|
|
|
def extract_headers_from_html(html_content: str) -> Dict[str, List[str]]:
|
|
"""Extract all headers from HTML content."""
|
|
headers = {
|
|
'h1': [],
|
|
'h2': [],
|
|
'h3': [],
|
|
'h4': [],
|
|
'h5': [],
|
|
'h6': [],
|
|
'nav_items': [],
|
|
'title': None,
|
|
'meta_description': None,
|
|
'og_site_name': None,
|
|
}
|
|
|
|
try:
|
|
tree = etree.HTML(html_content)
|
|
|
|
# Extract h1-h6 headers
|
|
for level in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
|
|
elements = tree.xpath(f'//{level}')
|
|
for el in elements:
|
|
text = ''.join(el.itertext()).strip()
|
|
if text and len(text) > 1 and len(text) < 500:
|
|
headers[level].append(text)
|
|
|
|
# Extract title
|
|
titles = tree.xpath('//title/text()')
|
|
if titles:
|
|
headers['title'] = titles[0].strip()
|
|
|
|
# Extract meta description
|
|
meta_desc = tree.xpath('//meta[@name="description"]/@content')
|
|
if meta_desc:
|
|
headers['meta_description'] = meta_desc[0].strip()
|
|
|
|
# Extract og:site_name
|
|
og_name = tree.xpath('//meta[@property="og:site_name"]/@content')
|
|
if og_name:
|
|
headers['og_site_name'] = og_name[0].strip()
|
|
|
|
# Extract navigation items
|
|
nav_links = tree.xpath('//nav//a/text() | //nav//span/text() | //*[@class[contains(., "nav")]]//a/text()')
|
|
for text in nav_links:
|
|
text = text.strip()
|
|
if text and len(text) > 1 and len(text) < 100:
|
|
headers['nav_items'].append(text)
|
|
|
|
except Exception as e:
|
|
print(f" Warning: Failed to parse HTML: {e}")
|
|
|
|
return headers
|
|
|
|
|
|
def find_html_file(archive_path: Path) -> Optional[Path]:
|
|
"""Find the main HTML file in an archive directory."""
|
|
# Priority order
|
|
candidates = [
|
|
archive_path / 'rendered.html',
|
|
archive_path / 'pages' / 'index.html',
|
|
]
|
|
|
|
for candidate in candidates:
|
|
if candidate.exists():
|
|
return candidate
|
|
|
|
# Fallback: find first HTML in pages/
|
|
pages_dir = archive_path / 'pages'
|
|
if pages_dir.exists():
|
|
html_files = list(pages_dir.glob('*.html'))
|
|
if html_files:
|
|
return html_files[0]
|
|
|
|
return None
|
|
|
|
|
|
def process_entry(entry_path: Path) -> Optional[Dict]:
|
|
"""Process a single entry and extract headers from its website."""
|
|
with open(entry_path, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
if not data:
|
|
return None
|
|
|
|
# Get entry number from filename
|
|
entry_num = entry_path.name.split('_')[0]
|
|
|
|
# Find web archive directory
|
|
web_enrichment = data.get('web_enrichment', {})
|
|
web_archives = web_enrichment.get('web_archives', [])
|
|
|
|
archive_path = None
|
|
if web_archives:
|
|
archive = web_archives[0]
|
|
directory = archive.get('directory')
|
|
if directory:
|
|
archive_path = ENTRIES_DIR / directory
|
|
|
|
if not archive_path:
|
|
# Fallback: check web/{entry_num}/
|
|
entry_web_dir = WEB_DIR / entry_num
|
|
if entry_web_dir.exists():
|
|
subdirs = [d for d in entry_web_dir.iterdir() if d.is_dir()]
|
|
if subdirs:
|
|
archive_path = subdirs[0]
|
|
|
|
if not archive_path or not archive_path.exists():
|
|
return None
|
|
|
|
# Find and read HTML file
|
|
html_file = find_html_file(archive_path)
|
|
if not html_file:
|
|
return None
|
|
|
|
try:
|
|
with open(html_file, 'r', encoding='utf-8', errors='replace') as f:
|
|
html_content = f.read()
|
|
except Exception as e:
|
|
print(f" Warning: Failed to read {html_file}: {e}")
|
|
return None
|
|
|
|
# Extract headers
|
|
headers = extract_headers_from_html(html_content)
|
|
|
|
# Get custodian name for comparison
|
|
custodian_name = data.get('custodian_name', {}).get('claim_value', '')
|
|
google_name = data.get('google_maps_enrichment', {}).get('name', '')
|
|
wikidata_name = data.get('wikidata_enrichment', {}).get('wikidata_label_nl', '')
|
|
|
|
return {
|
|
'entry_file': entry_path.name,
|
|
'entry_num': entry_num,
|
|
'custodian_name': custodian_name,
|
|
'google_name': google_name,
|
|
'wikidata_name': wikidata_name,
|
|
'website_url': data.get('wikidata_enrichment', {}).get('wikidata_official_website', ''),
|
|
'headers': headers,
|
|
}
|
|
|
|
|
|
def analyze_patterns(results: List[Dict]) -> Dict:
|
|
"""Analyze common patterns across all websites."""
|
|
|
|
# Collect all header values
|
|
all_h1 = defaultdict(int)
|
|
all_h2 = defaultdict(int)
|
|
all_nav = defaultdict(int)
|
|
all_titles = defaultdict(int)
|
|
|
|
# Compare headers with known names
|
|
h1_matches_name = 0
|
|
title_matches_name = 0
|
|
og_matches_name = 0
|
|
total_with_headers = 0
|
|
|
|
for result in results:
|
|
headers = result.get('headers', {})
|
|
known_names = {
|
|
result.get('custodian_name', '').lower(),
|
|
result.get('google_name', '').lower(),
|
|
result.get('wikidata_name', '').lower(),
|
|
}
|
|
known_names.discard('')
|
|
|
|
# Count header occurrences
|
|
for h1 in headers.get('h1', []):
|
|
all_h1[h1] += 1
|
|
|
|
for h2 in headers.get('h2', []):
|
|
all_h2[h2] += 1
|
|
|
|
for nav in headers.get('nav_items', []):
|
|
all_nav[nav] += 1
|
|
|
|
title = headers.get('title')
|
|
if title:
|
|
all_titles[title] += 1
|
|
|
|
# Check if headers match known names
|
|
if headers.get('h1'):
|
|
total_with_headers += 1
|
|
h1_text = headers['h1'][0].lower() if headers['h1'] else ''
|
|
if any(name in h1_text or h1_text in name for name in known_names if name):
|
|
h1_matches_name += 1
|
|
|
|
if title:
|
|
title_lower = title.lower()
|
|
if any(name in title_lower or title_lower in name for name in known_names if name):
|
|
title_matches_name += 1
|
|
|
|
og_name = headers.get('og_site_name', '')
|
|
if og_name:
|
|
og_lower = og_name.lower()
|
|
if any(name in og_lower or og_lower in name for name in known_names if name):
|
|
og_matches_name += 1
|
|
|
|
return {
|
|
'total_entries': len(results),
|
|
'entries_with_headers': total_with_headers,
|
|
'h1_name_match_rate': h1_matches_name / total_with_headers if total_with_headers else 0,
|
|
'title_name_match_rate': title_matches_name / len(results) if results else 0,
|
|
'og_name_match_rate': og_matches_name / len(results) if results else 0,
|
|
'common_h1_values': sorted(all_h1.items(), key=lambda x: -x[1])[:50],
|
|
'common_h2_values': sorted(all_h2.items(), key=lambda x: -x[1])[:50],
|
|
'common_nav_items': sorted(all_nav.items(), key=lambda x: -x[1])[:50],
|
|
'common_titles': sorted(all_titles.items(), key=lambda x: -x[1])[:30],
|
|
}
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='Extract and analyze website headers')
|
|
parser.add_argument('--limit', type=int, default=None, help='Limit entries to process')
|
|
parser.add_argument('--output', type=str, default=None, help='Output JSON file')
|
|
parser.add_argument('--verbose', action='store_true', help='Show detailed output')
|
|
args = parser.parse_args()
|
|
|
|
if not HAS_LXML:
|
|
return 1
|
|
|
|
# Find entry files
|
|
files = sorted([f for f in ENTRIES_DIR.glob('*.yaml') if f.is_file() and not f.name.startswith('.')])
|
|
|
|
if args.limit:
|
|
files = files[:args.limit]
|
|
|
|
print(f"Processing {len(files)} entries...")
|
|
|
|
results = []
|
|
processed = 0
|
|
|
|
for filepath in files:
|
|
result = process_entry(filepath)
|
|
if result:
|
|
results.append(result)
|
|
processed += 1
|
|
if args.verbose and processed % 100 == 0:
|
|
print(f" Processed {processed} entries with headers")
|
|
|
|
print(f"\nExtracted headers from {len(results)} entries")
|
|
|
|
# Analyze patterns
|
|
analysis = analyze_patterns(results)
|
|
|
|
# Print summary
|
|
print("\n" + "=" * 60)
|
|
print("HEADER ANALYSIS SUMMARY")
|
|
print("=" * 60)
|
|
print(f"Total entries processed: {analysis['total_entries']}")
|
|
print(f"Entries with H1 headers: {analysis['entries_with_headers']}")
|
|
print(f"H1 matches known name: {analysis['h1_name_match_rate']:.1%}")
|
|
print(f"Title matches known name: {analysis['title_name_match_rate']:.1%}")
|
|
print(f"OG:site_name matches known name: {analysis['og_name_match_rate']:.1%}")
|
|
|
|
print("\n" + "-" * 40)
|
|
print("MOST COMMON H1 VALUES (likely generic):")
|
|
for value, count in analysis['common_h1_values'][:20]:
|
|
if count > 1:
|
|
print(f" {count:4d}x {value[:60]}")
|
|
|
|
print("\n" + "-" * 40)
|
|
print("MOST COMMON NAV ITEMS:")
|
|
for value, count in analysis['common_nav_items'][:20]:
|
|
if count > 5:
|
|
print(f" {count:4d}x {value[:40]}")
|
|
|
|
# Save output
|
|
if args.output:
|
|
output_data = {
|
|
'analysis': analysis,
|
|
'entries': results,
|
|
}
|
|
with open(args.output, 'w', encoding='utf-8') as f:
|
|
json.dump(output_data, f, indent=2, ensure_ascii=False)
|
|
print(f"\nSaved detailed output to {args.output}")
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == '__main__':
|
|
sys.exit(main())
|