#!/usr/bin/env python3 """ Archive websites for entries that don't have web_claims yet. This script: 1. Finds all entries without web_claims 2. Extracts URLs from original_entry, google_maps, or wikidata 3. Fetches and archives using Playwright 4. Updates entries with web archive references Usage: python scripts/archive_missing_websites.py [--limit N] [--dry-run] """ import argparse import re import sys import time from datetime import datetime, timezone from pathlib import Path from urllib.parse import urlparse import yaml try: from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeout from bs4 import BeautifulSoup from markdownify import markdownify as md HAS_DEPS = True except ImportError as e: HAS_DEPS = False print(f"Warning: Missing dependency: {e}") # Directories ENTRIES_DIR = Path('/Users/kempersc/apps/glam/data/nde/enriched/entries') WEB_DIR = ENTRIES_DIR / 'web' def sanitize_dirname(url: str) -> str: """Create a safe directory name from a URL.""" parsed = urlparse(url) name = parsed.netloc.replace('www.', '') name = re.sub(r'[^\w\-.]', '_', name) return name def get_url_from_entry(data: dict) -> str | None: """Extract URL from entry, trying multiple sources.""" # Check original_entry.url url = data.get('original_entry', {}).get('url') # Check website_url if not url: url = data.get('website_url') # Check google_maps_enrichment.website if not url: gm = data.get('google_maps_enrichment', {}) url = gm.get('website') # Check wikidata_enrichment.official_website if not url: wiki = data.get('wikidata_enrichment', {}) url = wiki.get('official_website') # Validate URL if url: url = url.strip() if not url.startswith(('http://', 'https://')): url = 'https://' + url return url def clean_html_for_markdown(html: str) -> str: """Clean HTML before markdown conversion.""" soup = BeautifulSoup(html, 'html.parser') # Remove unwanted elements for element in soup.find_all(['script', 'style', 'nav', 'footer', 'header', 'aside', 'form', 'iframe', 'noscript', 'svg', 'button', 'input', 'select', 'textarea', 'meta', 'link']): element.decompose() return str(soup) def fetch_with_playwright(url: str, timeout: int = 30000) -> dict: """Fetch URL using Playwright.""" result = { 'url': url, 'fetch_timestamp': datetime.now(timezone.utc).isoformat(), 'raw_html': None, 'rendered_html': None, 'markdown': None, 'error': None } try: with sync_playwright() as p: browser = p.chromium.launch(headless=True) context = browser.new_context( user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', viewport={'width': 1920, 'height': 1080} ) page = context.new_page() response = page.goto(url, wait_until='networkidle', timeout=timeout) if not response or response.status >= 400: result['error'] = f"HTTP {response.status if response else 'No response'}" browser.close() return result result['raw_html'] = page.content() page.wait_for_timeout(2000) result['rendered_html'] = page.content() # Convert to markdown cleaned = clean_html_for_markdown(result['rendered_html']) markdown = md(cleaned, heading_style='atx', bullets='-') markdown = re.sub(r'\n{3,}', '\n\n', markdown) result['markdown'] = markdown.strip() browser.close() except PlaywrightTimeout: result['error'] = f"Timeout loading {url}" except Exception as e: result['error'] = f"Error: {str(e)}" return result def archive_entry(filepath: Path, dry_run: bool = False) -> tuple[bool, str]: """Archive website for a single entry.""" with open(filepath, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) if not data: return False, "Empty file" # Skip if already has web_claims if data.get('web_claims', {}).get('claims'): return False, "Already has web_claims" # Get URL url = get_url_from_entry(data) if not url: return False, "No URL found" entry_num = filepath.name.split('_')[0] dirname = sanitize_dirname(url) url_dir = WEB_DIR / entry_num / dirname # Skip if already archived if (url_dir / 'rendered.html').exists(): return False, "Already archived" if dry_run: return True, f"Would fetch: {url}" # Fetch the website result = fetch_with_playwright(url) if result['error']: return False, result['error'] if not result['rendered_html']: return False, "No content" # Create directory and save files url_dir.mkdir(parents=True, exist_ok=True) # Save raw HTML with open(url_dir / 'index.html', 'w', encoding='utf-8') as f: f.write(result['raw_html']) # Save rendered HTML with open(url_dir / 'rendered.html', 'w', encoding='utf-8') as f: f.write(result['rendered_html']) # Save markdown md_header = f"""--- source_url: {url} fetch_timestamp: {result['fetch_timestamp']} entry_file: {filepath.name} --- """ with open(url_dir / 'content.md', 'w', encoding='utf-8') as f: f.write(md_header + (result['markdown'] or '')) # Save metadata metadata = { 'url': url, 'fetch_timestamp': result['fetch_timestamp'], 'entry_file': filepath.name, 'files': { 'raw_html': 'index.html', 'rendered_html': 'rendered.html', 'markdown': 'content.md' } } with open(url_dir / 'metadata.yaml', 'w', encoding='utf-8') as f: yaml.dump(metadata, f, default_flow_style=False, allow_unicode=True) # Update entry with web archive reference if 'web_enrichment' not in data: data['web_enrichment'] = {} data['web_enrichment']['web_archives'] = [{ 'url': url, 'directory': f'web/{entry_num}/{dirname}' }] data['web_enrichment']['web_archive_timestamp'] = result['fetch_timestamp'] with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) return True, f"Archived: {url}" def main(): parser = argparse.ArgumentParser(description='Archive missing websites') parser.add_argument('--limit', type=int, default=None, help='Limit number of entries') parser.add_argument('--dry-run', action='store_true', help='Show what would be done') parser.add_argument('--start', type=int, default=0, help='Start from entry index') args = parser.parse_args() if not HAS_DEPS: print("Error: Required dependencies not installed.") print("Run: pip install playwright markdownify && playwright install chromium") return 1 # Find entries without web_claims print("Finding entries without web_claims...") entries_to_process = [] for filepath in sorted(ENTRIES_DIR.glob('*.yaml')): if filepath.is_dir(): continue with open(filepath, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) if not data: continue # Check for web_claims if not data.get('web_claims', {}).get('claims'): url = get_url_from_entry(data) if url: entries_to_process.append(filepath) print(f"Found {len(entries_to_process)} entries without web_claims that have URLs") # Apply start and limit if args.start: entries_to_process = entries_to_process[args.start:] if args.limit: entries_to_process = entries_to_process[:args.limit] print(f"Processing {len(entries_to_process)} entries...") print(f"Mode: {'DRY RUN' if args.dry_run else 'LIVE'}") print() archived = 0 failed = 0 skipped = 0 for i, filepath in enumerate(entries_to_process): print(f"[{i+1}/{len(entries_to_process)}] {filepath.name}") success, message = archive_entry(filepath, dry_run=args.dry_run) if success: archived += 1 print(f" ✓ {message}") elif 'Already' in message: skipped += 1 print(f" - {message}") else: failed += 1 print(f" ✗ {message}") # Rate limiting (2 seconds between requests) if success and not args.dry_run: time.sleep(2) print() print(f"{'DRY RUN - ' if args.dry_run else ''}Summary:") print(f" Archived: {archived}") print(f" Failed: {failed}") print(f" Skipped: {skipped}") return 0 if __name__ == '__main__': sys.exit(main())