302 lines
9.1 KiB
Python
302 lines
9.1 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Archive websites for entries that don't have web_claims yet.
|
|
|
|
This script:
|
|
1. Finds all entries without web_claims
|
|
2. Extracts URLs from original_entry, google_maps, or wikidata
|
|
3. Fetches and archives using Playwright
|
|
4. Updates entries with web archive references
|
|
|
|
Usage:
|
|
python scripts/archive_missing_websites.py [--limit N] [--dry-run]
|
|
"""
|
|
|
|
import argparse
|
|
import re
|
|
import sys
|
|
import time
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from urllib.parse import urlparse
|
|
|
|
import yaml
|
|
|
|
try:
|
|
from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeout
|
|
from bs4 import BeautifulSoup
|
|
from markdownify import markdownify as md
|
|
HAS_DEPS = True
|
|
except ImportError as e:
|
|
HAS_DEPS = False
|
|
print(f"Warning: Missing dependency: {e}")
|
|
|
|
# Directories
|
|
ENTRIES_DIR = Path('/Users/kempersc/apps/glam/data/nde/enriched/entries')
|
|
WEB_DIR = ENTRIES_DIR / 'web'
|
|
|
|
|
|
def sanitize_dirname(url: str) -> str:
|
|
"""Create a safe directory name from a URL."""
|
|
parsed = urlparse(url)
|
|
name = parsed.netloc.replace('www.', '')
|
|
name = re.sub(r'[^\w\-.]', '_', name)
|
|
return name
|
|
|
|
|
|
def get_url_from_entry(data: dict) -> str | None:
|
|
"""Extract URL from entry, trying multiple sources."""
|
|
# Check original_entry.url
|
|
url = data.get('original_entry', {}).get('url')
|
|
|
|
# Check website_url
|
|
if not url:
|
|
url = data.get('website_url')
|
|
|
|
# Check google_maps_enrichment.website
|
|
if not url:
|
|
gm = data.get('google_maps_enrichment', {})
|
|
url = gm.get('website')
|
|
|
|
# Check wikidata_enrichment.official_website
|
|
if not url:
|
|
wiki = data.get('wikidata_enrichment', {})
|
|
url = wiki.get('official_website')
|
|
|
|
# Validate URL
|
|
if url:
|
|
url = url.strip()
|
|
if not url.startswith(('http://', 'https://')):
|
|
url = 'https://' + url
|
|
|
|
return url
|
|
|
|
|
|
def clean_html_for_markdown(html: str) -> str:
|
|
"""Clean HTML before markdown conversion."""
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
|
|
# Remove unwanted elements
|
|
for element in soup.find_all(['script', 'style', 'nav', 'footer', 'header',
|
|
'aside', 'form', 'iframe', 'noscript', 'svg',
|
|
'button', 'input', 'select', 'textarea', 'meta',
|
|
'link']):
|
|
element.decompose()
|
|
|
|
return str(soup)
|
|
|
|
|
|
def fetch_with_playwright(url: str, timeout: int = 30000) -> dict:
|
|
"""Fetch URL using Playwright."""
|
|
result = {
|
|
'url': url,
|
|
'fetch_timestamp': datetime.now(timezone.utc).isoformat(),
|
|
'raw_html': None,
|
|
'rendered_html': None,
|
|
'markdown': None,
|
|
'error': None
|
|
}
|
|
|
|
try:
|
|
with sync_playwright() as p:
|
|
browser = p.chromium.launch(headless=True)
|
|
context = browser.new_context(
|
|
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
|
|
viewport={'width': 1920, 'height': 1080}
|
|
)
|
|
page = context.new_page()
|
|
|
|
response = page.goto(url, wait_until='networkidle', timeout=timeout)
|
|
|
|
if not response or response.status >= 400:
|
|
result['error'] = f"HTTP {response.status if response else 'No response'}"
|
|
browser.close()
|
|
return result
|
|
|
|
result['raw_html'] = page.content()
|
|
page.wait_for_timeout(2000)
|
|
result['rendered_html'] = page.content()
|
|
|
|
# Convert to markdown
|
|
cleaned = clean_html_for_markdown(result['rendered_html'])
|
|
markdown = md(cleaned, heading_style='atx', bullets='-')
|
|
markdown = re.sub(r'\n{3,}', '\n\n', markdown)
|
|
result['markdown'] = markdown.strip()
|
|
|
|
browser.close()
|
|
|
|
except PlaywrightTimeout:
|
|
result['error'] = f"Timeout loading {url}"
|
|
except Exception as e:
|
|
result['error'] = f"Error: {str(e)}"
|
|
|
|
return result
|
|
|
|
|
|
def archive_entry(filepath: Path, dry_run: bool = False) -> tuple[bool, str]:
|
|
"""Archive website for a single entry."""
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
if not data:
|
|
return False, "Empty file"
|
|
|
|
# Skip if already has web_claims
|
|
if data.get('web_claims', {}).get('claims'):
|
|
return False, "Already has web_claims"
|
|
|
|
# Get URL
|
|
url = get_url_from_entry(data)
|
|
if not url:
|
|
return False, "No URL found"
|
|
|
|
entry_num = filepath.name.split('_')[0]
|
|
dirname = sanitize_dirname(url)
|
|
url_dir = WEB_DIR / entry_num / dirname
|
|
|
|
# Skip if already archived
|
|
if (url_dir / 'rendered.html').exists():
|
|
return False, "Already archived"
|
|
|
|
if dry_run:
|
|
return True, f"Would fetch: {url}"
|
|
|
|
# Fetch the website
|
|
result = fetch_with_playwright(url)
|
|
|
|
if result['error']:
|
|
return False, result['error']
|
|
|
|
if not result['rendered_html']:
|
|
return False, "No content"
|
|
|
|
# Create directory and save files
|
|
url_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Save raw HTML
|
|
with open(url_dir / 'index.html', 'w', encoding='utf-8') as f:
|
|
f.write(result['raw_html'])
|
|
|
|
# Save rendered HTML
|
|
with open(url_dir / 'rendered.html', 'w', encoding='utf-8') as f:
|
|
f.write(result['rendered_html'])
|
|
|
|
# Save markdown
|
|
md_header = f"""---
|
|
source_url: {url}
|
|
fetch_timestamp: {result['fetch_timestamp']}
|
|
entry_file: {filepath.name}
|
|
---
|
|
|
|
"""
|
|
with open(url_dir / 'content.md', 'w', encoding='utf-8') as f:
|
|
f.write(md_header + (result['markdown'] or ''))
|
|
|
|
# Save metadata
|
|
metadata = {
|
|
'url': url,
|
|
'fetch_timestamp': result['fetch_timestamp'],
|
|
'entry_file': filepath.name,
|
|
'files': {
|
|
'raw_html': 'index.html',
|
|
'rendered_html': 'rendered.html',
|
|
'markdown': 'content.md'
|
|
}
|
|
}
|
|
with open(url_dir / 'metadata.yaml', 'w', encoding='utf-8') as f:
|
|
yaml.dump(metadata, f, default_flow_style=False, allow_unicode=True)
|
|
|
|
# Update entry with web archive reference
|
|
if 'web_enrichment' not in data:
|
|
data['web_enrichment'] = {}
|
|
|
|
data['web_enrichment']['web_archives'] = [{
|
|
'url': url,
|
|
'directory': f'web/{entry_num}/{dirname}'
|
|
}]
|
|
data['web_enrichment']['web_archive_timestamp'] = result['fetch_timestamp']
|
|
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
return True, f"Archived: {url}"
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='Archive missing websites')
|
|
parser.add_argument('--limit', type=int, default=None, help='Limit number of entries')
|
|
parser.add_argument('--dry-run', action='store_true', help='Show what would be done')
|
|
parser.add_argument('--start', type=int, default=0, help='Start from entry index')
|
|
args = parser.parse_args()
|
|
|
|
if not HAS_DEPS:
|
|
print("Error: Required dependencies not installed.")
|
|
print("Run: pip install playwright markdownify && playwright install chromium")
|
|
return 1
|
|
|
|
# Find entries without web_claims
|
|
print("Finding entries without web_claims...")
|
|
entries_to_process = []
|
|
|
|
for filepath in sorted(ENTRIES_DIR.glob('*.yaml')):
|
|
if filepath.is_dir():
|
|
continue
|
|
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
if not data:
|
|
continue
|
|
|
|
# Check for web_claims
|
|
if not data.get('web_claims', {}).get('claims'):
|
|
url = get_url_from_entry(data)
|
|
if url:
|
|
entries_to_process.append(filepath)
|
|
|
|
print(f"Found {len(entries_to_process)} entries without web_claims that have URLs")
|
|
|
|
# Apply start and limit
|
|
if args.start:
|
|
entries_to_process = entries_to_process[args.start:]
|
|
if args.limit:
|
|
entries_to_process = entries_to_process[:args.limit]
|
|
|
|
print(f"Processing {len(entries_to_process)} entries...")
|
|
print(f"Mode: {'DRY RUN' if args.dry_run else 'LIVE'}")
|
|
print()
|
|
|
|
archived = 0
|
|
failed = 0
|
|
skipped = 0
|
|
|
|
for i, filepath in enumerate(entries_to_process):
|
|
print(f"[{i+1}/{len(entries_to_process)}] {filepath.name}")
|
|
|
|
success, message = archive_entry(filepath, dry_run=args.dry_run)
|
|
|
|
if success:
|
|
archived += 1
|
|
print(f" ✓ {message}")
|
|
elif 'Already' in message:
|
|
skipped += 1
|
|
print(f" - {message}")
|
|
else:
|
|
failed += 1
|
|
print(f" ✗ {message}")
|
|
|
|
# Rate limiting (2 seconds between requests)
|
|
if success and not args.dry_run:
|
|
time.sleep(2)
|
|
|
|
print()
|
|
print(f"{'DRY RUN - ' if args.dry_run else ''}Summary:")
|
|
print(f" Archived: {archived}")
|
|
print(f" Failed: {failed}")
|
|
print(f" Skipped: {skipped}")
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == '__main__':
|
|
sys.exit(main())
|