285 lines
8.8 KiB
Python
285 lines
8.8 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Retry archiving for entries that previously failed.
|
|
|
|
Uses longer timeouts and different strategies for different failure types.
|
|
|
|
Usage:
|
|
python scripts/retry_archive_failures.py [--dry-run]
|
|
"""
|
|
|
|
import argparse
|
|
import re
|
|
import sys
|
|
import time
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from urllib.parse import urlparse, unquote
|
|
|
|
import yaml
|
|
|
|
try:
|
|
from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeout
|
|
from bs4 import BeautifulSoup
|
|
from markdownify import markdownify as md
|
|
HAS_DEPS = True
|
|
except ImportError as e:
|
|
HAS_DEPS = False
|
|
MISSING_DEP = str(e)
|
|
|
|
ENTRIES_DIR = Path('/Users/kempersc/apps/glam/data/nde/enriched/entries')
|
|
WEB_DIR = ENTRIES_DIR / 'web'
|
|
|
|
# URLs that are definitely not archivable (skip these)
|
|
SKIP_URLS = {
|
|
"http://www.aflegverenigingrachel@gmail.com", # Email, not URL
|
|
"http://Schuttersbroederschap%20St.%20Sebastianus", # Invalid URL
|
|
"http://marcdenelzen.nl/assets/ontwijken-of-samenleven-(column-74).pdf", # PDF on dead domain
|
|
}
|
|
|
|
|
|
def sanitize_dirname(url: str) -> str:
|
|
"""Create a safe directory name from a URL."""
|
|
parsed = urlparse(url)
|
|
name = parsed.netloc.replace('www.', '')
|
|
name = re.sub(r'[^\w\-.]', '_', name)
|
|
return name[:50]
|
|
|
|
|
|
def clean_html_for_markdown(html: str) -> str:
|
|
"""Clean HTML before markdown conversion."""
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
for element in soup.find_all(['script', 'style', 'nav', 'footer', 'header',
|
|
'aside', 'form', 'iframe', 'noscript', 'svg',
|
|
'button', 'input', 'select', 'textarea']):
|
|
element.decompose()
|
|
return str(soup)
|
|
|
|
|
|
def fetch_with_playwright(url: str, browser, timeout: int = 60000) -> dict:
|
|
"""Fetch URL with longer timeout."""
|
|
result = {
|
|
'url': url,
|
|
'fetch_timestamp': datetime.now(timezone.utc).isoformat(),
|
|
'raw_html': None,
|
|
'rendered_html': None,
|
|
'markdown': None,
|
|
'error': None
|
|
}
|
|
|
|
try:
|
|
context = browser.new_context(
|
|
user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
viewport={'width': 1920, 'height': 1080},
|
|
ignore_https_errors=True # Ignore SSL errors
|
|
)
|
|
page = context.new_page()
|
|
|
|
# Try with longer timeout
|
|
response = page.goto(url, wait_until='domcontentloaded', timeout=timeout)
|
|
|
|
if not response:
|
|
result['error'] = "No response"
|
|
context.close()
|
|
return result
|
|
|
|
if response.status >= 400:
|
|
result['error'] = f"HTTP {response.status}"
|
|
context.close()
|
|
return result
|
|
|
|
# Wait for page to settle
|
|
page.wait_for_timeout(3000)
|
|
|
|
result['raw_html'] = page.content()
|
|
result['rendered_html'] = page.content()
|
|
|
|
# Convert to markdown
|
|
cleaned = clean_html_for_markdown(result['rendered_html'])
|
|
markdown = md(cleaned, heading_style='atx', bullets='-')
|
|
markdown = re.sub(r'\n{3,}', '\n\n', markdown)
|
|
result['markdown'] = markdown.strip()
|
|
|
|
context.close()
|
|
|
|
except PlaywrightTimeout:
|
|
result['error'] = "Timeout"
|
|
except Exception as e:
|
|
error_msg = str(e)[:100]
|
|
result['error'] = error_msg
|
|
|
|
return result
|
|
|
|
|
|
def archive_url(url: str, entry_num: str, browser) -> tuple[bool, str, dict | None]:
|
|
"""Archive a single URL."""
|
|
# Skip known bad URLs
|
|
if url in SKIP_URLS or unquote(url) in SKIP_URLS:
|
|
return False, "Permanently invalid URL", None
|
|
|
|
dirname = sanitize_dirname(url)
|
|
url_dir = WEB_DIR / entry_num / dirname
|
|
|
|
# Check if already archived
|
|
if (url_dir / 'rendered.html').exists():
|
|
return False, "Already archived", None
|
|
|
|
result = fetch_with_playwright(url, browser)
|
|
|
|
if result['error']:
|
|
return False, result['error'], None
|
|
|
|
if not result['rendered_html']:
|
|
return False, "No content", None
|
|
|
|
# Save files
|
|
url_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
with open(url_dir / 'index.html', 'w', encoding='utf-8') as f:
|
|
f.write(result['raw_html'])
|
|
|
|
with open(url_dir / 'rendered.html', 'w', encoding='utf-8') as f:
|
|
f.write(result['rendered_html'])
|
|
|
|
with open(url_dir / 'content.md', 'w', encoding='utf-8') as f:
|
|
f.write(f"---\nsource_url: {url}\nfetch_timestamp: {result['fetch_timestamp']}\n---\n\n")
|
|
f.write(result['markdown'] or '')
|
|
|
|
return True, "Success", {
|
|
'url': url,
|
|
'directory': f'web/{entry_num}/{dirname}',
|
|
'archive_timestamp': result['fetch_timestamp'],
|
|
'status': 'retry_success'
|
|
}
|
|
|
|
|
|
def process_entry(filepath: Path, browser) -> tuple[int, int, list]:
|
|
"""Process an entry with previous failures."""
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
if not data:
|
|
return 0, 0, []
|
|
|
|
failures = data.get('web_enrichment', {}).get('archive_failures', [])
|
|
if not failures:
|
|
return 0, 0, []
|
|
|
|
entry_num = filepath.name.split('_')[0]
|
|
|
|
archived = 0
|
|
still_failed = 0
|
|
new_archives = []
|
|
updated_failures = []
|
|
|
|
for failure in failures:
|
|
url = failure.get('url', '')
|
|
if not url:
|
|
continue
|
|
|
|
print(f" Retrying: {url[:60]}...")
|
|
|
|
success, status, archive_info = archive_url(url, entry_num, browser)
|
|
|
|
if success:
|
|
archived += 1
|
|
print(f" ✓ {status}")
|
|
if archive_info:
|
|
new_archives.append(archive_info)
|
|
else:
|
|
still_failed += 1
|
|
print(f" ✗ {status}")
|
|
# Update failure record with new attempt
|
|
failure['last_retry'] = datetime.now(timezone.utc).isoformat()
|
|
failure['retry_result'] = status
|
|
updated_failures.append(failure)
|
|
|
|
# Update entry
|
|
if new_archives or updated_failures:
|
|
if 'web_enrichment' not in data:
|
|
data['web_enrichment'] = {}
|
|
|
|
# Add successful archives
|
|
if new_archives:
|
|
existing = data['web_enrichment'].get('web_archives', [])
|
|
existing_urls = {a.get('url', '').lower().rstrip('/') for a in existing}
|
|
|
|
for archive in new_archives:
|
|
if archive['url'].lower().rstrip('/') not in existing_urls:
|
|
existing.append(archive)
|
|
|
|
data['web_enrichment']['web_archives'] = existing
|
|
|
|
# Update failures list (only keep still-failed)
|
|
data['web_enrichment']['archive_failures'] = updated_failures
|
|
data['web_enrichment']['retry_timestamp'] = datetime.now(timezone.utc).isoformat()
|
|
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
return archived, still_failed, new_archives
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='Retry failed archives')
|
|
parser.add_argument('--dry-run', action='store_true', help='Show what would be done')
|
|
args = parser.parse_args()
|
|
|
|
if not HAS_DEPS:
|
|
print(f"Error: {MISSING_DEP}")
|
|
return 1
|
|
|
|
# Find entries with failures
|
|
print("Finding entries with archive failures...")
|
|
entries = []
|
|
for filepath in sorted(ENTRIES_DIR.glob('*.yaml')):
|
|
if not filepath.is_file():
|
|
continue
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
if data and data.get('web_enrichment', {}).get('archive_failures'):
|
|
entries.append(filepath)
|
|
except Exception:
|
|
continue
|
|
|
|
print(f"Found {len(entries)} entries with failures to retry")
|
|
print()
|
|
|
|
if args.dry_run:
|
|
for filepath in entries:
|
|
print(f"Would retry: {filepath.name}")
|
|
return 0
|
|
|
|
total_archived = 0
|
|
total_failed = 0
|
|
|
|
with sync_playwright() as p:
|
|
browser = p.chromium.launch(headless=True)
|
|
|
|
for i, filepath in enumerate(entries):
|
|
print(f"[{i+1}/{len(entries)}] {filepath.name}")
|
|
|
|
archived, failed, _ = process_entry(filepath, browser)
|
|
|
|
total_archived += archived
|
|
total_failed += failed
|
|
|
|
# Delay between entries
|
|
if archived > 0:
|
|
time.sleep(2)
|
|
|
|
browser.close()
|
|
|
|
print()
|
|
print("=" * 60)
|
|
print("RETRY SUMMARY")
|
|
print("=" * 60)
|
|
print(f" Successfully archived: {total_archived}")
|
|
print(f" Still failing: {total_failed}")
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == '__main__':
|
|
sys.exit(main())
|