glam/scripts/archive_digital_platforms.py
2025-12-05 15:30:23 +01:00

365 lines
12 KiB
Python

#!/usr/bin/env python3
"""
Archive websites for entries that have digital_platforms but no web_archives.
This script:
1. Finds entries with digital_platforms but missing web_archives
2. Extracts platform_url from digital_platforms
3. Fetches and archives using Playwright
4. Updates entries with web archive references
Usage:
python scripts/archive_digital_platforms.py [--limit N] [--dry-run] [--start N]
"""
import argparse
import re
import sys
import time
from datetime import datetime, timezone
from pathlib import Path
from urllib.parse import urlparse
import yaml
try:
from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeout
from bs4 import BeautifulSoup
from markdownify import markdownify as md
HAS_DEPS = True
except ImportError as e:
HAS_DEPS = False
MISSING_DEP = str(e)
# Directories
ENTRIES_DIR = Path('/Users/kempersc/apps/glam/data/nde/enriched/entries')
WEB_DIR = ENTRIES_DIR / 'web'
def sanitize_dirname(url: str) -> str:
"""Create a safe directory name from a URL."""
parsed = urlparse(url)
name = parsed.netloc.replace('www.', '')
name = re.sub(r'[^\w\-.]', '_', name)
return name
def get_platform_urls(data: dict) -> list[str]:
"""Extract all platform URLs from digital_platforms."""
urls = []
platforms = data.get('digital_platforms', [])
if not platforms:
return urls
for platform in platforms:
url = platform.get('platform_url')
if url:
url = url.strip()
if not url.startswith(('http://', 'https://')):
url = 'https://' + url
urls.append(url)
return urls
def get_archived_urls(data: dict) -> set[str]:
"""Get URLs already archived."""
archived = set()
web_enrichment = data.get('web_enrichment', {})
web_archives = web_enrichment.get('web_archives', [])
for archive in web_archives:
url = archive.get('url', '')
if url:
archived.add(url.lower().rstrip('/'))
return archived
def clean_html_for_markdown(html: str) -> str:
"""Clean HTML before markdown conversion."""
soup = BeautifulSoup(html, 'html.parser')
# Remove unwanted elements
for element in soup.find_all(['script', 'style', 'nav', 'footer', 'header',
'aside', 'form', 'iframe', 'noscript', 'svg',
'button', 'input', 'select', 'textarea', 'meta',
'link']):
element.decompose()
return str(soup)
def fetch_with_playwright(url: str, timeout: int = 30000) -> dict:
"""Fetch URL using Playwright."""
result = {
'url': url,
'fetch_timestamp': datetime.now(timezone.utc).isoformat(),
'raw_html': None,
'rendered_html': None,
'markdown': None,
'error': None
}
try:
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
context = browser.new_context(
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
viewport={'width': 1920, 'height': 1080}
)
page = context.new_page()
response = page.goto(url, wait_until='networkidle', timeout=timeout)
if not response or response.status >= 400:
result['error'] = f"HTTP {response.status if response else 'No response'}"
browser.close()
return result
result['raw_html'] = page.content()
page.wait_for_timeout(2000)
result['rendered_html'] = page.content()
# Convert to markdown
cleaned = clean_html_for_markdown(result['rendered_html'])
markdown = md(cleaned, heading_style='atx', bullets='-')
markdown = re.sub(r'\n{3,}', '\n\n', markdown)
result['markdown'] = markdown.strip()
browser.close()
except PlaywrightTimeout:
result['error'] = f"Timeout loading {url}"
except Exception as e:
result['error'] = f"Error: {str(e)}"
return result
def archive_url(url: str, entry_num: str, dry_run: bool = False) -> tuple[bool, str, dict | None]:
"""Archive a single URL."""
dirname = sanitize_dirname(url)
url_dir = WEB_DIR / entry_num / dirname
# Skip if already archived (directory exists with rendered.html)
if (url_dir / 'rendered.html').exists():
return False, "Already archived on disk", None
if dry_run:
return True, f"Would fetch: {url}", None
# Fetch the website
result = fetch_with_playwright(url)
if result['error']:
return False, result['error'], None
if not result['rendered_html']:
return False, "No content", None
# Create directory and save files
url_dir.mkdir(parents=True, exist_ok=True)
# Save raw HTML
with open(url_dir / 'index.html', 'w', encoding='utf-8') as f:
f.write(result['raw_html'])
# Save rendered HTML
with open(url_dir / 'rendered.html', 'w', encoding='utf-8') as f:
f.write(result['rendered_html'])
# Save markdown
md_header = f"""---
source_url: {url}
fetch_timestamp: {result['fetch_timestamp']}
---
"""
with open(url_dir / 'content.md', 'w', encoding='utf-8') as f:
f.write(md_header + (result['markdown'] or ''))
# Save metadata
metadata = {
'url': url,
'fetch_timestamp': result['fetch_timestamp'],
'files': {
'raw_html': 'index.html',
'rendered_html': 'rendered.html',
'markdown': 'content.md'
}
}
with open(url_dir / 'metadata.yaml', 'w', encoding='utf-8') as f:
yaml.dump(metadata, f, default_flow_style=False, allow_unicode=True)
archive_info = {
'url': url,
'directory': f'web/{entry_num}/{dirname}',
'archive_timestamp': result['fetch_timestamp']
}
return True, f"Archived: {url}", archive_info
def process_entry(filepath: Path, dry_run: bool = False) -> tuple[int, int, int]:
"""Process a single entry and archive any missing platform URLs."""
with open(filepath, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
if not data:
return 0, 0, 0
# Get platform URLs
platform_urls = get_platform_urls(data)
if not platform_urls:
return 0, 0, 0
# Get already archived URLs
archived_urls = get_archived_urls(data)
entry_num = filepath.name.split('_')[0]
archived = 0
failed = 0
skipped = 0
new_archives = []
for url in platform_urls:
url_normalized = url.lower().rstrip('/')
# Skip if already in web_archives
if url_normalized in archived_urls:
skipped += 1
continue
success, message, archive_info = archive_url(url, entry_num, dry_run)
if success:
archived += 1
print(f"{message}")
if archive_info:
new_archives.append(archive_info)
elif 'Already archived' in message:
skipped += 1
# Still add to entry metadata if disk archive exists
dirname = sanitize_dirname(url)
new_archives.append({
'url': url,
'directory': f'web/{entry_num}/{dirname}',
'archive_timestamp': datetime.now(timezone.utc).isoformat()
})
else:
failed += 1
print(f"{message}")
# Update entry with new archives
if new_archives and not dry_run:
if 'web_enrichment' not in data:
data['web_enrichment'] = {}
existing = data['web_enrichment'].get('web_archives', [])
data['web_enrichment']['web_archives'] = existing + new_archives
data['web_enrichment']['platform_archive_timestamp'] = datetime.now(timezone.utc).isoformat()
with open(filepath, 'w', encoding='utf-8') as f:
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
return archived, failed, skipped
def main():
parser = argparse.ArgumentParser(description='Archive digital platform websites')
parser.add_argument('--limit', type=int, default=None, help='Limit number of entries')
parser.add_argument('--dry-run', action='store_true', help='Show what would be done')
parser.add_argument('--start', type=int, default=0, help='Start from entry index')
args = parser.parse_args()
if not HAS_DEPS:
print(f"Error: Required dependencies not installed: {MISSING_DEP}")
print("Run: pip install playwright beautifulsoup4 markdownify && playwright install chromium")
return 1
# Find entries with digital_platforms but missing/incomplete web_archives
print("Finding entries with digital_platforms needing archiving...")
entries_to_process = []
for filepath in sorted(ENTRIES_DIR.glob('*.yaml')):
if filepath.name == 'web': # Skip 'web' if it's somehow a file
continue
if not filepath.is_file():
continue
try:
with open(filepath, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
except Exception:
continue
if not data:
continue
# Check for digital_platforms with URLs
platform_urls = get_platform_urls(data)
if not platform_urls:
continue
# Check if all URLs are already archived
archived_urls = get_archived_urls(data)
needs_archiving = False
for url in platform_urls:
url_normalized = url.lower().rstrip('/')
if url_normalized not in archived_urls:
# Also check if on-disk archive exists
entry_num = filepath.name.split('_')[0]
dirname = sanitize_dirname(url)
if not (WEB_DIR / entry_num / dirname / 'rendered.html').exists():
needs_archiving = True
break
if needs_archiving:
entries_to_process.append(filepath)
print(f"Found {len(entries_to_process)} entries needing archiving")
# Apply start and limit
if args.start:
entries_to_process = entries_to_process[args.start:]
if args.limit:
entries_to_process = entries_to_process[:args.limit]
print(f"Processing {len(entries_to_process)} entries...")
print(f"Mode: {'DRY RUN' if args.dry_run else 'LIVE'}")
print()
total_archived = 0
total_failed = 0
total_skipped = 0
for i, filepath in enumerate(entries_to_process):
print(f"[{i+1}/{len(entries_to_process)}] {filepath.name}")
archived, failed, skipped = process_entry(filepath, dry_run=args.dry_run)
total_archived += archived
total_failed += failed
total_skipped += skipped
# Rate limiting (2 seconds between successful fetches)
if archived > 0 and not args.dry_run:
time.sleep(2)
print()
print("=" * 60)
print(f"{'DRY RUN - ' if args.dry_run else ''}SUMMARY")
print("=" * 60)
print(f" Entries processed: {len(entries_to_process)}")
print(f" URLs archived: {total_archived}")
print(f" URLs failed: {total_failed}")
print(f" URLs skipped: {total_skipped}")
return 0
if __name__ == '__main__':
sys.exit(main())