glam/scripts/verify_website_links.py

#!/usr/bin/env python3
"""
Verify website URLs extracted from LinkedIn About pages.

Checks:
- HTTP status codes
- Redirects (follows and records final URL)
- SSL/TLS errors
- Connection timeouts
- DNS failures

Usage:
    python scripts/verify_website_links.py [--input FILE] [--output-dir DIR] [--timeout SECONDS]
"""

import argparse
import asyncio
import json
import sys
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
from urllib.parse import urlparse

import httpx


# Default timeout in seconds
DEFAULT_TIMEOUT = 15
MAX_CONCURRENT = 20  # Limit concurrent requests


async def check_url(
    client: httpx.AsyncClient,
    url: str,
    custodian_name: str,
    timeout: float = DEFAULT_TIMEOUT
) -> dict[str, Any]:
    """Check if a URL is accessible and return verification result."""
    result = {
        'custodian_name': custodian_name,
        'original_url': url,
        'final_url': None,
        'status_code': None,
        'is_alive': False,
        'is_redirect': False,
        'error': None,
        'response_time_ms': None,
        'checked_at': datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ'),
    }

    start_time = asyncio.get_event_loop().time()

    try:
        # Use GET with follow_redirects to get final URL
        response = await client.get(url, follow_redirects=True, timeout=timeout)

        end_time = asyncio.get_event_loop().time()
        result['response_time_ms'] = int((end_time - start_time) * 1000)
        result['status_code'] = response.status_code
        result['final_url'] = str(response.url)
        result['is_redirect'] = str(response.url) != url

        # Consider 2xx and 3xx as alive
        result['is_alive'] = 200 <= response.status_code < 400

    except httpx.TimeoutException:
        result['error'] = 'timeout'
    except httpx.ConnectError as e:
        if 'Name or service not known' in str(e) or 'getaddrinfo' in str(e):
            result['error'] = 'dns_failure'
        elif 'Connection refused' in str(e):
            result['error'] = 'connection_refused'
        else:
            result['error'] = f'connection_error: {str(e)[:100]}'
    except httpx.HTTPStatusError as e:
        result['status_code'] = e.response.status_code
        result['error'] = f'http_error: {e.response.status_code}'
    except Exception as e:
        error_type = type(e).__name__
        result['error'] = f'{error_type}: {str(e)[:100]}'

    return result


async def verify_urls(
    urls: list[dict[str, str]],
    timeout: float = DEFAULT_TIMEOUT,
    max_concurrent: int = MAX_CONCURRENT
) -> list[dict[str, Any]]:
    """Verify a list of URLs concurrently."""
    results = []
    semaphore = asyncio.Semaphore(max_concurrent)

    async with httpx.AsyncClient(
        headers={
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
        },
        verify=False,  # Skip SSL verification for checking dead links
    ) as client:

        async def check_with_semaphore(item: dict[str, str]) -> dict[str, Any]:
            async with semaphore:
                return await check_url(
                    client,
                    item['url'],
                    item['custodian_name'],
                    timeout
                )

        tasks = [check_with_semaphore(item) for item in urls]
        results = await asyncio.gather(*tasks)

    return results


def main():
    parser = argparse.ArgumentParser(
        description='Verify website URLs extracted from LinkedIn About pages'
    )
    parser.add_argument(
        '--input', '-i',
        type=Path,
        help='Input JSON file with About page data (defaults to most recent)'
    )
    parser.add_argument(
        '--output-dir', '-o',
        type=Path,
        default=Path('data/custodian/person/affiliated/verified_links'),
        help='Directory to save verification results'
    )
    parser.add_argument(
        '--timeout', '-t',
        type=float,
        default=DEFAULT_TIMEOUT,
        help=f'Request timeout in seconds (default: {DEFAULT_TIMEOUT})'
    )
    parser.add_argument(
        '--limit', '-l',
        type=int,
        default=None,
        help='Limit number of URLs to verify'
    )

    args = parser.parse_args()

    # Find input file
    about_data_dir = Path('data/custodian/person/affiliated/about_data')
    if args.input:
        input_file = args.input
    else:
        # Find most recent about_data file
        about_files = sorted(about_data_dir.glob('about_data_*.json'), reverse=True)
        if not about_files:
            print("Error: No about_data files found", file=sys.stderr)
            sys.exit(1)
        input_file = about_files[0]

    print(f"Loading data from: {input_file}")

    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # Extract URLs to verify
    urls_to_verify = []
    for item in data['data']:
        if item['website_url']:
            urls_to_verify.append({
                'custodian_name': item['custodian_name'],
                'url': item['website_url'],
            })

    print(f"Found {len(urls_to_verify)} URLs to verify")

    if args.limit:
        urls_to_verify = urls_to_verify[:args.limit]
        print(f"Limited to {len(urls_to_verify)} URLs")

    # Verify URLs
    print(f"Verifying URLs with {args.timeout}s timeout...")
    results = asyncio.run(verify_urls(urls_to_verify, args.timeout))

    # Analyze results
    alive = [r for r in results if r['is_alive']]
    dead = [r for r in results if not r['is_alive']]
    redirects = [r for r in results if r['is_redirect']]

    # Categorize errors
    error_types = {}
    for r in dead:
        error = r['error'] or 'unknown'
        error_type = error.split(':')[0] if ':' in error else error
        error_types[error_type] = error_types.get(error_type, 0) + 1

    print(f"\n{'='*60}")
    print("VERIFICATION RESULTS")
    print(f"  Total checked: {len(results)}")
    print(f"  Alive: {len(alive)} ({100*len(alive)/len(results):.1f}%)")
    print(f"  Dead: {len(dead)} ({100*len(dead)/len(results):.1f}%)")
    print(f"  Redirected: {len(redirects)}")
    print(f"\nDead link errors:")
    for error_type, count in sorted(error_types.items(), key=lambda x: -x[1]):
        print(f"  {error_type}: {count}")

    # Save results
    args.output_dir.mkdir(parents=True, exist_ok=True)
    timestamp = datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%SZ')

    # Save full results
    full_output = args.output_dir / f'verification_results_{timestamp}.json'
    with open(full_output, 'w', encoding='utf-8') as f:
        json.dump({
            'timestamp': timestamp,
            'source_file': str(input_file),
            'total_checked': len(results),
            'alive_count': len(alive),
            'dead_count': len(dead),
            'redirect_count': len(redirects),
            'error_summary': error_types,
            'results': results,
        }, f, indent=2, ensure_ascii=False)

    # Save dead links separately for follow-up
    dead_output = args.output_dir / f'dead_links_{timestamp}.json'
    with open(dead_output, 'w', encoding='utf-8') as f:
        json.dump({
            'timestamp': timestamp,
            'count': len(dead),
            'links': dead,
        }, f, indent=2, ensure_ascii=False)

    print(f"\nOutput files:")
    print(f"  Full results: {full_output}")
    print(f"  Dead links: {dead_output}")

    # Print some dead links for reference
    if dead:
        print(f"\nSample dead links (need website discovery):")
        for item in dead[:10]:
            print(f"  - {item['custodian_name']}: {item['original_url']} ({item['error']})")

    return 0


if __name__ == '__main__':
    sys.exit(main())