glam/scripts/validate_social_media_links.py

#!/usr/bin/env python3
"""
Validate social media links in NDE enriched entries.

This script checks if social media profiles (Twitter/X, Facebook, Instagram,
LinkedIn, YouTube) actually exist by making HTTP HEAD requests.

Usage:
    python scripts/validate_social_media_links.py [--dry-run] [--remove-dead] [--limit N]

Options:
    --dry-run       Only report dead links, don't modify files
    --remove-dead   Remove dead links from entry files
    --limit N       Process only first N entries (for testing)
    --entry ENTRY   Process specific entry (e.g., 0615)
"""

import argparse
import logging
import time
import re
from pathlib import Path
from typing import Optional
import httpx
import yaml
from concurrent.futures import ThreadPoolExecutor, as_completed
from dataclasses import dataclass, field

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Constants
ENTRIES_DIR = Path(__file__).parent.parent / "data/nde/enriched/entries"
REQUEST_TIMEOUT = 10.0
RATE_LIMIT_DELAY = 0.5  # seconds between requests to same domain
MAX_WORKERS = 5  # concurrent validation threads


@dataclass
class SocialMediaLink:
    """Represents a social media link to validate."""
    entry_id: str
    platform: str
    url: str
    source: str  # 'wikidata' or 'web_claims'
    claim_index: Optional[int] = None  # index in web_claims list if applicable


@dataclass
class ValidationResult:
    """Result of validating a social media link."""
    link: SocialMediaLink
    is_valid: bool
    status_code: Optional[int] = None
    error: Optional[str] = None
    redirect_url: Optional[str] = None


def normalize_twitter_url(url_or_username: str) -> str:
    """Convert Twitter username or URL to full URL."""
    if url_or_username.startswith(('http://', 'https://')):
        # Already a URL, normalize to x.com
        url = url_or_username.replace('twitter.com', 'x.com')
        return url
    else:
        # Username only
        username = url_or_username.lstrip('@')
        return f"https://x.com/{username}"


def extract_social_links(entry_path: Path) -> list[SocialMediaLink]:
    """Extract all social media links from an entry file."""
    links = []
    entry_id = entry_path.stem.split('_')[0]

    with open(entry_path, 'r', encoding='utf-8') as f:
        entry = yaml.safe_load(f)

    if not entry:
        return links

    # Check Wikidata enrichment for Twitter username
    wikidata = entry.get('wikidata_enrichment', {})
    if wikidata:
        # P2002 = Twitter/X username
        twitter_data = wikidata.get('P2002_x__twitter__username', {})
        if twitter_data:
            value = twitter_data.get('value')
            if value:
                url = normalize_twitter_url(value)
                links.append(SocialMediaLink(
                    entry_id=entry_id,
                    platform='twitter',
                    url=url,
                    source='wikidata'
                ))

        # P2013 = Facebook ID
        facebook_data = wikidata.get('P2013_facebook_id', {})
        if facebook_data:
            value = facebook_data.get('value')
            if value:
                url = f"https://www.facebook.com/{value}"
                links.append(SocialMediaLink(
                    entry_id=entry_id,
                    platform='facebook',
                    url=url,
                    source='wikidata'
                ))

        # P2003 = Instagram username
        instagram_data = wikidata.get('P2003_instagram_username', {})
        if instagram_data:
            value = instagram_data.get('value')
            if value:
                url = f"https://www.instagram.com/{value}/"
                links.append(SocialMediaLink(
                    entry_id=entry_id,
                    platform='instagram',
                    url=url,
                    source='wikidata'
                ))

        # P4264 = LinkedIn company ID
        linkedin_data = wikidata.get('P4264_linkedin_company_id', {})
        if linkedin_data:
            value = linkedin_data.get('value')
            if value:
                url = f"https://www.linkedin.com/company/{value}"
                links.append(SocialMediaLink(
                    entry_id=entry_id,
                    platform='linkedin',
                    url=url,
                    source='wikidata'
                ))

        # P2397 = YouTube channel ID
        youtube_data = wikidata.get('P2397_youtube_channel_id', {})
        if youtube_data:
            value = youtube_data.get('value')
            if value:
                url = f"https://www.youtube.com/channel/{value}"
                links.append(SocialMediaLink(
                    entry_id=entry_id,
                    platform='youtube',
                    url=url,
                    source='wikidata'
                ))

    # Check web_claims for social media links
    # web_claims is a dict with a 'claims' list inside
    web_claims_data = entry.get('web_claims', {})
    if isinstance(web_claims_data, dict):
        web_claims = web_claims_data.get('claims', [])
    else:
        web_claims = web_claims_data if isinstance(web_claims_data, list) else []

    for idx, claim in enumerate(web_claims):
        claim_type = claim.get('claim_type', '')
        claim_value = claim.get('claim_value', '')

        if claim_type == 'social_twitter' and claim_value:
            url = normalize_twitter_url(claim_value)
            links.append(SocialMediaLink(
                entry_id=entry_id,
                platform='twitter',
                url=url,
                source='web_claims',
                claim_index=idx
            ))

        elif claim_type == 'social_facebook' and claim_value:
            links.append(SocialMediaLink(
                entry_id=entry_id,
                platform='facebook',
                url=claim_value,
                source='web_claims',
                claim_index=idx
            ))

        elif claim_type == 'social_instagram' and claim_value:
            links.append(SocialMediaLink(
                entry_id=entry_id,
                platform='instagram',
                url=claim_value,
                source='web_claims',
                claim_index=idx
            ))

        elif claim_type == 'social_linkedin' and claim_value:
            links.append(SocialMediaLink(
                entry_id=entry_id,
                platform='linkedin',
                url=claim_value,
                source='web_claims',
                claim_index=idx
            ))

        elif claim_type == 'social_youtube' and claim_value:
            links.append(SocialMediaLink(
                entry_id=entry_id,
                platform='youtube',
                url=claim_value,
                source='web_claims',
                claim_index=idx
            ))

    return links


def validate_link(link: SocialMediaLink, client: httpx.Client) -> ValidationResult:
    """Validate a single social media link by checking if profile exists.

    IMPORTANT: Social media platforms have aggressive bot protection.
    We only mark links as DEAD if we get explicit 404 or content confirmation.
    Other error codes (400, 403) are treated as "indeterminate" (assume valid).
    """

    try:
        # Use HEAD request first, fall back to GET if needed
        response = client.head(link.url, follow_redirects=True, timeout=REQUEST_TIMEOUT)

        # Default: assume valid unless proven otherwise
        is_valid = True

        # Twitter/X specific checks
        if link.platform == 'twitter':
            # X returns 403 for bot protection - can't determine status
            if response.status_code == 403:
                return ValidationResult(
                    link=link,
                    is_valid=True,  # Assume valid - need browser to verify
                    status_code=403,
                    error="Twitter/X bot protection - cannot validate via HTTP"
                )
            elif response.status_code == 404:
                is_valid = False
            elif response.status_code == 200:
                # Need to do a GET to check content
                get_response = client.get(link.url, follow_redirects=True, timeout=REQUEST_TIMEOUT)
                # Check if redirected to login or account suspended
                final_url = str(get_response.url)
                if '/login' in final_url or 'account/suspended' in final_url:
                    is_valid = False
                # Check for "This account doesn't exist" in response
                if 'This account doesn' in get_response.text or "doesn't exist" in get_response.text.lower():
                    is_valid = False

        # Facebook specific checks
        elif link.platform == 'facebook':
            # Facebook returns 400 for bot protection - can't determine status
            if response.status_code == 400:
                return ValidationResult(
                    link=link,
                    is_valid=True,  # Assume valid - Facebook blocks HEAD requests
                    status_code=400,
                    error="Facebook bot protection - cannot validate via HTTP"
                )
            elif response.status_code == 404:
                is_valid = False
            elif response.status_code == 200:
                get_response = client.get(link.url, follow_redirects=True, timeout=REQUEST_TIMEOUT)
                # Check for "Page Not Found" or redirect to login
                if 'page not found' in get_response.text.lower():
                    is_valid = False
                if 'this content isn\'t available' in get_response.text.lower():
                    is_valid = False

        # Instagram specific checks
        elif link.platform == 'instagram':
            if response.status_code == 404:
                is_valid = False
            elif response.status_code == 200:
                get_response = client.get(link.url, follow_redirects=True, timeout=REQUEST_TIMEOUT)
                if "Sorry, this page isn't available" in get_response.text:
                    is_valid = False

        # LinkedIn specific checks
        elif link.platform == 'linkedin':
            if response.status_code == 404:
                is_valid = False
            elif response.status_code == 999:
                # LinkedIn returns 999 for bot detection, can't validate
                return ValidationResult(
                    link=link,
                    is_valid=True,  # Assume valid since we can't check
                    status_code=999,
                    error="LinkedIn bot detection - cannot validate"
                )

        # YouTube specific checks
        elif link.platform == 'youtube':
            if response.status_code == 404:
                is_valid = False
            elif response.status_code == 200:
                get_response = client.get(link.url, follow_redirects=True, timeout=REQUEST_TIMEOUT)
                if 'This channel does not exist' in get_response.text:
                    is_valid = False

        # Only mark as dead on explicit 404 - other errors are indeterminate
        # Don't treat 400/403/5xx as dead - could be bot protection
        if response.status_code == 404:
            is_valid = False

        return ValidationResult(
            link=link,
            is_valid=is_valid,
            status_code=response.status_code,
            redirect_url=str(response.url) if str(response.url) != link.url else None
        )

    except httpx.TimeoutException:
        return ValidationResult(
            link=link,
            is_valid=False,
            error="Timeout"
        )
    except httpx.HTTPError as e:
        return ValidationResult(
            link=link,
            is_valid=False,
            error=str(e)
        )
    except Exception as e:
        return ValidationResult(
            link=link,
            is_valid=False,
            error=f"Unexpected error: {e}"
        )


def remove_dead_links(entry_path: Path, dead_links: list[ValidationResult]) -> int:
    """Remove dead links from an entry file. Returns count of removed links."""

    with open(entry_path, 'r', encoding='utf-8') as f:
        entry = yaml.safe_load(f)

    if not entry:
        return 0

    removed_count = 0

    # Group dead links by source
    wikidata_dead = [r.link for r in dead_links if r.link.source == 'wikidata']
    web_claims_dead = [r.link for r in dead_links if r.link.source == 'web_claims']

    # Remove from wikidata_enrichment
    if wikidata_dead and 'wikidata_enrichment' in entry:
        wikidata = entry['wikidata_enrichment']
        for link in wikidata_dead:
            # Map platform to Wikidata property key
            prop_keys = {
                'twitter': 'P2002_x__twitter__username',
                'facebook': 'P2013_facebook_id',
                'instagram': 'P2003_instagram_username',
                'linkedin': 'P4264_linkedin_company_id',
                'youtube': 'P2397_youtube_channel_id'
            }
            prop_key = prop_keys.get(link.platform)
            if prop_key and prop_key in wikidata:
                del wikidata[prop_key]
                removed_count += 1
                logger.info(f"  Removed {prop_key} from wikidata_enrichment")

    # Remove from web_claims (work backwards to preserve indices)
    if web_claims_dead and 'web_claims' in entry:
        indices_to_remove = sorted([l.claim_index for l in web_claims_dead if l.claim_index is not None], reverse=True)
        for idx in indices_to_remove:
            if idx < len(entry['web_claims']):
                claim = entry['web_claims'][idx]
                logger.info(f"  Removed web_claim[{idx}]: {claim.get('claim_type')} = {claim.get('claim_value')}")
                del entry['web_claims'][idx]
                removed_count += 1

    # Write back if changes were made
    if removed_count > 0:
        with open(entry_path, 'w', encoding='utf-8') as f:
            yaml.dump(entry, f, default_flow_style=False, allow_unicode=True, sort_keys=False)

    return removed_count


def main():
    parser = argparse.ArgumentParser(description="Validate social media links in NDE entries")
    parser.add_argument('--dry-run', action='store_true', help="Only report dead links, don't modify files")
    parser.add_argument('--remove-dead', action='store_true', help="Remove dead links from entry files")
    parser.add_argument('--limit', type=int, help="Process only first N entries")
    parser.add_argument('--entry', type=str, help="Process specific entry (e.g., 0615)")
    parser.add_argument('--verbose', '-v', action='store_true', help="Verbose output")
    args = parser.parse_args()

    if args.verbose:
        logging.getLogger().setLevel(logging.DEBUG)

    # Find entry files
    if args.entry:
        entry_files = list(ENTRIES_DIR.glob(f"{args.entry}_*.yaml"))
        if not entry_files:
            logger.error(f"No entry found matching {args.entry}")
            return
    else:
        entry_files = sorted(ENTRIES_DIR.glob("*.yaml"))
        if args.limit:
            entry_files = entry_files[:args.limit]

    logger.info(f"Processing {len(entry_files)} entry files...")

    # Collect all social media links
    all_links: list[SocialMediaLink] = []
    for entry_path in entry_files:
        links = extract_social_links(entry_path)
        all_links.extend(links)

    logger.info(f"Found {len(all_links)} social media links to validate")

    if not all_links:
        logger.info("No social media links found.")
        return

    # Count by platform
    platform_counts = {}
    for link in all_links:
        platform_counts[link.platform] = platform_counts.get(link.platform, 0) + 1
    logger.info(f"By platform: {platform_counts}")

    # Validate links
    dead_links_by_entry: dict[str, list[ValidationResult]] = {}
    valid_count = 0
    dead_count = 0
    error_count = 0

    # Use httpx client with browser-like headers
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
    }

    with httpx.Client(headers=headers, follow_redirects=True) as client:
        for i, link in enumerate(all_links):
            if i > 0 and i % 10 == 0:
                logger.info(f"Progress: {i}/{len(all_links)} links validated...")

            result = validate_link(link, client)

            if result.error:
                logger.warning(f"[{link.entry_id}] {link.platform}: {link.url} - ERROR: {result.error}")
                error_count += 1
            elif result.is_valid:
                if args.verbose:
                    logger.debug(f"[{link.entry_id}] {link.platform}: {link.url} - OK ({result.status_code})")
                valid_count += 1
            else:
                logger.warning(f"[{link.entry_id}] {link.platform}: {link.url} - DEAD ({result.status_code})")
                dead_count += 1

                if link.entry_id not in dead_links_by_entry:
                    dead_links_by_entry[link.entry_id] = []
                dead_links_by_entry[link.entry_id].append(result)

            # Rate limiting
            time.sleep(RATE_LIMIT_DELAY)

    # Summary
    logger.info("=" * 60)
    logger.info("VALIDATION SUMMARY")
    logger.info("=" * 60)
    logger.info(f"Total links checked: {len(all_links)}")
    logger.info(f"Valid: {valid_count}")
    logger.info(f"Dead: {dead_count}")
    logger.info(f"Errors: {error_count}")
    logger.info(f"Entries with dead links: {len(dead_links_by_entry)}")

    if dead_links_by_entry:
        logger.info("\nDead links by entry:")
        for entry_id, results in sorted(dead_links_by_entry.items()):
            logger.info(f"\n  Entry {entry_id}:")
            for r in results:
                logger.info(f"    - {r.link.platform} ({r.link.source}): {r.link.url}")

    # Remove dead links if requested
    if args.remove_dead and not args.dry_run and dead_links_by_entry:
        logger.info("\nRemoving dead links from entry files...")
        total_removed = 0
        for entry_id, results in dead_links_by_entry.items():
            entry_path = list(ENTRIES_DIR.glob(f"{entry_id}_*.yaml"))[0]
            removed = remove_dead_links(entry_path, results)
            total_removed += removed
            logger.info(f"  {entry_id}: removed {removed} links")
        logger.info(f"Total links removed: {total_removed}")
    elif args.dry_run and dead_links_by_entry:
        logger.info("\n[DRY RUN] Would remove links from the above entries. Run with --remove-dead to actually remove.")


if __name__ == '__main__':
    main()