#!/usr/bin/env python3 """ Validate social media links in NDE enriched entries. This script checks if social media profiles (Twitter/X, Facebook, Instagram, LinkedIn, YouTube) actually exist by making HTTP HEAD requests. Usage: python scripts/validate_social_media_links.py [--dry-run] [--remove-dead] [--limit N] Options: --dry-run Only report dead links, don't modify files --remove-dead Remove dead links from entry files --limit N Process only first N entries (for testing) --entry ENTRY Process specific entry (e.g., 0615) """ import argparse import logging import time import re from pathlib import Path from typing import Optional import httpx import yaml from concurrent.futures import ThreadPoolExecutor, as_completed from dataclasses import dataclass, field logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) # Constants ENTRIES_DIR = Path(__file__).parent.parent / "data/nde/enriched/entries" REQUEST_TIMEOUT = 10.0 RATE_LIMIT_DELAY = 0.5 # seconds between requests to same domain MAX_WORKERS = 5 # concurrent validation threads @dataclass class SocialMediaLink: """Represents a social media link to validate.""" entry_id: str platform: str url: str source: str # 'wikidata' or 'web_claims' claim_index: Optional[int] = None # index in web_claims list if applicable @dataclass class ValidationResult: """Result of validating a social media link.""" link: SocialMediaLink is_valid: bool status_code: Optional[int] = None error: Optional[str] = None redirect_url: Optional[str] = None def normalize_twitter_url(url_or_username: str) -> str: """Convert Twitter username or URL to full URL.""" if url_or_username.startswith(('http://', 'https://')): # Already a URL, normalize to x.com url = url_or_username.replace('twitter.com', 'x.com') return url else: # Username only username = url_or_username.lstrip('@') return f"https://x.com/{username}" def extract_social_links(entry_path: Path) -> list[SocialMediaLink]: """Extract all social media links from an entry file.""" links = [] entry_id = entry_path.stem.split('_')[0] with open(entry_path, 'r', encoding='utf-8') as f: entry = yaml.safe_load(f) if not entry: return links # Check Wikidata enrichment for Twitter username wikidata = entry.get('wikidata_enrichment', {}) if wikidata: # P2002 = Twitter/X username twitter_data = wikidata.get('P2002_x__twitter__username', {}) if twitter_data: value = twitter_data.get('value') if value: url = normalize_twitter_url(value) links.append(SocialMediaLink( entry_id=entry_id, platform='twitter', url=url, source='wikidata' )) # P2013 = Facebook ID facebook_data = wikidata.get('P2013_facebook_id', {}) if facebook_data: value = facebook_data.get('value') if value: url = f"https://www.facebook.com/{value}" links.append(SocialMediaLink( entry_id=entry_id, platform='facebook', url=url, source='wikidata' )) # P2003 = Instagram username instagram_data = wikidata.get('P2003_instagram_username', {}) if instagram_data: value = instagram_data.get('value') if value: url = f"https://www.instagram.com/{value}/" links.append(SocialMediaLink( entry_id=entry_id, platform='instagram', url=url, source='wikidata' )) # P4264 = LinkedIn company ID linkedin_data = wikidata.get('P4264_linkedin_company_id', {}) if linkedin_data: value = linkedin_data.get('value') if value: url = f"https://www.linkedin.com/company/{value}" links.append(SocialMediaLink( entry_id=entry_id, platform='linkedin', url=url, source='wikidata' )) # P2397 = YouTube channel ID youtube_data = wikidata.get('P2397_youtube_channel_id', {}) if youtube_data: value = youtube_data.get('value') if value: url = f"https://www.youtube.com/channel/{value}" links.append(SocialMediaLink( entry_id=entry_id, platform='youtube', url=url, source='wikidata' )) # Check web_claims for social media links # web_claims is a dict with a 'claims' list inside web_claims_data = entry.get('web_claims', {}) if isinstance(web_claims_data, dict): web_claims = web_claims_data.get('claims', []) else: web_claims = web_claims_data if isinstance(web_claims_data, list) else [] for idx, claim in enumerate(web_claims): claim_type = claim.get('claim_type', '') claim_value = claim.get('claim_value', '') if claim_type == 'social_twitter' and claim_value: url = normalize_twitter_url(claim_value) links.append(SocialMediaLink( entry_id=entry_id, platform='twitter', url=url, source='web_claims', claim_index=idx )) elif claim_type == 'social_facebook' and claim_value: links.append(SocialMediaLink( entry_id=entry_id, platform='facebook', url=claim_value, source='web_claims', claim_index=idx )) elif claim_type == 'social_instagram' and claim_value: links.append(SocialMediaLink( entry_id=entry_id, platform='instagram', url=claim_value, source='web_claims', claim_index=idx )) elif claim_type == 'social_linkedin' and claim_value: links.append(SocialMediaLink( entry_id=entry_id, platform='linkedin', url=claim_value, source='web_claims', claim_index=idx )) elif claim_type == 'social_youtube' and claim_value: links.append(SocialMediaLink( entry_id=entry_id, platform='youtube', url=claim_value, source='web_claims', claim_index=idx )) return links def validate_link(link: SocialMediaLink, client: httpx.Client) -> ValidationResult: """Validate a single social media link by checking if profile exists. IMPORTANT: Social media platforms have aggressive bot protection. We only mark links as DEAD if we get explicit 404 or content confirmation. Other error codes (400, 403) are treated as "indeterminate" (assume valid). """ try: # Use HEAD request first, fall back to GET if needed response = client.head(link.url, follow_redirects=True, timeout=REQUEST_TIMEOUT) # Default: assume valid unless proven otherwise is_valid = True # Twitter/X specific checks if link.platform == 'twitter': # X returns 403 for bot protection - can't determine status if response.status_code == 403: return ValidationResult( link=link, is_valid=True, # Assume valid - need browser to verify status_code=403, error="Twitter/X bot protection - cannot validate via HTTP" ) elif response.status_code == 404: is_valid = False elif response.status_code == 200: # Need to do a GET to check content get_response = client.get(link.url, follow_redirects=True, timeout=REQUEST_TIMEOUT) # Check if redirected to login or account suspended final_url = str(get_response.url) if '/login' in final_url or 'account/suspended' in final_url: is_valid = False # Check for "This account doesn't exist" in response if 'This account doesn' in get_response.text or "doesn't exist" in get_response.text.lower(): is_valid = False # Facebook specific checks elif link.platform == 'facebook': # Facebook returns 400 for bot protection - can't determine status if response.status_code == 400: return ValidationResult( link=link, is_valid=True, # Assume valid - Facebook blocks HEAD requests status_code=400, error="Facebook bot protection - cannot validate via HTTP" ) elif response.status_code == 404: is_valid = False elif response.status_code == 200: get_response = client.get(link.url, follow_redirects=True, timeout=REQUEST_TIMEOUT) # Check for "Page Not Found" or redirect to login if 'page not found' in get_response.text.lower(): is_valid = False if 'this content isn\'t available' in get_response.text.lower(): is_valid = False # Instagram specific checks elif link.platform == 'instagram': if response.status_code == 404: is_valid = False elif response.status_code == 200: get_response = client.get(link.url, follow_redirects=True, timeout=REQUEST_TIMEOUT) if "Sorry, this page isn't available" in get_response.text: is_valid = False # LinkedIn specific checks elif link.platform == 'linkedin': if response.status_code == 404: is_valid = False elif response.status_code == 999: # LinkedIn returns 999 for bot detection, can't validate return ValidationResult( link=link, is_valid=True, # Assume valid since we can't check status_code=999, error="LinkedIn bot detection - cannot validate" ) # YouTube specific checks elif link.platform == 'youtube': if response.status_code == 404: is_valid = False elif response.status_code == 200: get_response = client.get(link.url, follow_redirects=True, timeout=REQUEST_TIMEOUT) if 'This channel does not exist' in get_response.text: is_valid = False # Only mark as dead on explicit 404 - other errors are indeterminate # Don't treat 400/403/5xx as dead - could be bot protection if response.status_code == 404: is_valid = False return ValidationResult( link=link, is_valid=is_valid, status_code=response.status_code, redirect_url=str(response.url) if str(response.url) != link.url else None ) except httpx.TimeoutException: return ValidationResult( link=link, is_valid=False, error="Timeout" ) except httpx.HTTPError as e: return ValidationResult( link=link, is_valid=False, error=str(e) ) except Exception as e: return ValidationResult( link=link, is_valid=False, error=f"Unexpected error: {e}" ) def remove_dead_links(entry_path: Path, dead_links: list[ValidationResult]) -> int: """Remove dead links from an entry file. Returns count of removed links.""" with open(entry_path, 'r', encoding='utf-8') as f: entry = yaml.safe_load(f) if not entry: return 0 removed_count = 0 # Group dead links by source wikidata_dead = [r.link for r in dead_links if r.link.source == 'wikidata'] web_claims_dead = [r.link for r in dead_links if r.link.source == 'web_claims'] # Remove from wikidata_enrichment if wikidata_dead and 'wikidata_enrichment' in entry: wikidata = entry['wikidata_enrichment'] for link in wikidata_dead: # Map platform to Wikidata property key prop_keys = { 'twitter': 'P2002_x__twitter__username', 'facebook': 'P2013_facebook_id', 'instagram': 'P2003_instagram_username', 'linkedin': 'P4264_linkedin_company_id', 'youtube': 'P2397_youtube_channel_id' } prop_key = prop_keys.get(link.platform) if prop_key and prop_key in wikidata: del wikidata[prop_key] removed_count += 1 logger.info(f" Removed {prop_key} from wikidata_enrichment") # Remove from web_claims (work backwards to preserve indices) if web_claims_dead and 'web_claims' in entry: indices_to_remove = sorted([l.claim_index for l in web_claims_dead if l.claim_index is not None], reverse=True) for idx in indices_to_remove: if idx < len(entry['web_claims']): claim = entry['web_claims'][idx] logger.info(f" Removed web_claim[{idx}]: {claim.get('claim_type')} = {claim.get('claim_value')}") del entry['web_claims'][idx] removed_count += 1 # Write back if changes were made if removed_count > 0: with open(entry_path, 'w', encoding='utf-8') as f: yaml.dump(entry, f, default_flow_style=False, allow_unicode=True, sort_keys=False) return removed_count def main(): parser = argparse.ArgumentParser(description="Validate social media links in NDE entries") parser.add_argument('--dry-run', action='store_true', help="Only report dead links, don't modify files") parser.add_argument('--remove-dead', action='store_true', help="Remove dead links from entry files") parser.add_argument('--limit', type=int, help="Process only first N entries") parser.add_argument('--entry', type=str, help="Process specific entry (e.g., 0615)") parser.add_argument('--verbose', '-v', action='store_true', help="Verbose output") args = parser.parse_args() if args.verbose: logging.getLogger().setLevel(logging.DEBUG) # Find entry files if args.entry: entry_files = list(ENTRIES_DIR.glob(f"{args.entry}_*.yaml")) if not entry_files: logger.error(f"No entry found matching {args.entry}") return else: entry_files = sorted(ENTRIES_DIR.glob("*.yaml")) if args.limit: entry_files = entry_files[:args.limit] logger.info(f"Processing {len(entry_files)} entry files...") # Collect all social media links all_links: list[SocialMediaLink] = [] for entry_path in entry_files: links = extract_social_links(entry_path) all_links.extend(links) logger.info(f"Found {len(all_links)} social media links to validate") if not all_links: logger.info("No social media links found.") return # Count by platform platform_counts = {} for link in all_links: platform_counts[link.platform] = platform_counts.get(link.platform, 0) + 1 logger.info(f"By platform: {platform_counts}") # Validate links dead_links_by_entry: dict[str, list[ValidationResult]] = {} valid_count = 0 dead_count = 0 error_count = 0 # Use httpx client with browser-like headers headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', } with httpx.Client(headers=headers, follow_redirects=True) as client: for i, link in enumerate(all_links): if i > 0 and i % 10 == 0: logger.info(f"Progress: {i}/{len(all_links)} links validated...") result = validate_link(link, client) if result.error: logger.warning(f"[{link.entry_id}] {link.platform}: {link.url} - ERROR: {result.error}") error_count += 1 elif result.is_valid: if args.verbose: logger.debug(f"[{link.entry_id}] {link.platform}: {link.url} - OK ({result.status_code})") valid_count += 1 else: logger.warning(f"[{link.entry_id}] {link.platform}: {link.url} - DEAD ({result.status_code})") dead_count += 1 if link.entry_id not in dead_links_by_entry: dead_links_by_entry[link.entry_id] = [] dead_links_by_entry[link.entry_id].append(result) # Rate limiting time.sleep(RATE_LIMIT_DELAY) # Summary logger.info("=" * 60) logger.info("VALIDATION SUMMARY") logger.info("=" * 60) logger.info(f"Total links checked: {len(all_links)}") logger.info(f"Valid: {valid_count}") logger.info(f"Dead: {dead_count}") logger.info(f"Errors: {error_count}") logger.info(f"Entries with dead links: {len(dead_links_by_entry)}") if dead_links_by_entry: logger.info("\nDead links by entry:") for entry_id, results in sorted(dead_links_by_entry.items()): logger.info(f"\n Entry {entry_id}:") for r in results: logger.info(f" - {r.link.platform} ({r.link.source}): {r.link.url}") # Remove dead links if requested if args.remove_dead and not args.dry_run and dead_links_by_entry: logger.info("\nRemoving dead links from entry files...") total_removed = 0 for entry_id, results in dead_links_by_entry.items(): entry_path = list(ENTRIES_DIR.glob(f"{entry_id}_*.yaml"))[0] removed = remove_dead_links(entry_path, results) total_removed += removed logger.info(f" {entry_id}: removed {removed} links") logger.info(f"Total links removed: {total_removed}") elif args.dry_run and dead_links_by_entry: logger.info("\n[DRY RUN] Would remove links from the above entries. Run with --remove-dead to actually remove.") if __name__ == '__main__': main()