glam/scripts/validate_social_media_links.py
2025-12-02 14:36:01 +01:00

496 lines
19 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Validate social media links in NDE enriched entries.
This script checks if social media profiles (Twitter/X, Facebook, Instagram,
LinkedIn, YouTube) actually exist by making HTTP HEAD requests.
Usage:
python scripts/validate_social_media_links.py [--dry-run] [--remove-dead] [--limit N]
Options:
--dry-run Only report dead links, don't modify files
--remove-dead Remove dead links from entry files
--limit N Process only first N entries (for testing)
--entry ENTRY Process specific entry (e.g., 0615)
"""
import argparse
import logging
import time
import re
from pathlib import Path
from typing import Optional
import httpx
import yaml
from concurrent.futures import ThreadPoolExecutor, as_completed
from dataclasses import dataclass, field
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
# Constants
ENTRIES_DIR = Path(__file__).parent.parent / "data/nde/enriched/entries"
REQUEST_TIMEOUT = 10.0
RATE_LIMIT_DELAY = 0.5 # seconds between requests to same domain
MAX_WORKERS = 5 # concurrent validation threads
@dataclass
class SocialMediaLink:
"""Represents a social media link to validate."""
entry_id: str
platform: str
url: str
source: str # 'wikidata' or 'web_claims'
claim_index: Optional[int] = None # index in web_claims list if applicable
@dataclass
class ValidationResult:
"""Result of validating a social media link."""
link: SocialMediaLink
is_valid: bool
status_code: Optional[int] = None
error: Optional[str] = None
redirect_url: Optional[str] = None
def normalize_twitter_url(url_or_username: str) -> str:
"""Convert Twitter username or URL to full URL."""
if url_or_username.startswith(('http://', 'https://')):
# Already a URL, normalize to x.com
url = url_or_username.replace('twitter.com', 'x.com')
return url
else:
# Username only
username = url_or_username.lstrip('@')
return f"https://x.com/{username}"
def extract_social_links(entry_path: Path) -> list[SocialMediaLink]:
"""Extract all social media links from an entry file."""
links = []
entry_id = entry_path.stem.split('_')[0]
with open(entry_path, 'r', encoding='utf-8') as f:
entry = yaml.safe_load(f)
if not entry:
return links
# Check Wikidata enrichment for Twitter username
wikidata = entry.get('wikidata_enrichment', {})
if wikidata:
# P2002 = Twitter/X username
twitter_data = wikidata.get('P2002_x__twitter__username', {})
if twitter_data:
value = twitter_data.get('value')
if value:
url = normalize_twitter_url(value)
links.append(SocialMediaLink(
entry_id=entry_id,
platform='twitter',
url=url,
source='wikidata'
))
# P2013 = Facebook ID
facebook_data = wikidata.get('P2013_facebook_id', {})
if facebook_data:
value = facebook_data.get('value')
if value:
url = f"https://www.facebook.com/{value}"
links.append(SocialMediaLink(
entry_id=entry_id,
platform='facebook',
url=url,
source='wikidata'
))
# P2003 = Instagram username
instagram_data = wikidata.get('P2003_instagram_username', {})
if instagram_data:
value = instagram_data.get('value')
if value:
url = f"https://www.instagram.com/{value}/"
links.append(SocialMediaLink(
entry_id=entry_id,
platform='instagram',
url=url,
source='wikidata'
))
# P4264 = LinkedIn company ID
linkedin_data = wikidata.get('P4264_linkedin_company_id', {})
if linkedin_data:
value = linkedin_data.get('value')
if value:
url = f"https://www.linkedin.com/company/{value}"
links.append(SocialMediaLink(
entry_id=entry_id,
platform='linkedin',
url=url,
source='wikidata'
))
# P2397 = YouTube channel ID
youtube_data = wikidata.get('P2397_youtube_channel_id', {})
if youtube_data:
value = youtube_data.get('value')
if value:
url = f"https://www.youtube.com/channel/{value}"
links.append(SocialMediaLink(
entry_id=entry_id,
platform='youtube',
url=url,
source='wikidata'
))
# Check web_claims for social media links
# web_claims is a dict with a 'claims' list inside
web_claims_data = entry.get('web_claims', {})
if isinstance(web_claims_data, dict):
web_claims = web_claims_data.get('claims', [])
else:
web_claims = web_claims_data if isinstance(web_claims_data, list) else []
for idx, claim in enumerate(web_claims):
claim_type = claim.get('claim_type', '')
claim_value = claim.get('claim_value', '')
if claim_type == 'social_twitter' and claim_value:
url = normalize_twitter_url(claim_value)
links.append(SocialMediaLink(
entry_id=entry_id,
platform='twitter',
url=url,
source='web_claims',
claim_index=idx
))
elif claim_type == 'social_facebook' and claim_value:
links.append(SocialMediaLink(
entry_id=entry_id,
platform='facebook',
url=claim_value,
source='web_claims',
claim_index=idx
))
elif claim_type == 'social_instagram' and claim_value:
links.append(SocialMediaLink(
entry_id=entry_id,
platform='instagram',
url=claim_value,
source='web_claims',
claim_index=idx
))
elif claim_type == 'social_linkedin' and claim_value:
links.append(SocialMediaLink(
entry_id=entry_id,
platform='linkedin',
url=claim_value,
source='web_claims',
claim_index=idx
))
elif claim_type == 'social_youtube' and claim_value:
links.append(SocialMediaLink(
entry_id=entry_id,
platform='youtube',
url=claim_value,
source='web_claims',
claim_index=idx
))
return links
def validate_link(link: SocialMediaLink, client: httpx.Client) -> ValidationResult:
"""Validate a single social media link by checking if profile exists.
IMPORTANT: Social media platforms have aggressive bot protection.
We only mark links as DEAD if we get explicit 404 or content confirmation.
Other error codes (400, 403) are treated as "indeterminate" (assume valid).
"""
try:
# Use HEAD request first, fall back to GET if needed
response = client.head(link.url, follow_redirects=True, timeout=REQUEST_TIMEOUT)
# Default: assume valid unless proven otherwise
is_valid = True
# Twitter/X specific checks
if link.platform == 'twitter':
# X returns 403 for bot protection - can't determine status
if response.status_code == 403:
return ValidationResult(
link=link,
is_valid=True, # Assume valid - need browser to verify
status_code=403,
error="Twitter/X bot protection - cannot validate via HTTP"
)
elif response.status_code == 404:
is_valid = False
elif response.status_code == 200:
# Need to do a GET to check content
get_response = client.get(link.url, follow_redirects=True, timeout=REQUEST_TIMEOUT)
# Check if redirected to login or account suspended
final_url = str(get_response.url)
if '/login' in final_url or 'account/suspended' in final_url:
is_valid = False
# Check for "This account doesn't exist" in response
if 'This account doesn' in get_response.text or "doesn't exist" in get_response.text.lower():
is_valid = False
# Facebook specific checks
elif link.platform == 'facebook':
# Facebook returns 400 for bot protection - can't determine status
if response.status_code == 400:
return ValidationResult(
link=link,
is_valid=True, # Assume valid - Facebook blocks HEAD requests
status_code=400,
error="Facebook bot protection - cannot validate via HTTP"
)
elif response.status_code == 404:
is_valid = False
elif response.status_code == 200:
get_response = client.get(link.url, follow_redirects=True, timeout=REQUEST_TIMEOUT)
# Check for "Page Not Found" or redirect to login
if 'page not found' in get_response.text.lower():
is_valid = False
if 'this content isn\'t available' in get_response.text.lower():
is_valid = False
# Instagram specific checks
elif link.platform == 'instagram':
if response.status_code == 404:
is_valid = False
elif response.status_code == 200:
get_response = client.get(link.url, follow_redirects=True, timeout=REQUEST_TIMEOUT)
if "Sorry, this page isn't available" in get_response.text:
is_valid = False
# LinkedIn specific checks
elif link.platform == 'linkedin':
if response.status_code == 404:
is_valid = False
elif response.status_code == 999:
# LinkedIn returns 999 for bot detection, can't validate
return ValidationResult(
link=link,
is_valid=True, # Assume valid since we can't check
status_code=999,
error="LinkedIn bot detection - cannot validate"
)
# YouTube specific checks
elif link.platform == 'youtube':
if response.status_code == 404:
is_valid = False
elif response.status_code == 200:
get_response = client.get(link.url, follow_redirects=True, timeout=REQUEST_TIMEOUT)
if 'This channel does not exist' in get_response.text:
is_valid = False
# Only mark as dead on explicit 404 - other errors are indeterminate
# Don't treat 400/403/5xx as dead - could be bot protection
if response.status_code == 404:
is_valid = False
return ValidationResult(
link=link,
is_valid=is_valid,
status_code=response.status_code,
redirect_url=str(response.url) if str(response.url) != link.url else None
)
except httpx.TimeoutException:
return ValidationResult(
link=link,
is_valid=False,
error="Timeout"
)
except httpx.HTTPError as e:
return ValidationResult(
link=link,
is_valid=False,
error=str(e)
)
except Exception as e:
return ValidationResult(
link=link,
is_valid=False,
error=f"Unexpected error: {e}"
)
def remove_dead_links(entry_path: Path, dead_links: list[ValidationResult]) -> int:
"""Remove dead links from an entry file. Returns count of removed links."""
with open(entry_path, 'r', encoding='utf-8') as f:
entry = yaml.safe_load(f)
if not entry:
return 0
removed_count = 0
# Group dead links by source
wikidata_dead = [r.link for r in dead_links if r.link.source == 'wikidata']
web_claims_dead = [r.link for r in dead_links if r.link.source == 'web_claims']
# Remove from wikidata_enrichment
if wikidata_dead and 'wikidata_enrichment' in entry:
wikidata = entry['wikidata_enrichment']
for link in wikidata_dead:
# Map platform to Wikidata property key
prop_keys = {
'twitter': 'P2002_x__twitter__username',
'facebook': 'P2013_facebook_id',
'instagram': 'P2003_instagram_username',
'linkedin': 'P4264_linkedin_company_id',
'youtube': 'P2397_youtube_channel_id'
}
prop_key = prop_keys.get(link.platform)
if prop_key and prop_key in wikidata:
del wikidata[prop_key]
removed_count += 1
logger.info(f" Removed {prop_key} from wikidata_enrichment")
# Remove from web_claims (work backwards to preserve indices)
if web_claims_dead and 'web_claims' in entry:
indices_to_remove = sorted([l.claim_index for l in web_claims_dead if l.claim_index is not None], reverse=True)
for idx in indices_to_remove:
if idx < len(entry['web_claims']):
claim = entry['web_claims'][idx]
logger.info(f" Removed web_claim[{idx}]: {claim.get('claim_type')} = {claim.get('claim_value')}")
del entry['web_claims'][idx]
removed_count += 1
# Write back if changes were made
if removed_count > 0:
with open(entry_path, 'w', encoding='utf-8') as f:
yaml.dump(entry, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
return removed_count
def main():
parser = argparse.ArgumentParser(description="Validate social media links in NDE entries")
parser.add_argument('--dry-run', action='store_true', help="Only report dead links, don't modify files")
parser.add_argument('--remove-dead', action='store_true', help="Remove dead links from entry files")
parser.add_argument('--limit', type=int, help="Process only first N entries")
parser.add_argument('--entry', type=str, help="Process specific entry (e.g., 0615)")
parser.add_argument('--verbose', '-v', action='store_true', help="Verbose output")
args = parser.parse_args()
if args.verbose:
logging.getLogger().setLevel(logging.DEBUG)
# Find entry files
if args.entry:
entry_files = list(ENTRIES_DIR.glob(f"{args.entry}_*.yaml"))
if not entry_files:
logger.error(f"No entry found matching {args.entry}")
return
else:
entry_files = sorted(ENTRIES_DIR.glob("*.yaml"))
if args.limit:
entry_files = entry_files[:args.limit]
logger.info(f"Processing {len(entry_files)} entry files...")
# Collect all social media links
all_links: list[SocialMediaLink] = []
for entry_path in entry_files:
links = extract_social_links(entry_path)
all_links.extend(links)
logger.info(f"Found {len(all_links)} social media links to validate")
if not all_links:
logger.info("No social media links found.")
return
# Count by platform
platform_counts = {}
for link in all_links:
platform_counts[link.platform] = platform_counts.get(link.platform, 0) + 1
logger.info(f"By platform: {platform_counts}")
# Validate links
dead_links_by_entry: dict[str, list[ValidationResult]] = {}
valid_count = 0
dead_count = 0
error_count = 0
# Use httpx client with browser-like headers
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
}
with httpx.Client(headers=headers, follow_redirects=True) as client:
for i, link in enumerate(all_links):
if i > 0 and i % 10 == 0:
logger.info(f"Progress: {i}/{len(all_links)} links validated...")
result = validate_link(link, client)
if result.error:
logger.warning(f"[{link.entry_id}] {link.platform}: {link.url} - ERROR: {result.error}")
error_count += 1
elif result.is_valid:
if args.verbose:
logger.debug(f"[{link.entry_id}] {link.platform}: {link.url} - OK ({result.status_code})")
valid_count += 1
else:
logger.warning(f"[{link.entry_id}] {link.platform}: {link.url} - DEAD ({result.status_code})")
dead_count += 1
if link.entry_id not in dead_links_by_entry:
dead_links_by_entry[link.entry_id] = []
dead_links_by_entry[link.entry_id].append(result)
# Rate limiting
time.sleep(RATE_LIMIT_DELAY)
# Summary
logger.info("=" * 60)
logger.info("VALIDATION SUMMARY")
logger.info("=" * 60)
logger.info(f"Total links checked: {len(all_links)}")
logger.info(f"Valid: {valid_count}")
logger.info(f"Dead: {dead_count}")
logger.info(f"Errors: {error_count}")
logger.info(f"Entries with dead links: {len(dead_links_by_entry)}")
if dead_links_by_entry:
logger.info("\nDead links by entry:")
for entry_id, results in sorted(dead_links_by_entry.items()):
logger.info(f"\n Entry {entry_id}:")
for r in results:
logger.info(f" - {r.link.platform} ({r.link.source}): {r.link.url}")
# Remove dead links if requested
if args.remove_dead and not args.dry_run and dead_links_by_entry:
logger.info("\nRemoving dead links from entry files...")
total_removed = 0
for entry_id, results in dead_links_by_entry.items():
entry_path = list(ENTRIES_DIR.glob(f"{entry_id}_*.yaml"))[0]
removed = remove_dead_links(entry_path, results)
total_removed += removed
logger.info(f" {entry_id}: removed {removed} links")
logger.info(f"Total links removed: {total_removed}")
elif args.dry_run and dead_links_by_entry:
logger.info("\n[DRY RUN] Would remove links from the above entries. Run with --remove-dead to actually remove.")
if __name__ == '__main__':
main()