496 lines
19 KiB
Python
Executable file
496 lines
19 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Validate social media links in NDE enriched entries.
|
|
|
|
This script checks if social media profiles (Twitter/X, Facebook, Instagram,
|
|
LinkedIn, YouTube) actually exist by making HTTP HEAD requests.
|
|
|
|
Usage:
|
|
python scripts/validate_social_media_links.py [--dry-run] [--remove-dead] [--limit N]
|
|
|
|
Options:
|
|
--dry-run Only report dead links, don't modify files
|
|
--remove-dead Remove dead links from entry files
|
|
--limit N Process only first N entries (for testing)
|
|
--entry ENTRY Process specific entry (e.g., 0615)
|
|
"""
|
|
|
|
import argparse
|
|
import logging
|
|
import time
|
|
import re
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
import httpx
|
|
import yaml
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
from dataclasses import dataclass, field
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Constants
|
|
ENTRIES_DIR = Path(__file__).parent.parent / "data/nde/enriched/entries"
|
|
REQUEST_TIMEOUT = 10.0
|
|
RATE_LIMIT_DELAY = 0.5 # seconds between requests to same domain
|
|
MAX_WORKERS = 5 # concurrent validation threads
|
|
|
|
|
|
@dataclass
|
|
class SocialMediaLink:
|
|
"""Represents a social media link to validate."""
|
|
entry_id: str
|
|
platform: str
|
|
url: str
|
|
source: str # 'wikidata' or 'web_claims'
|
|
claim_index: Optional[int] = None # index in web_claims list if applicable
|
|
|
|
|
|
@dataclass
|
|
class ValidationResult:
|
|
"""Result of validating a social media link."""
|
|
link: SocialMediaLink
|
|
is_valid: bool
|
|
status_code: Optional[int] = None
|
|
error: Optional[str] = None
|
|
redirect_url: Optional[str] = None
|
|
|
|
|
|
def normalize_twitter_url(url_or_username: str) -> str:
|
|
"""Convert Twitter username or URL to full URL."""
|
|
if url_or_username.startswith(('http://', 'https://')):
|
|
# Already a URL, normalize to x.com
|
|
url = url_or_username.replace('twitter.com', 'x.com')
|
|
return url
|
|
else:
|
|
# Username only
|
|
username = url_or_username.lstrip('@')
|
|
return f"https://x.com/{username}"
|
|
|
|
|
|
def extract_social_links(entry_path: Path) -> list[SocialMediaLink]:
|
|
"""Extract all social media links from an entry file."""
|
|
links = []
|
|
entry_id = entry_path.stem.split('_')[0]
|
|
|
|
with open(entry_path, 'r', encoding='utf-8') as f:
|
|
entry = yaml.safe_load(f)
|
|
|
|
if not entry:
|
|
return links
|
|
|
|
# Check Wikidata enrichment for Twitter username
|
|
wikidata = entry.get('wikidata_enrichment', {})
|
|
if wikidata:
|
|
# P2002 = Twitter/X username
|
|
twitter_data = wikidata.get('P2002_x__twitter__username', {})
|
|
if twitter_data:
|
|
value = twitter_data.get('value')
|
|
if value:
|
|
url = normalize_twitter_url(value)
|
|
links.append(SocialMediaLink(
|
|
entry_id=entry_id,
|
|
platform='twitter',
|
|
url=url,
|
|
source='wikidata'
|
|
))
|
|
|
|
# P2013 = Facebook ID
|
|
facebook_data = wikidata.get('P2013_facebook_id', {})
|
|
if facebook_data:
|
|
value = facebook_data.get('value')
|
|
if value:
|
|
url = f"https://www.facebook.com/{value}"
|
|
links.append(SocialMediaLink(
|
|
entry_id=entry_id,
|
|
platform='facebook',
|
|
url=url,
|
|
source='wikidata'
|
|
))
|
|
|
|
# P2003 = Instagram username
|
|
instagram_data = wikidata.get('P2003_instagram_username', {})
|
|
if instagram_data:
|
|
value = instagram_data.get('value')
|
|
if value:
|
|
url = f"https://www.instagram.com/{value}/"
|
|
links.append(SocialMediaLink(
|
|
entry_id=entry_id,
|
|
platform='instagram',
|
|
url=url,
|
|
source='wikidata'
|
|
))
|
|
|
|
# P4264 = LinkedIn company ID
|
|
linkedin_data = wikidata.get('P4264_linkedin_company_id', {})
|
|
if linkedin_data:
|
|
value = linkedin_data.get('value')
|
|
if value:
|
|
url = f"https://www.linkedin.com/company/{value}"
|
|
links.append(SocialMediaLink(
|
|
entry_id=entry_id,
|
|
platform='linkedin',
|
|
url=url,
|
|
source='wikidata'
|
|
))
|
|
|
|
# P2397 = YouTube channel ID
|
|
youtube_data = wikidata.get('P2397_youtube_channel_id', {})
|
|
if youtube_data:
|
|
value = youtube_data.get('value')
|
|
if value:
|
|
url = f"https://www.youtube.com/channel/{value}"
|
|
links.append(SocialMediaLink(
|
|
entry_id=entry_id,
|
|
platform='youtube',
|
|
url=url,
|
|
source='wikidata'
|
|
))
|
|
|
|
# Check web_claims for social media links
|
|
# web_claims is a dict with a 'claims' list inside
|
|
web_claims_data = entry.get('web_claims', {})
|
|
if isinstance(web_claims_data, dict):
|
|
web_claims = web_claims_data.get('claims', [])
|
|
else:
|
|
web_claims = web_claims_data if isinstance(web_claims_data, list) else []
|
|
|
|
for idx, claim in enumerate(web_claims):
|
|
claim_type = claim.get('claim_type', '')
|
|
claim_value = claim.get('claim_value', '')
|
|
|
|
if claim_type == 'social_twitter' and claim_value:
|
|
url = normalize_twitter_url(claim_value)
|
|
links.append(SocialMediaLink(
|
|
entry_id=entry_id,
|
|
platform='twitter',
|
|
url=url,
|
|
source='web_claims',
|
|
claim_index=idx
|
|
))
|
|
|
|
elif claim_type == 'social_facebook' and claim_value:
|
|
links.append(SocialMediaLink(
|
|
entry_id=entry_id,
|
|
platform='facebook',
|
|
url=claim_value,
|
|
source='web_claims',
|
|
claim_index=idx
|
|
))
|
|
|
|
elif claim_type == 'social_instagram' and claim_value:
|
|
links.append(SocialMediaLink(
|
|
entry_id=entry_id,
|
|
platform='instagram',
|
|
url=claim_value,
|
|
source='web_claims',
|
|
claim_index=idx
|
|
))
|
|
|
|
elif claim_type == 'social_linkedin' and claim_value:
|
|
links.append(SocialMediaLink(
|
|
entry_id=entry_id,
|
|
platform='linkedin',
|
|
url=claim_value,
|
|
source='web_claims',
|
|
claim_index=idx
|
|
))
|
|
|
|
elif claim_type == 'social_youtube' and claim_value:
|
|
links.append(SocialMediaLink(
|
|
entry_id=entry_id,
|
|
platform='youtube',
|
|
url=claim_value,
|
|
source='web_claims',
|
|
claim_index=idx
|
|
))
|
|
|
|
return links
|
|
|
|
|
|
def validate_link(link: SocialMediaLink, client: httpx.Client) -> ValidationResult:
|
|
"""Validate a single social media link by checking if profile exists.
|
|
|
|
IMPORTANT: Social media platforms have aggressive bot protection.
|
|
We only mark links as DEAD if we get explicit 404 or content confirmation.
|
|
Other error codes (400, 403) are treated as "indeterminate" (assume valid).
|
|
"""
|
|
|
|
try:
|
|
# Use HEAD request first, fall back to GET if needed
|
|
response = client.head(link.url, follow_redirects=True, timeout=REQUEST_TIMEOUT)
|
|
|
|
# Default: assume valid unless proven otherwise
|
|
is_valid = True
|
|
|
|
# Twitter/X specific checks
|
|
if link.platform == 'twitter':
|
|
# X returns 403 for bot protection - can't determine status
|
|
if response.status_code == 403:
|
|
return ValidationResult(
|
|
link=link,
|
|
is_valid=True, # Assume valid - need browser to verify
|
|
status_code=403,
|
|
error="Twitter/X bot protection - cannot validate via HTTP"
|
|
)
|
|
elif response.status_code == 404:
|
|
is_valid = False
|
|
elif response.status_code == 200:
|
|
# Need to do a GET to check content
|
|
get_response = client.get(link.url, follow_redirects=True, timeout=REQUEST_TIMEOUT)
|
|
# Check if redirected to login or account suspended
|
|
final_url = str(get_response.url)
|
|
if '/login' in final_url or 'account/suspended' in final_url:
|
|
is_valid = False
|
|
# Check for "This account doesn't exist" in response
|
|
if 'This account doesn' in get_response.text or "doesn't exist" in get_response.text.lower():
|
|
is_valid = False
|
|
|
|
# Facebook specific checks
|
|
elif link.platform == 'facebook':
|
|
# Facebook returns 400 for bot protection - can't determine status
|
|
if response.status_code == 400:
|
|
return ValidationResult(
|
|
link=link,
|
|
is_valid=True, # Assume valid - Facebook blocks HEAD requests
|
|
status_code=400,
|
|
error="Facebook bot protection - cannot validate via HTTP"
|
|
)
|
|
elif response.status_code == 404:
|
|
is_valid = False
|
|
elif response.status_code == 200:
|
|
get_response = client.get(link.url, follow_redirects=True, timeout=REQUEST_TIMEOUT)
|
|
# Check for "Page Not Found" or redirect to login
|
|
if 'page not found' in get_response.text.lower():
|
|
is_valid = False
|
|
if 'this content isn\'t available' in get_response.text.lower():
|
|
is_valid = False
|
|
|
|
# Instagram specific checks
|
|
elif link.platform == 'instagram':
|
|
if response.status_code == 404:
|
|
is_valid = False
|
|
elif response.status_code == 200:
|
|
get_response = client.get(link.url, follow_redirects=True, timeout=REQUEST_TIMEOUT)
|
|
if "Sorry, this page isn't available" in get_response.text:
|
|
is_valid = False
|
|
|
|
# LinkedIn specific checks
|
|
elif link.platform == 'linkedin':
|
|
if response.status_code == 404:
|
|
is_valid = False
|
|
elif response.status_code == 999:
|
|
# LinkedIn returns 999 for bot detection, can't validate
|
|
return ValidationResult(
|
|
link=link,
|
|
is_valid=True, # Assume valid since we can't check
|
|
status_code=999,
|
|
error="LinkedIn bot detection - cannot validate"
|
|
)
|
|
|
|
# YouTube specific checks
|
|
elif link.platform == 'youtube':
|
|
if response.status_code == 404:
|
|
is_valid = False
|
|
elif response.status_code == 200:
|
|
get_response = client.get(link.url, follow_redirects=True, timeout=REQUEST_TIMEOUT)
|
|
if 'This channel does not exist' in get_response.text:
|
|
is_valid = False
|
|
|
|
# Only mark as dead on explicit 404 - other errors are indeterminate
|
|
# Don't treat 400/403/5xx as dead - could be bot protection
|
|
if response.status_code == 404:
|
|
is_valid = False
|
|
|
|
return ValidationResult(
|
|
link=link,
|
|
is_valid=is_valid,
|
|
status_code=response.status_code,
|
|
redirect_url=str(response.url) if str(response.url) != link.url else None
|
|
)
|
|
|
|
except httpx.TimeoutException:
|
|
return ValidationResult(
|
|
link=link,
|
|
is_valid=False,
|
|
error="Timeout"
|
|
)
|
|
except httpx.HTTPError as e:
|
|
return ValidationResult(
|
|
link=link,
|
|
is_valid=False,
|
|
error=str(e)
|
|
)
|
|
except Exception as e:
|
|
return ValidationResult(
|
|
link=link,
|
|
is_valid=False,
|
|
error=f"Unexpected error: {e}"
|
|
)
|
|
|
|
|
|
def remove_dead_links(entry_path: Path, dead_links: list[ValidationResult]) -> int:
|
|
"""Remove dead links from an entry file. Returns count of removed links."""
|
|
|
|
with open(entry_path, 'r', encoding='utf-8') as f:
|
|
entry = yaml.safe_load(f)
|
|
|
|
if not entry:
|
|
return 0
|
|
|
|
removed_count = 0
|
|
|
|
# Group dead links by source
|
|
wikidata_dead = [r.link for r in dead_links if r.link.source == 'wikidata']
|
|
web_claims_dead = [r.link for r in dead_links if r.link.source == 'web_claims']
|
|
|
|
# Remove from wikidata_enrichment
|
|
if wikidata_dead and 'wikidata_enrichment' in entry:
|
|
wikidata = entry['wikidata_enrichment']
|
|
for link in wikidata_dead:
|
|
# Map platform to Wikidata property key
|
|
prop_keys = {
|
|
'twitter': 'P2002_x__twitter__username',
|
|
'facebook': 'P2013_facebook_id',
|
|
'instagram': 'P2003_instagram_username',
|
|
'linkedin': 'P4264_linkedin_company_id',
|
|
'youtube': 'P2397_youtube_channel_id'
|
|
}
|
|
prop_key = prop_keys.get(link.platform)
|
|
if prop_key and prop_key in wikidata:
|
|
del wikidata[prop_key]
|
|
removed_count += 1
|
|
logger.info(f" Removed {prop_key} from wikidata_enrichment")
|
|
|
|
# Remove from web_claims (work backwards to preserve indices)
|
|
if web_claims_dead and 'web_claims' in entry:
|
|
indices_to_remove = sorted([l.claim_index for l in web_claims_dead if l.claim_index is not None], reverse=True)
|
|
for idx in indices_to_remove:
|
|
if idx < len(entry['web_claims']):
|
|
claim = entry['web_claims'][idx]
|
|
logger.info(f" Removed web_claim[{idx}]: {claim.get('claim_type')} = {claim.get('claim_value')}")
|
|
del entry['web_claims'][idx]
|
|
removed_count += 1
|
|
|
|
# Write back if changes were made
|
|
if removed_count > 0:
|
|
with open(entry_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(entry, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
return removed_count
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Validate social media links in NDE entries")
|
|
parser.add_argument('--dry-run', action='store_true', help="Only report dead links, don't modify files")
|
|
parser.add_argument('--remove-dead', action='store_true', help="Remove dead links from entry files")
|
|
parser.add_argument('--limit', type=int, help="Process only first N entries")
|
|
parser.add_argument('--entry', type=str, help="Process specific entry (e.g., 0615)")
|
|
parser.add_argument('--verbose', '-v', action='store_true', help="Verbose output")
|
|
args = parser.parse_args()
|
|
|
|
if args.verbose:
|
|
logging.getLogger().setLevel(logging.DEBUG)
|
|
|
|
# Find entry files
|
|
if args.entry:
|
|
entry_files = list(ENTRIES_DIR.glob(f"{args.entry}_*.yaml"))
|
|
if not entry_files:
|
|
logger.error(f"No entry found matching {args.entry}")
|
|
return
|
|
else:
|
|
entry_files = sorted(ENTRIES_DIR.glob("*.yaml"))
|
|
if args.limit:
|
|
entry_files = entry_files[:args.limit]
|
|
|
|
logger.info(f"Processing {len(entry_files)} entry files...")
|
|
|
|
# Collect all social media links
|
|
all_links: list[SocialMediaLink] = []
|
|
for entry_path in entry_files:
|
|
links = extract_social_links(entry_path)
|
|
all_links.extend(links)
|
|
|
|
logger.info(f"Found {len(all_links)} social media links to validate")
|
|
|
|
if not all_links:
|
|
logger.info("No social media links found.")
|
|
return
|
|
|
|
# Count by platform
|
|
platform_counts = {}
|
|
for link in all_links:
|
|
platform_counts[link.platform] = platform_counts.get(link.platform, 0) + 1
|
|
logger.info(f"By platform: {platform_counts}")
|
|
|
|
# Validate links
|
|
dead_links_by_entry: dict[str, list[ValidationResult]] = {}
|
|
valid_count = 0
|
|
dead_count = 0
|
|
error_count = 0
|
|
|
|
# Use httpx client with browser-like headers
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
|
'Accept-Language': 'en-US,en;q=0.5',
|
|
}
|
|
|
|
with httpx.Client(headers=headers, follow_redirects=True) as client:
|
|
for i, link in enumerate(all_links):
|
|
if i > 0 and i % 10 == 0:
|
|
logger.info(f"Progress: {i}/{len(all_links)} links validated...")
|
|
|
|
result = validate_link(link, client)
|
|
|
|
if result.error:
|
|
logger.warning(f"[{link.entry_id}] {link.platform}: {link.url} - ERROR: {result.error}")
|
|
error_count += 1
|
|
elif result.is_valid:
|
|
if args.verbose:
|
|
logger.debug(f"[{link.entry_id}] {link.platform}: {link.url} - OK ({result.status_code})")
|
|
valid_count += 1
|
|
else:
|
|
logger.warning(f"[{link.entry_id}] {link.platform}: {link.url} - DEAD ({result.status_code})")
|
|
dead_count += 1
|
|
|
|
if link.entry_id not in dead_links_by_entry:
|
|
dead_links_by_entry[link.entry_id] = []
|
|
dead_links_by_entry[link.entry_id].append(result)
|
|
|
|
# Rate limiting
|
|
time.sleep(RATE_LIMIT_DELAY)
|
|
|
|
# Summary
|
|
logger.info("=" * 60)
|
|
logger.info("VALIDATION SUMMARY")
|
|
logger.info("=" * 60)
|
|
logger.info(f"Total links checked: {len(all_links)}")
|
|
logger.info(f"Valid: {valid_count}")
|
|
logger.info(f"Dead: {dead_count}")
|
|
logger.info(f"Errors: {error_count}")
|
|
logger.info(f"Entries with dead links: {len(dead_links_by_entry)}")
|
|
|
|
if dead_links_by_entry:
|
|
logger.info("\nDead links by entry:")
|
|
for entry_id, results in sorted(dead_links_by_entry.items()):
|
|
logger.info(f"\n Entry {entry_id}:")
|
|
for r in results:
|
|
logger.info(f" - {r.link.platform} ({r.link.source}): {r.link.url}")
|
|
|
|
# Remove dead links if requested
|
|
if args.remove_dead and not args.dry_run and dead_links_by_entry:
|
|
logger.info("\nRemoving dead links from entry files...")
|
|
total_removed = 0
|
|
for entry_id, results in dead_links_by_entry.items():
|
|
entry_path = list(ENTRIES_DIR.glob(f"{entry_id}_*.yaml"))[0]
|
|
removed = remove_dead_links(entry_path, results)
|
|
total_removed += removed
|
|
logger.info(f" {entry_id}: removed {removed} links")
|
|
logger.info(f"Total links removed: {total_removed}")
|
|
elif args.dry_run and dead_links_by_entry:
|
|
logger.info("\n[DRY RUN] Would remove links from the above entries. Run with --remove-dead to actually remove.")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|