#!/usr/bin/env python3 """ Audit web_claims for misleading or invalid values. This script identifies and flags problematic web claims such as: - Share buttons mistaken for social media profiles - Generic UI text captured as organization names - Empty or whitespace-only claims - Spam/SEO content Usage: python scripts/audit_web_claims.py [--fix] [--verbose] """ import argparse import re import sys from collections import defaultdict from pathlib import Path from typing import Dict, List, Set, Tuple import yaml ENTRIES_DIR = Path('/Users/kempersc/apps/glam/data/nde/enriched/entries') # === INVALID PATTERNS === # Social share URLs (NOT actual profiles) SHARE_URL_PATTERNS = [ r'facebook\.com/sharer', r'facebook\.com/share', r'twitter\.com/share', r'twitter\.com/intent', r'linkedin\.com/shareArticle', r'linkedin\.com/share', r'wa\.me/', # WhatsApp share r'pinterest\.com/pin/create', r'reddit\.com/submit', r'tumblr\.com/share', r'getpocket\.com/', r'buffer\.com/', r'addthis\.com', r'sharethis\.com', ] # Generic UI text that should NOT be org names INVALID_ORG_NAMES = { # Navigation/buttons 'Home', 'home', 'HOME', 'Menu', 'menu', 'MENU', 'Contact', 'contact', 'CONTACT', 'Over ons', 'Over Ons', 'OVER ONS', 'Nieuws', 'nieuws', 'NIEUWS', 'Nieuw:', 'NIEUW:', 'Zoeken', 'zoeken', 'ZOEKEN', 'Archief', 'archief', 'ARCHIEF', 'Close', 'close', 'CLOSE', 'Sluiten', 'sluiten', 'Terug', 'terug', 'TERUG', 'Back', 'back', 'BACK', 'Next', 'next', 'NEXT', 'Previous', 'previous', 'PREVIOUS', 'Volgende', 'volgende', 'Vorige', 'vorige', 'Meer lezen', 'Meer Lezen', 'MEER LEZEN', 'Lees meer', 'Lees Meer', 'LEES MEER', 'Read more', 'Read More', 'READ MORE', 'Click here', 'Click Here', 'CLICK HERE', 'Klik hier', 'Klik Hier', 'KLIK HIER', 'Submit', 'submit', 'SUBMIT', 'Verstuur', 'verstuur', 'Verzenden', 'verzenden', 'Loading', 'loading', 'LOADING', 'Laden', 'laden', # Social media icons/labels 'Facebook', 'facebook', 'FACEBOOK', 'Instagram', 'instagram', 'INSTAGRAM', 'Twitter', 'twitter', 'TWITTER', 'LinkedIn', 'linkedin', 'LINKEDIN', 'YouTube', 'youtube', 'YOUTUBE', 'Pinterest', 'pinterest', 'PINTEREST', 'TikTok', 'tiktok', 'TIKTOK', # UI elements 'Chevron left', 'Chevron right', 'Chevron down', 'Chevron up', 'Arrow left', 'Arrow right', 'Arrow down', 'Arrow up', 'Eye', 'eye', 'Search', 'search', 'Share', 'share', 'SHARE', 'Delen', 'delen', 'DELEN', 'Print', 'print', 'PRINT', # Misc 'Opent in externe pagina', 'Opens in new window', 'Oproep', 'Moennik', # Common menu system } # Patterns that indicate spam/SEO content SPAM_PATTERNS = [ r'k9win', r'casino', r'เว็บ', # Thai characters (often spam) r'slot', r'betting', r'poker', r'รับ.*โบนัส', # Thai bonus text ] # Domain names that should not be org names DOMAIN_PATTERN = re.compile(r'^[a-z0-9\-]+\.[a-z]{2,}$', re.IGNORECASE) # Too short to be meaningful MIN_ORG_NAME_LENGTH = 3 # Too long (likely a sentence/description) MAX_ORG_NAME_LENGTH = 150 def is_share_url(url: str) -> bool: """Check if URL is a share button, not a profile.""" for pattern in SHARE_URL_PATTERNS: if re.search(pattern, url, re.IGNORECASE): return True return False def is_invalid_org_name(name: str) -> Tuple[bool, str]: """Check if org_name value is invalid. Returns (is_invalid, reason).""" if not name or not name.strip(): return True, "empty" name_stripped = name.strip() # Check exact matches if name_stripped in INVALID_ORG_NAMES: return True, f"generic_ui_text: {name_stripped}" # Check if it's just a domain name if DOMAIN_PATTERN.match(name_stripped): return True, f"domain_name: {name_stripped}" # Check length if len(name_stripped) < MIN_ORG_NAME_LENGTH: return True, f"too_short: {len(name_stripped)} chars" if len(name_stripped) > MAX_ORG_NAME_LENGTH: return True, f"too_long: {len(name_stripped)} chars" # Check for spam patterns for pattern in SPAM_PATTERNS: if re.search(pattern, name_stripped, re.IGNORECASE): return True, f"spam_pattern: {pattern}" # Check if it starts with common garbage garbage_prefixes = [ '//', 'http', 'www.', '