394 lines
12 KiB
Python
394 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Audit web_claims for misleading or invalid values.
|
|
|
|
This script identifies and flags problematic web claims such as:
|
|
- Share buttons mistaken for social media profiles
|
|
- Generic UI text captured as organization names
|
|
- Empty or whitespace-only claims
|
|
- Spam/SEO content
|
|
|
|
Usage:
|
|
python scripts/audit_web_claims.py [--fix] [--verbose]
|
|
"""
|
|
|
|
import argparse
|
|
import re
|
|
import sys
|
|
from collections import defaultdict
|
|
from pathlib import Path
|
|
from typing import Dict, List, Set, Tuple
|
|
|
|
import yaml
|
|
|
|
ENTRIES_DIR = Path('/Users/kempersc/apps/glam/data/nde/enriched/entries')
|
|
|
|
# === INVALID PATTERNS ===
|
|
|
|
# Social share URLs (NOT actual profiles)
|
|
SHARE_URL_PATTERNS = [
|
|
r'facebook\.com/sharer',
|
|
r'facebook\.com/share',
|
|
r'twitter\.com/share',
|
|
r'twitter\.com/intent',
|
|
r'linkedin\.com/shareArticle',
|
|
r'linkedin\.com/share',
|
|
r'wa\.me/', # WhatsApp share
|
|
r'pinterest\.com/pin/create',
|
|
r'reddit\.com/submit',
|
|
r'tumblr\.com/share',
|
|
r'getpocket\.com/',
|
|
r'buffer\.com/',
|
|
r'addthis\.com',
|
|
r'sharethis\.com',
|
|
]
|
|
|
|
# Generic UI text that should NOT be org names
|
|
INVALID_ORG_NAMES = {
|
|
# Navigation/buttons
|
|
'Home', 'home', 'HOME',
|
|
'Menu', 'menu', 'MENU',
|
|
'Contact', 'contact', 'CONTACT',
|
|
'Over ons', 'Over Ons', 'OVER ONS',
|
|
'Nieuws', 'nieuws', 'NIEUWS',
|
|
'Nieuw:', 'NIEUW:',
|
|
'Zoeken', 'zoeken', 'ZOEKEN',
|
|
'Archief', 'archief', 'ARCHIEF',
|
|
'Close', 'close', 'CLOSE',
|
|
'Sluiten', 'sluiten',
|
|
'Terug', 'terug', 'TERUG',
|
|
'Back', 'back', 'BACK',
|
|
'Next', 'next', 'NEXT',
|
|
'Previous', 'previous', 'PREVIOUS',
|
|
'Volgende', 'volgende',
|
|
'Vorige', 'vorige',
|
|
'Meer lezen', 'Meer Lezen', 'MEER LEZEN',
|
|
'Lees meer', 'Lees Meer', 'LEES MEER',
|
|
'Read more', 'Read More', 'READ MORE',
|
|
'Click here', 'Click Here', 'CLICK HERE',
|
|
'Klik hier', 'Klik Hier', 'KLIK HIER',
|
|
'Submit', 'submit', 'SUBMIT',
|
|
'Verstuur', 'verstuur',
|
|
'Verzenden', 'verzenden',
|
|
'Loading', 'loading', 'LOADING',
|
|
'Laden', 'laden',
|
|
|
|
# Social media icons/labels
|
|
'Facebook', 'facebook', 'FACEBOOK',
|
|
'Instagram', 'instagram', 'INSTAGRAM',
|
|
'Twitter', 'twitter', 'TWITTER',
|
|
'LinkedIn', 'linkedin', 'LINKEDIN',
|
|
'YouTube', 'youtube', 'YOUTUBE',
|
|
'Pinterest', 'pinterest', 'PINTEREST',
|
|
'TikTok', 'tiktok', 'TIKTOK',
|
|
|
|
# UI elements
|
|
'Chevron left', 'Chevron right', 'Chevron down', 'Chevron up',
|
|
'Arrow left', 'Arrow right', 'Arrow down', 'Arrow up',
|
|
'Eye', 'eye',
|
|
'Search', 'search',
|
|
'Share', 'share', 'SHARE',
|
|
'Delen', 'delen', 'DELEN',
|
|
'Print', 'print', 'PRINT',
|
|
|
|
# Misc
|
|
'Opent in externe pagina',
|
|
'Opens in new window',
|
|
'Oproep',
|
|
'Moennik', # Common menu system
|
|
}
|
|
|
|
# Patterns that indicate spam/SEO content
|
|
SPAM_PATTERNS = [
|
|
r'k9win',
|
|
r'casino',
|
|
r'เว็บ', # Thai characters (often spam)
|
|
r'slot',
|
|
r'betting',
|
|
r'poker',
|
|
r'รับ.*โบนัส', # Thai bonus text
|
|
]
|
|
|
|
# Domain names that should not be org names
|
|
DOMAIN_PATTERN = re.compile(r'^[a-z0-9\-]+\.[a-z]{2,}$', re.IGNORECASE)
|
|
|
|
# Too short to be meaningful
|
|
MIN_ORG_NAME_LENGTH = 3
|
|
|
|
# Too long (likely a sentence/description)
|
|
MAX_ORG_NAME_LENGTH = 150
|
|
|
|
|
|
def is_share_url(url: str) -> bool:
|
|
"""Check if URL is a share button, not a profile."""
|
|
for pattern in SHARE_URL_PATTERNS:
|
|
if re.search(pattern, url, re.IGNORECASE):
|
|
return True
|
|
return False
|
|
|
|
|
|
def is_invalid_org_name(name: str) -> Tuple[bool, str]:
|
|
"""Check if org_name value is invalid. Returns (is_invalid, reason)."""
|
|
if not name or not name.strip():
|
|
return True, "empty"
|
|
|
|
name_stripped = name.strip()
|
|
|
|
# Check exact matches
|
|
if name_stripped in INVALID_ORG_NAMES:
|
|
return True, f"generic_ui_text: {name_stripped}"
|
|
|
|
# Check if it's just a domain name
|
|
if DOMAIN_PATTERN.match(name_stripped):
|
|
return True, f"domain_name: {name_stripped}"
|
|
|
|
# Check length
|
|
if len(name_stripped) < MIN_ORG_NAME_LENGTH:
|
|
return True, f"too_short: {len(name_stripped)} chars"
|
|
|
|
if len(name_stripped) > MAX_ORG_NAME_LENGTH:
|
|
return True, f"too_long: {len(name_stripped)} chars"
|
|
|
|
# Check for spam patterns
|
|
for pattern in SPAM_PATTERNS:
|
|
if re.search(pattern, name_stripped, re.IGNORECASE):
|
|
return True, f"spam_pattern: {pattern}"
|
|
|
|
# Check if it starts with common garbage
|
|
garbage_prefixes = [
|
|
'//', 'http', 'www.',
|
|
'<!--', '/*', '//', # Code comments
|
|
]
|
|
for prefix in garbage_prefixes:
|
|
if name_stripped.startswith(prefix):
|
|
return True, f"garbage_prefix: {prefix}"
|
|
|
|
# Check for excessive punctuation (likely not a name)
|
|
if name_stripped.startswith("'") and name_stripped.endswith("'"):
|
|
return True, "quoted_fragment"
|
|
|
|
# Check if it's just a date pattern
|
|
if re.match(r'^\d{1,2}\s+(jan|feb|mar|apr|mei|jun|jul|aug|sep|okt|nov|dec).*\d{4}$', name_stripped, re.IGNORECASE):
|
|
return True, "date_string"
|
|
|
|
return False, ""
|
|
|
|
|
|
def audit_entry(filepath: Path) -> Dict:
|
|
"""Audit a single entry for problematic claims."""
|
|
issues = {
|
|
'share_buttons_as_social': [],
|
|
'invalid_org_names': [],
|
|
'empty_claims': [],
|
|
'spam_content': [],
|
|
}
|
|
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
if not data:
|
|
return issues
|
|
|
|
web_claims = data.get('web_claims', {})
|
|
claims = web_claims.get('claims', [])
|
|
|
|
for claim in claims:
|
|
claim_type = claim.get('claim_type', '')
|
|
claim_value = claim.get('claim_value', '')
|
|
|
|
# Check social media claims for share URLs
|
|
if claim_type.startswith('social_'):
|
|
if is_share_url(claim_value):
|
|
issues['share_buttons_as_social'].append({
|
|
'claim_type': claim_type,
|
|
'claim_value': claim_value,
|
|
'xpath': claim.get('xpath', ''),
|
|
})
|
|
|
|
# Check org_name claims
|
|
if claim_type == 'org_name':
|
|
is_invalid, reason = is_invalid_org_name(claim_value)
|
|
if is_invalid:
|
|
issues['invalid_org_names'].append({
|
|
'claim_value': claim_value,
|
|
'reason': reason,
|
|
'xpath': claim.get('xpath', ''),
|
|
})
|
|
|
|
# Check for empty claims
|
|
if not claim_value or not claim_value.strip():
|
|
issues['empty_claims'].append({
|
|
'claim_type': claim_type,
|
|
'xpath': claim.get('xpath', ''),
|
|
})
|
|
|
|
return issues
|
|
|
|
|
|
def remove_invalid_claims(filepath: Path, issues: Dict, dry_run: bool = True) -> int:
|
|
"""Remove invalid claims from entry file."""
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
if not data or 'web_claims' not in data:
|
|
return 0
|
|
|
|
claims = data['web_claims'].get('claims', [])
|
|
original_count = len(claims)
|
|
|
|
# Build set of claims to remove
|
|
invalid_values = set()
|
|
for issue_type, issue_list in issues.items():
|
|
for issue in issue_list:
|
|
if 'claim_value' in issue:
|
|
invalid_values.add(issue['claim_value'])
|
|
|
|
# Filter claims
|
|
valid_claims = []
|
|
removed_count = 0
|
|
|
|
for claim in claims:
|
|
claim_type = claim.get('claim_type', '')
|
|
claim_value = claim.get('claim_value', '')
|
|
|
|
# Remove share button URLs
|
|
if claim_type.startswith('social_') and is_share_url(claim_value):
|
|
removed_count += 1
|
|
continue
|
|
|
|
# Remove invalid org names
|
|
if claim_type == 'org_name':
|
|
is_invalid, _ = is_invalid_org_name(claim_value)
|
|
if is_invalid:
|
|
removed_count += 1
|
|
continue
|
|
|
|
# Remove empty claims
|
|
if not claim_value or not claim_value.strip():
|
|
removed_count += 1
|
|
continue
|
|
|
|
valid_claims.append(claim)
|
|
|
|
if removed_count > 0 and not dry_run:
|
|
data['web_claims']['claims'] = valid_claims
|
|
data['web_claims']['claims_count'] = len(valid_claims)
|
|
|
|
# Track removed claims for audit
|
|
if 'removed_invalid_claims' not in data['web_claims']:
|
|
data['web_claims']['removed_invalid_claims'] = []
|
|
data['web_claims']['removed_invalid_claims'].append({
|
|
'removed_count': removed_count,
|
|
'audit_timestamp': '2025-12-01',
|
|
'reasons': list(issues.keys()),
|
|
})
|
|
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
return removed_count
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='Audit web_claims for invalid values')
|
|
parser.add_argument('--fix', action='store_true', help='Remove invalid claims')
|
|
parser.add_argument('--verbose', action='store_true', help='Show all issues')
|
|
parser.add_argument('--limit', type=int, default=None, help='Limit entries to process')
|
|
parser.add_argument('--stats', action='store_true', help='Show summary statistics only')
|
|
args = parser.parse_args()
|
|
|
|
files = sorted([f for f in ENTRIES_DIR.glob('*.yaml') if f.is_file() and not f.name.startswith('.')])
|
|
|
|
if args.limit:
|
|
files = files[:args.limit]
|
|
|
|
# Collect statistics
|
|
total_share_buttons = 0
|
|
total_invalid_org_names = 0
|
|
total_empty_claims = 0
|
|
total_spam = 0
|
|
total_removed = 0
|
|
entries_with_issues = 0
|
|
|
|
invalid_org_name_values = defaultdict(int)
|
|
share_button_patterns = defaultdict(int)
|
|
|
|
print(f"Auditing {len(files)} entries...")
|
|
print()
|
|
|
|
for filepath in files:
|
|
issues = audit_entry(filepath)
|
|
|
|
has_issues = any(len(v) > 0 for v in issues.values())
|
|
if has_issues:
|
|
entries_with_issues += 1
|
|
|
|
# Count statistics
|
|
total_share_buttons += len(issues['share_buttons_as_social'])
|
|
total_invalid_org_names += len(issues['invalid_org_names'])
|
|
total_empty_claims += len(issues['empty_claims'])
|
|
total_spam += len(issues['spam_content'])
|
|
|
|
# Track specific invalid values
|
|
for issue in issues['invalid_org_names']:
|
|
invalid_org_name_values[issue['claim_value']] += 1
|
|
|
|
for issue in issues['share_buttons_as_social']:
|
|
# Extract pattern from URL
|
|
url = issue['claim_value']
|
|
for pattern in SHARE_URL_PATTERNS:
|
|
if re.search(pattern, url, re.IGNORECASE):
|
|
share_button_patterns[pattern] += 1
|
|
break
|
|
|
|
# Print verbose output
|
|
if args.verbose and has_issues:
|
|
print(f"\n{filepath.name}:")
|
|
for issue_type, issue_list in issues.items():
|
|
if issue_list:
|
|
print(f" {issue_type}: {len(issue_list)}")
|
|
for issue in issue_list[:3]: # Show first 3
|
|
print(f" - {issue}")
|
|
|
|
# Fix if requested
|
|
if args.fix and has_issues:
|
|
removed = remove_invalid_claims(filepath, issues, dry_run=False)
|
|
total_removed += removed
|
|
if removed > 0:
|
|
print(f" Fixed {filepath.name}: removed {removed} claims")
|
|
|
|
# Print summary
|
|
print("\n" + "=" * 60)
|
|
print("AUDIT SUMMARY")
|
|
print("=" * 60)
|
|
print(f"Total entries scanned: {len(files)}")
|
|
print(f"Entries with issues: {entries_with_issues}")
|
|
print()
|
|
print("Issues found:")
|
|
print(f" Share buttons as social profiles: {total_share_buttons}")
|
|
print(f" Invalid organization names: {total_invalid_org_names}")
|
|
print(f" Empty claims: {total_empty_claims}")
|
|
print(f" Spam content: {total_spam}")
|
|
print()
|
|
|
|
if args.fix:
|
|
print(f"Total claims removed: {total_removed}")
|
|
else:
|
|
print("Run with --fix to remove invalid claims")
|
|
|
|
if args.stats:
|
|
print("\n" + "-" * 40)
|
|
print("TOP INVALID ORG NAMES:")
|
|
for name, count in sorted(invalid_org_name_values.items(), key=lambda x: -x[1])[:20]:
|
|
print(f" {count:4d}x {repr(name)}")
|
|
|
|
print("\n" + "-" * 40)
|
|
print("SHARE BUTTON PATTERNS:")
|
|
for pattern, count in sorted(share_button_patterns.items(), key=lambda x: -x[1]):
|
|
print(f" {count:4d}x {pattern}")
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == '__main__':
|
|
sys.exit(main())
|