glam/scripts/audit_web_claims.py
2025-12-01 23:55:55 +01:00

394 lines
12 KiB
Python

#!/usr/bin/env python3
"""
Audit web_claims for misleading or invalid values.
This script identifies and flags problematic web claims such as:
- Share buttons mistaken for social media profiles
- Generic UI text captured as organization names
- Empty or whitespace-only claims
- Spam/SEO content
Usage:
python scripts/audit_web_claims.py [--fix] [--verbose]
"""
import argparse
import re
import sys
from collections import defaultdict
from pathlib import Path
from typing import Dict, List, Set, Tuple
import yaml
ENTRIES_DIR = Path('/Users/kempersc/apps/glam/data/nde/enriched/entries')
# === INVALID PATTERNS ===
# Social share URLs (NOT actual profiles)
SHARE_URL_PATTERNS = [
r'facebook\.com/sharer',
r'facebook\.com/share',
r'twitter\.com/share',
r'twitter\.com/intent',
r'linkedin\.com/shareArticle',
r'linkedin\.com/share',
r'wa\.me/', # WhatsApp share
r'pinterest\.com/pin/create',
r'reddit\.com/submit',
r'tumblr\.com/share',
r'getpocket\.com/',
r'buffer\.com/',
r'addthis\.com',
r'sharethis\.com',
]
# Generic UI text that should NOT be org names
INVALID_ORG_NAMES = {
# Navigation/buttons
'Home', 'home', 'HOME',
'Menu', 'menu', 'MENU',
'Contact', 'contact', 'CONTACT',
'Over ons', 'Over Ons', 'OVER ONS',
'Nieuws', 'nieuws', 'NIEUWS',
'Nieuw:', 'NIEUW:',
'Zoeken', 'zoeken', 'ZOEKEN',
'Archief', 'archief', 'ARCHIEF',
'Close', 'close', 'CLOSE',
'Sluiten', 'sluiten',
'Terug', 'terug', 'TERUG',
'Back', 'back', 'BACK',
'Next', 'next', 'NEXT',
'Previous', 'previous', 'PREVIOUS',
'Volgende', 'volgende',
'Vorige', 'vorige',
'Meer lezen', 'Meer Lezen', 'MEER LEZEN',
'Lees meer', 'Lees Meer', 'LEES MEER',
'Read more', 'Read More', 'READ MORE',
'Click here', 'Click Here', 'CLICK HERE',
'Klik hier', 'Klik Hier', 'KLIK HIER',
'Submit', 'submit', 'SUBMIT',
'Verstuur', 'verstuur',
'Verzenden', 'verzenden',
'Loading', 'loading', 'LOADING',
'Laden', 'laden',
# Social media icons/labels
'Facebook', 'facebook', 'FACEBOOK',
'Instagram', 'instagram', 'INSTAGRAM',
'Twitter', 'twitter', 'TWITTER',
'LinkedIn', 'linkedin', 'LINKEDIN',
'YouTube', 'youtube', 'YOUTUBE',
'Pinterest', 'pinterest', 'PINTEREST',
'TikTok', 'tiktok', 'TIKTOK',
# UI elements
'Chevron left', 'Chevron right', 'Chevron down', 'Chevron up',
'Arrow left', 'Arrow right', 'Arrow down', 'Arrow up',
'Eye', 'eye',
'Search', 'search',
'Share', 'share', 'SHARE',
'Delen', 'delen', 'DELEN',
'Print', 'print', 'PRINT',
# Misc
'Opent in externe pagina',
'Opens in new window',
'Oproep',
'Moennik', # Common menu system
}
# Patterns that indicate spam/SEO content
SPAM_PATTERNS = [
r'k9win',
r'casino',
r'เว็บ', # Thai characters (often spam)
r'slot',
r'betting',
r'poker',
r'รับ.*โบนัส', # Thai bonus text
]
# Domain names that should not be org names
DOMAIN_PATTERN = re.compile(r'^[a-z0-9\-]+\.[a-z]{2,}$', re.IGNORECASE)
# Too short to be meaningful
MIN_ORG_NAME_LENGTH = 3
# Too long (likely a sentence/description)
MAX_ORG_NAME_LENGTH = 150
def is_share_url(url: str) -> bool:
"""Check if URL is a share button, not a profile."""
for pattern in SHARE_URL_PATTERNS:
if re.search(pattern, url, re.IGNORECASE):
return True
return False
def is_invalid_org_name(name: str) -> Tuple[bool, str]:
"""Check if org_name value is invalid. Returns (is_invalid, reason)."""
if not name or not name.strip():
return True, "empty"
name_stripped = name.strip()
# Check exact matches
if name_stripped in INVALID_ORG_NAMES:
return True, f"generic_ui_text: {name_stripped}"
# Check if it's just a domain name
if DOMAIN_PATTERN.match(name_stripped):
return True, f"domain_name: {name_stripped}"
# Check length
if len(name_stripped) < MIN_ORG_NAME_LENGTH:
return True, f"too_short: {len(name_stripped)} chars"
if len(name_stripped) > MAX_ORG_NAME_LENGTH:
return True, f"too_long: {len(name_stripped)} chars"
# Check for spam patterns
for pattern in SPAM_PATTERNS:
if re.search(pattern, name_stripped, re.IGNORECASE):
return True, f"spam_pattern: {pattern}"
# Check if it starts with common garbage
garbage_prefixes = [
'//', 'http', 'www.',
'<!--', '/*', '//', # Code comments
]
for prefix in garbage_prefixes:
if name_stripped.startswith(prefix):
return True, f"garbage_prefix: {prefix}"
# Check for excessive punctuation (likely not a name)
if name_stripped.startswith("'") and name_stripped.endswith("'"):
return True, "quoted_fragment"
# Check if it's just a date pattern
if re.match(r'^\d{1,2}\s+(jan|feb|mar|apr|mei|jun|jul|aug|sep|okt|nov|dec).*\d{4}$', name_stripped, re.IGNORECASE):
return True, "date_string"
return False, ""
def audit_entry(filepath: Path) -> Dict:
"""Audit a single entry for problematic claims."""
issues = {
'share_buttons_as_social': [],
'invalid_org_names': [],
'empty_claims': [],
'spam_content': [],
}
with open(filepath, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
if not data:
return issues
web_claims = data.get('web_claims', {})
claims = web_claims.get('claims', [])
for claim in claims:
claim_type = claim.get('claim_type', '')
claim_value = claim.get('claim_value', '')
# Check social media claims for share URLs
if claim_type.startswith('social_'):
if is_share_url(claim_value):
issues['share_buttons_as_social'].append({
'claim_type': claim_type,
'claim_value': claim_value,
'xpath': claim.get('xpath', ''),
})
# Check org_name claims
if claim_type == 'org_name':
is_invalid, reason = is_invalid_org_name(claim_value)
if is_invalid:
issues['invalid_org_names'].append({
'claim_value': claim_value,
'reason': reason,
'xpath': claim.get('xpath', ''),
})
# Check for empty claims
if not claim_value or not claim_value.strip():
issues['empty_claims'].append({
'claim_type': claim_type,
'xpath': claim.get('xpath', ''),
})
return issues
def remove_invalid_claims(filepath: Path, issues: Dict, dry_run: bool = True) -> int:
"""Remove invalid claims from entry file."""
with open(filepath, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
if not data or 'web_claims' not in data:
return 0
claims = data['web_claims'].get('claims', [])
original_count = len(claims)
# Build set of claims to remove
invalid_values = set()
for issue_type, issue_list in issues.items():
for issue in issue_list:
if 'claim_value' in issue:
invalid_values.add(issue['claim_value'])
# Filter claims
valid_claims = []
removed_count = 0
for claim in claims:
claim_type = claim.get('claim_type', '')
claim_value = claim.get('claim_value', '')
# Remove share button URLs
if claim_type.startswith('social_') and is_share_url(claim_value):
removed_count += 1
continue
# Remove invalid org names
if claim_type == 'org_name':
is_invalid, _ = is_invalid_org_name(claim_value)
if is_invalid:
removed_count += 1
continue
# Remove empty claims
if not claim_value or not claim_value.strip():
removed_count += 1
continue
valid_claims.append(claim)
if removed_count > 0 and not dry_run:
data['web_claims']['claims'] = valid_claims
data['web_claims']['claims_count'] = len(valid_claims)
# Track removed claims for audit
if 'removed_invalid_claims' not in data['web_claims']:
data['web_claims']['removed_invalid_claims'] = []
data['web_claims']['removed_invalid_claims'].append({
'removed_count': removed_count,
'audit_timestamp': '2025-12-01',
'reasons': list(issues.keys()),
})
with open(filepath, 'w', encoding='utf-8') as f:
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
return removed_count
def main():
parser = argparse.ArgumentParser(description='Audit web_claims for invalid values')
parser.add_argument('--fix', action='store_true', help='Remove invalid claims')
parser.add_argument('--verbose', action='store_true', help='Show all issues')
parser.add_argument('--limit', type=int, default=None, help='Limit entries to process')
parser.add_argument('--stats', action='store_true', help='Show summary statistics only')
args = parser.parse_args()
files = sorted([f for f in ENTRIES_DIR.glob('*.yaml') if f.is_file() and not f.name.startswith('.')])
if args.limit:
files = files[:args.limit]
# Collect statistics
total_share_buttons = 0
total_invalid_org_names = 0
total_empty_claims = 0
total_spam = 0
total_removed = 0
entries_with_issues = 0
invalid_org_name_values = defaultdict(int)
share_button_patterns = defaultdict(int)
print(f"Auditing {len(files)} entries...")
print()
for filepath in files:
issues = audit_entry(filepath)
has_issues = any(len(v) > 0 for v in issues.values())
if has_issues:
entries_with_issues += 1
# Count statistics
total_share_buttons += len(issues['share_buttons_as_social'])
total_invalid_org_names += len(issues['invalid_org_names'])
total_empty_claims += len(issues['empty_claims'])
total_spam += len(issues['spam_content'])
# Track specific invalid values
for issue in issues['invalid_org_names']:
invalid_org_name_values[issue['claim_value']] += 1
for issue in issues['share_buttons_as_social']:
# Extract pattern from URL
url = issue['claim_value']
for pattern in SHARE_URL_PATTERNS:
if re.search(pattern, url, re.IGNORECASE):
share_button_patterns[pattern] += 1
break
# Print verbose output
if args.verbose and has_issues:
print(f"\n{filepath.name}:")
for issue_type, issue_list in issues.items():
if issue_list:
print(f" {issue_type}: {len(issue_list)}")
for issue in issue_list[:3]: # Show first 3
print(f" - {issue}")
# Fix if requested
if args.fix and has_issues:
removed = remove_invalid_claims(filepath, issues, dry_run=False)
total_removed += removed
if removed > 0:
print(f" Fixed {filepath.name}: removed {removed} claims")
# Print summary
print("\n" + "=" * 60)
print("AUDIT SUMMARY")
print("=" * 60)
print(f"Total entries scanned: {len(files)}")
print(f"Entries with issues: {entries_with_issues}")
print()
print("Issues found:")
print(f" Share buttons as social profiles: {total_share_buttons}")
print(f" Invalid organization names: {total_invalid_org_names}")
print(f" Empty claims: {total_empty_claims}")
print(f" Spam content: {total_spam}")
print()
if args.fix:
print(f"Total claims removed: {total_removed}")
else:
print("Run with --fix to remove invalid claims")
if args.stats:
print("\n" + "-" * 40)
print("TOP INVALID ORG NAMES:")
for name, count in sorted(invalid_org_name_values.items(), key=lambda x: -x[1])[:20]:
print(f" {count:4d}x {repr(name)}")
print("\n" + "-" * 40)
print("SHARE BUTTON PATTERNS:")
for pattern, count in sorted(share_button_patterns.items(), key=lambda x: -x[1]):
print(f" {count:4d}x {pattern}")
return 0
if __name__ == '__main__':
sys.exit(main())