glam/scripts/audit_web_claims.py

#!/usr/bin/env python3
"""
Audit web_claims for misleading or invalid values.

This script identifies and flags problematic web claims such as:
- Share buttons mistaken for social media profiles
- Generic UI text captured as organization names
- Empty or whitespace-only claims
- Spam/SEO content

Usage:
    python scripts/audit_web_claims.py [--fix] [--verbose]
"""

import argparse
import re
import sys
from collections import defaultdict
from pathlib import Path
from typing import Dict, List, Set, Tuple

import yaml

ENTRIES_DIR = Path('/Users/kempersc/apps/glam/data/nde/enriched/entries')

# === INVALID PATTERNS ===

# Social share URLs (NOT actual profiles)
SHARE_URL_PATTERNS = [
    r'facebook\.com/sharer',
    r'facebook\.com/share',
    r'twitter\.com/share',
    r'twitter\.com/intent',
    r'linkedin\.com/shareArticle',
    r'linkedin\.com/share',
    r'wa\.me/',  # WhatsApp share
    r'pinterest\.com/pin/create',
    r'reddit\.com/submit',
    r'tumblr\.com/share',
    r'getpocket\.com/',
    r'buffer\.com/',
    r'addthis\.com',
    r'sharethis\.com',
]

# Generic UI text that should NOT be org names
INVALID_ORG_NAMES = {
    # Navigation/buttons
    'Home', 'home', 'HOME',
    'Menu', 'menu', 'MENU',
    'Contact', 'contact', 'CONTACT',
    'Over ons', 'Over Ons', 'OVER ONS',
    'Nieuws', 'nieuws', 'NIEUWS',
    'Nieuw:', 'NIEUW:',
    'Zoeken', 'zoeken', 'ZOEKEN',
    'Archief', 'archief', 'ARCHIEF',
    'Close', 'close', 'CLOSE',
    'Sluiten', 'sluiten',
    'Terug', 'terug', 'TERUG',
    'Back', 'back', 'BACK',
    'Next', 'next', 'NEXT',
    'Previous', 'previous', 'PREVIOUS',
    'Volgende', 'volgende',
    'Vorige', 'vorige',
    'Meer lezen', 'Meer Lezen', 'MEER LEZEN',
    'Lees meer', 'Lees Meer', 'LEES MEER',
    'Read more', 'Read More', 'READ MORE',
    'Click here', 'Click Here', 'CLICK HERE',
    'Klik hier', 'Klik Hier', 'KLIK HIER',
    'Submit', 'submit', 'SUBMIT',
    'Verstuur', 'verstuur',
    'Verzenden', 'verzenden',
    'Loading', 'loading', 'LOADING',
    'Laden', 'laden',

    # Social media icons/labels
    'Facebook', 'facebook', 'FACEBOOK',
    'Instagram', 'instagram', 'INSTAGRAM',
    'Twitter', 'twitter', 'TWITTER',
    'LinkedIn', 'linkedin', 'LINKEDIN',
    'YouTube', 'youtube', 'YOUTUBE',
    'Pinterest', 'pinterest', 'PINTEREST',
    'TikTok', 'tiktok', 'TIKTOK',

    # UI elements
    'Chevron left', 'Chevron right', 'Chevron down', 'Chevron up',
    'Arrow left', 'Arrow right', 'Arrow down', 'Arrow up',
    'Eye', 'eye',
    'Search', 'search',
    'Share', 'share', 'SHARE',
    'Delen', 'delen', 'DELEN',
    'Print', 'print', 'PRINT',

    # Misc
    'Opent in externe pagina',
    'Opens in new window',
    'Oproep',
    'Moennik',  # Common menu system
}

# Patterns that indicate spam/SEO content
SPAM_PATTERNS = [
    r'k9win',
    r'casino',
    r'เว็บ',  # Thai characters (often spam)
    r'slot',
    r'betting',
    r'poker',
    r'รับ.*โบนัส',  # Thai bonus text
]

# Domain names that should not be org names
DOMAIN_PATTERN = re.compile(r'^[a-z0-9\-]+\.[a-z]{2,}$', re.IGNORECASE)

# Too short to be meaningful
MIN_ORG_NAME_LENGTH = 3

# Too long (likely a sentence/description)
MAX_ORG_NAME_LENGTH = 150


def is_share_url(url: str) -> bool:
    """Check if URL is a share button, not a profile."""
    for pattern in SHARE_URL_PATTERNS:
        if re.search(pattern, url, re.IGNORECASE):
            return True
    return False


def is_invalid_org_name(name: str) -> Tuple[bool, str]:
    """Check if org_name value is invalid. Returns (is_invalid, reason)."""
    if not name or not name.strip():
        return True, "empty"

    name_stripped = name.strip()

    # Check exact matches
    if name_stripped in INVALID_ORG_NAMES:
        return True, f"generic_ui_text: {name_stripped}"

    # Check if it's just a domain name
    if DOMAIN_PATTERN.match(name_stripped):
        return True, f"domain_name: {name_stripped}"

    # Check length
    if len(name_stripped) < MIN_ORG_NAME_LENGTH:
        return True, f"too_short: {len(name_stripped)} chars"

    if len(name_stripped) > MAX_ORG_NAME_LENGTH:
        return True, f"too_long: {len(name_stripped)} chars"

    # Check for spam patterns
    for pattern in SPAM_PATTERNS:
        if re.search(pattern, name_stripped, re.IGNORECASE):
            return True, f"spam_pattern: {pattern}"

    # Check if it starts with common garbage
    garbage_prefixes = [
        '//', 'http', 'www.',
        '<!--', '/*', '//',  # Code comments
    ]
    for prefix in garbage_prefixes:
        if name_stripped.startswith(prefix):
            return True, f"garbage_prefix: {prefix}"

    # Check for excessive punctuation (likely not a name)
    if name_stripped.startswith("'") and name_stripped.endswith("'"):
        return True, "quoted_fragment"

    # Check if it's just a date pattern
    if re.match(r'^\d{1,2}\s+(jan|feb|mar|apr|mei|jun|jul|aug|sep|okt|nov|dec).*\d{4}$', name_stripped, re.IGNORECASE):
        return True, "date_string"

    return False, ""


def audit_entry(filepath: Path) -> Dict:
    """Audit a single entry for problematic claims."""
    issues = {
        'share_buttons_as_social': [],
        'invalid_org_names': [],
        'empty_claims': [],
        'spam_content': [],
    }

    with open(filepath, 'r', encoding='utf-8') as f:
        data = yaml.safe_load(f)

    if not data:
        return issues

    web_claims = data.get('web_claims', {})
    claims = web_claims.get('claims', [])

    for claim in claims:
        claim_type = claim.get('claim_type', '')
        claim_value = claim.get('claim_value', '')

        # Check social media claims for share URLs
        if claim_type.startswith('social_'):
            if is_share_url(claim_value):
                issues['share_buttons_as_social'].append({
                    'claim_type': claim_type,
                    'claim_value': claim_value,
                    'xpath': claim.get('xpath', ''),
                })

        # Check org_name claims
        if claim_type == 'org_name':
            is_invalid, reason = is_invalid_org_name(claim_value)
            if is_invalid:
                issues['invalid_org_names'].append({
                    'claim_value': claim_value,
                    'reason': reason,
                    'xpath': claim.get('xpath', ''),
                })

        # Check for empty claims
        if not claim_value or not claim_value.strip():
            issues['empty_claims'].append({
                'claim_type': claim_type,
                'xpath': claim.get('xpath', ''),
            })

    return issues


def remove_invalid_claims(filepath: Path, issues: Dict, dry_run: bool = True) -> int:
    """Remove invalid claims from entry file."""
    with open(filepath, 'r', encoding='utf-8') as f:
        data = yaml.safe_load(f)

    if not data or 'web_claims' not in data:
        return 0

    claims = data['web_claims'].get('claims', [])
    original_count = len(claims)

    # Build set of claims to remove
    invalid_values = set()
    for issue_type, issue_list in issues.items():
        for issue in issue_list:
            if 'claim_value' in issue:
                invalid_values.add(issue['claim_value'])

    # Filter claims
    valid_claims = []
    removed_count = 0

    for claim in claims:
        claim_type = claim.get('claim_type', '')
        claim_value = claim.get('claim_value', '')

        # Remove share button URLs
        if claim_type.startswith('social_') and is_share_url(claim_value):
            removed_count += 1
            continue

        # Remove invalid org names
        if claim_type == 'org_name':
            is_invalid, _ = is_invalid_org_name(claim_value)
            if is_invalid:
                removed_count += 1
                continue

        # Remove empty claims
        if not claim_value or not claim_value.strip():
            removed_count += 1
            continue

        valid_claims.append(claim)

    if removed_count > 0 and not dry_run:
        data['web_claims']['claims'] = valid_claims
        data['web_claims']['claims_count'] = len(valid_claims)

        # Track removed claims for audit
        if 'removed_invalid_claims' not in data['web_claims']:
            data['web_claims']['removed_invalid_claims'] = []
        data['web_claims']['removed_invalid_claims'].append({
            'removed_count': removed_count,
            'audit_timestamp': '2025-12-01',
            'reasons': list(issues.keys()),
        })

        with open(filepath, 'w', encoding='utf-8') as f:
            yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)

    return removed_count


def main():
    parser = argparse.ArgumentParser(description='Audit web_claims for invalid values')
    parser.add_argument('--fix', action='store_true', help='Remove invalid claims')
    parser.add_argument('--verbose', action='store_true', help='Show all issues')
    parser.add_argument('--limit', type=int, default=None, help='Limit entries to process')
    parser.add_argument('--stats', action='store_true', help='Show summary statistics only')
    args = parser.parse_args()

    files = sorted([f for f in ENTRIES_DIR.glob('*.yaml') if f.is_file() and not f.name.startswith('.')])

    if args.limit:
        files = files[:args.limit]

    # Collect statistics
    total_share_buttons = 0
    total_invalid_org_names = 0
    total_empty_claims = 0
    total_spam = 0
    total_removed = 0
    entries_with_issues = 0

    invalid_org_name_values = defaultdict(int)
    share_button_patterns = defaultdict(int)

    print(f"Auditing {len(files)} entries...")
    print()

    for filepath in files:
        issues = audit_entry(filepath)

        has_issues = any(len(v) > 0 for v in issues.values())
        if has_issues:
            entries_with_issues += 1

        # Count statistics
        total_share_buttons += len(issues['share_buttons_as_social'])
        total_invalid_org_names += len(issues['invalid_org_names'])
        total_empty_claims += len(issues['empty_claims'])
        total_spam += len(issues['spam_content'])

        # Track specific invalid values
        for issue in issues['invalid_org_names']:
            invalid_org_name_values[issue['claim_value']] += 1

        for issue in issues['share_buttons_as_social']:
            # Extract pattern from URL
            url = issue['claim_value']
            for pattern in SHARE_URL_PATTERNS:
                if re.search(pattern, url, re.IGNORECASE):
                    share_button_patterns[pattern] += 1
                    break

        # Print verbose output
        if args.verbose and has_issues:
            print(f"\n{filepath.name}:")
            for issue_type, issue_list in issues.items():
                if issue_list:
                    print(f"  {issue_type}: {len(issue_list)}")
                    for issue in issue_list[:3]:  # Show first 3
                        print(f"    - {issue}")

        # Fix if requested
        if args.fix and has_issues:
            removed = remove_invalid_claims(filepath, issues, dry_run=False)
            total_removed += removed
            if removed > 0:
                print(f"  Fixed {filepath.name}: removed {removed} claims")

    # Print summary
    print("\n" + "=" * 60)
    print("AUDIT SUMMARY")
    print("=" * 60)
    print(f"Total entries scanned: {len(files)}")
    print(f"Entries with issues: {entries_with_issues}")
    print()
    print("Issues found:")
    print(f"  Share buttons as social profiles: {total_share_buttons}")
    print(f"  Invalid organization names: {total_invalid_org_names}")
    print(f"  Empty claims: {total_empty_claims}")
    print(f"  Spam content: {total_spam}")
    print()

    if args.fix:
        print(f"Total claims removed: {total_removed}")
    else:
        print("Run with --fix to remove invalid claims")

    if args.stats:
        print("\n" + "-" * 40)
        print("TOP INVALID ORG NAMES:")
        for name, count in sorted(invalid_org_name_values.items(), key=lambda x: -x[1])[:20]:
            print(f"  {count:4d}x  {repr(name)}")

        print("\n" + "-" * 40)
        print("SHARE BUTTON PATTERNS:")
        for pattern, count in sorted(share_button_patterns.items(), key=lambda x: -x[1]):
            print(f"  {count:4d}x  {pattern}")

    return 0


if __name__ == '__main__':
    sys.exit(main())