glam/scripts/cleanup_linkedin_mismatches.py

#!/usr/bin/env python3
"""
Remove LinkedIn claims that point to DIFFERENT profiles than the person was extracted from.

This is an entity resolution failure - the enrichment script found LinkedIn profiles
for people with similar names, but they are NOT the same person.

DATA QUALITY IS OF UTMOST IMPORTANCE - Wrong data is worse than no data.
"""

import json
from pathlib import Path
from datetime import datetime, timezone
from urllib.parse import unquote
import argparse

def normalize_slug(slug: str) -> str:
    """Normalize LinkedIn slug for comparison."""
    # URL decode
    slug = unquote(slug)
    # Remove language suffix like /nl, /en
    slug = slug.rstrip('/').split('/')[-1]
    # Remove query params like ?_l=en
    if '?' in slug:
        slug = slug.split('?')[0]
    return slug.lower()


def process_profile(file_path: Path, dry_run: bool = True) -> dict:
    """Process a single profile and remove mismatched LinkedIn claims."""
    stats = {
        'file': file_path.name,
        'claims_before': 0,
        'claims_after': 0,
        'removed_mismatches': [],
        'kept_claims': 0,
    }

    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            profile = json.load(f)
    except (json.JSONDecodeError, FileNotFoundError) as e:
        stats['error'] = str(e)
        return stats

    # Get profile's LinkedIn slug
    linkedin_slug = profile.get('linkedin_slug', '')
    if not linkedin_slug:
        stats['no_linkedin_slug'] = True
        return stats

    profile_slug_normalized = normalize_slug(linkedin_slug)

    web_claims = profile.get('web_claims', [])
    stats['claims_before'] = len(web_claims)

    if not web_claims:
        return stats

    # Process claims
    kept_claims = []

    for claim in web_claims:
        source_url = claim.get('provenance', {}).get('source_url', '')

        # Only check LinkedIn sources
        if 'linkedin.com/in/' not in source_url:
            kept_claims.append(claim)
            continue

        # Extract and normalize source slug
        source_slug = source_url.split('/in/')[-1]
        source_slug_normalized = normalize_slug(source_slug)

        # Check if it matches
        if profile_slug_normalized == source_slug_normalized:
            # Exact match - keep
            kept_claims.append(claim)
        elif (source_slug_normalized.startswith(profile_slug_normalized) or
              profile_slug_normalized.startswith(source_slug_normalized)):
            # Close match (language suffix, etc) - keep but flag
            kept_claims.append(claim)
        else:
            # TRUE MISMATCH - this is from a different person!
            stats['removed_mismatches'].append({
                'claim_type': claim.get('claim_type'),
                'claim_value': str(claim.get('claim_value', ''))[:100],
                'profile_slug': linkedin_slug,
                'claim_source_slug': source_slug,
                'reason': f"LinkedIn profile mismatch: profile is '{linkedin_slug}' but claim from '{source_slug}'"
            })

    stats['claims_after'] = len(kept_claims)
    stats['kept_claims'] = len(kept_claims)

    # Update profile if not dry run and we removed something
    if not dry_run and stats['removed_mismatches']:
        profile['web_claims'] = kept_claims

        # Add cleanup metadata
        if 'enrichment_metadata' not in profile:
            profile['enrichment_metadata'] = {}

        cleanup_entry = {
            'cleanup_date': datetime.now(timezone.utc).isoformat(),
            'cleanup_script': 'cleanup_linkedin_mismatches.py',
            'claims_removed': len(stats['removed_mismatches']),
            'removal_reason': 'LinkedIn profile slug mismatch - claims from different person',
        }

        if 'cleanup_history' not in profile['enrichment_metadata']:
            profile['enrichment_metadata']['cleanup_history'] = []
        profile['enrichment_metadata']['cleanup_history'].append(cleanup_entry)

        with open(file_path, 'w', encoding='utf-8') as f:
            json.dump(profile, f, indent=2, ensure_ascii=False)

    return stats


def main():
    parser = argparse.ArgumentParser(description='Remove LinkedIn claims from wrong profiles')
    parser.add_argument('--dry-run', action='store_true', default=True,
                        help='Do not modify files, just report (default: True)')
    parser.add_argument('--execute', action='store_true',
                        help='Actually modify files')
    parser.add_argument('--limit', type=int, default=None,
                        help='Process only N files')

    args = parser.parse_args()
    dry_run = not args.execute

    person_dir = Path('/Users/kempersc/apps/glam/data/person')
    files = sorted(person_dir.glob('ID_*.json'))

    if args.limit:
        files = files[:args.limit]

    print(f"{'DRY RUN - ' if dry_run else ''}Processing {len(files):,} files...")
    print("=" * 70)

    total_stats = {
        'files_processed': 0,
        'files_with_mismatches': 0,
        'claims_removed': 0,
        'claims_kept': 0,
    }

    removal_log = []

    for file_path in files:
        stats = process_profile(file_path, dry_run=dry_run)
        total_stats['files_processed'] += 1

        if stats.get('error'):
            continue

        if stats['removed_mismatches']:
            total_stats['files_with_mismatches'] += 1
            total_stats['claims_removed'] += len(stats['removed_mismatches'])
            removal_log.append(stats)

            print(f"\n{file_path.name}:")
            for mismatch in stats['removed_mismatches']:
                print(f"  REMOVED: {mismatch['claim_type']}")
                print(f"    Profile: linkedin.com/in/{mismatch['profile_slug']}")
                print(f"    Claim from: linkedin.com/in/{mismatch['claim_source_slug']}")

        total_stats['claims_kept'] += stats.get('kept_claims', 0)

    print("\n" + "=" * 70)
    print("SUMMARY")
    print("=" * 70)
    print(f"Files processed:             {total_stats['files_processed']:,}")
    print(f"Files with mismatches:       {total_stats['files_with_mismatches']}")
    print(f"Claims REMOVED (wrong person): {total_stats['claims_removed']}")
    print(f"Claims kept:                 {total_stats['claims_kept']:,}")

    if dry_run:
        print("\n*** DRY RUN - No files were modified ***")
        print("Run with --execute to apply changes")

    # Save log
    log_path = person_dir / '_linkedin_mismatch_cleanup_log.json'
    with open(log_path, 'w', encoding='utf-8') as f:
        json.dump({
            'cleanup_date': datetime.now(timezone.utc).isoformat(),
            'dry_run': dry_run,
            'total_stats': total_stats,
            'removal_details': removal_log,
        }, f, indent=2, ensure_ascii=False)

    print(f"\nLog saved to: {log_path}")


if __name__ == '__main__':
    main()