#!/usr/bin/env python3 """ Remove LinkedIn claims that point to DIFFERENT profiles than the person was extracted from. This is an entity resolution failure - the enrichment script found LinkedIn profiles for people with similar names, but they are NOT the same person. DATA QUALITY IS OF UTMOST IMPORTANCE - Wrong data is worse than no data. """ import json from pathlib import Path from datetime import datetime, timezone from urllib.parse import unquote import argparse def normalize_slug(slug: str) -> str: """Normalize LinkedIn slug for comparison.""" # URL decode slug = unquote(slug) # Remove language suffix like /nl, /en slug = slug.rstrip('/').split('/')[-1] # Remove query params like ?_l=en if '?' in slug: slug = slug.split('?')[0] return slug.lower() def process_profile(file_path: Path, dry_run: bool = True) -> dict: """Process a single profile and remove mismatched LinkedIn claims.""" stats = { 'file': file_path.name, 'claims_before': 0, 'claims_after': 0, 'removed_mismatches': [], 'kept_claims': 0, } try: with open(file_path, 'r', encoding='utf-8') as f: profile = json.load(f) except (json.JSONDecodeError, FileNotFoundError) as e: stats['error'] = str(e) return stats # Get profile's LinkedIn slug linkedin_slug = profile.get('linkedin_slug', '') if not linkedin_slug: stats['no_linkedin_slug'] = True return stats profile_slug_normalized = normalize_slug(linkedin_slug) web_claims = profile.get('web_claims', []) stats['claims_before'] = len(web_claims) if not web_claims: return stats # Process claims kept_claims = [] for claim in web_claims: source_url = claim.get('provenance', {}).get('source_url', '') # Only check LinkedIn sources if 'linkedin.com/in/' not in source_url: kept_claims.append(claim) continue # Extract and normalize source slug source_slug = source_url.split('/in/')[-1] source_slug_normalized = normalize_slug(source_slug) # Check if it matches if profile_slug_normalized == source_slug_normalized: # Exact match - keep kept_claims.append(claim) elif (source_slug_normalized.startswith(profile_slug_normalized) or profile_slug_normalized.startswith(source_slug_normalized)): # Close match (language suffix, etc) - keep but flag kept_claims.append(claim) else: # TRUE MISMATCH - this is from a different person! stats['removed_mismatches'].append({ 'claim_type': claim.get('claim_type'), 'claim_value': str(claim.get('claim_value', ''))[:100], 'profile_slug': linkedin_slug, 'claim_source_slug': source_slug, 'reason': f"LinkedIn profile mismatch: profile is '{linkedin_slug}' but claim from '{source_slug}'" }) stats['claims_after'] = len(kept_claims) stats['kept_claims'] = len(kept_claims) # Update profile if not dry run and we removed something if not dry_run and stats['removed_mismatches']: profile['web_claims'] = kept_claims # Add cleanup metadata if 'enrichment_metadata' not in profile: profile['enrichment_metadata'] = {} cleanup_entry = { 'cleanup_date': datetime.now(timezone.utc).isoformat(), 'cleanup_script': 'cleanup_linkedin_mismatches.py', 'claims_removed': len(stats['removed_mismatches']), 'removal_reason': 'LinkedIn profile slug mismatch - claims from different person', } if 'cleanup_history' not in profile['enrichment_metadata']: profile['enrichment_metadata']['cleanup_history'] = [] profile['enrichment_metadata']['cleanup_history'].append(cleanup_entry) with open(file_path, 'w', encoding='utf-8') as f: json.dump(profile, f, indent=2, ensure_ascii=False) return stats def main(): parser = argparse.ArgumentParser(description='Remove LinkedIn claims from wrong profiles') parser.add_argument('--dry-run', action='store_true', default=True, help='Do not modify files, just report (default: True)') parser.add_argument('--execute', action='store_true', help='Actually modify files') parser.add_argument('--limit', type=int, default=None, help='Process only N files') args = parser.parse_args() dry_run = not args.execute person_dir = Path('/Users/kempersc/apps/glam/data/person') files = sorted(person_dir.glob('ID_*.json')) if args.limit: files = files[:args.limit] print(f"{'DRY RUN - ' if dry_run else ''}Processing {len(files):,} files...") print("=" * 70) total_stats = { 'files_processed': 0, 'files_with_mismatches': 0, 'claims_removed': 0, 'claims_kept': 0, } removal_log = [] for file_path in files: stats = process_profile(file_path, dry_run=dry_run) total_stats['files_processed'] += 1 if stats.get('error'): continue if stats['removed_mismatches']: total_stats['files_with_mismatches'] += 1 total_stats['claims_removed'] += len(stats['removed_mismatches']) removal_log.append(stats) print(f"\n{file_path.name}:") for mismatch in stats['removed_mismatches']: print(f" REMOVED: {mismatch['claim_type']}") print(f" Profile: linkedin.com/in/{mismatch['profile_slug']}") print(f" Claim from: linkedin.com/in/{mismatch['claim_source_slug']}") total_stats['claims_kept'] += stats.get('kept_claims', 0) print("\n" + "=" * 70) print("SUMMARY") print("=" * 70) print(f"Files processed: {total_stats['files_processed']:,}") print(f"Files with mismatches: {total_stats['files_with_mismatches']}") print(f"Claims REMOVED (wrong person): {total_stats['claims_removed']}") print(f"Claims kept: {total_stats['claims_kept']:,}") if dry_run: print("\n*** DRY RUN - No files were modified ***") print("Run with --execute to apply changes") # Save log log_path = person_dir / '_linkedin_mismatch_cleanup_log.json' with open(log_path, 'w', encoding='utf-8') as f: json.dump({ 'cleanup_date': datetime.now(timezone.utc).isoformat(), 'dry_run': dry_run, 'total_stats': total_stats, 'removal_details': removal_log, }, f, indent=2, ensure_ascii=False) print(f"\nLog saved to: {log_path}") if __name__ == '__main__': main()