glam/scripts/cleanup_linkedin_mismatches.py
2026-01-11 12:15:27 +01:00

197 lines
6.9 KiB
Python

#!/usr/bin/env python3
"""
Remove LinkedIn claims that point to DIFFERENT profiles than the person was extracted from.
This is an entity resolution failure - the enrichment script found LinkedIn profiles
for people with similar names, but they are NOT the same person.
DATA QUALITY IS OF UTMOST IMPORTANCE - Wrong data is worse than no data.
"""
import json
from pathlib import Path
from datetime import datetime, timezone
from urllib.parse import unquote
import argparse
def normalize_slug(slug: str) -> str:
"""Normalize LinkedIn slug for comparison."""
# URL decode
slug = unquote(slug)
# Remove language suffix like /nl, /en
slug = slug.rstrip('/').split('/')[-1]
# Remove query params like ?_l=en
if '?' in slug:
slug = slug.split('?')[0]
return slug.lower()
def process_profile(file_path: Path, dry_run: bool = True) -> dict:
"""Process a single profile and remove mismatched LinkedIn claims."""
stats = {
'file': file_path.name,
'claims_before': 0,
'claims_after': 0,
'removed_mismatches': [],
'kept_claims': 0,
}
try:
with open(file_path, 'r', encoding='utf-8') as f:
profile = json.load(f)
except (json.JSONDecodeError, FileNotFoundError) as e:
stats['error'] = str(e)
return stats
# Get profile's LinkedIn slug
linkedin_slug = profile.get('linkedin_slug', '')
if not linkedin_slug:
stats['no_linkedin_slug'] = True
return stats
profile_slug_normalized = normalize_slug(linkedin_slug)
web_claims = profile.get('web_claims', [])
stats['claims_before'] = len(web_claims)
if not web_claims:
return stats
# Process claims
kept_claims = []
for claim in web_claims:
source_url = claim.get('provenance', {}).get('source_url', '')
# Only check LinkedIn sources
if 'linkedin.com/in/' not in source_url:
kept_claims.append(claim)
continue
# Extract and normalize source slug
source_slug = source_url.split('/in/')[-1]
source_slug_normalized = normalize_slug(source_slug)
# Check if it matches
if profile_slug_normalized == source_slug_normalized:
# Exact match - keep
kept_claims.append(claim)
elif (source_slug_normalized.startswith(profile_slug_normalized) or
profile_slug_normalized.startswith(source_slug_normalized)):
# Close match (language suffix, etc) - keep but flag
kept_claims.append(claim)
else:
# TRUE MISMATCH - this is from a different person!
stats['removed_mismatches'].append({
'claim_type': claim.get('claim_type'),
'claim_value': str(claim.get('claim_value', ''))[:100],
'profile_slug': linkedin_slug,
'claim_source_slug': source_slug,
'reason': f"LinkedIn profile mismatch: profile is '{linkedin_slug}' but claim from '{source_slug}'"
})
stats['claims_after'] = len(kept_claims)
stats['kept_claims'] = len(kept_claims)
# Update profile if not dry run and we removed something
if not dry_run and stats['removed_mismatches']:
profile['web_claims'] = kept_claims
# Add cleanup metadata
if 'enrichment_metadata' not in profile:
profile['enrichment_metadata'] = {}
cleanup_entry = {
'cleanup_date': datetime.now(timezone.utc).isoformat(),
'cleanup_script': 'cleanup_linkedin_mismatches.py',
'claims_removed': len(stats['removed_mismatches']),
'removal_reason': 'LinkedIn profile slug mismatch - claims from different person',
}
if 'cleanup_history' not in profile['enrichment_metadata']:
profile['enrichment_metadata']['cleanup_history'] = []
profile['enrichment_metadata']['cleanup_history'].append(cleanup_entry)
with open(file_path, 'w', encoding='utf-8') as f:
json.dump(profile, f, indent=2, ensure_ascii=False)
return stats
def main():
parser = argparse.ArgumentParser(description='Remove LinkedIn claims from wrong profiles')
parser.add_argument('--dry-run', action='store_true', default=True,
help='Do not modify files, just report (default: True)')
parser.add_argument('--execute', action='store_true',
help='Actually modify files')
parser.add_argument('--limit', type=int, default=None,
help='Process only N files')
args = parser.parse_args()
dry_run = not args.execute
person_dir = Path('/Users/kempersc/apps/glam/data/person')
files = sorted(person_dir.glob('ID_*.json'))
if args.limit:
files = files[:args.limit]
print(f"{'DRY RUN - ' if dry_run else ''}Processing {len(files):,} files...")
print("=" * 70)
total_stats = {
'files_processed': 0,
'files_with_mismatches': 0,
'claims_removed': 0,
'claims_kept': 0,
}
removal_log = []
for file_path in files:
stats = process_profile(file_path, dry_run=dry_run)
total_stats['files_processed'] += 1
if stats.get('error'):
continue
if stats['removed_mismatches']:
total_stats['files_with_mismatches'] += 1
total_stats['claims_removed'] += len(stats['removed_mismatches'])
removal_log.append(stats)
print(f"\n{file_path.name}:")
for mismatch in stats['removed_mismatches']:
print(f" REMOVED: {mismatch['claim_type']}")
print(f" Profile: linkedin.com/in/{mismatch['profile_slug']}")
print(f" Claim from: linkedin.com/in/{mismatch['claim_source_slug']}")
total_stats['claims_kept'] += stats.get('kept_claims', 0)
print("\n" + "=" * 70)
print("SUMMARY")
print("=" * 70)
print(f"Files processed: {total_stats['files_processed']:,}")
print(f"Files with mismatches: {total_stats['files_with_mismatches']}")
print(f"Claims REMOVED (wrong person): {total_stats['claims_removed']}")
print(f"Claims kept: {total_stats['claims_kept']:,}")
if dry_run:
print("\n*** DRY RUN - No files were modified ***")
print("Run with --execute to apply changes")
# Save log
log_path = person_dir / '_linkedin_mismatch_cleanup_log.json'
with open(log_path, 'w', encoding='utf-8') as f:
json.dump({
'cleanup_date': datetime.now(timezone.utc).isoformat(),
'dry_run': dry_run,
'total_stats': total_stats,
'removal_details': removal_log,
}, f, indent=2, ensure_ascii=False)
print(f"\nLog saved to: {log_path}")
if __name__ == '__main__':
main()