197 lines
6.9 KiB
Python
197 lines
6.9 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Remove LinkedIn claims that point to DIFFERENT profiles than the person was extracted from.
|
|
|
|
This is an entity resolution failure - the enrichment script found LinkedIn profiles
|
|
for people with similar names, but they are NOT the same person.
|
|
|
|
DATA QUALITY IS OF UTMOST IMPORTANCE - Wrong data is worse than no data.
|
|
"""
|
|
|
|
import json
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from urllib.parse import unquote
|
|
import argparse
|
|
|
|
def normalize_slug(slug: str) -> str:
|
|
"""Normalize LinkedIn slug for comparison."""
|
|
# URL decode
|
|
slug = unquote(slug)
|
|
# Remove language suffix like /nl, /en
|
|
slug = slug.rstrip('/').split('/')[-1]
|
|
# Remove query params like ?_l=en
|
|
if '?' in slug:
|
|
slug = slug.split('?')[0]
|
|
return slug.lower()
|
|
|
|
|
|
def process_profile(file_path: Path, dry_run: bool = True) -> dict:
|
|
"""Process a single profile and remove mismatched LinkedIn claims."""
|
|
stats = {
|
|
'file': file_path.name,
|
|
'claims_before': 0,
|
|
'claims_after': 0,
|
|
'removed_mismatches': [],
|
|
'kept_claims': 0,
|
|
}
|
|
|
|
try:
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
profile = json.load(f)
|
|
except (json.JSONDecodeError, FileNotFoundError) as e:
|
|
stats['error'] = str(e)
|
|
return stats
|
|
|
|
# Get profile's LinkedIn slug
|
|
linkedin_slug = profile.get('linkedin_slug', '')
|
|
if not linkedin_slug:
|
|
stats['no_linkedin_slug'] = True
|
|
return stats
|
|
|
|
profile_slug_normalized = normalize_slug(linkedin_slug)
|
|
|
|
web_claims = profile.get('web_claims', [])
|
|
stats['claims_before'] = len(web_claims)
|
|
|
|
if not web_claims:
|
|
return stats
|
|
|
|
# Process claims
|
|
kept_claims = []
|
|
|
|
for claim in web_claims:
|
|
source_url = claim.get('provenance', {}).get('source_url', '')
|
|
|
|
# Only check LinkedIn sources
|
|
if 'linkedin.com/in/' not in source_url:
|
|
kept_claims.append(claim)
|
|
continue
|
|
|
|
# Extract and normalize source slug
|
|
source_slug = source_url.split('/in/')[-1]
|
|
source_slug_normalized = normalize_slug(source_slug)
|
|
|
|
# Check if it matches
|
|
if profile_slug_normalized == source_slug_normalized:
|
|
# Exact match - keep
|
|
kept_claims.append(claim)
|
|
elif (source_slug_normalized.startswith(profile_slug_normalized) or
|
|
profile_slug_normalized.startswith(source_slug_normalized)):
|
|
# Close match (language suffix, etc) - keep but flag
|
|
kept_claims.append(claim)
|
|
else:
|
|
# TRUE MISMATCH - this is from a different person!
|
|
stats['removed_mismatches'].append({
|
|
'claim_type': claim.get('claim_type'),
|
|
'claim_value': str(claim.get('claim_value', ''))[:100],
|
|
'profile_slug': linkedin_slug,
|
|
'claim_source_slug': source_slug,
|
|
'reason': f"LinkedIn profile mismatch: profile is '{linkedin_slug}' but claim from '{source_slug}'"
|
|
})
|
|
|
|
stats['claims_after'] = len(kept_claims)
|
|
stats['kept_claims'] = len(kept_claims)
|
|
|
|
# Update profile if not dry run and we removed something
|
|
if not dry_run and stats['removed_mismatches']:
|
|
profile['web_claims'] = kept_claims
|
|
|
|
# Add cleanup metadata
|
|
if 'enrichment_metadata' not in profile:
|
|
profile['enrichment_metadata'] = {}
|
|
|
|
cleanup_entry = {
|
|
'cleanup_date': datetime.now(timezone.utc).isoformat(),
|
|
'cleanup_script': 'cleanup_linkedin_mismatches.py',
|
|
'claims_removed': len(stats['removed_mismatches']),
|
|
'removal_reason': 'LinkedIn profile slug mismatch - claims from different person',
|
|
}
|
|
|
|
if 'cleanup_history' not in profile['enrichment_metadata']:
|
|
profile['enrichment_metadata']['cleanup_history'] = []
|
|
profile['enrichment_metadata']['cleanup_history'].append(cleanup_entry)
|
|
|
|
with open(file_path, 'w', encoding='utf-8') as f:
|
|
json.dump(profile, f, indent=2, ensure_ascii=False)
|
|
|
|
return stats
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='Remove LinkedIn claims from wrong profiles')
|
|
parser.add_argument('--dry-run', action='store_true', default=True,
|
|
help='Do not modify files, just report (default: True)')
|
|
parser.add_argument('--execute', action='store_true',
|
|
help='Actually modify files')
|
|
parser.add_argument('--limit', type=int, default=None,
|
|
help='Process only N files')
|
|
|
|
args = parser.parse_args()
|
|
dry_run = not args.execute
|
|
|
|
person_dir = Path('/Users/kempersc/apps/glam/data/person')
|
|
files = sorted(person_dir.glob('ID_*.json'))
|
|
|
|
if args.limit:
|
|
files = files[:args.limit]
|
|
|
|
print(f"{'DRY RUN - ' if dry_run else ''}Processing {len(files):,} files...")
|
|
print("=" * 70)
|
|
|
|
total_stats = {
|
|
'files_processed': 0,
|
|
'files_with_mismatches': 0,
|
|
'claims_removed': 0,
|
|
'claims_kept': 0,
|
|
}
|
|
|
|
removal_log = []
|
|
|
|
for file_path in files:
|
|
stats = process_profile(file_path, dry_run=dry_run)
|
|
total_stats['files_processed'] += 1
|
|
|
|
if stats.get('error'):
|
|
continue
|
|
|
|
if stats['removed_mismatches']:
|
|
total_stats['files_with_mismatches'] += 1
|
|
total_stats['claims_removed'] += len(stats['removed_mismatches'])
|
|
removal_log.append(stats)
|
|
|
|
print(f"\n{file_path.name}:")
|
|
for mismatch in stats['removed_mismatches']:
|
|
print(f" REMOVED: {mismatch['claim_type']}")
|
|
print(f" Profile: linkedin.com/in/{mismatch['profile_slug']}")
|
|
print(f" Claim from: linkedin.com/in/{mismatch['claim_source_slug']}")
|
|
|
|
total_stats['claims_kept'] += stats.get('kept_claims', 0)
|
|
|
|
print("\n" + "=" * 70)
|
|
print("SUMMARY")
|
|
print("=" * 70)
|
|
print(f"Files processed: {total_stats['files_processed']:,}")
|
|
print(f"Files with mismatches: {total_stats['files_with_mismatches']}")
|
|
print(f"Claims REMOVED (wrong person): {total_stats['claims_removed']}")
|
|
print(f"Claims kept: {total_stats['claims_kept']:,}")
|
|
|
|
if dry_run:
|
|
print("\n*** DRY RUN - No files were modified ***")
|
|
print("Run with --execute to apply changes")
|
|
|
|
# Save log
|
|
log_path = person_dir / '_linkedin_mismatch_cleanup_log.json'
|
|
with open(log_path, 'w', encoding='utf-8') as f:
|
|
json.dump({
|
|
'cleanup_date': datetime.now(timezone.utc).isoformat(),
|
|
'dry_run': dry_run,
|
|
'total_stats': total_stats,
|
|
'removal_details': removal_log,
|
|
}, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"\nLog saved to: {log_path}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|