#!/usr/bin/env python3 """ Detect name mismatches in LinkedIn entity profiles. Compares the LinkedIn URL slug with the assigned name to find: 1. Profiles where the name doesn't match the slug at all 2. Patterns of repeated wrong names (like "Simon Kemper") 3. Other potential filler/hallucinated names """ import json import os import re from pathlib import Path from collections import Counter, defaultdict from urllib.parse import unquote import unicodedata def normalize_name(name: str) -> str: """Normalize a name for comparison.""" if not name: return "" # Decode URL encoding name = unquote(name) # Normalize unicode name = unicodedata.normalize('NFD', name) # Remove diacritics name = ''.join(c for c in name if unicodedata.category(c) != 'Mn') # Lowercase name = name.lower() # Remove common suffixes like numbers, hyphens name = re.sub(r'[-_\d]+$', '', name) # Replace hyphens/underscores with spaces name = re.sub(r'[-_]+', ' ', name) # Remove extra whitespace name = ' '.join(name.split()) return name def extract_name_from_slug(slug: str) -> str: """Extract a human-readable name from a LinkedIn slug.""" # Decode URL encoding slug = unquote(slug) # Remove timestamp suffix like _20251214T115050Z slug = re.sub(r'_\d{8}T\d{6}Z\.json$', '', slug) # Remove trailing numbers/IDs slug = re.sub(r'[-_][\da-f]{6,}$', '', slug) slug = re.sub(r'[-_]\d+$', '', slug) return normalize_name(slug) def names_match(slug_name: str, profile_name: str) -> bool: """Check if the slug name and profile name are reasonably similar.""" if not slug_name or not profile_name: return False slug_normalized = normalize_name(slug_name) profile_normalized = normalize_name(profile_name) # Direct match if slug_normalized == profile_normalized: return True # Check if all words from slug appear in profile name slug_words = set(slug_normalized.split()) profile_words = set(profile_normalized.split()) # If slug has meaningful words, check overlap if slug_words and len(slug_words) >= 2: # At least half the slug words should be in profile overlap = slug_words & profile_words if len(overlap) >= len(slug_words) * 0.5: return True # Check if first name matches slug_parts = slug_normalized.split() profile_parts = profile_normalized.split() if slug_parts and profile_parts: if slug_parts[0] == profile_parts[0]: return True return False def analyze_entity_files(entity_dir: Path): """Analyze all entity files for name mismatches.""" mismatches = [] name_counter = Counter() files_by_name = defaultdict(list) total_files = 0 fallback_files = 0 for filepath in entity_dir.glob("*.json"): total_files += 1 filename = filepath.name try: with open(filepath, 'r', encoding='utf-8') as f: data = json.load(f) except (json.JSONDecodeError, IOError) as e: print(f"Error reading {filename}: {e}") continue # Get the profile name profile_name = None if 'profile_data' in data and 'name' in data['profile_data']: profile_name = data['profile_data']['name'] elif 'source_staff_info' in data and 'name' in data['source_staff_info']: profile_name = data['source_staff_info']['name'] if not profile_name: continue # Track all names for frequency analysis name_counter[profile_name] += 1 files_by_name[profile_name].append(filename) # Check if this is a fallback file extraction_method = data.get('extraction_metadata', {}).get('extraction_method', '') if extraction_method == 'fallback_basic': fallback_files += 1 # Extract name from slug slug_name = extract_name_from_slug(filename) # Check for mismatch if not names_match(slug_name, profile_name): mismatches.append({ 'filename': filename, 'slug_name': slug_name, 'profile_name': profile_name, 'extraction_method': extraction_method, 'linkedin_url': data.get('extraction_metadata', {}).get('linkedin_url', '') }) return { 'total_files': total_files, 'fallback_files': fallback_files, 'mismatches': mismatches, 'name_counter': name_counter, 'files_by_name': files_by_name } def main(): entity_dir = Path("/Users/kempersc/apps/glam/data/custodian/person/entity") print("=" * 80) print("LINKEDIN ENTITY NAME MISMATCH ANALYSIS") print("=" * 80) print() results = analyze_entity_files(entity_dir) print(f"Total entity files analyzed: {results['total_files']}") print(f"Fallback (basic) files: {results['fallback_files']}") print(f"Total mismatches detected: {len(results['mismatches'])}") print() # Find names that appear suspiciously often (potential filler names) print("=" * 80) print("NAMES APPEARING MORE THAN 5 TIMES (Potential Filler Names)") print("=" * 80) frequent_names = [(name, count) for name, count in results['name_counter'].most_common(50) if count > 5] for name, count in frequent_names: # Check if this name appears in mismatches mismatch_count = sum(1 for m in results['mismatches'] if m['profile_name'] == name) print(f" '{name}': {count} occurrences ({mismatch_count} are mismatches)") print() print("=" * 80) print("ALL MISMATCHED FILES (slug name != profile name)") print("=" * 80) # Group mismatches by profile_name to see patterns mismatch_by_name = defaultdict(list) for m in results['mismatches']: mismatch_by_name[m['profile_name']].append(m) # Sort by frequency of the mismatched name sorted_names = sorted(mismatch_by_name.items(), key=lambda x: -len(x[1])) for profile_name, items in sorted_names[:30]: # Top 30 most frequent mismatch names print(f"\n--- '{profile_name}' assigned to {len(items)} different slugs ---") for item in items[:10]: # Show first 10 examples print(f" Slug: {item['slug_name']}") print(f" File: {item['filename']}") print(f" Method: {item['extraction_method']}") print() # Output detailed CSV for further analysis csv_path = entity_dir.parent / "name_mismatch_report.csv" with open(csv_path, 'w', encoding='utf-8') as f: f.write("filename,slug_name,profile_name,extraction_method,linkedin_url\n") for m in results['mismatches']: f.write(f'"{m["filename"]}","{m["slug_name"]}","{m["profile_name"]}","{m["extraction_method"]}","{m["linkedin_url"]}"\n') print(f"\nDetailed report saved to: {csv_path}") # Also output JSON for programmatic use json_path = entity_dir.parent / "name_mismatch_report.json" with open(json_path, 'w', encoding='utf-8') as f: json.dump({ 'total_files': results['total_files'], 'fallback_files': results['fallback_files'], 'total_mismatches': len(results['mismatches']), 'mismatches_by_name': {name: len(items) for name, items in mismatch_by_name.items()}, 'frequent_names': [(name, count) for name, count in results['name_counter'].most_common(100)], 'mismatches': results['mismatches'] }, f, indent=2, ensure_ascii=False) print(f"JSON report saved to: {json_path}") if __name__ == "__main__": main()