- batch_crawl4ai_recrawl.py: Retry failed URL crawls - batch_firecrawl_recrawl.py: FireCrawl batch processing - batch_httpx_scrape.py: HTTPX-based scraping - detect_name_mismatch.py: Find name mismatches in data - enrich_dutch_custodians_crawl4ai.py: Dutch custodian enrichment - fix_collision_victims.py: GHCID collision resolution - fix_generic_platform_names*.py: Platform name cleanup - fix_ghcid_type.py: GHCID type corrections - fix_simon_kemper_contamination.py: Data cleanup - scan_dutch_data_quality.py: Data quality scanning - transform_crawl4ai_to_digital_platform.py: Data transformation
213 lines
7.6 KiB
Python
213 lines
7.6 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Detect name mismatches in LinkedIn entity profiles.
|
|
|
|
Compares the LinkedIn URL slug with the assigned name to find:
|
|
1. Profiles where the name doesn't match the slug at all
|
|
2. Patterns of repeated wrong names (like "Simon Kemper")
|
|
3. Other potential filler/hallucinated names
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import re
|
|
from pathlib import Path
|
|
from collections import Counter, defaultdict
|
|
from urllib.parse import unquote
|
|
import unicodedata
|
|
|
|
def normalize_name(name: str) -> str:
|
|
"""Normalize a name for comparison."""
|
|
if not name:
|
|
return ""
|
|
# Decode URL encoding
|
|
name = unquote(name)
|
|
# Normalize unicode
|
|
name = unicodedata.normalize('NFD', name)
|
|
# Remove diacritics
|
|
name = ''.join(c for c in name if unicodedata.category(c) != 'Mn')
|
|
# Lowercase
|
|
name = name.lower()
|
|
# Remove common suffixes like numbers, hyphens
|
|
name = re.sub(r'[-_\d]+$', '', name)
|
|
# Replace hyphens/underscores with spaces
|
|
name = re.sub(r'[-_]+', ' ', name)
|
|
# Remove extra whitespace
|
|
name = ' '.join(name.split())
|
|
return name
|
|
|
|
def extract_name_from_slug(slug: str) -> str:
|
|
"""Extract a human-readable name from a LinkedIn slug."""
|
|
# Decode URL encoding
|
|
slug = unquote(slug)
|
|
# Remove timestamp suffix like _20251214T115050Z
|
|
slug = re.sub(r'_\d{8}T\d{6}Z\.json$', '', slug)
|
|
# Remove trailing numbers/IDs
|
|
slug = re.sub(r'[-_][\da-f]{6,}$', '', slug)
|
|
slug = re.sub(r'[-_]\d+$', '', slug)
|
|
return normalize_name(slug)
|
|
|
|
def names_match(slug_name: str, profile_name: str) -> bool:
|
|
"""Check if the slug name and profile name are reasonably similar."""
|
|
if not slug_name or not profile_name:
|
|
return False
|
|
|
|
slug_normalized = normalize_name(slug_name)
|
|
profile_normalized = normalize_name(profile_name)
|
|
|
|
# Direct match
|
|
if slug_normalized == profile_normalized:
|
|
return True
|
|
|
|
# Check if all words from slug appear in profile name
|
|
slug_words = set(slug_normalized.split())
|
|
profile_words = set(profile_normalized.split())
|
|
|
|
# If slug has meaningful words, check overlap
|
|
if slug_words and len(slug_words) >= 2:
|
|
# At least half the slug words should be in profile
|
|
overlap = slug_words & profile_words
|
|
if len(overlap) >= len(slug_words) * 0.5:
|
|
return True
|
|
|
|
# Check if first name matches
|
|
slug_parts = slug_normalized.split()
|
|
profile_parts = profile_normalized.split()
|
|
if slug_parts and profile_parts:
|
|
if slug_parts[0] == profile_parts[0]:
|
|
return True
|
|
|
|
return False
|
|
|
|
def analyze_entity_files(entity_dir: Path):
|
|
"""Analyze all entity files for name mismatches."""
|
|
|
|
mismatches = []
|
|
name_counter = Counter()
|
|
files_by_name = defaultdict(list)
|
|
total_files = 0
|
|
fallback_files = 0
|
|
|
|
for filepath in entity_dir.glob("*.json"):
|
|
total_files += 1
|
|
filename = filepath.name
|
|
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
except (json.JSONDecodeError, IOError) as e:
|
|
print(f"Error reading {filename}: {e}")
|
|
continue
|
|
|
|
# Get the profile name
|
|
profile_name = None
|
|
if 'profile_data' in data and 'name' in data['profile_data']:
|
|
profile_name = data['profile_data']['name']
|
|
elif 'source_staff_info' in data and 'name' in data['source_staff_info']:
|
|
profile_name = data['source_staff_info']['name']
|
|
|
|
if not profile_name:
|
|
continue
|
|
|
|
# Track all names for frequency analysis
|
|
name_counter[profile_name] += 1
|
|
files_by_name[profile_name].append(filename)
|
|
|
|
# Check if this is a fallback file
|
|
extraction_method = data.get('extraction_metadata', {}).get('extraction_method', '')
|
|
if extraction_method == 'fallback_basic':
|
|
fallback_files += 1
|
|
|
|
# Extract name from slug
|
|
slug_name = extract_name_from_slug(filename)
|
|
|
|
# Check for mismatch
|
|
if not names_match(slug_name, profile_name):
|
|
mismatches.append({
|
|
'filename': filename,
|
|
'slug_name': slug_name,
|
|
'profile_name': profile_name,
|
|
'extraction_method': extraction_method,
|
|
'linkedin_url': data.get('extraction_metadata', {}).get('linkedin_url', '')
|
|
})
|
|
|
|
return {
|
|
'total_files': total_files,
|
|
'fallback_files': fallback_files,
|
|
'mismatches': mismatches,
|
|
'name_counter': name_counter,
|
|
'files_by_name': files_by_name
|
|
}
|
|
|
|
def main():
|
|
entity_dir = Path("/Users/kempersc/apps/glam/data/custodian/person/entity")
|
|
|
|
print("=" * 80)
|
|
print("LINKEDIN ENTITY NAME MISMATCH ANALYSIS")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
results = analyze_entity_files(entity_dir)
|
|
|
|
print(f"Total entity files analyzed: {results['total_files']}")
|
|
print(f"Fallback (basic) files: {results['fallback_files']}")
|
|
print(f"Total mismatches detected: {len(results['mismatches'])}")
|
|
print()
|
|
|
|
# Find names that appear suspiciously often (potential filler names)
|
|
print("=" * 80)
|
|
print("NAMES APPEARING MORE THAN 5 TIMES (Potential Filler Names)")
|
|
print("=" * 80)
|
|
frequent_names = [(name, count) for name, count in results['name_counter'].most_common(50) if count > 5]
|
|
|
|
for name, count in frequent_names:
|
|
# Check if this name appears in mismatches
|
|
mismatch_count = sum(1 for m in results['mismatches'] if m['profile_name'] == name)
|
|
print(f" '{name}': {count} occurrences ({mismatch_count} are mismatches)")
|
|
|
|
print()
|
|
print("=" * 80)
|
|
print("ALL MISMATCHED FILES (slug name != profile name)")
|
|
print("=" * 80)
|
|
|
|
# Group mismatches by profile_name to see patterns
|
|
mismatch_by_name = defaultdict(list)
|
|
for m in results['mismatches']:
|
|
mismatch_by_name[m['profile_name']].append(m)
|
|
|
|
# Sort by frequency of the mismatched name
|
|
sorted_names = sorted(mismatch_by_name.items(), key=lambda x: -len(x[1]))
|
|
|
|
for profile_name, items in sorted_names[:30]: # Top 30 most frequent mismatch names
|
|
print(f"\n--- '{profile_name}' assigned to {len(items)} different slugs ---")
|
|
for item in items[:10]: # Show first 10 examples
|
|
print(f" Slug: {item['slug_name']}")
|
|
print(f" File: {item['filename']}")
|
|
print(f" Method: {item['extraction_method']}")
|
|
print()
|
|
|
|
# Output detailed CSV for further analysis
|
|
csv_path = entity_dir.parent / "name_mismatch_report.csv"
|
|
with open(csv_path, 'w', encoding='utf-8') as f:
|
|
f.write("filename,slug_name,profile_name,extraction_method,linkedin_url\n")
|
|
for m in results['mismatches']:
|
|
f.write(f'"{m["filename"]}","{m["slug_name"]}","{m["profile_name"]}","{m["extraction_method"]}","{m["linkedin_url"]}"\n')
|
|
|
|
print(f"\nDetailed report saved to: {csv_path}")
|
|
|
|
# Also output JSON for programmatic use
|
|
json_path = entity_dir.parent / "name_mismatch_report.json"
|
|
with open(json_path, 'w', encoding='utf-8') as f:
|
|
json.dump({
|
|
'total_files': results['total_files'],
|
|
'fallback_files': results['fallback_files'],
|
|
'total_mismatches': len(results['mismatches']),
|
|
'mismatches_by_name': {name: len(items) for name, items in mismatch_by_name.items()},
|
|
'frequent_names': [(name, count) for name, count in results['name_counter'].most_common(100)],
|
|
'mismatches': results['mismatches']
|
|
}, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"JSON report saved to: {json_path}")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|