glam/scripts/detect_name_mismatch.py
kempersc 0c36429257 feat(scripts): Add batch crawling and data quality scripts
- batch_crawl4ai_recrawl.py: Retry failed URL crawls
- batch_firecrawl_recrawl.py: FireCrawl batch processing
- batch_httpx_scrape.py: HTTPX-based scraping
- detect_name_mismatch.py: Find name mismatches in data
- enrich_dutch_custodians_crawl4ai.py: Dutch custodian enrichment
- fix_collision_victims.py: GHCID collision resolution
- fix_generic_platform_names*.py: Platform name cleanup
- fix_ghcid_type.py: GHCID type corrections
- fix_simon_kemper_contamination.py: Data cleanup
- scan_dutch_data_quality.py: Data quality scanning
- transform_crawl4ai_to_digital_platform.py: Data transformation
2025-12-15 01:47:46 +01:00

213 lines
7.6 KiB
Python

#!/usr/bin/env python3
"""
Detect name mismatches in LinkedIn entity profiles.
Compares the LinkedIn URL slug with the assigned name to find:
1. Profiles where the name doesn't match the slug at all
2. Patterns of repeated wrong names (like "Simon Kemper")
3. Other potential filler/hallucinated names
"""
import json
import os
import re
from pathlib import Path
from collections import Counter, defaultdict
from urllib.parse import unquote
import unicodedata
def normalize_name(name: str) -> str:
"""Normalize a name for comparison."""
if not name:
return ""
# Decode URL encoding
name = unquote(name)
# Normalize unicode
name = unicodedata.normalize('NFD', name)
# Remove diacritics
name = ''.join(c for c in name if unicodedata.category(c) != 'Mn')
# Lowercase
name = name.lower()
# Remove common suffixes like numbers, hyphens
name = re.sub(r'[-_\d]+$', '', name)
# Replace hyphens/underscores with spaces
name = re.sub(r'[-_]+', ' ', name)
# Remove extra whitespace
name = ' '.join(name.split())
return name
def extract_name_from_slug(slug: str) -> str:
"""Extract a human-readable name from a LinkedIn slug."""
# Decode URL encoding
slug = unquote(slug)
# Remove timestamp suffix like _20251214T115050Z
slug = re.sub(r'_\d{8}T\d{6}Z\.json$', '', slug)
# Remove trailing numbers/IDs
slug = re.sub(r'[-_][\da-f]{6,}$', '', slug)
slug = re.sub(r'[-_]\d+$', '', slug)
return normalize_name(slug)
def names_match(slug_name: str, profile_name: str) -> bool:
"""Check if the slug name and profile name are reasonably similar."""
if not slug_name or not profile_name:
return False
slug_normalized = normalize_name(slug_name)
profile_normalized = normalize_name(profile_name)
# Direct match
if slug_normalized == profile_normalized:
return True
# Check if all words from slug appear in profile name
slug_words = set(slug_normalized.split())
profile_words = set(profile_normalized.split())
# If slug has meaningful words, check overlap
if slug_words and len(slug_words) >= 2:
# At least half the slug words should be in profile
overlap = slug_words & profile_words
if len(overlap) >= len(slug_words) * 0.5:
return True
# Check if first name matches
slug_parts = slug_normalized.split()
profile_parts = profile_normalized.split()
if slug_parts and profile_parts:
if slug_parts[0] == profile_parts[0]:
return True
return False
def analyze_entity_files(entity_dir: Path):
"""Analyze all entity files for name mismatches."""
mismatches = []
name_counter = Counter()
files_by_name = defaultdict(list)
total_files = 0
fallback_files = 0
for filepath in entity_dir.glob("*.json"):
total_files += 1
filename = filepath.name
try:
with open(filepath, 'r', encoding='utf-8') as f:
data = json.load(f)
except (json.JSONDecodeError, IOError) as e:
print(f"Error reading {filename}: {e}")
continue
# Get the profile name
profile_name = None
if 'profile_data' in data and 'name' in data['profile_data']:
profile_name = data['profile_data']['name']
elif 'source_staff_info' in data and 'name' in data['source_staff_info']:
profile_name = data['source_staff_info']['name']
if not profile_name:
continue
# Track all names for frequency analysis
name_counter[profile_name] += 1
files_by_name[profile_name].append(filename)
# Check if this is a fallback file
extraction_method = data.get('extraction_metadata', {}).get('extraction_method', '')
if extraction_method == 'fallback_basic':
fallback_files += 1
# Extract name from slug
slug_name = extract_name_from_slug(filename)
# Check for mismatch
if not names_match(slug_name, profile_name):
mismatches.append({
'filename': filename,
'slug_name': slug_name,
'profile_name': profile_name,
'extraction_method': extraction_method,
'linkedin_url': data.get('extraction_metadata', {}).get('linkedin_url', '')
})
return {
'total_files': total_files,
'fallback_files': fallback_files,
'mismatches': mismatches,
'name_counter': name_counter,
'files_by_name': files_by_name
}
def main():
entity_dir = Path("/Users/kempersc/apps/glam/data/custodian/person/entity")
print("=" * 80)
print("LINKEDIN ENTITY NAME MISMATCH ANALYSIS")
print("=" * 80)
print()
results = analyze_entity_files(entity_dir)
print(f"Total entity files analyzed: {results['total_files']}")
print(f"Fallback (basic) files: {results['fallback_files']}")
print(f"Total mismatches detected: {len(results['mismatches'])}")
print()
# Find names that appear suspiciously often (potential filler names)
print("=" * 80)
print("NAMES APPEARING MORE THAN 5 TIMES (Potential Filler Names)")
print("=" * 80)
frequent_names = [(name, count) for name, count in results['name_counter'].most_common(50) if count > 5]
for name, count in frequent_names:
# Check if this name appears in mismatches
mismatch_count = sum(1 for m in results['mismatches'] if m['profile_name'] == name)
print(f" '{name}': {count} occurrences ({mismatch_count} are mismatches)")
print()
print("=" * 80)
print("ALL MISMATCHED FILES (slug name != profile name)")
print("=" * 80)
# Group mismatches by profile_name to see patterns
mismatch_by_name = defaultdict(list)
for m in results['mismatches']:
mismatch_by_name[m['profile_name']].append(m)
# Sort by frequency of the mismatched name
sorted_names = sorted(mismatch_by_name.items(), key=lambda x: -len(x[1]))
for profile_name, items in sorted_names[:30]: # Top 30 most frequent mismatch names
print(f"\n--- '{profile_name}' assigned to {len(items)} different slugs ---")
for item in items[:10]: # Show first 10 examples
print(f" Slug: {item['slug_name']}")
print(f" File: {item['filename']}")
print(f" Method: {item['extraction_method']}")
print()
# Output detailed CSV for further analysis
csv_path = entity_dir.parent / "name_mismatch_report.csv"
with open(csv_path, 'w', encoding='utf-8') as f:
f.write("filename,slug_name,profile_name,extraction_method,linkedin_url\n")
for m in results['mismatches']:
f.write(f'"{m["filename"]}","{m["slug_name"]}","{m["profile_name"]}","{m["extraction_method"]}","{m["linkedin_url"]}"\n')
print(f"\nDetailed report saved to: {csv_path}")
# Also output JSON for programmatic use
json_path = entity_dir.parent / "name_mismatch_report.json"
with open(json_path, 'w', encoding='utf-8') as f:
json.dump({
'total_files': results['total_files'],
'fallback_files': results['fallback_files'],
'total_mismatches': len(results['mismatches']),
'mismatches_by_name': {name: len(items) for name, items in mismatch_by_name.items()},
'frequent_names': [(name, count) for name, count in results['name_counter'].most_common(100)],
'mismatches': results['mismatches']
}, f, indent=2, ensure_ascii=False)
print(f"JSON report saved to: {json_path}")
if __name__ == "__main__":
main()