glam/scripts/detect_name_mismatch.py

#!/usr/bin/env python3
"""
Detect name mismatches in LinkedIn entity profiles.

Compares the LinkedIn URL slug with the assigned name to find:
1. Profiles where the name doesn't match the slug at all
2. Patterns of repeated wrong names (like "Simon Kemper")
3. Other potential filler/hallucinated names
"""

import json
import os
import re
from pathlib import Path
from collections import Counter, defaultdict
from urllib.parse import unquote
import unicodedata

def normalize_name(name: str) -> str:
    """Normalize a name for comparison."""
    if not name:
        return ""
    # Decode URL encoding
    name = unquote(name)
    # Normalize unicode
    name = unicodedata.normalize('NFD', name)
    # Remove diacritics
    name = ''.join(c for c in name if unicodedata.category(c) != 'Mn')
    # Lowercase
    name = name.lower()
    # Remove common suffixes like numbers, hyphens
    name = re.sub(r'[-_\d]+$', '', name)
    # Replace hyphens/underscores with spaces
    name = re.sub(r'[-_]+', ' ', name)
    # Remove extra whitespace
    name = ' '.join(name.split())
    return name

def extract_name_from_slug(slug: str) -> str:
    """Extract a human-readable name from a LinkedIn slug."""
    # Decode URL encoding
    slug = unquote(slug)
    # Remove timestamp suffix like _20251214T115050Z
    slug = re.sub(r'_\d{8}T\d{6}Z\.json$', '', slug)
    # Remove trailing numbers/IDs
    slug = re.sub(r'[-_][\da-f]{6,}$', '', slug)
    slug = re.sub(r'[-_]\d+$', '', slug)
    return normalize_name(slug)

def names_match(slug_name: str, profile_name: str) -> bool:
    """Check if the slug name and profile name are reasonably similar."""
    if not slug_name or not profile_name:
        return False

    slug_normalized = normalize_name(slug_name)
    profile_normalized = normalize_name(profile_name)

    # Direct match
    if slug_normalized == profile_normalized:
        return True

    # Check if all words from slug appear in profile name
    slug_words = set(slug_normalized.split())
    profile_words = set(profile_normalized.split())

    # If slug has meaningful words, check overlap
    if slug_words and len(slug_words) >= 2:
        # At least half the slug words should be in profile
        overlap = slug_words & profile_words
        if len(overlap) >= len(slug_words) * 0.5:
            return True

    # Check if first name matches
    slug_parts = slug_normalized.split()
    profile_parts = profile_normalized.split()
    if slug_parts and profile_parts:
        if slug_parts[0] == profile_parts[0]:
            return True

    return False

def analyze_entity_files(entity_dir: Path):
    """Analyze all entity files for name mismatches."""

    mismatches = []
    name_counter = Counter()
    files_by_name = defaultdict(list)
    total_files = 0
    fallback_files = 0

    for filepath in entity_dir.glob("*.json"):
        total_files += 1
        filename = filepath.name

        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                data = json.load(f)
        except (json.JSONDecodeError, IOError) as e:
            print(f"Error reading {filename}: {e}")
            continue

        # Get the profile name
        profile_name = None
        if 'profile_data' in data and 'name' in data['profile_data']:
            profile_name = data['profile_data']['name']
        elif 'source_staff_info' in data and 'name' in data['source_staff_info']:
            profile_name = data['source_staff_info']['name']

        if not profile_name:
            continue

        # Track all names for frequency analysis
        name_counter[profile_name] += 1
        files_by_name[profile_name].append(filename)

        # Check if this is a fallback file
        extraction_method = data.get('extraction_metadata', {}).get('extraction_method', '')
        if extraction_method == 'fallback_basic':
            fallback_files += 1

        # Extract name from slug
        slug_name = extract_name_from_slug(filename)

        # Check for mismatch
        if not names_match(slug_name, profile_name):
            mismatches.append({
                'filename': filename,
                'slug_name': slug_name,
                'profile_name': profile_name,
                'extraction_method': extraction_method,
                'linkedin_url': data.get('extraction_metadata', {}).get('linkedin_url', '')
            })

    return {
        'total_files': total_files,
        'fallback_files': fallback_files,
        'mismatches': mismatches,
        'name_counter': name_counter,
        'files_by_name': files_by_name
    }

def main():
    entity_dir = Path("/Users/kempersc/apps/glam/data/custodian/person/entity")

    print("=" * 80)
    print("LINKEDIN ENTITY NAME MISMATCH ANALYSIS")
    print("=" * 80)
    print()

    results = analyze_entity_files(entity_dir)

    print(f"Total entity files analyzed: {results['total_files']}")
    print(f"Fallback (basic) files: {results['fallback_files']}")
    print(f"Total mismatches detected: {len(results['mismatches'])}")
    print()

    # Find names that appear suspiciously often (potential filler names)
    print("=" * 80)
    print("NAMES APPEARING MORE THAN 5 TIMES (Potential Filler Names)")
    print("=" * 80)
    frequent_names = [(name, count) for name, count in results['name_counter'].most_common(50) if count > 5]

    for name, count in frequent_names:
        # Check if this name appears in mismatches
        mismatch_count = sum(1 for m in results['mismatches'] if m['profile_name'] == name)
        print(f"  '{name}': {count} occurrences ({mismatch_count} are mismatches)")

    print()
    print("=" * 80)
    print("ALL MISMATCHED FILES (slug name != profile name)")
    print("=" * 80)

    # Group mismatches by profile_name to see patterns
    mismatch_by_name = defaultdict(list)
    for m in results['mismatches']:
        mismatch_by_name[m['profile_name']].append(m)

    # Sort by frequency of the mismatched name
    sorted_names = sorted(mismatch_by_name.items(), key=lambda x: -len(x[1]))

    for profile_name, items in sorted_names[:30]:  # Top 30 most frequent mismatch names
        print(f"\n--- '{profile_name}' assigned to {len(items)} different slugs ---")
        for item in items[:10]:  # Show first 10 examples
            print(f"  Slug: {item['slug_name']}")
            print(f"  File: {item['filename']}")
            print(f"  Method: {item['extraction_method']}")
            print()

    # Output detailed CSV for further analysis
    csv_path = entity_dir.parent / "name_mismatch_report.csv"
    with open(csv_path, 'w', encoding='utf-8') as f:
        f.write("filename,slug_name,profile_name,extraction_method,linkedin_url\n")
        for m in results['mismatches']:
            f.write(f'"{m["filename"]}","{m["slug_name"]}","{m["profile_name"]}","{m["extraction_method"]}","{m["linkedin_url"]}"\n')

    print(f"\nDetailed report saved to: {csv_path}")

    # Also output JSON for programmatic use
    json_path = entity_dir.parent / "name_mismatch_report.json"
    with open(json_path, 'w', encoding='utf-8') as f:
        json.dump({
            'total_files': results['total_files'],
            'fallback_files': results['fallback_files'],
            'total_mismatches': len(results['mismatches']),
            'mismatches_by_name': {name: len(items) for name, items in mismatch_by_name.items()},
            'frequent_names': [(name, count) for name, count in results['name_counter'].most_common(100)],
            'mismatches': results['mismatches']
        }, f, indent=2, ensure_ascii=False)

    print(f"JSON report saved to: {json_path}")

if __name__ == "__main__":
    main()