glam/scripts/fix_missing_entity_profiles.py

#!/usr/bin/env python3
"""
Fix Simon Kemper contamination in missing_entity_profiles.json.

Uses the same slug-to-name logic as fix_simon_kemper_contamination.py.
"""

import json
import re
from pathlib import Path
from urllib.parse import unquote
from datetime import datetime, timezone


# Known compound slugs with their correct name interpretations
KNOWN_COMPOUND_SLUGS = {
    'jponjee': 'J. Ponjee',
    'sharellyemanuelson': 'Sharelly Emanuelson',
    'addieroelofsen': 'Addie Roelofsen',
    'adheliap': 'Adhelia P.',
    'anejanboomsma': 'Anejan Boomsma',
    'fredericlogghe': 'Frederic Logghe',
    'dirkjanheinen': 'Dirkjan Heinen',
}


def is_compound_slug(slug: str) -> bool:
    """Check if slug is a compound name without ANY hyphens."""
    decoded_slug = unquote(slug)
    if '-' not in decoded_slug:
        return True
    return False


def slug_to_name(slug: str) -> tuple[str, bool]:
    """Convert a LinkedIn slug to a human-readable name.

    Returns:
        tuple: (name, is_reliable) where:
            - name: The derived name or "Unknown"
            - is_reliable: True if we're confident in the derivation
    """
    # Decode URL encoding
    decoded_slug = unquote(slug)

    # Check if this is a KNOWN compound slug with manual mapping
    if decoded_slug in KNOWN_COMPOUND_SLUGS:
        return (KNOWN_COMPOUND_SLUGS[decoded_slug], True)

    # Check if this is an UNKNOWN compound slug we can't reliably parse
    if is_compound_slug(slug):
        return ("Unknown", False)

    # Remove trailing ID (hex or numeric)
    clean_slug = re.sub(r'[-_][\da-f]{6,}$', '', decoded_slug)
    clean_slug = re.sub(r'[-_]\d{5,}$', '', clean_slug)

    # Split by hyphens
    parts = clean_slug.split('-')

    # Filter out empty parts
    parts = [p for p in parts if p]

    if not parts:
        return ("Unknown", False)

    # Capitalize appropriately
    # Dutch particles that should stay lowercase: van, de, den, der, het, 't
    dutch_particles = {'van', 'de', 'den', 'der', 'het', 't', "'t"}

    name_parts = []
    for i, part in enumerate(parts):
        if part.lower() in dutch_particles and i > 0:
            name_parts.append(part.lower())
        else:
            # Capitalize first letter, preserve rest
            name_parts.append(part.capitalize())

    name = ' '.join(name_parts)

    # Additional validation - name should have at least 2 characters
    if len(name) < 2:
        return ("Unknown", False)

    return (name, True)


def fix_missing_entity_profiles(filepath: Path, dry_run: bool = True):
    """Fix Simon Kemper contamination in missing_entity_profiles.json."""

    with open(filepath, 'r', encoding='utf-8') as f:
        data = json.load(f)

    fixed_count = 0
    unknown_count = 0
    fixes = []

    profiles = data.get('missing_heritage_profiles', [])

    for profile in profiles:
        if profile.get('name') != 'Simon Kemper':
            continue

        slug = profile.get('slug', '')
        if not slug:
            continue

        # Skip if this is the real Simon Kemper
        slug_lower = slug.lower()
        if 'simonkemper' in slug_lower or 'simon-kemper' in slug_lower:
            continue

        # Derive correct name
        correct_name, is_reliable = slug_to_name(slug)

        fix_info = {
            'slug': slug,
            'old_name': 'Simon Kemper',
            'new_name': correct_name,
            'is_reliable': is_reliable,
            'headline': profile.get('headline', ''),
            'custodian': profile.get('custodian', '')
        }
        fixes.append(fix_info)

        if not dry_run:
            profile['name'] = correct_name

        if is_reliable:
            fixed_count += 1
        else:
            unknown_count += 1

    if not dry_run:
        with open(filepath, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=2, ensure_ascii=False)

    return fixes, fixed_count, unknown_count


def main():
    import argparse
    parser = argparse.ArgumentParser(description='Fix Simon Kemper contamination in missing_entity_profiles.json')
    parser.add_argument('--fix', action='store_true', help='Actually fix the file (default: dry run)')
    args = parser.parse_args()

    filepath = Path("/Users/kempersc/apps/glam/data/custodian/person/affiliated/parsed/missing_entity_profiles.json")

    dry_run = not args.fix
    mode = "DRY RUN" if dry_run else "FIXING"

    print("=" * 80)
    print(f"MISSING ENTITY PROFILES - SIMON KEMPER CONTAMINATION FIX - {mode}")
    print("=" * 80)

    fixes, fixed_count, unknown_count = fix_missing_entity_profiles(filepath, dry_run=dry_run)

    print(f"\nFound {len(fixes)} Simon Kemper contaminations:\n")

    for fix in fixes:
        status = "✅" if fix['is_reliable'] else "⚠️ "
        print(f"  {status} {fix['slug']}")
        print(f"      → '{fix['new_name']}'")
        headline = fix['headline']
        print(f"      Headline: {headline[:50]}..." if len(headline) > 50 else f"      Headline: {headline}")
        print()

    print(f"\n{'='*40}")
    print("SUMMARY")
    print(f"{'='*40}")
    print(f"  Reliably fixed:    {fixed_count}")
    print(f"  Set to 'Unknown':  {unknown_count}")
    print(f"  Total:             {len(fixes)}")

    if not dry_run:
        print(f"\n✅ Fixed {len(fixes)} entries in {filepath.name}")
    else:
        print(f"\n⚠️  DRY RUN - No changes made. Run with --fix to apply changes.")


if __name__ == "__main__":
    main()