#!/usr/bin/env python3 """ Fix Simon Kemper contamination in missing_entity_profiles.json. Uses the same slug-to-name logic as fix_simon_kemper_contamination.py. """ import json import re from pathlib import Path from urllib.parse import unquote from datetime import datetime, timezone # Known compound slugs with their correct name interpretations KNOWN_COMPOUND_SLUGS = { 'jponjee': 'J. Ponjee', 'sharellyemanuelson': 'Sharelly Emanuelson', 'addieroelofsen': 'Addie Roelofsen', 'adheliap': 'Adhelia P.', 'anejanboomsma': 'Anejan Boomsma', 'fredericlogghe': 'Frederic Logghe', 'dirkjanheinen': 'Dirkjan Heinen', } def is_compound_slug(slug: str) -> bool: """Check if slug is a compound name without ANY hyphens.""" decoded_slug = unquote(slug) if '-' not in decoded_slug: return True return False def slug_to_name(slug: str) -> tuple[str, bool]: """Convert a LinkedIn slug to a human-readable name. Returns: tuple: (name, is_reliable) where: - name: The derived name or "Unknown" - is_reliable: True if we're confident in the derivation """ # Decode URL encoding decoded_slug = unquote(slug) # Check if this is a KNOWN compound slug with manual mapping if decoded_slug in KNOWN_COMPOUND_SLUGS: return (KNOWN_COMPOUND_SLUGS[decoded_slug], True) # Check if this is an UNKNOWN compound slug we can't reliably parse if is_compound_slug(slug): return ("Unknown", False) # Remove trailing ID (hex or numeric) clean_slug = re.sub(r'[-_][\da-f]{6,}$', '', decoded_slug) clean_slug = re.sub(r'[-_]\d{5,}$', '', clean_slug) # Split by hyphens parts = clean_slug.split('-') # Filter out empty parts parts = [p for p in parts if p] if not parts: return ("Unknown", False) # Capitalize appropriately # Dutch particles that should stay lowercase: van, de, den, der, het, 't dutch_particles = {'van', 'de', 'den', 'der', 'het', 't', "'t"} name_parts = [] for i, part in enumerate(parts): if part.lower() in dutch_particles and i > 0: name_parts.append(part.lower()) else: # Capitalize first letter, preserve rest name_parts.append(part.capitalize()) name = ' '.join(name_parts) # Additional validation - name should have at least 2 characters if len(name) < 2: return ("Unknown", False) return (name, True) def fix_missing_entity_profiles(filepath: Path, dry_run: bool = True): """Fix Simon Kemper contamination in missing_entity_profiles.json.""" with open(filepath, 'r', encoding='utf-8') as f: data = json.load(f) fixed_count = 0 unknown_count = 0 fixes = [] profiles = data.get('missing_heritage_profiles', []) for profile in profiles: if profile.get('name') != 'Simon Kemper': continue slug = profile.get('slug', '') if not slug: continue # Skip if this is the real Simon Kemper slug_lower = slug.lower() if 'simonkemper' in slug_lower or 'simon-kemper' in slug_lower: continue # Derive correct name correct_name, is_reliable = slug_to_name(slug) fix_info = { 'slug': slug, 'old_name': 'Simon Kemper', 'new_name': correct_name, 'is_reliable': is_reliable, 'headline': profile.get('headline', ''), 'custodian': profile.get('custodian', '') } fixes.append(fix_info) if not dry_run: profile['name'] = correct_name if is_reliable: fixed_count += 1 else: unknown_count += 1 if not dry_run: with open(filepath, 'w', encoding='utf-8') as f: json.dump(data, f, indent=2, ensure_ascii=False) return fixes, fixed_count, unknown_count def main(): import argparse parser = argparse.ArgumentParser(description='Fix Simon Kemper contamination in missing_entity_profiles.json') parser.add_argument('--fix', action='store_true', help='Actually fix the file (default: dry run)') args = parser.parse_args() filepath = Path("/Users/kempersc/apps/glam/data/custodian/person/affiliated/parsed/missing_entity_profiles.json") dry_run = not args.fix mode = "DRY RUN" if dry_run else "FIXING" print("=" * 80) print(f"MISSING ENTITY PROFILES - SIMON KEMPER CONTAMINATION FIX - {mode}") print("=" * 80) fixes, fixed_count, unknown_count = fix_missing_entity_profiles(filepath, dry_run=dry_run) print(f"\nFound {len(fixes)} Simon Kemper contaminations:\n") for fix in fixes: status = "✅" if fix['is_reliable'] else "⚠️ " print(f" {status} {fix['slug']}") print(f" → '{fix['new_name']}'") headline = fix['headline'] print(f" Headline: {headline[:50]}..." if len(headline) > 50 else f" Headline: {headline}") print() print(f"\n{'='*40}") print("SUMMARY") print(f"{'='*40}") print(f" Reliably fixed: {fixed_count}") print(f" Set to 'Unknown': {unknown_count}") print(f" Total: {len(fixes)}") if not dry_run: print(f"\n✅ Fixed {len(fixes)} entries in {filepath.name}") else: print(f"\n⚠️ DRY RUN - No changes made. Run with --fix to apply changes.") if __name__ == "__main__": main()