#!/usr/bin/env python3 """ Fix Simon Kemper contamination in entity profiles. For entries where: 1. Name is "Simon Kemper" 2. But the LinkedIn slug clearly indicates a different person We derive the correct name from the slug and update the profile. IMPORTANT: Per Rule 21 (Data Fabrication Prohibition) - if we cannot reliably derive the name from the slug, we mark it as "Unknown" rather than guessing. Compound slugs without hyphens (like "jponjee") cannot be reliably parsed. """ import json import os import re from pathlib import Path from urllib.parse import unquote from datetime import datetime, timezone def is_compound_slug(slug: str) -> bool: """Check if slug is a compound name without ANY hyphens. Returns True for slugs like: - 'jponjee' (no hyphens at all) - 'sharellyemanuelson' (no hyphens) - 'addieroelofsen' (no hyphens) - 'adheliap' (no hyphens) Returns False for slugs like: - 'willem-blok' (has hyphens between name parts) - 'jan-van-den-borre' (has hyphens) - 'miriam-h-38b500b2' (has hyphens, even if name part is short) - 'olivi%C3%AB-7153658' (has hyphen before ID - name is parseable) - 'daniel-tuik' (has hyphen between name parts) The key distinction: - Slugs WITH hyphens can be parsed (split on hyphen, remove ID suffix) - Slugs WITHOUT any hyphens cannot be parsed (word boundaries unknown) Note: Known compound slugs are handled separately in KNOWN_COMPOUND_SLUGS. """ # First decode URL encoding (e.g., %C3%AB -> ë) decoded_slug = unquote(slug) # If there are NO hyphens at all in the decoded slug, it's unparseable # Examples: 'jponjee', 'sharellyemanuelson' if '-' not in decoded_slug: return True return False # Known compound slugs with their correct name interpretations # These were manually reviewed and determined to be the most likely names KNOWN_COMPOUND_SLUGS = { 'jponjee': 'J. Ponjee', 'sharellyemanuelson': 'Sharelly Emanuelson', 'addieroelofsen': 'Addie Roelofsen', 'adheliap': 'Adhelia P.', 'anejanboomsma': 'Anejan Boomsma', 'fredericlogghe': 'Frederic Logghe', 'dirkjanheinen': 'Dirkjan Heinen', } def slug_to_name(slug: str) -> tuple[str, bool]: """Convert a LinkedIn slug to a human-readable name. Returns: tuple: (name, is_reliable) where: - name: The derived name or "Unknown" - is_reliable: True if we're confident in the derivation Examples: 'willem-blok-b6a46648' -> ('Willem Blok', True) 'dave-van-den-nieuwenhof-4446b3146' -> ('Dave van den Nieuwenhof', True) 'olivi%C3%AB-7153658' -> ('Olivië', True) 'jponjee' -> ('J. Ponjee', True) # Known compound slug with manual mapping 'sharellyemanuelson' -> ('Sharelly Emanuelson', True) # Known compound slug """ # Decode URL encoding decoded_slug = unquote(slug) # Check if this is a KNOWN compound slug with manual mapping if decoded_slug in KNOWN_COMPOUND_SLUGS: return (KNOWN_COMPOUND_SLUGS[decoded_slug], True) # Check if this is an UNKNOWN compound slug we can't reliably parse if is_compound_slug(slug): return ("Unknown", False) # Remove trailing ID (hex or numeric) clean_slug = re.sub(r'[-_][\da-f]{6,}$', '', decoded_slug) clean_slug = re.sub(r'[-_]\d{5,}$', '', clean_slug) # Split by hyphens parts = clean_slug.split('-') # Filter out empty parts parts = [p for p in parts if p] if not parts: return ("Unknown", False) # Capitalize appropriately # Dutch particles that should stay lowercase: van, de, den, der, het, 't dutch_particles = {'van', 'de', 'den', 'der', 'het', 't', "'t"} name_parts = [] for i, part in enumerate(parts): if part.lower() in dutch_particles and i > 0: name_parts.append(part.lower()) else: # Capitalize first letter, preserve rest name_parts.append(part.capitalize()) name = ' '.join(name_parts) # Additional validation - name should have at least 2 characters if len(name) < 2: return ("Unknown", False) return (name, True) def fix_contaminated_files(entity_dir: Path, dry_run: bool = True): """Find and fix Simon Kemper contaminated files. Only processes files where name is ACTUALLY "Simon Kemper" (contaminated). Skips files where name was already corrected or was never contaminated. Returns: tuple: (contaminated_list, fixed_list, unreliable_list) """ contaminated = [] fixed = [] unreliable = [] # Files where we couldn't reliably derive the name for filepath in entity_dir.glob("*.json"): try: with open(filepath, 'r', encoding='utf-8') as f: data = json.load(f) except (json.JSONDecodeError, IOError): continue # Check if this is a Simon Kemper contamination profile_name = data.get('profile_data', {}).get('name', '') source_name = data.get('source_staff_info', {}).get('name', '') # ONLY process files where the name is ACTUALLY "Simon Kemper" if profile_name != 'Simon Kemper' and source_name != 'Simon Kemper': continue # Get the slug from filename or URL filename = filepath.name linkedin_url = data.get('extraction_metadata', {}).get('linkedin_url', '') # Extract slug from URL slug_match = re.search(r'/in/([^/]+)/?$', linkedin_url) if not slug_match: continue slug = slug_match.group(1) # Check if this is truly contamination (slug doesn't match simon kemper) slug_lower = slug.lower().replace('%', '') if 'simonkemper' in slug_lower or 'simon-kemper' in slug_lower: # This is the real Simon Kemper, skip continue # Derive correct name from slug correct_name, is_reliable = slug_to_name(slug) entry = { 'file': filepath.name, 'slug': slug, 'profile_name': profile_name, 'source_name': source_name, 'contaminated_field': 'profile_data.name' if profile_name == 'Simon Kemper' else 'source_staff_info.name', 'correct_name': correct_name, 'is_reliable': is_reliable, 'headline': data.get('profile_data', {}).get('headline', ''), 'custodian': data.get('affiliations', [{}])[0].get('custodian_name', '') if data.get('affiliations') else '' } if is_reliable: contaminated.append(entry) else: unreliable.append(entry) if not dry_run: # Fix the data if 'profile_data' in data: data['profile_data']['name'] = correct_name if 'source_staff_info' in data: data['source_staff_info']['name'] = correct_name # Add fix metadata if 'extraction_metadata' not in data: data['extraction_metadata'] = {} if is_reliable: fix_note = f"Name corrected from 'Simon Kemper' (contamination) to '{correct_name}' (derived from slug) on {datetime.now(timezone.utc).isoformat()}" else: fix_note = f"Name set to 'Unknown' (was 'Simon Kemper' contamination). Original slug: {slug}. Compound slug cannot be reliably parsed. Fixed on {datetime.now(timezone.utc).isoformat()}" # Also preserve slug in a dedicated field for future reference data['extraction_metadata']['original_slug'] = slug existing_notes = data['extraction_metadata'].get('notes', '') if existing_notes: data['extraction_metadata']['notes'] = f"{existing_notes} | {fix_note}" else: data['extraction_metadata']['notes'] = fix_note # Write back with open(filepath, 'w', encoding='utf-8') as f: json.dump(data, f, indent=2, ensure_ascii=False) fixed.append(filepath.name) return contaminated, fixed, unreliable def main(): import argparse parser = argparse.ArgumentParser(description='Fix Simon Kemper contamination') parser.add_argument('--fix', action='store_true', help='Actually fix files (default: dry run)') args = parser.parse_args() entity_dir = Path("/Users/kempersc/apps/glam/data/custodian/person/entity") dry_run = not args.fix mode = "DRY RUN" if dry_run else "FIXING" print("=" * 80) print(f"SIMON KEMPER CONTAMINATION FIX - {mode}") print("=" * 80) contaminated, fixed, unreliable = fix_contaminated_files(entity_dir, dry_run=dry_run) print(f"\n{'='*40}") print(f"RELIABLY PARSEABLE ({len(contaminated)} files)") print(f"{'='*40}") print("These slugs have hyphens and can be reliably converted to names:\n") for c in contaminated: print(f" File: {c['file']}") print(f" Slug: {c['slug']}") print(f" Contaminated: {c['contaminated_field']} = 'Simon Kemper'") print(f" Correct name: '{c['correct_name']}'") headline = c['headline'] print(f" Headline: {headline[:60]}..." if len(headline) > 60 else f" Headline: {headline}") print(f" Custodian: {c['custodian']}") print() if unreliable: print(f"\n{'='*40}") print(f"COMPOUND SLUGS - SET TO 'Unknown' ({len(unreliable)} files)") print(f"{'='*40}") print("These slugs have no hyphens and cannot be reliably parsed.") print("Per Rule 21: Names will be set to 'Unknown' (no hallucination).\n") for u in unreliable: print(f" File: {u['file']}") print(f" Slug: {u['slug']}") print(f" Contaminated: {u['contaminated_field']} = 'Simon Kemper'") print(f" Will be set to: 'Unknown' (slug preserved in metadata)") headline = u['headline'] print(f" Headline: {headline[:60]}..." if len(headline) > 60 else f" Headline: {headline}") print(f" Custodian: {u['custodian']}") print() print(f"\n{'='*40}") print("SUMMARY") print(f"{'='*40}") print(f" Reliably fixable: {len(contaminated)}") print(f" Set to 'Unknown': {len(unreliable)}") print(f" Total: {len(contaminated) + len(unreliable)}") if not dry_run: print(f"\n✅ Fixed {len(fixed)} files") else: print(f"\n⚠️ DRY RUN - No files modified. Run with --fix to apply changes.") if __name__ == "__main__": main()