182 lines
5.5 KiB
Python
Executable file
182 lines
5.5 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Fix Simon Kemper contamination in missing_entity_profiles.json.
|
|
|
|
Uses the same slug-to-name logic as fix_simon_kemper_contamination.py.
|
|
"""
|
|
|
|
import json
|
|
import re
|
|
from pathlib import Path
|
|
from urllib.parse import unquote
|
|
from datetime import datetime, timezone
|
|
|
|
|
|
# Known compound slugs with their correct name interpretations
|
|
KNOWN_COMPOUND_SLUGS = {
|
|
'jponjee': 'J. Ponjee',
|
|
'sharellyemanuelson': 'Sharelly Emanuelson',
|
|
'addieroelofsen': 'Addie Roelofsen',
|
|
'adheliap': 'Adhelia P.',
|
|
'anejanboomsma': 'Anejan Boomsma',
|
|
'fredericlogghe': 'Frederic Logghe',
|
|
'dirkjanheinen': 'Dirkjan Heinen',
|
|
}
|
|
|
|
|
|
def is_compound_slug(slug: str) -> bool:
|
|
"""Check if slug is a compound name without ANY hyphens."""
|
|
decoded_slug = unquote(slug)
|
|
if '-' not in decoded_slug:
|
|
return True
|
|
return False
|
|
|
|
|
|
def slug_to_name(slug: str) -> tuple[str, bool]:
|
|
"""Convert a LinkedIn slug to a human-readable name.
|
|
|
|
Returns:
|
|
tuple: (name, is_reliable) where:
|
|
- name: The derived name or "Unknown"
|
|
- is_reliable: True if we're confident in the derivation
|
|
"""
|
|
# Decode URL encoding
|
|
decoded_slug = unquote(slug)
|
|
|
|
# Check if this is a KNOWN compound slug with manual mapping
|
|
if decoded_slug in KNOWN_COMPOUND_SLUGS:
|
|
return (KNOWN_COMPOUND_SLUGS[decoded_slug], True)
|
|
|
|
# Check if this is an UNKNOWN compound slug we can't reliably parse
|
|
if is_compound_slug(slug):
|
|
return ("Unknown", False)
|
|
|
|
# Remove trailing ID (hex or numeric)
|
|
clean_slug = re.sub(r'[-_][\da-f]{6,}$', '', decoded_slug)
|
|
clean_slug = re.sub(r'[-_]\d{5,}$', '', clean_slug)
|
|
|
|
# Split by hyphens
|
|
parts = clean_slug.split('-')
|
|
|
|
# Filter out empty parts
|
|
parts = [p for p in parts if p]
|
|
|
|
if not parts:
|
|
return ("Unknown", False)
|
|
|
|
# Capitalize appropriately
|
|
# Dutch particles that should stay lowercase: van, de, den, der, het, 't
|
|
dutch_particles = {'van', 'de', 'den', 'der', 'het', 't', "'t"}
|
|
|
|
name_parts = []
|
|
for i, part in enumerate(parts):
|
|
if part.lower() in dutch_particles and i > 0:
|
|
name_parts.append(part.lower())
|
|
else:
|
|
# Capitalize first letter, preserve rest
|
|
name_parts.append(part.capitalize())
|
|
|
|
name = ' '.join(name_parts)
|
|
|
|
# Additional validation - name should have at least 2 characters
|
|
if len(name) < 2:
|
|
return ("Unknown", False)
|
|
|
|
return (name, True)
|
|
|
|
|
|
def fix_missing_entity_profiles(filepath: Path, dry_run: bool = True):
|
|
"""Fix Simon Kemper contamination in missing_entity_profiles.json."""
|
|
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
fixed_count = 0
|
|
unknown_count = 0
|
|
fixes = []
|
|
|
|
profiles = data.get('missing_heritage_profiles', [])
|
|
|
|
for profile in profiles:
|
|
if profile.get('name') != 'Simon Kemper':
|
|
continue
|
|
|
|
slug = profile.get('slug', '')
|
|
if not slug:
|
|
continue
|
|
|
|
# Skip if this is the real Simon Kemper
|
|
slug_lower = slug.lower()
|
|
if 'simonkemper' in slug_lower or 'simon-kemper' in slug_lower:
|
|
continue
|
|
|
|
# Derive correct name
|
|
correct_name, is_reliable = slug_to_name(slug)
|
|
|
|
fix_info = {
|
|
'slug': slug,
|
|
'old_name': 'Simon Kemper',
|
|
'new_name': correct_name,
|
|
'is_reliable': is_reliable,
|
|
'headline': profile.get('headline', ''),
|
|
'custodian': profile.get('custodian', '')
|
|
}
|
|
fixes.append(fix_info)
|
|
|
|
if not dry_run:
|
|
profile['name'] = correct_name
|
|
|
|
if is_reliable:
|
|
fixed_count += 1
|
|
else:
|
|
unknown_count += 1
|
|
|
|
if not dry_run:
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
json.dump(data, f, indent=2, ensure_ascii=False)
|
|
|
|
return fixes, fixed_count, unknown_count
|
|
|
|
|
|
def main():
|
|
import argparse
|
|
parser = argparse.ArgumentParser(description='Fix Simon Kemper contamination in missing_entity_profiles.json')
|
|
parser.add_argument('--fix', action='store_true', help='Actually fix the file (default: dry run)')
|
|
args = parser.parse_args()
|
|
|
|
filepath = Path("/Users/kempersc/apps/glam/data/custodian/person/affiliated/parsed/missing_entity_profiles.json")
|
|
|
|
dry_run = not args.fix
|
|
mode = "DRY RUN" if dry_run else "FIXING"
|
|
|
|
print("=" * 80)
|
|
print(f"MISSING ENTITY PROFILES - SIMON KEMPER CONTAMINATION FIX - {mode}")
|
|
print("=" * 80)
|
|
|
|
fixes, fixed_count, unknown_count = fix_missing_entity_profiles(filepath, dry_run=dry_run)
|
|
|
|
print(f"\nFound {len(fixes)} Simon Kemper contaminations:\n")
|
|
|
|
for fix in fixes:
|
|
status = "✅" if fix['is_reliable'] else "⚠️ "
|
|
print(f" {status} {fix['slug']}")
|
|
print(f" → '{fix['new_name']}'")
|
|
headline = fix['headline']
|
|
print(f" Headline: {headline[:50]}..." if len(headline) > 50 else f" Headline: {headline}")
|
|
print()
|
|
|
|
print(f"\n{'='*40}")
|
|
print("SUMMARY")
|
|
print(f"{'='*40}")
|
|
print(f" Reliably fixed: {fixed_count}")
|
|
print(f" Set to 'Unknown': {unknown_count}")
|
|
print(f" Total: {len(fixes)}")
|
|
|
|
if not dry_run:
|
|
print(f"\n✅ Fixed {len(fixes)} entries in {filepath.name}")
|
|
else:
|
|
print(f"\n⚠️ DRY RUN - No changes made. Run with --fix to apply changes.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|