glam/scripts/fix_missing_entity_profiles.py
2025-12-15 22:31:41 +01:00

182 lines
5.5 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Fix Simon Kemper contamination in missing_entity_profiles.json.
Uses the same slug-to-name logic as fix_simon_kemper_contamination.py.
"""
import json
import re
from pathlib import Path
from urllib.parse import unquote
from datetime import datetime, timezone
# Known compound slugs with their correct name interpretations
KNOWN_COMPOUND_SLUGS = {
'jponjee': 'J. Ponjee',
'sharellyemanuelson': 'Sharelly Emanuelson',
'addieroelofsen': 'Addie Roelofsen',
'adheliap': 'Adhelia P.',
'anejanboomsma': 'Anejan Boomsma',
'fredericlogghe': 'Frederic Logghe',
'dirkjanheinen': 'Dirkjan Heinen',
}
def is_compound_slug(slug: str) -> bool:
"""Check if slug is a compound name without ANY hyphens."""
decoded_slug = unquote(slug)
if '-' not in decoded_slug:
return True
return False
def slug_to_name(slug: str) -> tuple[str, bool]:
"""Convert a LinkedIn slug to a human-readable name.
Returns:
tuple: (name, is_reliable) where:
- name: The derived name or "Unknown"
- is_reliable: True if we're confident in the derivation
"""
# Decode URL encoding
decoded_slug = unquote(slug)
# Check if this is a KNOWN compound slug with manual mapping
if decoded_slug in KNOWN_COMPOUND_SLUGS:
return (KNOWN_COMPOUND_SLUGS[decoded_slug], True)
# Check if this is an UNKNOWN compound slug we can't reliably parse
if is_compound_slug(slug):
return ("Unknown", False)
# Remove trailing ID (hex or numeric)
clean_slug = re.sub(r'[-_][\da-f]{6,}$', '', decoded_slug)
clean_slug = re.sub(r'[-_]\d{5,}$', '', clean_slug)
# Split by hyphens
parts = clean_slug.split('-')
# Filter out empty parts
parts = [p for p in parts if p]
if not parts:
return ("Unknown", False)
# Capitalize appropriately
# Dutch particles that should stay lowercase: van, de, den, der, het, 't
dutch_particles = {'van', 'de', 'den', 'der', 'het', 't', "'t"}
name_parts = []
for i, part in enumerate(parts):
if part.lower() in dutch_particles and i > 0:
name_parts.append(part.lower())
else:
# Capitalize first letter, preserve rest
name_parts.append(part.capitalize())
name = ' '.join(name_parts)
# Additional validation - name should have at least 2 characters
if len(name) < 2:
return ("Unknown", False)
return (name, True)
def fix_missing_entity_profiles(filepath: Path, dry_run: bool = True):
"""Fix Simon Kemper contamination in missing_entity_profiles.json."""
with open(filepath, 'r', encoding='utf-8') as f:
data = json.load(f)
fixed_count = 0
unknown_count = 0
fixes = []
profiles = data.get('missing_heritage_profiles', [])
for profile in profiles:
if profile.get('name') != 'Simon Kemper':
continue
slug = profile.get('slug', '')
if not slug:
continue
# Skip if this is the real Simon Kemper
slug_lower = slug.lower()
if 'simonkemper' in slug_lower or 'simon-kemper' in slug_lower:
continue
# Derive correct name
correct_name, is_reliable = slug_to_name(slug)
fix_info = {
'slug': slug,
'old_name': 'Simon Kemper',
'new_name': correct_name,
'is_reliable': is_reliable,
'headline': profile.get('headline', ''),
'custodian': profile.get('custodian', '')
}
fixes.append(fix_info)
if not dry_run:
profile['name'] = correct_name
if is_reliable:
fixed_count += 1
else:
unknown_count += 1
if not dry_run:
with open(filepath, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
return fixes, fixed_count, unknown_count
def main():
import argparse
parser = argparse.ArgumentParser(description='Fix Simon Kemper contamination in missing_entity_profiles.json')
parser.add_argument('--fix', action='store_true', help='Actually fix the file (default: dry run)')
args = parser.parse_args()
filepath = Path("/Users/kempersc/apps/glam/data/custodian/person/affiliated/parsed/missing_entity_profiles.json")
dry_run = not args.fix
mode = "DRY RUN" if dry_run else "FIXING"
print("=" * 80)
print(f"MISSING ENTITY PROFILES - SIMON KEMPER CONTAMINATION FIX - {mode}")
print("=" * 80)
fixes, fixed_count, unknown_count = fix_missing_entity_profiles(filepath, dry_run=dry_run)
print(f"\nFound {len(fixes)} Simon Kemper contaminations:\n")
for fix in fixes:
status = "" if fix['is_reliable'] else "⚠️ "
print(f" {status} {fix['slug']}")
print(f"'{fix['new_name']}'")
headline = fix['headline']
print(f" Headline: {headline[:50]}..." if len(headline) > 50 else f" Headline: {headline}")
print()
print(f"\n{'='*40}")
print("SUMMARY")
print(f"{'='*40}")
print(f" Reliably fixed: {fixed_count}")
print(f" Set to 'Unknown': {unknown_count}")
print(f" Total: {len(fixes)}")
if not dry_run:
print(f"\n✅ Fixed {len(fixes)} entries in {filepath.name}")
else:
print(f"\n⚠️ DRY RUN - No changes made. Run with --fix to apply changes.")
if __name__ == "__main__":
main()