289 lines
11 KiB
Python
289 lines
11 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Fix Simon Kemper contamination in entity profiles.
|
|
|
|
For entries where:
|
|
1. Name is "Simon Kemper"
|
|
2. But the LinkedIn slug clearly indicates a different person
|
|
|
|
We derive the correct name from the slug and update the profile.
|
|
|
|
IMPORTANT: Per Rule 21 (Data Fabrication Prohibition) - if we cannot reliably
|
|
derive the name from the slug, we mark it as "Unknown" rather than guessing.
|
|
Compound slugs without hyphens (like "jponjee") cannot be reliably parsed.
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import re
|
|
from pathlib import Path
|
|
from urllib.parse import unquote
|
|
from datetime import datetime, timezone
|
|
|
|
|
|
def is_compound_slug(slug: str) -> bool:
|
|
"""Check if slug is a compound name without ANY hyphens.
|
|
|
|
Returns True for slugs like:
|
|
- 'jponjee' (no hyphens at all)
|
|
- 'sharellyemanuelson' (no hyphens)
|
|
- 'addieroelofsen' (no hyphens)
|
|
- 'adheliap' (no hyphens)
|
|
|
|
Returns False for slugs like:
|
|
- 'willem-blok' (has hyphens between name parts)
|
|
- 'jan-van-den-borre' (has hyphens)
|
|
- 'miriam-h-38b500b2' (has hyphens, even if name part is short)
|
|
- 'olivi%C3%AB-7153658' (has hyphen before ID - name is parseable)
|
|
- 'daniel-tuik' (has hyphen between name parts)
|
|
|
|
The key distinction:
|
|
- Slugs WITH hyphens can be parsed (split on hyphen, remove ID suffix)
|
|
- Slugs WITHOUT any hyphens cannot be parsed (word boundaries unknown)
|
|
|
|
Note: Known compound slugs are handled separately in KNOWN_COMPOUND_SLUGS.
|
|
"""
|
|
# First decode URL encoding (e.g., %C3%AB -> ë)
|
|
decoded_slug = unquote(slug)
|
|
|
|
# If there are NO hyphens at all in the decoded slug, it's unparseable
|
|
# Examples: 'jponjee', 'sharellyemanuelson'
|
|
if '-' not in decoded_slug:
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
# Known compound slugs with their correct name interpretations
|
|
# These were manually reviewed and determined to be the most likely names
|
|
KNOWN_COMPOUND_SLUGS = {
|
|
'jponjee': 'J. Ponjee',
|
|
'sharellyemanuelson': 'Sharelly Emanuelson',
|
|
'addieroelofsen': 'Addie Roelofsen',
|
|
'adheliap': 'Adhelia P.',
|
|
'anejanboomsma': 'Anejan Boomsma',
|
|
'fredericlogghe': 'Frederic Logghe',
|
|
'dirkjanheinen': 'Dirkjan Heinen',
|
|
}
|
|
|
|
|
|
def slug_to_name(slug: str) -> tuple[str, bool]:
|
|
"""Convert a LinkedIn slug to a human-readable name.
|
|
|
|
Returns:
|
|
tuple: (name, is_reliable) where:
|
|
- name: The derived name or "Unknown"
|
|
- is_reliable: True if we're confident in the derivation
|
|
|
|
Examples:
|
|
'willem-blok-b6a46648' -> ('Willem Blok', True)
|
|
'dave-van-den-nieuwenhof-4446b3146' -> ('Dave van den Nieuwenhof', True)
|
|
'olivi%C3%AB-7153658' -> ('Olivië', True)
|
|
'jponjee' -> ('J. Ponjee', True) # Known compound slug with manual mapping
|
|
'sharellyemanuelson' -> ('Sharelly Emanuelson', True) # Known compound slug
|
|
"""
|
|
# Decode URL encoding
|
|
decoded_slug = unquote(slug)
|
|
|
|
# Check if this is a KNOWN compound slug with manual mapping
|
|
if decoded_slug in KNOWN_COMPOUND_SLUGS:
|
|
return (KNOWN_COMPOUND_SLUGS[decoded_slug], True)
|
|
|
|
# Check if this is an UNKNOWN compound slug we can't reliably parse
|
|
if is_compound_slug(slug):
|
|
return ("Unknown", False)
|
|
|
|
# Remove trailing ID (hex or numeric)
|
|
clean_slug = re.sub(r'[-_][\da-f]{6,}$', '', decoded_slug)
|
|
clean_slug = re.sub(r'[-_]\d{5,}$', '', clean_slug)
|
|
|
|
# Split by hyphens
|
|
parts = clean_slug.split('-')
|
|
|
|
# Filter out empty parts
|
|
parts = [p for p in parts if p]
|
|
|
|
if not parts:
|
|
return ("Unknown", False)
|
|
|
|
# Capitalize appropriately
|
|
# Dutch particles that should stay lowercase: van, de, den, der, het, 't
|
|
dutch_particles = {'van', 'de', 'den', 'der', 'het', 't', "'t"}
|
|
|
|
name_parts = []
|
|
for i, part in enumerate(parts):
|
|
if part.lower() in dutch_particles and i > 0:
|
|
name_parts.append(part.lower())
|
|
else:
|
|
# Capitalize first letter, preserve rest
|
|
name_parts.append(part.capitalize())
|
|
|
|
name = ' '.join(name_parts)
|
|
|
|
# Additional validation - name should have at least 2 characters
|
|
if len(name) < 2:
|
|
return ("Unknown", False)
|
|
|
|
return (name, True)
|
|
|
|
def fix_contaminated_files(entity_dir: Path, dry_run: bool = True):
|
|
"""Find and fix Simon Kemper contaminated files.
|
|
|
|
Only processes files where name is ACTUALLY "Simon Kemper" (contaminated).
|
|
Skips files where name was already corrected or was never contaminated.
|
|
|
|
Returns:
|
|
tuple: (contaminated_list, fixed_list, unreliable_list)
|
|
"""
|
|
|
|
contaminated = []
|
|
fixed = []
|
|
unreliable = [] # Files where we couldn't reliably derive the name
|
|
|
|
for filepath in entity_dir.glob("*.json"):
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
except (json.JSONDecodeError, IOError):
|
|
continue
|
|
|
|
# Check if this is a Simon Kemper contamination
|
|
profile_name = data.get('profile_data', {}).get('name', '')
|
|
source_name = data.get('source_staff_info', {}).get('name', '')
|
|
|
|
# ONLY process files where the name is ACTUALLY "Simon Kemper"
|
|
if profile_name != 'Simon Kemper' and source_name != 'Simon Kemper':
|
|
continue
|
|
|
|
# Get the slug from filename or URL
|
|
filename = filepath.name
|
|
linkedin_url = data.get('extraction_metadata', {}).get('linkedin_url', '')
|
|
|
|
# Extract slug from URL
|
|
slug_match = re.search(r'/in/([^/]+)/?$', linkedin_url)
|
|
if not slug_match:
|
|
continue
|
|
|
|
slug = slug_match.group(1)
|
|
|
|
# Check if this is truly contamination (slug doesn't match simon kemper)
|
|
slug_lower = slug.lower().replace('%', '')
|
|
if 'simonkemper' in slug_lower or 'simon-kemper' in slug_lower:
|
|
# This is the real Simon Kemper, skip
|
|
continue
|
|
|
|
# Derive correct name from slug
|
|
correct_name, is_reliable = slug_to_name(slug)
|
|
|
|
entry = {
|
|
'file': filepath.name,
|
|
'slug': slug,
|
|
'profile_name': profile_name,
|
|
'source_name': source_name,
|
|
'contaminated_field': 'profile_data.name' if profile_name == 'Simon Kemper' else 'source_staff_info.name',
|
|
'correct_name': correct_name,
|
|
'is_reliable': is_reliable,
|
|
'headline': data.get('profile_data', {}).get('headline', ''),
|
|
'custodian': data.get('affiliations', [{}])[0].get('custodian_name', '') if data.get('affiliations') else ''
|
|
}
|
|
|
|
if is_reliable:
|
|
contaminated.append(entry)
|
|
else:
|
|
unreliable.append(entry)
|
|
|
|
if not dry_run:
|
|
# Fix the data
|
|
if 'profile_data' in data:
|
|
data['profile_data']['name'] = correct_name
|
|
if 'source_staff_info' in data:
|
|
data['source_staff_info']['name'] = correct_name
|
|
|
|
# Add fix metadata
|
|
if 'extraction_metadata' not in data:
|
|
data['extraction_metadata'] = {}
|
|
|
|
if is_reliable:
|
|
fix_note = f"Name corrected from 'Simon Kemper' (contamination) to '{correct_name}' (derived from slug) on {datetime.now(timezone.utc).isoformat()}"
|
|
else:
|
|
fix_note = f"Name set to 'Unknown' (was 'Simon Kemper' contamination). Original slug: {slug}. Compound slug cannot be reliably parsed. Fixed on {datetime.now(timezone.utc).isoformat()}"
|
|
# Also preserve slug in a dedicated field for future reference
|
|
data['extraction_metadata']['original_slug'] = slug
|
|
|
|
existing_notes = data['extraction_metadata'].get('notes', '')
|
|
if existing_notes:
|
|
data['extraction_metadata']['notes'] = f"{existing_notes} | {fix_note}"
|
|
else:
|
|
data['extraction_metadata']['notes'] = fix_note
|
|
|
|
# Write back
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
json.dump(data, f, indent=2, ensure_ascii=False)
|
|
|
|
fixed.append(filepath.name)
|
|
|
|
return contaminated, fixed, unreliable
|
|
|
|
def main():
|
|
import argparse
|
|
parser = argparse.ArgumentParser(description='Fix Simon Kemper contamination')
|
|
parser.add_argument('--fix', action='store_true', help='Actually fix files (default: dry run)')
|
|
args = parser.parse_args()
|
|
|
|
entity_dir = Path("/Users/kempersc/apps/glam/data/custodian/person/entity")
|
|
|
|
dry_run = not args.fix
|
|
mode = "DRY RUN" if dry_run else "FIXING"
|
|
|
|
print("=" * 80)
|
|
print(f"SIMON KEMPER CONTAMINATION FIX - {mode}")
|
|
print("=" * 80)
|
|
|
|
contaminated, fixed, unreliable = fix_contaminated_files(entity_dir, dry_run=dry_run)
|
|
|
|
print(f"\n{'='*40}")
|
|
print(f"RELIABLY PARSEABLE ({len(contaminated)} files)")
|
|
print(f"{'='*40}")
|
|
print("These slugs have hyphens and can be reliably converted to names:\n")
|
|
|
|
for c in contaminated:
|
|
print(f" File: {c['file']}")
|
|
print(f" Slug: {c['slug']}")
|
|
print(f" Contaminated: {c['contaminated_field']} = 'Simon Kemper'")
|
|
print(f" Correct name: '{c['correct_name']}'")
|
|
headline = c['headline']
|
|
print(f" Headline: {headline[:60]}..." if len(headline) > 60 else f" Headline: {headline}")
|
|
print(f" Custodian: {c['custodian']}")
|
|
print()
|
|
|
|
if unreliable:
|
|
print(f"\n{'='*40}")
|
|
print(f"COMPOUND SLUGS - SET TO 'Unknown' ({len(unreliable)} files)")
|
|
print(f"{'='*40}")
|
|
print("These slugs have no hyphens and cannot be reliably parsed.")
|
|
print("Per Rule 21: Names will be set to 'Unknown' (no hallucination).\n")
|
|
|
|
for u in unreliable:
|
|
print(f" File: {u['file']}")
|
|
print(f" Slug: {u['slug']}")
|
|
print(f" Contaminated: {u['contaminated_field']} = 'Simon Kemper'")
|
|
print(f" Will be set to: 'Unknown' (slug preserved in metadata)")
|
|
headline = u['headline']
|
|
print(f" Headline: {headline[:60]}..." if len(headline) > 60 else f" Headline: {headline}")
|
|
print(f" Custodian: {u['custodian']}")
|
|
print()
|
|
|
|
print(f"\n{'='*40}")
|
|
print("SUMMARY")
|
|
print(f"{'='*40}")
|
|
print(f" Reliably fixable: {len(contaminated)}")
|
|
print(f" Set to 'Unknown': {len(unreliable)}")
|
|
print(f" Total: {len(contaminated) + len(unreliable)}")
|
|
|
|
if not dry_run:
|
|
print(f"\n✅ Fixed {len(fixed)} files")
|
|
else:
|
|
print(f"\n⚠️ DRY RUN - No files modified. Run with --fix to apply changes.")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|