glam/scripts/fix_simon_kemper_contamination.py
kempersc 0c36429257 feat(scripts): Add batch crawling and data quality scripts
- batch_crawl4ai_recrawl.py: Retry failed URL crawls
- batch_firecrawl_recrawl.py: FireCrawl batch processing
- batch_httpx_scrape.py: HTTPX-based scraping
- detect_name_mismatch.py: Find name mismatches in data
- enrich_dutch_custodians_crawl4ai.py: Dutch custodian enrichment
- fix_collision_victims.py: GHCID collision resolution
- fix_generic_platform_names*.py: Platform name cleanup
- fix_ghcid_type.py: GHCID type corrections
- fix_simon_kemper_contamination.py: Data cleanup
- scan_dutch_data_quality.py: Data quality scanning
- transform_crawl4ai_to_digital_platform.py: Data transformation
2025-12-15 01:47:46 +01:00

269 lines
9.8 KiB
Python

#!/usr/bin/env python3
"""
Fix Simon Kemper contamination in entity profiles.
For entries where:
1. Name is "Simon Kemper"
2. But the LinkedIn slug clearly indicates a different person
We derive the correct name from the slug and update the profile.
IMPORTANT: Per Rule 21 (Data Fabrication Prohibition) - if we cannot reliably
derive the name from the slug, we mark it as "Unknown" rather than guessing.
Compound slugs without hyphens (like "jponjee") cannot be reliably parsed.
"""
import json
import os
import re
from pathlib import Path
from urllib.parse import unquote
from datetime import datetime, timezone
def is_compound_slug(slug: str) -> bool:
"""Check if slug is a compound name without separators.
Returns True for slugs like:
- 'jponjee' (no hyphens, all lowercase)
- 'sharellyemanuelson'
- 'addieroelofsen'
- 'adheliap'
Returns False for slugs like:
- 'willem-blok' (has hyphens)
- 'jan-van-den-borre' (has hyphens)
- 'miriam-h' (has hyphens, even if short)
- 'olivi%C3%AB-7153658' (has hyphens after URL decoding)
"""
# First decode URL encoding (e.g., %C3%AB -> ë)
slug = unquote(slug)
# After removing trailing ID, check if there are NO hyphens
clean_slug = re.sub(r'[-_][\da-f]{6,}$', '', slug)
clean_slug = re.sub(r'[-_]\d{5,}$', '', clean_slug)
# If no hyphens remain, it's a compound slug that can't be reliably parsed
# Even short ones like "jponjee" (7 chars) could be "J. Ponjee" or "J Ponjee"
if '-' not in clean_slug:
return True
return False
def slug_to_name(slug: str) -> tuple[str, bool]:
"""Convert a LinkedIn slug to a human-readable name.
Returns:
tuple: (name, is_reliable) where:
- name: The derived name or "Unknown"
- is_reliable: True if we're confident in the derivation
Examples:
'willem-blok-b6a46648' -> ('Willem Blok', True)
'dave-van-den-nieuwenhof-4446b3146' -> ('Dave van den Nieuwenhof', True)
'olivi%C3%AB-7153658' -> ('Olivië', True)
'jponjee' -> ('Unknown', False) # Compound slug, cannot parse reliably
'sharellyemanuelson' -> ('Unknown', False) # Compound slug
"""
# Decode URL encoding
slug = unquote(slug)
# Remove trailing ID (hex or numeric)
clean_slug = re.sub(r'[-_][\da-f]{6,}$', '', slug)
clean_slug = re.sub(r'[-_]\d{5,}$', '', clean_slug)
# Check if this is a compound slug we can't reliably parse
if is_compound_slug(slug):
return ("Unknown", False)
# Split by hyphens
parts = clean_slug.split('-')
# Filter out empty parts
parts = [p for p in parts if p]
if not parts:
return ("Unknown", False)
# Capitalize appropriately
# Dutch particles that should stay lowercase: van, de, den, der, het, 't
dutch_particles = {'van', 'de', 'den', 'der', 'het', 't', "'t"}
name_parts = []
for i, part in enumerate(parts):
if part.lower() in dutch_particles and i > 0:
name_parts.append(part.lower())
else:
# Capitalize first letter, preserve rest
name_parts.append(part.capitalize())
name = ' '.join(name_parts)
# Additional validation - name should have at least 2 characters
if len(name) < 2:
return ("Unknown", False)
return (name, True)
def fix_contaminated_files(entity_dir: Path, dry_run: bool = True):
"""Find and fix Simon Kemper contaminated files.
Only processes files where name is ACTUALLY "Simon Kemper" (contaminated).
Skips files where name was already corrected or was never contaminated.
Returns:
tuple: (contaminated_list, fixed_list, unreliable_list)
"""
contaminated = []
fixed = []
unreliable = [] # Files where we couldn't reliably derive the name
for filepath in entity_dir.glob("*.json"):
try:
with open(filepath, 'r', encoding='utf-8') as f:
data = json.load(f)
except (json.JSONDecodeError, IOError):
continue
# Check if this is a Simon Kemper contamination
profile_name = data.get('profile_data', {}).get('name', '')
source_name = data.get('source_staff_info', {}).get('name', '')
# ONLY process files where the name is ACTUALLY "Simon Kemper"
if profile_name != 'Simon Kemper' and source_name != 'Simon Kemper':
continue
# Get the slug from filename or URL
filename = filepath.name
linkedin_url = data.get('extraction_metadata', {}).get('linkedin_url', '')
# Extract slug from URL
slug_match = re.search(r'/in/([^/]+)/?$', linkedin_url)
if not slug_match:
continue
slug = slug_match.group(1)
# Check if this is truly contamination (slug doesn't match simon kemper)
slug_lower = slug.lower().replace('%', '')
if 'simonkemper' in slug_lower or 'simon-kemper' in slug_lower:
# This is the real Simon Kemper, skip
continue
# Derive correct name from slug
correct_name, is_reliable = slug_to_name(slug)
entry = {
'file': filepath.name,
'slug': slug,
'profile_name': profile_name,
'source_name': source_name,
'contaminated_field': 'profile_data.name' if profile_name == 'Simon Kemper' else 'source_staff_info.name',
'correct_name': correct_name,
'is_reliable': is_reliable,
'headline': data.get('profile_data', {}).get('headline', ''),
'custodian': data.get('affiliations', [{}])[0].get('custodian_name', '') if data.get('affiliations') else ''
}
if is_reliable:
contaminated.append(entry)
else:
unreliable.append(entry)
if not dry_run:
# Fix the data
if 'profile_data' in data:
data['profile_data']['name'] = correct_name
if 'source_staff_info' in data:
data['source_staff_info']['name'] = correct_name
# Add fix metadata
if 'extraction_metadata' not in data:
data['extraction_metadata'] = {}
if is_reliable:
fix_note = f"Name corrected from 'Simon Kemper' (contamination) to '{correct_name}' (derived from slug) on {datetime.now(timezone.utc).isoformat()}"
else:
fix_note = f"Name set to 'Unknown' (was 'Simon Kemper' contamination). Original slug: {slug}. Compound slug cannot be reliably parsed. Fixed on {datetime.now(timezone.utc).isoformat()}"
# Also preserve slug in a dedicated field for future reference
data['extraction_metadata']['original_slug'] = slug
existing_notes = data['extraction_metadata'].get('notes', '')
if existing_notes:
data['extraction_metadata']['notes'] = f"{existing_notes} | {fix_note}"
else:
data['extraction_metadata']['notes'] = fix_note
# Write back
with open(filepath, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
fixed.append(filepath.name)
return contaminated, fixed, unreliable
def main():
import argparse
parser = argparse.ArgumentParser(description='Fix Simon Kemper contamination')
parser.add_argument('--fix', action='store_true', help='Actually fix files (default: dry run)')
args = parser.parse_args()
entity_dir = Path("/Users/kempersc/apps/glam/data/custodian/person/entity")
dry_run = not args.fix
mode = "DRY RUN" if dry_run else "FIXING"
print("=" * 80)
print(f"SIMON KEMPER CONTAMINATION FIX - {mode}")
print("=" * 80)
contaminated, fixed, unreliable = fix_contaminated_files(entity_dir, dry_run=dry_run)
print(f"\n{'='*40}")
print(f"RELIABLY PARSEABLE ({len(contaminated)} files)")
print(f"{'='*40}")
print("These slugs have hyphens and can be reliably converted to names:\n")
for c in contaminated:
print(f" File: {c['file']}")
print(f" Slug: {c['slug']}")
print(f" Contaminated: {c['contaminated_field']} = 'Simon Kemper'")
print(f" Correct name: '{c['correct_name']}'")
headline = c['headline']
print(f" Headline: {headline[:60]}..." if len(headline) > 60 else f" Headline: {headline}")
print(f" Custodian: {c['custodian']}")
print()
if unreliable:
print(f"\n{'='*40}")
print(f"COMPOUND SLUGS - SET TO 'Unknown' ({len(unreliable)} files)")
print(f"{'='*40}")
print("These slugs have no hyphens and cannot be reliably parsed.")
print("Per Rule 21: Names will be set to 'Unknown' (no hallucination).\n")
for u in unreliable:
print(f" File: {u['file']}")
print(f" Slug: {u['slug']}")
print(f" Contaminated: {u['contaminated_field']} = 'Simon Kemper'")
print(f" Will be set to: 'Unknown' (slug preserved in metadata)")
headline = u['headline']
print(f" Headline: {headline[:60]}..." if len(headline) > 60 else f" Headline: {headline}")
print(f" Custodian: {u['custodian']}")
print()
print(f"\n{'='*40}")
print("SUMMARY")
print(f"{'='*40}")
print(f" Reliably fixable: {len(contaminated)}")
print(f" Set to 'Unknown': {len(unreliable)}")
print(f" Total: {len(contaminated) + len(unreliable)}")
if not dry_run:
print(f"\n✅ Fixed {len(fixed)} files")
else:
print(f"\n⚠️ DRY RUN - No files modified. Run with --fix to apply changes.")
if __name__ == "__main__":
main()