#!/usr/bin/env python3 """ Build entity resolution candidates between WCMS and LinkedIn profiles. This script: 1. Indexes all profiles by normalized name 2. Finds potential matches based on multiple signals 3. Scores matches but NEVER auto-merges 4. Outputs candidates for manual review CRITICAL: No auto-merging! Entity resolution requires human verification. Matching signals: - Name similarity (primary) - Email domain matches employer - Overlapping affiliations - Location overlap Usage: python scripts/build_entity_resolution.py --limit 10000 python scripts/build_entity_resolution.py --output candidates.json """ import json import argparse import re from pathlib import Path from datetime import datetime, timezone import unicodedata from typing import Dict, List, Optional, Set, Tuple from collections import defaultdict from dataclasses import dataclass, asdict PERSON_DIR = Path('/Users/kempersc/apps/glam/data/person') OUTPUT_DIR = Path('/Users/kempersc/apps/glam/data/entity_resolution') def normalize_name(name) -> str: """Normalize name for comparison.""" if not name: return "" # Handle dict or other types if isinstance(name, dict): name = name.get('full_name', name.get('name', str(name))) if not isinstance(name, str): name = str(name) # Remove titles name = re.sub(r'\b(Dr|Prof|Mr|Mrs|Ms|Drs|Ir|Ing|PhD|MA|MSc|MBA|BSc|Jr|Sr)\b\.?', '', name, flags=re.IGNORECASE) # Normalize unicode nfkd = unicodedata.normalize('NFKD', name) ascii_name = ''.join(c for c in nfkd if not unicodedata.combining(c)) # Lowercase, remove punctuation clean = re.sub(r'[^a-z\s]', '', ascii_name.lower()) # Normalize whitespace return ' '.join(clean.split()) def extract_name_parts(name: str) -> Tuple[str, str]: """Extract first and last name parts.""" parts = name.split() if len(parts) >= 2: return parts[0], parts[-1] elif len(parts) == 1: return parts[0], "" return "", "" def extract_employer_domains(profile: dict) -> Set[str]: """Extract domains from employer information in profile.""" domains = set() # From affiliations for aff in profile.get('affiliations', []) or []: if not isinstance(aff, dict): continue org = aff.get('organization', {}) if isinstance(org, str): continue # Skip if org is just a string if not isinstance(org, dict): continue website = org.get('website', '') or '' if website and isinstance(website, str): # Extract domain from URL domain = re.sub(r'^https?://(www\.)?', '', website.lower()) domain = domain.split('/')[0] if domain: domains.add(domain) # From profile_data profile_data = profile.get('profile_data', {}) or {} if not isinstance(profile_data, dict): return domains for exp in profile_data.get('experience', []) or []: if not isinstance(exp, dict): continue company_url = exp.get('company_url', '') or '' if company_url and isinstance(company_url, str): domain = re.sub(r'^https?://(www\.)?', '', company_url.lower()) domain = domain.split('/')[0] if domain: domains.add(domain) return domains def extract_employer_names(profile: dict) -> Set[str]: """Extract employer names from profile.""" names = set() # From affiliations for aff in profile.get('affiliations', []) or []: if not isinstance(aff, dict): continue org = aff.get('organization', {}) if isinstance(org, str): # org is just a string name normalized = normalize_name(org) if normalized: names.add(normalized) continue if isinstance(org, dict): name = org.get('name', '') if name and isinstance(name, str): normalized = normalize_name(name) if normalized: names.add(normalized) # From profile_data experience profile_data = profile.get('profile_data', {}) or {} if not isinstance(profile_data, dict): return names for exp in profile_data.get('experience', []) or []: if not isinstance(exp, dict): continue company = exp.get('company', '') if company and isinstance(company, str): normalized = normalize_name(company) if normalized: names.add(normalized) return names @dataclass class MatchCandidate: """A potential entity resolution match.""" wcms_ppid: str wcms_name: str wcms_email: Optional[str] wcms_email_domain: Optional[str] linkedin_ppid: str linkedin_name: str linkedin_slug: Optional[str] # Scoring signals name_match_score: float # 0-1, how similar are names email_domain_matches_employer: bool employer_name_overlap: List[str] # Overall score confidence_score: float match_signals: List[str] # Review status requires_review: bool = True reviewed: bool = False review_decision: Optional[str] = None # "match", "not_match", "uncertain" def calculate_name_similarity(name1: str, name2: str) -> float: """Calculate name similarity score (0-1).""" n1 = normalize_name(name1) n2 = normalize_name(name2) if not n1 or not n2: return 0.0 # Exact match if n1 == n2: return 1.0 # Check first/last name match first1, last1 = extract_name_parts(n1) first2, last2 = extract_name_parts(n2) # Both first and last match if first1 == first2 and last1 == last2: return 0.95 # Last name matches, first name is initial or similar if last1 == last2: if first1 and first2 and first1[0] == first2[0]: return 0.85 if first1 == first2: return 0.9 # First name matches, last name similar if first1 == first2 and last1 and last2: # Check if one is substring of other if last1 in last2 or last2 in last1: return 0.8 # Token overlap tokens1 = set(n1.split()) tokens2 = set(n2.split()) overlap = tokens1 & tokens2 if overlap: jaccard = len(overlap) / len(tokens1 | tokens2) return 0.5 + (jaccard * 0.4) return 0.0 def build_name_index(profiles: List[dict]) -> Dict[str, List[dict]]: """Build index of profiles by normalized name components.""" index = defaultdict(list) for profile in profiles: # Skip profiles without ppid if not profile.get('ppid'): continue name = profile.get('name', '') normalized = normalize_name(name) if normalized: # Index by full normalized name index[normalized].append(profile) # Also index by last name _, last = extract_name_parts(normalized) if last: index[f"_last_{last}"].append(profile) return index def find_candidates(wcms_profile: dict, linkedin_index: Dict[str, List[dict]]) -> List[MatchCandidate]: """Find potential LinkedIn matches for a WCMS profile.""" candidates = [] wcms_name = wcms_profile.get('name', '') wcms_normalized = normalize_name(wcms_name) wcms_email = wcms_profile.get('contact_details', {}).get('email') wcms_domain = wcms_profile.get('contact_details', {}).get('email_domain') if not wcms_normalized: return [] # Get candidate LinkedIn profiles potential_matches = set() # Exact name match for p in linkedin_index.get(wcms_normalized, []): ppid = p.get('ppid') if ppid: potential_matches.add(ppid) # Last name match _, last = extract_name_parts(wcms_normalized) if last: for p in linkedin_index.get(f"_last_{last}", []): ppid = p.get('ppid') if ppid: potential_matches.add(ppid) # Score each potential match for linkedin_ppid in potential_matches: # Get full profile profile_path = PERSON_DIR / f"{linkedin_ppid}.json" if not profile_path.exists(): continue try: with open(profile_path) as f: linkedin_profile = json.load(f) except: continue # Skip if same profile (WCMS profile, not LinkedIn) if 'wcms' in linkedin_profile.get('data_sources', []): continue linkedin_name = linkedin_profile.get('name', '') # Calculate signals name_score = calculate_name_similarity(wcms_name, linkedin_name) if name_score < 0.5: continue # Too low to consider # Check email domain vs employer employer_domains = extract_employer_domains(linkedin_profile) domain_matches = wcms_domain and wcms_domain in employer_domains # Check employer name overlap employer_names = extract_employer_names(linkedin_profile) # Could enhance with WCMS organization data if available # Build match signals signals = [] if name_score >= 0.95: signals.append("exact_name_match") elif name_score >= 0.85: signals.append("strong_name_match") elif name_score >= 0.7: signals.append("partial_name_match") if domain_matches: signals.append("email_domain_matches_employer") # Calculate overall confidence confidence = name_score * 0.6 # Name is 60% of score if domain_matches: confidence += 0.3 # Domain match is strong signal # Create candidate candidate = MatchCandidate( wcms_ppid=wcms_profile['ppid'], wcms_name=wcms_name, wcms_email=wcms_email, wcms_email_domain=wcms_domain, linkedin_ppid=linkedin_ppid, linkedin_name=linkedin_name, linkedin_slug=linkedin_profile.get('linkedin_slug'), name_match_score=name_score, email_domain_matches_employer=domain_matches, employer_name_overlap=list(employer_names)[:5], confidence_score=confidence, match_signals=signals, requires_review=True ) candidates.append(candidate) # Sort by confidence candidates.sort(key=lambda c: c.confidence_score, reverse=True) return candidates[:5] # Return top 5 candidates per WCMS profile def main(): parser = argparse.ArgumentParser(description='Build entity resolution candidates') parser.add_argument('--limit', type=int, default=None, help='Limit profiles to process') parser.add_argument('--output', type=str, default='entity_resolution_candidates.json', help='Output file name') parser.add_argument('--min-confidence', type=float, default=0.5, help='Minimum confidence threshold') args = parser.parse_args() print("=" * 70) print("ENTITY RESOLUTION CANDIDATE BUILDER") print("=" * 70) print(" CRITICAL: No auto-merging! All candidates require manual review.") OUTPUT_DIR.mkdir(exist_ok=True) # Phase 1: Load all profiles print("\nPhase 1: Loading profiles...") wcms_profiles = [] linkedin_profiles = [] count = 0 for f in PERSON_DIR.glob('ID_*.json'): count += 1 if count % 20000 == 0: print(f" Loaded {count:,} profiles...") try: with open(f) as fp: data = json.load(fp) sources = data.get('data_sources', []) if 'wcms' in sources: wcms_profiles.append(data) elif 'linkedin' in sources or data.get('linkedin_slug'): linkedin_profiles.append(data) except: pass print(f" Loaded {len(wcms_profiles):,} WCMS profiles") print(f" Loaded {len(linkedin_profiles):,} LinkedIn profiles") if args.limit: wcms_profiles = wcms_profiles[:args.limit] print(f" Limited WCMS profiles to {args.limit}") # Phase 2: Build LinkedIn index print("\nPhase 2: Building LinkedIn name index...") linkedin_index = build_name_index(linkedin_profiles) print(f" Index size: {len(linkedin_index):,} name keys") # Phase 3: Find candidates print("\nPhase 3: Finding match candidates...") all_candidates = [] profiles_with_matches = 0 for i, wcms in enumerate(wcms_profiles): candidates = find_candidates(wcms, linkedin_index) # Filter by confidence candidates = [c for c in candidates if c.confidence_score >= args.min_confidence] if candidates: profiles_with_matches += 1 all_candidates.extend(candidates) if (i + 1) % 10000 == 0: print(f" Processed {i+1:,}/{len(wcms_profiles):,} - " f"Found {len(all_candidates):,} candidates from {profiles_with_matches:,} profiles") # Phase 4: Output results print("\nPhase 4: Saving results...") output_path = OUTPUT_DIR / args.output # Convert to dict for JSON serialization results = { "metadata": { "generated_at": datetime.now(timezone.utc).isoformat(), "wcms_profiles_processed": len(wcms_profiles), "linkedin_profiles_indexed": len(linkedin_profiles), "profiles_with_matches": profiles_with_matches, "total_candidates": len(all_candidates), "min_confidence_threshold": args.min_confidence, "requires_manual_review": True }, "candidates": [asdict(c) for c in all_candidates] } with open(output_path, 'w') as f: json.dump(results, f, indent=2, ensure_ascii=False) # Summary print("\n" + "=" * 70) print("ENTITY RESOLUTION SUMMARY") print("=" * 70) print(f" WCMS profiles processed: {len(wcms_profiles):,}") print(f" LinkedIn profiles indexed: {len(linkedin_profiles):,}") print(f" Profiles with potential matches: {profiles_with_matches:,}") print(f" Total match candidates: {len(all_candidates):,}") print(f" Output saved to: {output_path}") # Show confidence distribution if all_candidates: high = sum(1 for c in all_candidates if c.confidence_score >= 0.8) medium = sum(1 for c in all_candidates if 0.6 <= c.confidence_score < 0.8) low = sum(1 for c in all_candidates if c.confidence_score < 0.6) print(f"\n Confidence distribution:") print(f" High (>=0.8): {high:,}") print(f" Medium (0.6-0.8): {medium:,}") print(f" Low (<0.6): {low:,}") # Show sample candidates if all_candidates: print(f"\n Sample high-confidence candidates:") for c in sorted(all_candidates, key=lambda x: x.confidence_score, reverse=True)[:5]: print(f" {c.wcms_name} <-> {c.linkedin_name}") print(f" Score: {c.confidence_score:.2f}, Signals: {', '.join(c.match_signals)}") if __name__ == '__main__': main()