#!/usr/bin/env python3 """ Annotate profiles with entity resolution match candidates. This script: 1. Reads the entity resolution candidates file 2. Adds match_candidates annotations to WCMS profiles 3. DOES NOT MERGE - only adds annotations for human review CRITICAL: NO AUTO-MERGING! All matches require human verification. This script only ANNOTATES profiles - it does NOT modify LinkedIn profiles and does NOT merge any data between profiles. Usage: python scripts/annotate_match_candidates.py --dry-run python scripts/annotate_match_candidates.py """ import json import argparse from pathlib import Path from datetime import datetime, timezone from collections import defaultdict PERSON_DIR = Path('/Users/kempersc/apps/glam/data/person') CANDIDATES_FILE = Path('/Users/kempersc/apps/glam/data/entity_resolution/entity_resolution_candidates.json') def main(): parser = argparse.ArgumentParser(description='Annotate profiles with match candidates') parser.add_argument('--dry-run', action='store_true', help='Preview only, no file changes') parser.add_argument('--min-confidence', type=float, default=0.5, help='Minimum confidence to annotate') args = parser.parse_args() print("=" * 70) print("PROFILE MATCH ANNOTATION") print("=" * 70) print(" CRITICAL: This script ONLY adds annotations.") print(" NO MERGING occurs - all matches require human review.") print(f" Dry run: {args.dry_run}") # Load candidates print("\nPhase 1: Loading entity resolution candidates...") with open(CANDIDATES_FILE) as f: data = json.load(f) candidates = data['candidates'] print(f" Loaded {len(candidates):,} candidates") # Filter by confidence candidates = [c for c in candidates if c['confidence_score'] >= args.min_confidence] print(f" After filtering (>={args.min_confidence}): {len(candidates):,}") # Group by WCMS profile by_wcms = defaultdict(list) for c in candidates: by_wcms[c['wcms_ppid']].append(c) print(f" WCMS profiles with candidates: {len(by_wcms):,}") # Annotate WCMS profiles print(f"\nPhase 2: Annotating WCMS profiles...") annotated = 0 errors = 0 for wcms_ppid, matches in by_wcms.items(): profile_path = PERSON_DIR / f"{wcms_ppid}.json" if not profile_path.exists(): errors += 1 continue try: with open(profile_path) as f: profile = json.load(f) # Build match annotation (summary only - not full candidate data) match_annotations = [] for m in sorted(matches, key=lambda x: x['confidence_score'], reverse=True): match_annotations.append({ "linkedin_ppid": m['linkedin_ppid'], "linkedin_name": m['linkedin_name'], "linkedin_slug": m['linkedin_slug'], "confidence_score": m['confidence_score'], "match_signals": m['match_signals'], "requires_review": True, # ALWAYS requires review "reviewed": False, "review_decision": None, # "match", "not_match", "uncertain" "reviewed_by": None, "reviewed_at": None }) # Update entity_resolution section if 'entity_resolution' not in profile: profile['entity_resolution'] = {} profile['entity_resolution']['potential_linkedin_matches'] = len(match_annotations) profile['entity_resolution']['match_candidates'] = match_annotations profile['entity_resolution']['requires_manual_review'] = True profile['entity_resolution']['auto_merged'] = False # NEVER auto-merge profile['entity_resolution']['annotation_date'] = datetime.now(timezone.utc).isoformat() profile['entity_resolution']['annotation_script'] = 'annotate_match_candidates.py' if not args.dry_run: with open(profile_path, 'w') as f: json.dump(profile, f, indent=2, ensure_ascii=False) annotated += 1 if annotated % 5000 == 0: print(f" Annotated {annotated:,}/{len(by_wcms):,} profiles...") except Exception as e: errors += 1 if errors <= 5: print(f" ERROR: {wcms_ppid}: {e}") # Summary print("\n" + "=" * 70) print(f"{'DRY RUN ' if args.dry_run else ''}ANNOTATION SUMMARY") print("=" * 70) print(f" WCMS profiles annotated: {annotated:,}") print(f" Errors: {errors}") print(f" Total match candidates added: {len(candidates):,}") # Distribution single_match = sum(1 for matches in by_wcms.values() if len(matches) == 1) multi_match = len(by_wcms) - single_match print(f"\n Match distribution:") print(f" Single LinkedIn match: {single_match:,} (easier to review)") print(f" Multiple LinkedIn matches: {multi_match:,} (need disambiguation)") print(f"\n REMINDER: All matches require human verification!") print(f" Use the review interface to approve/reject matches.") if args.dry_run: print(f"\n To apply annotations, run without --dry-run") if __name__ == '__main__': main()