glam/scripts/annotate_match_candidates.py

#!/usr/bin/env python3
"""
Annotate profiles with entity resolution match candidates.

This script:
1. Reads the entity resolution candidates file
2. Adds match_candidates annotations to WCMS profiles
3. DOES NOT MERGE - only adds annotations for human review

CRITICAL: NO AUTO-MERGING! All matches require human verification.
This script only ANNOTATES profiles - it does NOT modify LinkedIn profiles
and does NOT merge any data between profiles.

Usage:
  python scripts/annotate_match_candidates.py --dry-run
  python scripts/annotate_match_candidates.py
"""

import json
import argparse
from pathlib import Path
from datetime import datetime, timezone
from collections import defaultdict

PERSON_DIR = Path('/Users/kempersc/apps/glam/data/person')
CANDIDATES_FILE = Path('/Users/kempersc/apps/glam/data/entity_resolution/entity_resolution_candidates.json')


def main():
    parser = argparse.ArgumentParser(description='Annotate profiles with match candidates')
    parser.add_argument('--dry-run', action='store_true', help='Preview only, no file changes')
    parser.add_argument('--min-confidence', type=float, default=0.5, help='Minimum confidence to annotate')
    args = parser.parse_args()

    print("=" * 70)
    print("PROFILE MATCH ANNOTATION")
    print("=" * 70)
    print("  CRITICAL: This script ONLY adds annotations.")
    print("  NO MERGING occurs - all matches require human review.")
    print(f"  Dry run: {args.dry_run}")

    # Load candidates
    print("\nPhase 1: Loading entity resolution candidates...")
    with open(CANDIDATES_FILE) as f:
        data = json.load(f)

    candidates = data['candidates']
    print(f"  Loaded {len(candidates):,} candidates")

    # Filter by confidence
    candidates = [c for c in candidates if c['confidence_score'] >= args.min_confidence]
    print(f"  After filtering (>={args.min_confidence}): {len(candidates):,}")

    # Group by WCMS profile
    by_wcms = defaultdict(list)
    for c in candidates:
        by_wcms[c['wcms_ppid']].append(c)

    print(f"  WCMS profiles with candidates: {len(by_wcms):,}")

    # Annotate WCMS profiles
    print(f"\nPhase 2: Annotating WCMS profiles...")

    annotated = 0
    errors = 0

    for wcms_ppid, matches in by_wcms.items():
        profile_path = PERSON_DIR / f"{wcms_ppid}.json"

        if not profile_path.exists():
            errors += 1
            continue

        try:
            with open(profile_path) as f:
                profile = json.load(f)

            # Build match annotation (summary only - not full candidate data)
            match_annotations = []
            for m in sorted(matches, key=lambda x: x['confidence_score'], reverse=True):
                match_annotations.append({
                    "linkedin_ppid": m['linkedin_ppid'],
                    "linkedin_name": m['linkedin_name'],
                    "linkedin_slug": m['linkedin_slug'],
                    "confidence_score": m['confidence_score'],
                    "match_signals": m['match_signals'],
                    "requires_review": True,  # ALWAYS requires review
                    "reviewed": False,
                    "review_decision": None,  # "match", "not_match", "uncertain"
                    "reviewed_by": None,
                    "reviewed_at": None
                })

            # Update entity_resolution section
            if 'entity_resolution' not in profile:
                profile['entity_resolution'] = {}

            profile['entity_resolution']['potential_linkedin_matches'] = len(match_annotations)
            profile['entity_resolution']['match_candidates'] = match_annotations
            profile['entity_resolution']['requires_manual_review'] = True
            profile['entity_resolution']['auto_merged'] = False  # NEVER auto-merge
            profile['entity_resolution']['annotation_date'] = datetime.now(timezone.utc).isoformat()
            profile['entity_resolution']['annotation_script'] = 'annotate_match_candidates.py'

            if not args.dry_run:
                with open(profile_path, 'w') as f:
                    json.dump(profile, f, indent=2, ensure_ascii=False)

            annotated += 1

            if annotated % 5000 == 0:
                print(f"  Annotated {annotated:,}/{len(by_wcms):,} profiles...")

        except Exception as e:
            errors += 1
            if errors <= 5:
                print(f"  ERROR: {wcms_ppid}: {e}")

    # Summary
    print("\n" + "=" * 70)
    print(f"{'DRY RUN ' if args.dry_run else ''}ANNOTATION SUMMARY")
    print("=" * 70)
    print(f"  WCMS profiles annotated: {annotated:,}")
    print(f"  Errors: {errors}")
    print(f"  Total match candidates added: {len(candidates):,}")

    # Distribution
    single_match = sum(1 for matches in by_wcms.values() if len(matches) == 1)
    multi_match = len(by_wcms) - single_match
    print(f"\n  Match distribution:")
    print(f"    Single LinkedIn match: {single_match:,} (easier to review)")
    print(f"    Multiple LinkedIn matches: {multi_match:,} (need disambiguation)")

    print(f"\n  REMINDER: All matches require human verification!")
    print(f"  Use the review interface to approve/reject matches.")

    if args.dry_run:
        print(f"\n  To apply annotations, run without --dry-run")


if __name__ == '__main__':
    main()