glam/scripts/annotate_match_candidates.py
2026-01-12 14:33:56 +01:00

142 lines
5.4 KiB
Python

#!/usr/bin/env python3
"""
Annotate profiles with entity resolution match candidates.
This script:
1. Reads the entity resolution candidates file
2. Adds match_candidates annotations to WCMS profiles
3. DOES NOT MERGE - only adds annotations for human review
CRITICAL: NO AUTO-MERGING! All matches require human verification.
This script only ANNOTATES profiles - it does NOT modify LinkedIn profiles
and does NOT merge any data between profiles.
Usage:
python scripts/annotate_match_candidates.py --dry-run
python scripts/annotate_match_candidates.py
"""
import json
import argparse
from pathlib import Path
from datetime import datetime, timezone
from collections import defaultdict
PERSON_DIR = Path('/Users/kempersc/apps/glam/data/person')
CANDIDATES_FILE = Path('/Users/kempersc/apps/glam/data/entity_resolution/entity_resolution_candidates.json')
def main():
parser = argparse.ArgumentParser(description='Annotate profiles with match candidates')
parser.add_argument('--dry-run', action='store_true', help='Preview only, no file changes')
parser.add_argument('--min-confidence', type=float, default=0.5, help='Minimum confidence to annotate')
args = parser.parse_args()
print("=" * 70)
print("PROFILE MATCH ANNOTATION")
print("=" * 70)
print(" CRITICAL: This script ONLY adds annotations.")
print(" NO MERGING occurs - all matches require human review.")
print(f" Dry run: {args.dry_run}")
# Load candidates
print("\nPhase 1: Loading entity resolution candidates...")
with open(CANDIDATES_FILE) as f:
data = json.load(f)
candidates = data['candidates']
print(f" Loaded {len(candidates):,} candidates")
# Filter by confidence
candidates = [c for c in candidates if c['confidence_score'] >= args.min_confidence]
print(f" After filtering (>={args.min_confidence}): {len(candidates):,}")
# Group by WCMS profile
by_wcms = defaultdict(list)
for c in candidates:
by_wcms[c['wcms_ppid']].append(c)
print(f" WCMS profiles with candidates: {len(by_wcms):,}")
# Annotate WCMS profiles
print(f"\nPhase 2: Annotating WCMS profiles...")
annotated = 0
errors = 0
for wcms_ppid, matches in by_wcms.items():
profile_path = PERSON_DIR / f"{wcms_ppid}.json"
if not profile_path.exists():
errors += 1
continue
try:
with open(profile_path) as f:
profile = json.load(f)
# Build match annotation (summary only - not full candidate data)
match_annotations = []
for m in sorted(matches, key=lambda x: x['confidence_score'], reverse=True):
match_annotations.append({
"linkedin_ppid": m['linkedin_ppid'],
"linkedin_name": m['linkedin_name'],
"linkedin_slug": m['linkedin_slug'],
"confidence_score": m['confidence_score'],
"match_signals": m['match_signals'],
"requires_review": True, # ALWAYS requires review
"reviewed": False,
"review_decision": None, # "match", "not_match", "uncertain"
"reviewed_by": None,
"reviewed_at": None
})
# Update entity_resolution section
if 'entity_resolution' not in profile:
profile['entity_resolution'] = {}
profile['entity_resolution']['potential_linkedin_matches'] = len(match_annotations)
profile['entity_resolution']['match_candidates'] = match_annotations
profile['entity_resolution']['requires_manual_review'] = True
profile['entity_resolution']['auto_merged'] = False # NEVER auto-merge
profile['entity_resolution']['annotation_date'] = datetime.now(timezone.utc).isoformat()
profile['entity_resolution']['annotation_script'] = 'annotate_match_candidates.py'
if not args.dry_run:
with open(profile_path, 'w') as f:
json.dump(profile, f, indent=2, ensure_ascii=False)
annotated += 1
if annotated % 5000 == 0:
print(f" Annotated {annotated:,}/{len(by_wcms):,} profiles...")
except Exception as e:
errors += 1
if errors <= 5:
print(f" ERROR: {wcms_ppid}: {e}")
# Summary
print("\n" + "=" * 70)
print(f"{'DRY RUN ' if args.dry_run else ''}ANNOTATION SUMMARY")
print("=" * 70)
print(f" WCMS profiles annotated: {annotated:,}")
print(f" Errors: {errors}")
print(f" Total match candidates added: {len(candidates):,}")
# Distribution
single_match = sum(1 for matches in by_wcms.values() if len(matches) == 1)
multi_match = len(by_wcms) - single_match
print(f"\n Match distribution:")
print(f" Single LinkedIn match: {single_match:,} (easier to review)")
print(f" Multiple LinkedIn matches: {multi_match:,} (need disambiguation)")
print(f"\n REMINDER: All matches require human verification!")
print(f" Use the review interface to approve/reject matches.")
if args.dry_run:
print(f"\n To apply annotations, run without --dry-run")
if __name__ == '__main__':
main()