142 lines
5.4 KiB
Python
142 lines
5.4 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Annotate profiles with entity resolution match candidates.
|
|
|
|
This script:
|
|
1. Reads the entity resolution candidates file
|
|
2. Adds match_candidates annotations to WCMS profiles
|
|
3. DOES NOT MERGE - only adds annotations for human review
|
|
|
|
CRITICAL: NO AUTO-MERGING! All matches require human verification.
|
|
This script only ANNOTATES profiles - it does NOT modify LinkedIn profiles
|
|
and does NOT merge any data between profiles.
|
|
|
|
Usage:
|
|
python scripts/annotate_match_candidates.py --dry-run
|
|
python scripts/annotate_match_candidates.py
|
|
"""
|
|
|
|
import json
|
|
import argparse
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from collections import defaultdict
|
|
|
|
PERSON_DIR = Path('/Users/kempersc/apps/glam/data/person')
|
|
CANDIDATES_FILE = Path('/Users/kempersc/apps/glam/data/entity_resolution/entity_resolution_candidates.json')
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='Annotate profiles with match candidates')
|
|
parser.add_argument('--dry-run', action='store_true', help='Preview only, no file changes')
|
|
parser.add_argument('--min-confidence', type=float, default=0.5, help='Minimum confidence to annotate')
|
|
args = parser.parse_args()
|
|
|
|
print("=" * 70)
|
|
print("PROFILE MATCH ANNOTATION")
|
|
print("=" * 70)
|
|
print(" CRITICAL: This script ONLY adds annotations.")
|
|
print(" NO MERGING occurs - all matches require human review.")
|
|
print(f" Dry run: {args.dry_run}")
|
|
|
|
# Load candidates
|
|
print("\nPhase 1: Loading entity resolution candidates...")
|
|
with open(CANDIDATES_FILE) as f:
|
|
data = json.load(f)
|
|
|
|
candidates = data['candidates']
|
|
print(f" Loaded {len(candidates):,} candidates")
|
|
|
|
# Filter by confidence
|
|
candidates = [c for c in candidates if c['confidence_score'] >= args.min_confidence]
|
|
print(f" After filtering (>={args.min_confidence}): {len(candidates):,}")
|
|
|
|
# Group by WCMS profile
|
|
by_wcms = defaultdict(list)
|
|
for c in candidates:
|
|
by_wcms[c['wcms_ppid']].append(c)
|
|
|
|
print(f" WCMS profiles with candidates: {len(by_wcms):,}")
|
|
|
|
# Annotate WCMS profiles
|
|
print(f"\nPhase 2: Annotating WCMS profiles...")
|
|
|
|
annotated = 0
|
|
errors = 0
|
|
|
|
for wcms_ppid, matches in by_wcms.items():
|
|
profile_path = PERSON_DIR / f"{wcms_ppid}.json"
|
|
|
|
if not profile_path.exists():
|
|
errors += 1
|
|
continue
|
|
|
|
try:
|
|
with open(profile_path) as f:
|
|
profile = json.load(f)
|
|
|
|
# Build match annotation (summary only - not full candidate data)
|
|
match_annotations = []
|
|
for m in sorted(matches, key=lambda x: x['confidence_score'], reverse=True):
|
|
match_annotations.append({
|
|
"linkedin_ppid": m['linkedin_ppid'],
|
|
"linkedin_name": m['linkedin_name'],
|
|
"linkedin_slug": m['linkedin_slug'],
|
|
"confidence_score": m['confidence_score'],
|
|
"match_signals": m['match_signals'],
|
|
"requires_review": True, # ALWAYS requires review
|
|
"reviewed": False,
|
|
"review_decision": None, # "match", "not_match", "uncertain"
|
|
"reviewed_by": None,
|
|
"reviewed_at": None
|
|
})
|
|
|
|
# Update entity_resolution section
|
|
if 'entity_resolution' not in profile:
|
|
profile['entity_resolution'] = {}
|
|
|
|
profile['entity_resolution']['potential_linkedin_matches'] = len(match_annotations)
|
|
profile['entity_resolution']['match_candidates'] = match_annotations
|
|
profile['entity_resolution']['requires_manual_review'] = True
|
|
profile['entity_resolution']['auto_merged'] = False # NEVER auto-merge
|
|
profile['entity_resolution']['annotation_date'] = datetime.now(timezone.utc).isoformat()
|
|
profile['entity_resolution']['annotation_script'] = 'annotate_match_candidates.py'
|
|
|
|
if not args.dry_run:
|
|
with open(profile_path, 'w') as f:
|
|
json.dump(profile, f, indent=2, ensure_ascii=False)
|
|
|
|
annotated += 1
|
|
|
|
if annotated % 5000 == 0:
|
|
print(f" Annotated {annotated:,}/{len(by_wcms):,} profiles...")
|
|
|
|
except Exception as e:
|
|
errors += 1
|
|
if errors <= 5:
|
|
print(f" ERROR: {wcms_ppid}: {e}")
|
|
|
|
# Summary
|
|
print("\n" + "=" * 70)
|
|
print(f"{'DRY RUN ' if args.dry_run else ''}ANNOTATION SUMMARY")
|
|
print("=" * 70)
|
|
print(f" WCMS profiles annotated: {annotated:,}")
|
|
print(f" Errors: {errors}")
|
|
print(f" Total match candidates added: {len(candidates):,}")
|
|
|
|
# Distribution
|
|
single_match = sum(1 for matches in by_wcms.values() if len(matches) == 1)
|
|
multi_match = len(by_wcms) - single_match
|
|
print(f"\n Match distribution:")
|
|
print(f" Single LinkedIn match: {single_match:,} (easier to review)")
|
|
print(f" Multiple LinkedIn matches: {multi_match:,} (need disambiguation)")
|
|
|
|
print(f"\n REMINDER: All matches require human verification!")
|
|
print(f" Use the review interface to approve/reject matches.")
|
|
|
|
if args.dry_run:
|
|
print(f"\n To apply annotations, run without --dry-run")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|