#!/usr/bin/env python3 """ Recalculate confidence scores for entity resolution candidates. This script applies improved confidence scoring that incorporates: 1. Birth year cross-validation (email year vs LinkedIn decade) 2. Institution match boosting 3. Wrong-person detection for birth year mismatches It also RE-EXTRACTS email semantics using the latest parsing logic, ensuring any fixes to birth year extraction are applied. Usage: python scripts/recalculate_confidence.py \ --input data/entity_resolution/entity_resolution_candidates.json \ --output data/entity_resolution/entity_resolution_candidates.json """ import argparse import json import sys from datetime import datetime, timezone from pathlib import Path from collections import Counter # Add src to path for local imports sys.path.insert(0, str(Path(__file__).parent.parent / 'src')) from glam_extractor.entity_resolution import ( recalculate_candidate_confidence, extract_birth_decade_from_ppid, parse_email_semantics, ) def re_extract_email_semantics(candidate: dict) -> int: """ Re-extract email semantics using current parsing logic. Returns number of changes made. """ changes = 0 wcms_email = candidate.get('wcms_email') if not wcms_email: return 0 # Parse email with current logic semantics = parse_email_semantics(wcms_email) if not semantics: return 0 # Update birth year if changed old_birth_year = candidate.get('email_probable_birth_year') new_birth_year = semantics.probable_birth_year if old_birth_year != new_birth_year: candidate['email_probable_birth_year'] = new_birth_year candidate['email_birth_year_confidence'] = semantics.birth_year_confidence candidate['email_birth_year_position'] = semantics.birth_year_position changes += 1 # Update institutional status if changed old_institutional = candidate.get('email_is_institutional') new_institutional = semantics.is_institutional_domain if old_institutional != new_institutional: candidate['email_is_institutional'] = new_institutional candidate['email_institution_name'] = semantics.institution_name candidate['email_institution_type'] = semantics.institution_type changes += 1 # Update consumer status if changed old_consumer = candidate.get('email_is_consumer') new_consumer = semantics.is_consumer_domain if old_consumer != new_consumer: candidate['email_is_consumer'] = new_consumer changes += 1 return changes def main(): parser = argparse.ArgumentParser( description='Recalculate confidence scores for entity resolution candidates' ) parser.add_argument( '--input', '-i', type=str, default='data/entity_resolution/entity_resolution_candidates.json', help='Input candidates JSON file' ) parser.add_argument( '--output', '-o', type=str, default=None, help='Output file (default: overwrite input)' ) parser.add_argument( '--dry-run', '-n', action='store_true', help='Show what would be changed without writing' ) parser.add_argument( '--show-wrong-person', '-w', action='store_true', help='Print candidates flagged as likely wrong person' ) args = parser.parse_args() input_path = Path(args.input) output_path = Path(args.output) if args.output else input_path if not input_path.exists(): print(f"Error: Input file not found: {input_path}") sys.exit(1) print(f"Loading candidates from: {input_path}") with open(input_path, 'r') as f: data = json.load(f) candidates = data.get('candidates', []) print(f"Total candidates: {len(candidates):,}") # Track statistics stats = { 'total': len(candidates), 'adjusted': 0, 'boosted': 0, 'penalized': 0, 'likely_wrong_person': 0, 'reviews_preserved': 0, 'email_reextracted': 0, 'birth_years_removed': 0, 'birth_years_changed': 0, } # Score distribution before scores_before = [c.get('confidence_score', 0) for c in candidates] # Wrong person candidates for reporting wrong_person_candidates = [] # Recalculate each candidate for i, candidate in enumerate(candidates): original_score = candidate.get('confidence_score', 0) old_birth_year = candidate.get('email_probable_birth_year') # Preserve review status if candidate.get('reviewed'): stats['reviews_preserved'] += 1 # Re-extract email semantics with updated parsing logic email_changes = re_extract_email_semantics(candidate) if email_changes: stats['email_reextracted'] += 1 new_birth_year = candidate.get('email_probable_birth_year') if old_birth_year and not new_birth_year: stats['birth_years_removed'] += 1 elif old_birth_year != new_birth_year: stats['birth_years_changed'] += 1 # Apply new scoring recalculate_candidate_confidence(candidate) new_score = candidate.get('confidence_score', 0) if new_score != original_score: stats['adjusted'] += 1 if new_score > original_score: stats['boosted'] += 1 else: stats['penalized'] += 1 if candidate.get('is_likely_wrong_person'): stats['likely_wrong_person'] += 1 wrong_person_candidates.append({ 'wcms_name': candidate.get('wcms_name'), 'wcms_email': candidate.get('wcms_email'), 'linkedin_name': candidate.get('linkedin_name'), 'email_birth_year': candidate.get('email_probable_birth_year'), 'linkedin_decade': extract_birth_decade_from_ppid(candidate.get('linkedin_ppid', '')), 'reason': candidate.get('wrong_person_reason'), 'score_before': original_score, 'score_after': new_score, }) if (i + 1) % 10000 == 0: print(f" Processed {i + 1:,} / {len(candidates):,}") # Score distribution after scores_after = [c.get('confidence_score', 0) for c in candidates] # Update metadata data['metadata']['confidence_recalculated_at'] = datetime.now(timezone.utc).isoformat() data['metadata']['confidence_scoring_version'] = '2.0' data['metadata']['confidence_recalculation_stats'] = stats # Print statistics print("\n" + "=" * 60) print("RECALCULATION STATISTICS") print("=" * 60) print(f"Total candidates: {stats['total']:,}") print(f"Email re-extracted: {stats['email_reextracted']:,}") print(f" - Birth years removed: {stats['birth_years_removed']:,}") print(f" - Birth years changed: {stats['birth_years_changed']:,}") print(f"Scores adjusted: {stats['adjusted']:,}") print(f" - Boosted: {stats['boosted']:,}") print(f" - Penalized: {stats['penalized']:,}") print(f"Likely wrong person: {stats['likely_wrong_person']:,}") print(f"Reviews preserved: {stats['reviews_preserved']:,}") # Distribution comparison print("\n" + "-" * 60) print("CONFIDENCE DISTRIBUTION") print("-" * 60) def dist(scores): high = sum(1 for s in scores if s >= 0.8) med = sum(1 for s in scores if 0.6 <= s < 0.8) low = sum(1 for s in scores if 0.4 <= s < 0.6) vlow = sum(1 for s in scores if s < 0.4) return high, med, low, vlow before = dist(scores_before) after = dist(scores_after) print(f"{'Range':<20} {'Before':>10} {'After':>10} {'Change':>10}") print("-" * 50) print(f"{'High (>=0.8)':<20} {before[0]:>10,} {after[0]:>10,} {after[0]-before[0]:>+10,}") print(f"{'Medium (0.6-0.8)':<20} {before[1]:>10,} {after[1]:>10,} {after[1]-before[1]:>+10,}") print(f"{'Low (0.4-0.6)':<20} {before[2]:>10,} {after[2]:>10,} {after[2]-before[2]:>+10,}") print(f"{'Very Low (<0.4)':<20} {before[3]:>10,} {after[3]:>10,} {after[3]-before[3]:>+10,}") # Signal counts print("\n" + "-" * 60) print("NEW SIGNAL COUNTS") print("-" * 60) signal_counts = Counter() for c in candidates: signal_counts.update(c.get('match_signals', [])) for signal, count in signal_counts.most_common(): print(f" {signal}: {count:,}") # Show wrong person examples if args.show_wrong_person and wrong_person_candidates: print("\n" + "=" * 60) print("LIKELY WRONG PERSON MATCHES (sample)") print("=" * 60) for wp in wrong_person_candidates[:10]: linkedin_name = wp['linkedin_name'] if isinstance(linkedin_name, dict): linkedin_name = linkedin_name.get('full_name', linkedin_name) print(f"\n WCMS: {wp['wcms_name']}") print(f" Email: {wp['wcms_email']} (birth year: {wp['email_birth_year']})") print(f" LinkedIn: {linkedin_name} (decade: {wp['linkedin_decade']})") print(f" Reason: {wp['reason']}") print(f" Score: {wp['score_before']:.2f} -> {wp['score_after']:.2f}") # Write output if args.dry_run: print("\n[DRY RUN] Would write to:", output_path) else: print(f"\nWriting to: {output_path}") with open(output_path, 'w') as f: json.dump(data, f, indent=2, ensure_ascii=False) print("Done!") if __name__ == '__main__': main()