glam/scripts/recalculate_confidence.py

#!/usr/bin/env python3
"""
Recalculate confidence scores for entity resolution candidates.

This script applies improved confidence scoring that incorporates:
1. Birth year cross-validation (email year vs LinkedIn decade)
2. Institution match boosting
3. Wrong-person detection for birth year mismatches

It also RE-EXTRACTS email semantics using the latest parsing logic,
ensuring any fixes to birth year extraction are applied.

Usage:
    python scripts/recalculate_confidence.py \
        --input data/entity_resolution/entity_resolution_candidates.json \
        --output data/entity_resolution/entity_resolution_candidates.json
"""

import argparse
import json
import sys
from datetime import datetime, timezone
from pathlib import Path
from collections import Counter

# Add src to path for local imports
sys.path.insert(0, str(Path(__file__).parent.parent / 'src'))

from glam_extractor.entity_resolution import (
    recalculate_candidate_confidence,
    extract_birth_decade_from_ppid,
    parse_email_semantics,
)


def re_extract_email_semantics(candidate: dict) -> int:
    """
    Re-extract email semantics using current parsing logic.

    Returns number of changes made.
    """
    changes = 0
    wcms_email = candidate.get('wcms_email')

    if not wcms_email:
        return 0

    # Parse email with current logic
    semantics = parse_email_semantics(wcms_email)

    if not semantics:
        return 0

    # Update birth year if changed
    old_birth_year = candidate.get('email_probable_birth_year')
    new_birth_year = semantics.probable_birth_year

    if old_birth_year != new_birth_year:
        candidate['email_probable_birth_year'] = new_birth_year
        candidate['email_birth_year_confidence'] = semantics.birth_year_confidence
        candidate['email_birth_year_position'] = semantics.birth_year_position
        changes += 1

    # Update institutional status if changed
    old_institutional = candidate.get('email_is_institutional')
    new_institutional = semantics.is_institutional_domain

    if old_institutional != new_institutional:
        candidate['email_is_institutional'] = new_institutional
        candidate['email_institution_name'] = semantics.institution_name
        candidate['email_institution_type'] = semantics.institution_type
        changes += 1

    # Update consumer status if changed
    old_consumer = candidate.get('email_is_consumer')
    new_consumer = semantics.is_consumer_domain

    if old_consumer != new_consumer:
        candidate['email_is_consumer'] = new_consumer
        changes += 1

    return changes


def main():
    parser = argparse.ArgumentParser(
        description='Recalculate confidence scores for entity resolution candidates'
    )
    parser.add_argument(
        '--input', '-i',
        type=str,
        default='data/entity_resolution/entity_resolution_candidates.json',
        help='Input candidates JSON file'
    )
    parser.add_argument(
        '--output', '-o',
        type=str,
        default=None,
        help='Output file (default: overwrite input)'
    )
    parser.add_argument(
        '--dry-run', '-n',
        action='store_true',
        help='Show what would be changed without writing'
    )
    parser.add_argument(
        '--show-wrong-person', '-w',
        action='store_true',
        help='Print candidates flagged as likely wrong person'
    )
    args = parser.parse_args()

    input_path = Path(args.input)
    output_path = Path(args.output) if args.output else input_path

    if not input_path.exists():
        print(f"Error: Input file not found: {input_path}")
        sys.exit(1)

    print(f"Loading candidates from: {input_path}")
    with open(input_path, 'r') as f:
        data = json.load(f)

    candidates = data.get('candidates', [])
    print(f"Total candidates: {len(candidates):,}")

    # Track statistics
    stats = {
        'total': len(candidates),
        'adjusted': 0,
        'boosted': 0,
        'penalized': 0,
        'likely_wrong_person': 0,
        'reviews_preserved': 0,
        'email_reextracted': 0,
        'birth_years_removed': 0,
        'birth_years_changed': 0,
    }

    # Score distribution before
    scores_before = [c.get('confidence_score', 0) for c in candidates]

    # Wrong person candidates for reporting
    wrong_person_candidates = []

    # Recalculate each candidate
    for i, candidate in enumerate(candidates):
        original_score = candidate.get('confidence_score', 0)
        old_birth_year = candidate.get('email_probable_birth_year')

        # Preserve review status
        if candidate.get('reviewed'):
            stats['reviews_preserved'] += 1

        # Re-extract email semantics with updated parsing logic
        email_changes = re_extract_email_semantics(candidate)
        if email_changes:
            stats['email_reextracted'] += 1
            new_birth_year = candidate.get('email_probable_birth_year')
            if old_birth_year and not new_birth_year:
                stats['birth_years_removed'] += 1
            elif old_birth_year != new_birth_year:
                stats['birth_years_changed'] += 1

        # Apply new scoring
        recalculate_candidate_confidence(candidate)

        new_score = candidate.get('confidence_score', 0)

        if new_score != original_score:
            stats['adjusted'] += 1
            if new_score > original_score:
                stats['boosted'] += 1
            else:
                stats['penalized'] += 1

        if candidate.get('is_likely_wrong_person'):
            stats['likely_wrong_person'] += 1
            wrong_person_candidates.append({
                'wcms_name': candidate.get('wcms_name'),
                'wcms_email': candidate.get('wcms_email'),
                'linkedin_name': candidate.get('linkedin_name'),
                'email_birth_year': candidate.get('email_probable_birth_year'),
                'linkedin_decade': extract_birth_decade_from_ppid(candidate.get('linkedin_ppid', '')),
                'reason': candidate.get('wrong_person_reason'),
                'score_before': original_score,
                'score_after': new_score,
            })

        if (i + 1) % 10000 == 0:
            print(f"  Processed {i + 1:,} / {len(candidates):,}")

    # Score distribution after
    scores_after = [c.get('confidence_score', 0) for c in candidates]

    # Update metadata
    data['metadata']['confidence_recalculated_at'] = datetime.now(timezone.utc).isoformat()
    data['metadata']['confidence_scoring_version'] = '2.0'
    data['metadata']['confidence_recalculation_stats'] = stats

    # Print statistics
    print("\n" + "=" * 60)
    print("RECALCULATION STATISTICS")
    print("=" * 60)
    print(f"Total candidates:        {stats['total']:,}")
    print(f"Email re-extracted:      {stats['email_reextracted']:,}")
    print(f"  - Birth years removed: {stats['birth_years_removed']:,}")
    print(f"  - Birth years changed: {stats['birth_years_changed']:,}")
    print(f"Scores adjusted:         {stats['adjusted']:,}")
    print(f"  - Boosted:             {stats['boosted']:,}")
    print(f"  - Penalized:           {stats['penalized']:,}")
    print(f"Likely wrong person:     {stats['likely_wrong_person']:,}")
    print(f"Reviews preserved:       {stats['reviews_preserved']:,}")

    # Distribution comparison
    print("\n" + "-" * 60)
    print("CONFIDENCE DISTRIBUTION")
    print("-" * 60)

    def dist(scores):
        high = sum(1 for s in scores if s >= 0.8)
        med = sum(1 for s in scores if 0.6 <= s < 0.8)
        low = sum(1 for s in scores if 0.4 <= s < 0.6)
        vlow = sum(1 for s in scores if s < 0.4)
        return high, med, low, vlow

    before = dist(scores_before)
    after = dist(scores_after)

    print(f"{'Range':<20} {'Before':>10} {'After':>10} {'Change':>10}")
    print("-" * 50)
    print(f"{'High (>=0.8)':<20} {before[0]:>10,} {after[0]:>10,} {after[0]-before[0]:>+10,}")
    print(f"{'Medium (0.6-0.8)':<20} {before[1]:>10,} {after[1]:>10,} {after[1]-before[1]:>+10,}")
    print(f"{'Low (0.4-0.6)':<20} {before[2]:>10,} {after[2]:>10,} {after[2]-before[2]:>+10,}")
    print(f"{'Very Low (<0.4)':<20} {before[3]:>10,} {after[3]:>10,} {after[3]-before[3]:>+10,}")

    # Signal counts
    print("\n" + "-" * 60)
    print("NEW SIGNAL COUNTS")
    print("-" * 60)

    signal_counts = Counter()
    for c in candidates:
        signal_counts.update(c.get('match_signals', []))

    for signal, count in signal_counts.most_common():
        print(f"  {signal}: {count:,}")

    # Show wrong person examples
    if args.show_wrong_person and wrong_person_candidates:
        print("\n" + "=" * 60)
        print("LIKELY WRONG PERSON MATCHES (sample)")
        print("=" * 60)

        for wp in wrong_person_candidates[:10]:
            linkedin_name = wp['linkedin_name']
            if isinstance(linkedin_name, dict):
                linkedin_name = linkedin_name.get('full_name', linkedin_name)

            print(f"\n  WCMS: {wp['wcms_name']}")
            print(f"  Email: {wp['wcms_email']} (birth year: {wp['email_birth_year']})")
            print(f"  LinkedIn: {linkedin_name} (decade: {wp['linkedin_decade']})")
            print(f"  Reason: {wp['reason']}")
            print(f"  Score: {wp['score_before']:.2f} -> {wp['score_after']:.2f}")

    # Write output
    if args.dry_run:
        print("\n[DRY RUN] Would write to:", output_path)
    else:
        print(f"\nWriting to: {output_path}")
        with open(output_path, 'w') as f:
            json.dump(data, f, indent=2, ensure_ascii=False)
        print("Done!")


if __name__ == '__main__':
    main()