glam/scripts/recalculate_confidence.py
2026-01-13 20:35:11 +01:00

207 lines
7.1 KiB
Python

#!/usr/bin/env python3
"""
Recalculate confidence scores for entity resolution candidates.
This script applies improved confidence scoring that incorporates:
1. Birth year cross-validation (email year vs LinkedIn decade)
2. Institution match boosting
3. Wrong-person detection for birth year mismatches
Usage:
python scripts/recalculate_confidence.py \
--input data/entity_resolution/entity_resolution_candidates.json \
--output data/entity_resolution/entity_resolution_candidates.json
"""
import argparse
import json
import sys
from datetime import datetime, timezone
from pathlib import Path
from collections import Counter
# Add src to path for local imports
sys.path.insert(0, str(Path(__file__).parent.parent / 'src'))
from glam_extractor.entity_resolution import (
recalculate_candidate_confidence,
extract_birth_decade_from_ppid,
)
def main():
parser = argparse.ArgumentParser(
description='Recalculate confidence scores for entity resolution candidates'
)
parser.add_argument(
'--input', '-i',
type=str,
default='data/entity_resolution/entity_resolution_candidates.json',
help='Input candidates JSON file'
)
parser.add_argument(
'--output', '-o',
type=str,
default=None,
help='Output file (default: overwrite input)'
)
parser.add_argument(
'--dry-run', '-n',
action='store_true',
help='Show what would be changed without writing'
)
parser.add_argument(
'--show-wrong-person', '-w',
action='store_true',
help='Print candidates flagged as likely wrong person'
)
args = parser.parse_args()
input_path = Path(args.input)
output_path = Path(args.output) if args.output else input_path
if not input_path.exists():
print(f"Error: Input file not found: {input_path}")
sys.exit(1)
print(f"Loading candidates from: {input_path}")
with open(input_path, 'r') as f:
data = json.load(f)
candidates = data.get('candidates', [])
print(f"Total candidates: {len(candidates):,}")
# Track statistics
stats = {
'total': len(candidates),
'adjusted': 0,
'boosted': 0,
'penalized': 0,
'likely_wrong_person': 0,
'reviews_preserved': 0,
}
# Score distribution before
scores_before = [c.get('confidence_score', 0) for c in candidates]
# Wrong person candidates for reporting
wrong_person_candidates = []
# Recalculate each candidate
for i, candidate in enumerate(candidates):
original_score = candidate.get('confidence_score', 0)
# Preserve review status
if candidate.get('reviewed'):
stats['reviews_preserved'] += 1
# Apply new scoring
recalculate_candidate_confidence(candidate)
new_score = candidate.get('confidence_score', 0)
if new_score != original_score:
stats['adjusted'] += 1
if new_score > original_score:
stats['boosted'] += 1
else:
stats['penalized'] += 1
if candidate.get('is_likely_wrong_person'):
stats['likely_wrong_person'] += 1
wrong_person_candidates.append({
'wcms_name': candidate.get('wcms_name'),
'wcms_email': candidate.get('wcms_email'),
'linkedin_name': candidate.get('linkedin_name'),
'email_birth_year': candidate.get('email_probable_birth_year'),
'linkedin_decade': extract_birth_decade_from_ppid(candidate.get('linkedin_ppid', '')),
'reason': candidate.get('wrong_person_reason'),
'score_before': original_score,
'score_after': new_score,
})
if (i + 1) % 10000 == 0:
print(f" Processed {i + 1:,} / {len(candidates):,}")
# Score distribution after
scores_after = [c.get('confidence_score', 0) for c in candidates]
# Update metadata
data['metadata']['confidence_recalculated_at'] = datetime.now(timezone.utc).isoformat()
data['metadata']['confidence_scoring_version'] = '2.0'
data['metadata']['confidence_recalculation_stats'] = stats
# Print statistics
print("\n" + "=" * 60)
print("RECALCULATION STATISTICS")
print("=" * 60)
print(f"Total candidates: {stats['total']:,}")
print(f"Scores adjusted: {stats['adjusted']:,}")
print(f" - Boosted: {stats['boosted']:,}")
print(f" - Penalized: {stats['penalized']:,}")
print(f"Likely wrong person: {stats['likely_wrong_person']:,}")
print(f"Reviews preserved: {stats['reviews_preserved']:,}")
# Distribution comparison
print("\n" + "-" * 60)
print("CONFIDENCE DISTRIBUTION")
print("-" * 60)
def dist(scores):
high = sum(1 for s in scores if s >= 0.8)
med = sum(1 for s in scores if 0.6 <= s < 0.8)
low = sum(1 for s in scores if 0.4 <= s < 0.6)
vlow = sum(1 for s in scores if s < 0.4)
return high, med, low, vlow
before = dist(scores_before)
after = dist(scores_after)
print(f"{'Range':<20} {'Before':>10} {'After':>10} {'Change':>10}")
print("-" * 50)
print(f"{'High (>=0.8)':<20} {before[0]:>10,} {after[0]:>10,} {after[0]-before[0]:>+10,}")
print(f"{'Medium (0.6-0.8)':<20} {before[1]:>10,} {after[1]:>10,} {after[1]-before[1]:>+10,}")
print(f"{'Low (0.4-0.6)':<20} {before[2]:>10,} {after[2]:>10,} {after[2]-before[2]:>+10,}")
print(f"{'Very Low (<0.4)':<20} {before[3]:>10,} {after[3]:>10,} {after[3]-before[3]:>+10,}")
# Signal counts
print("\n" + "-" * 60)
print("NEW SIGNAL COUNTS")
print("-" * 60)
signal_counts = Counter()
for c in candidates:
signal_counts.update(c.get('match_signals', []))
for signal, count in signal_counts.most_common():
print(f" {signal}: {count:,}")
# Show wrong person examples
if args.show_wrong_person and wrong_person_candidates:
print("\n" + "=" * 60)
print("LIKELY WRONG PERSON MATCHES (sample)")
print("=" * 60)
for wp in wrong_person_candidates[:10]:
linkedin_name = wp['linkedin_name']
if isinstance(linkedin_name, dict):
linkedin_name = linkedin_name.get('full_name', linkedin_name)
print(f"\n WCMS: {wp['wcms_name']}")
print(f" Email: {wp['wcms_email']} (birth year: {wp['email_birth_year']})")
print(f" LinkedIn: {linkedin_name} (decade: {wp['linkedin_decade']})")
print(f" Reason: {wp['reason']}")
print(f" Score: {wp['score_before']:.2f} -> {wp['score_after']:.2f}")
# Write output
if args.dry_run:
print("\n[DRY RUN] Would write to:", output_path)
else:
print(f"\nWriting to: {output_path}")
with open(output_path, 'w') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
print("Done!")
if __name__ == '__main__':
main()