glam/scripts/recalculate_confidence.py
kempersc 9a395f3dbe fix: improve birth year extraction to avoid date suffix false positives
- Skip YYYYMMDD and YYMMDD date patterns at end of email
- Skip digit sequences longer than 4 characters
- Require non-digit before 4-digit years at end
- Add knid.nl/kabelnoord.nl to consumer domains (Friesland ISP)
- Add 11 missing regional archive domains to HERITAGE_DOMAIN_MAP
- Update recalculation script to re-extract email semantics

Results:
- 3,151 false birth years removed
- 'Likely wrong person' reduced from 533 to 325 (-39%)
- 2,944 candidates' scores boosted
2026-01-13 22:37:10 +01:00

277 lines
9.6 KiB
Python

#!/usr/bin/env python3
"""
Recalculate confidence scores for entity resolution candidates.
This script applies improved confidence scoring that incorporates:
1. Birth year cross-validation (email year vs LinkedIn decade)
2. Institution match boosting
3. Wrong-person detection for birth year mismatches
It also RE-EXTRACTS email semantics using the latest parsing logic,
ensuring any fixes to birth year extraction are applied.
Usage:
python scripts/recalculate_confidence.py \
--input data/entity_resolution/entity_resolution_candidates.json \
--output data/entity_resolution/entity_resolution_candidates.json
"""
import argparse
import json
import sys
from datetime import datetime, timezone
from pathlib import Path
from collections import Counter
# Add src to path for local imports
sys.path.insert(0, str(Path(__file__).parent.parent / 'src'))
from glam_extractor.entity_resolution import (
recalculate_candidate_confidence,
extract_birth_decade_from_ppid,
parse_email_semantics,
)
def re_extract_email_semantics(candidate: dict) -> int:
"""
Re-extract email semantics using current parsing logic.
Returns number of changes made.
"""
changes = 0
wcms_email = candidate.get('wcms_email')
if not wcms_email:
return 0
# Parse email with current logic
semantics = parse_email_semantics(wcms_email)
if not semantics:
return 0
# Update birth year if changed
old_birth_year = candidate.get('email_probable_birth_year')
new_birth_year = semantics.probable_birth_year
if old_birth_year != new_birth_year:
candidate['email_probable_birth_year'] = new_birth_year
candidate['email_birth_year_confidence'] = semantics.birth_year_confidence
candidate['email_birth_year_position'] = semantics.birth_year_position
changes += 1
# Update institutional status if changed
old_institutional = candidate.get('email_is_institutional')
new_institutional = semantics.is_institutional_domain
if old_institutional != new_institutional:
candidate['email_is_institutional'] = new_institutional
candidate['email_institution_name'] = semantics.institution_name
candidate['email_institution_type'] = semantics.institution_type
changes += 1
# Update consumer status if changed
old_consumer = candidate.get('email_is_consumer')
new_consumer = semantics.is_consumer_domain
if old_consumer != new_consumer:
candidate['email_is_consumer'] = new_consumer
changes += 1
return changes
def main():
parser = argparse.ArgumentParser(
description='Recalculate confidence scores for entity resolution candidates'
)
parser.add_argument(
'--input', '-i',
type=str,
default='data/entity_resolution/entity_resolution_candidates.json',
help='Input candidates JSON file'
)
parser.add_argument(
'--output', '-o',
type=str,
default=None,
help='Output file (default: overwrite input)'
)
parser.add_argument(
'--dry-run', '-n',
action='store_true',
help='Show what would be changed without writing'
)
parser.add_argument(
'--show-wrong-person', '-w',
action='store_true',
help='Print candidates flagged as likely wrong person'
)
args = parser.parse_args()
input_path = Path(args.input)
output_path = Path(args.output) if args.output else input_path
if not input_path.exists():
print(f"Error: Input file not found: {input_path}")
sys.exit(1)
print(f"Loading candidates from: {input_path}")
with open(input_path, 'r') as f:
data = json.load(f)
candidates = data.get('candidates', [])
print(f"Total candidates: {len(candidates):,}")
# Track statistics
stats = {
'total': len(candidates),
'adjusted': 0,
'boosted': 0,
'penalized': 0,
'likely_wrong_person': 0,
'reviews_preserved': 0,
'email_reextracted': 0,
'birth_years_removed': 0,
'birth_years_changed': 0,
}
# Score distribution before
scores_before = [c.get('confidence_score', 0) for c in candidates]
# Wrong person candidates for reporting
wrong_person_candidates = []
# Recalculate each candidate
for i, candidate in enumerate(candidates):
original_score = candidate.get('confidence_score', 0)
old_birth_year = candidate.get('email_probable_birth_year')
# Preserve review status
if candidate.get('reviewed'):
stats['reviews_preserved'] += 1
# Re-extract email semantics with updated parsing logic
email_changes = re_extract_email_semantics(candidate)
if email_changes:
stats['email_reextracted'] += 1
new_birth_year = candidate.get('email_probable_birth_year')
if old_birth_year and not new_birth_year:
stats['birth_years_removed'] += 1
elif old_birth_year != new_birth_year:
stats['birth_years_changed'] += 1
# Apply new scoring
recalculate_candidate_confidence(candidate)
new_score = candidate.get('confidence_score', 0)
if new_score != original_score:
stats['adjusted'] += 1
if new_score > original_score:
stats['boosted'] += 1
else:
stats['penalized'] += 1
if candidate.get('is_likely_wrong_person'):
stats['likely_wrong_person'] += 1
wrong_person_candidates.append({
'wcms_name': candidate.get('wcms_name'),
'wcms_email': candidate.get('wcms_email'),
'linkedin_name': candidate.get('linkedin_name'),
'email_birth_year': candidate.get('email_probable_birth_year'),
'linkedin_decade': extract_birth_decade_from_ppid(candidate.get('linkedin_ppid', '')),
'reason': candidate.get('wrong_person_reason'),
'score_before': original_score,
'score_after': new_score,
})
if (i + 1) % 10000 == 0:
print(f" Processed {i + 1:,} / {len(candidates):,}")
# Score distribution after
scores_after = [c.get('confidence_score', 0) for c in candidates]
# Update metadata
data['metadata']['confidence_recalculated_at'] = datetime.now(timezone.utc).isoformat()
data['metadata']['confidence_scoring_version'] = '2.0'
data['metadata']['confidence_recalculation_stats'] = stats
# Print statistics
print("\n" + "=" * 60)
print("RECALCULATION STATISTICS")
print("=" * 60)
print(f"Total candidates: {stats['total']:,}")
print(f"Email re-extracted: {stats['email_reextracted']:,}")
print(f" - Birth years removed: {stats['birth_years_removed']:,}")
print(f" - Birth years changed: {stats['birth_years_changed']:,}")
print(f"Scores adjusted: {stats['adjusted']:,}")
print(f" - Boosted: {stats['boosted']:,}")
print(f" - Penalized: {stats['penalized']:,}")
print(f"Likely wrong person: {stats['likely_wrong_person']:,}")
print(f"Reviews preserved: {stats['reviews_preserved']:,}")
# Distribution comparison
print("\n" + "-" * 60)
print("CONFIDENCE DISTRIBUTION")
print("-" * 60)
def dist(scores):
high = sum(1 for s in scores if s >= 0.8)
med = sum(1 for s in scores if 0.6 <= s < 0.8)
low = sum(1 for s in scores if 0.4 <= s < 0.6)
vlow = sum(1 for s in scores if s < 0.4)
return high, med, low, vlow
before = dist(scores_before)
after = dist(scores_after)
print(f"{'Range':<20} {'Before':>10} {'After':>10} {'Change':>10}")
print("-" * 50)
print(f"{'High (>=0.8)':<20} {before[0]:>10,} {after[0]:>10,} {after[0]-before[0]:>+10,}")
print(f"{'Medium (0.6-0.8)':<20} {before[1]:>10,} {after[1]:>10,} {after[1]-before[1]:>+10,}")
print(f"{'Low (0.4-0.6)':<20} {before[2]:>10,} {after[2]:>10,} {after[2]-before[2]:>+10,}")
print(f"{'Very Low (<0.4)':<20} {before[3]:>10,} {after[3]:>10,} {after[3]-before[3]:>+10,}")
# Signal counts
print("\n" + "-" * 60)
print("NEW SIGNAL COUNTS")
print("-" * 60)
signal_counts = Counter()
for c in candidates:
signal_counts.update(c.get('match_signals', []))
for signal, count in signal_counts.most_common():
print(f" {signal}: {count:,}")
# Show wrong person examples
if args.show_wrong_person and wrong_person_candidates:
print("\n" + "=" * 60)
print("LIKELY WRONG PERSON MATCHES (sample)")
print("=" * 60)
for wp in wrong_person_candidates[:10]:
linkedin_name = wp['linkedin_name']
if isinstance(linkedin_name, dict):
linkedin_name = linkedin_name.get('full_name', linkedin_name)
print(f"\n WCMS: {wp['wcms_name']}")
print(f" Email: {wp['wcms_email']} (birth year: {wp['email_birth_year']})")
print(f" LinkedIn: {linkedin_name} (decade: {wp['linkedin_decade']})")
print(f" Reason: {wp['reason']}")
print(f" Score: {wp['score_before']:.2f} -> {wp['score_after']:.2f}")
# Write output
if args.dry_run:
print("\n[DRY RUN] Would write to:", output_path)
else:
print(f"\nWriting to: {output_path}")
with open(output_path, 'w') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
print("Done!")
if __name__ == '__main__':
main()