- Skip YYYYMMDD and YYMMDD date patterns at end of email - Skip digit sequences longer than 4 characters - Require non-digit before 4-digit years at end - Add knid.nl/kabelnoord.nl to consumer domains (Friesland ISP) - Add 11 missing regional archive domains to HERITAGE_DOMAIN_MAP - Update recalculation script to re-extract email semantics Results: - 3,151 false birth years removed - 'Likely wrong person' reduced from 533 to 325 (-39%) - 2,944 candidates' scores boosted
277 lines
9.6 KiB
Python
277 lines
9.6 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Recalculate confidence scores for entity resolution candidates.
|
|
|
|
This script applies improved confidence scoring that incorporates:
|
|
1. Birth year cross-validation (email year vs LinkedIn decade)
|
|
2. Institution match boosting
|
|
3. Wrong-person detection for birth year mismatches
|
|
|
|
It also RE-EXTRACTS email semantics using the latest parsing logic,
|
|
ensuring any fixes to birth year extraction are applied.
|
|
|
|
Usage:
|
|
python scripts/recalculate_confidence.py \
|
|
--input data/entity_resolution/entity_resolution_candidates.json \
|
|
--output data/entity_resolution/entity_resolution_candidates.json
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import sys
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from collections import Counter
|
|
|
|
# Add src to path for local imports
|
|
sys.path.insert(0, str(Path(__file__).parent.parent / 'src'))
|
|
|
|
from glam_extractor.entity_resolution import (
|
|
recalculate_candidate_confidence,
|
|
extract_birth_decade_from_ppid,
|
|
parse_email_semantics,
|
|
)
|
|
|
|
|
|
def re_extract_email_semantics(candidate: dict) -> int:
|
|
"""
|
|
Re-extract email semantics using current parsing logic.
|
|
|
|
Returns number of changes made.
|
|
"""
|
|
changes = 0
|
|
wcms_email = candidate.get('wcms_email')
|
|
|
|
if not wcms_email:
|
|
return 0
|
|
|
|
# Parse email with current logic
|
|
semantics = parse_email_semantics(wcms_email)
|
|
|
|
if not semantics:
|
|
return 0
|
|
|
|
# Update birth year if changed
|
|
old_birth_year = candidate.get('email_probable_birth_year')
|
|
new_birth_year = semantics.probable_birth_year
|
|
|
|
if old_birth_year != new_birth_year:
|
|
candidate['email_probable_birth_year'] = new_birth_year
|
|
candidate['email_birth_year_confidence'] = semantics.birth_year_confidence
|
|
candidate['email_birth_year_position'] = semantics.birth_year_position
|
|
changes += 1
|
|
|
|
# Update institutional status if changed
|
|
old_institutional = candidate.get('email_is_institutional')
|
|
new_institutional = semantics.is_institutional_domain
|
|
|
|
if old_institutional != new_institutional:
|
|
candidate['email_is_institutional'] = new_institutional
|
|
candidate['email_institution_name'] = semantics.institution_name
|
|
candidate['email_institution_type'] = semantics.institution_type
|
|
changes += 1
|
|
|
|
# Update consumer status if changed
|
|
old_consumer = candidate.get('email_is_consumer')
|
|
new_consumer = semantics.is_consumer_domain
|
|
|
|
if old_consumer != new_consumer:
|
|
candidate['email_is_consumer'] = new_consumer
|
|
changes += 1
|
|
|
|
return changes
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description='Recalculate confidence scores for entity resolution candidates'
|
|
)
|
|
parser.add_argument(
|
|
'--input', '-i',
|
|
type=str,
|
|
default='data/entity_resolution/entity_resolution_candidates.json',
|
|
help='Input candidates JSON file'
|
|
)
|
|
parser.add_argument(
|
|
'--output', '-o',
|
|
type=str,
|
|
default=None,
|
|
help='Output file (default: overwrite input)'
|
|
)
|
|
parser.add_argument(
|
|
'--dry-run', '-n',
|
|
action='store_true',
|
|
help='Show what would be changed without writing'
|
|
)
|
|
parser.add_argument(
|
|
'--show-wrong-person', '-w',
|
|
action='store_true',
|
|
help='Print candidates flagged as likely wrong person'
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
input_path = Path(args.input)
|
|
output_path = Path(args.output) if args.output else input_path
|
|
|
|
if not input_path.exists():
|
|
print(f"Error: Input file not found: {input_path}")
|
|
sys.exit(1)
|
|
|
|
print(f"Loading candidates from: {input_path}")
|
|
with open(input_path, 'r') as f:
|
|
data = json.load(f)
|
|
|
|
candidates = data.get('candidates', [])
|
|
print(f"Total candidates: {len(candidates):,}")
|
|
|
|
# Track statistics
|
|
stats = {
|
|
'total': len(candidates),
|
|
'adjusted': 0,
|
|
'boosted': 0,
|
|
'penalized': 0,
|
|
'likely_wrong_person': 0,
|
|
'reviews_preserved': 0,
|
|
'email_reextracted': 0,
|
|
'birth_years_removed': 0,
|
|
'birth_years_changed': 0,
|
|
}
|
|
|
|
# Score distribution before
|
|
scores_before = [c.get('confidence_score', 0) for c in candidates]
|
|
|
|
# Wrong person candidates for reporting
|
|
wrong_person_candidates = []
|
|
|
|
# Recalculate each candidate
|
|
for i, candidate in enumerate(candidates):
|
|
original_score = candidate.get('confidence_score', 0)
|
|
old_birth_year = candidate.get('email_probable_birth_year')
|
|
|
|
# Preserve review status
|
|
if candidate.get('reviewed'):
|
|
stats['reviews_preserved'] += 1
|
|
|
|
# Re-extract email semantics with updated parsing logic
|
|
email_changes = re_extract_email_semantics(candidate)
|
|
if email_changes:
|
|
stats['email_reextracted'] += 1
|
|
new_birth_year = candidate.get('email_probable_birth_year')
|
|
if old_birth_year and not new_birth_year:
|
|
stats['birth_years_removed'] += 1
|
|
elif old_birth_year != new_birth_year:
|
|
stats['birth_years_changed'] += 1
|
|
|
|
# Apply new scoring
|
|
recalculate_candidate_confidence(candidate)
|
|
|
|
new_score = candidate.get('confidence_score', 0)
|
|
|
|
if new_score != original_score:
|
|
stats['adjusted'] += 1
|
|
if new_score > original_score:
|
|
stats['boosted'] += 1
|
|
else:
|
|
stats['penalized'] += 1
|
|
|
|
if candidate.get('is_likely_wrong_person'):
|
|
stats['likely_wrong_person'] += 1
|
|
wrong_person_candidates.append({
|
|
'wcms_name': candidate.get('wcms_name'),
|
|
'wcms_email': candidate.get('wcms_email'),
|
|
'linkedin_name': candidate.get('linkedin_name'),
|
|
'email_birth_year': candidate.get('email_probable_birth_year'),
|
|
'linkedin_decade': extract_birth_decade_from_ppid(candidate.get('linkedin_ppid', '')),
|
|
'reason': candidate.get('wrong_person_reason'),
|
|
'score_before': original_score,
|
|
'score_after': new_score,
|
|
})
|
|
|
|
if (i + 1) % 10000 == 0:
|
|
print(f" Processed {i + 1:,} / {len(candidates):,}")
|
|
|
|
# Score distribution after
|
|
scores_after = [c.get('confidence_score', 0) for c in candidates]
|
|
|
|
# Update metadata
|
|
data['metadata']['confidence_recalculated_at'] = datetime.now(timezone.utc).isoformat()
|
|
data['metadata']['confidence_scoring_version'] = '2.0'
|
|
data['metadata']['confidence_recalculation_stats'] = stats
|
|
|
|
# Print statistics
|
|
print("\n" + "=" * 60)
|
|
print("RECALCULATION STATISTICS")
|
|
print("=" * 60)
|
|
print(f"Total candidates: {stats['total']:,}")
|
|
print(f"Email re-extracted: {stats['email_reextracted']:,}")
|
|
print(f" - Birth years removed: {stats['birth_years_removed']:,}")
|
|
print(f" - Birth years changed: {stats['birth_years_changed']:,}")
|
|
print(f"Scores adjusted: {stats['adjusted']:,}")
|
|
print(f" - Boosted: {stats['boosted']:,}")
|
|
print(f" - Penalized: {stats['penalized']:,}")
|
|
print(f"Likely wrong person: {stats['likely_wrong_person']:,}")
|
|
print(f"Reviews preserved: {stats['reviews_preserved']:,}")
|
|
|
|
# Distribution comparison
|
|
print("\n" + "-" * 60)
|
|
print("CONFIDENCE DISTRIBUTION")
|
|
print("-" * 60)
|
|
|
|
def dist(scores):
|
|
high = sum(1 for s in scores if s >= 0.8)
|
|
med = sum(1 for s in scores if 0.6 <= s < 0.8)
|
|
low = sum(1 for s in scores if 0.4 <= s < 0.6)
|
|
vlow = sum(1 for s in scores if s < 0.4)
|
|
return high, med, low, vlow
|
|
|
|
before = dist(scores_before)
|
|
after = dist(scores_after)
|
|
|
|
print(f"{'Range':<20} {'Before':>10} {'After':>10} {'Change':>10}")
|
|
print("-" * 50)
|
|
print(f"{'High (>=0.8)':<20} {before[0]:>10,} {after[0]:>10,} {after[0]-before[0]:>+10,}")
|
|
print(f"{'Medium (0.6-0.8)':<20} {before[1]:>10,} {after[1]:>10,} {after[1]-before[1]:>+10,}")
|
|
print(f"{'Low (0.4-0.6)':<20} {before[2]:>10,} {after[2]:>10,} {after[2]-before[2]:>+10,}")
|
|
print(f"{'Very Low (<0.4)':<20} {before[3]:>10,} {after[3]:>10,} {after[3]-before[3]:>+10,}")
|
|
|
|
# Signal counts
|
|
print("\n" + "-" * 60)
|
|
print("NEW SIGNAL COUNTS")
|
|
print("-" * 60)
|
|
|
|
signal_counts = Counter()
|
|
for c in candidates:
|
|
signal_counts.update(c.get('match_signals', []))
|
|
|
|
for signal, count in signal_counts.most_common():
|
|
print(f" {signal}: {count:,}")
|
|
|
|
# Show wrong person examples
|
|
if args.show_wrong_person and wrong_person_candidates:
|
|
print("\n" + "=" * 60)
|
|
print("LIKELY WRONG PERSON MATCHES (sample)")
|
|
print("=" * 60)
|
|
|
|
for wp in wrong_person_candidates[:10]:
|
|
linkedin_name = wp['linkedin_name']
|
|
if isinstance(linkedin_name, dict):
|
|
linkedin_name = linkedin_name.get('full_name', linkedin_name)
|
|
|
|
print(f"\n WCMS: {wp['wcms_name']}")
|
|
print(f" Email: {wp['wcms_email']} (birth year: {wp['email_birth_year']})")
|
|
print(f" LinkedIn: {linkedin_name} (decade: {wp['linkedin_decade']})")
|
|
print(f" Reason: {wp['reason']}")
|
|
print(f" Score: {wp['score_before']:.2f} -> {wp['score_after']:.2f}")
|
|
|
|
# Write output
|
|
if args.dry_run:
|
|
print("\n[DRY RUN] Would write to:", output_path)
|
|
else:
|
|
print(f"\nWriting to: {output_path}")
|
|
with open(output_path, 'w') as f:
|
|
json.dump(data, f, indent=2, ensure_ascii=False)
|
|
print("Done!")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|