#!/usr/bin/env python3 """ Enrich entity resolution candidates with email semantic signals. This script adds email semantic analysis to existing candidates without regenerating the entire file, preserving all review decisions. Usage: python scripts/enrich_email_semantics.py \ --input data/entity_resolution/backups/production_20260113_141819.json \ --output data/entity_resolution/entity_resolution_candidates.json """ import argparse import json import sys from datetime import datetime from pathlib import Path # Add src to path for imports sys.path.insert(0, str(Path(__file__).parent.parent / 'src')) from glam_extractor.entity_resolution.email_semantics import parse_email_semantics def load_json(filepath: Path) -> dict: """Load JSON file.""" with open(filepath, 'r', encoding='utf-8') as f: return json.load(f) def save_json(filepath: Path, data: dict) -> None: """Save JSON file with proper formatting.""" with open(filepath, 'w', encoding='utf-8') as f: json.dump(data, f, indent=2, ensure_ascii=False) def enrich_candidate_with_email_semantics(candidate: dict) -> dict: """Add email semantic fields to a candidate.""" email = candidate.get('wcms_email') if not email: # No email to analyze candidate['email_probable_birth_year'] = None candidate['email_birth_year_confidence'] = 0.0 candidate['email_institution_name'] = None candidate['email_institution_type'] = None candidate['email_institution_ghcid'] = None candidate['email_extracted_names'] = [] candidate['email_extracted_first_name'] = None candidate['email_extracted_last_name'] = None candidate['email_has_dutch_prefix'] = False candidate['email_is_institutional'] = False return candidate # Parse email semantics result = parse_email_semantics(email) # Add fields to candidate candidate['email_probable_birth_year'] = result.probable_birth_year candidate['email_birth_year_confidence'] = result.birth_year_confidence candidate['email_institution_name'] = result.institution_name candidate['email_institution_type'] = result.institution_type candidate['email_institution_ghcid'] = result.institution_ghcid candidate['email_extracted_names'] = result.extracted_names candidate['email_extracted_first_name'] = result.extracted_first_name candidate['email_extracted_last_name'] = result.extracted_last_name candidate['email_has_dutch_prefix'] = result.has_dutch_prefix candidate['email_is_institutional'] = result.is_institutional_domain # Update match signals if we found useful info signals = candidate.get('match_signals', []) if result.probable_birth_year: if 'email_birth_year' not in signals: signals.append('email_birth_year') if result.institution_name: if 'email_institution' not in signals: signals.append('email_institution') if result.extracted_names: if 'email_name_parts' not in signals: signals.append('email_name_parts') candidate['match_signals'] = signals return candidate def main(): parser = argparse.ArgumentParser( description='Enrich entity resolution candidates with email semantics' ) parser.add_argument( '--input', '-i', type=Path, required=True, help='Path to input candidates file' ) parser.add_argument( '--output', '-o', type=Path, required=True, help='Path for output file with email semantics' ) parser.add_argument( '--dry-run', action='store_true', help='Show stats without writing' ) args = parser.parse_args() # Validate input file exists if not args.input.exists(): print(f"Error: Input file not found: {args.input}", file=sys.stderr) sys.exit(1) print(f"Loading candidates from: {args.input}") data = load_json(args.input) candidates = data.get('candidates', []) print(f" - {len(candidates):,} candidates") # Count existing reviews reviewed = sum(1 for c in candidates if c.get('reviewed')) print(f" - {reviewed} reviewed (will be preserved)") # Enrich each candidate print("\nEnriching with email semantics...") stats = { 'total': len(candidates), 'with_email': 0, 'birth_year_found': 0, 'institution_found': 0, 'names_found': 0, } for i, candidate in enumerate(candidates): if i % 10000 == 0 and i > 0: print(f" - Processed {i:,} candidates...") enrich_candidate_with_email_semantics(candidate) if candidate.get('wcms_email'): stats['with_email'] += 1 if candidate.get('email_probable_birth_year'): stats['birth_year_found'] += 1 if candidate.get('email_institution_name'): stats['institution_found'] += 1 if candidate.get('email_extracted_names'): stats['names_found'] += 1 print(f"\nEmail semantic enrichment stats:") print(f" - Candidates with email: {stats['with_email']:,}") print(f" - Birth year extracted: {stats['birth_year_found']:,}") print(f" - Institution identified: {stats['institution_found']:,}") print(f" - Name parts extracted: {stats['names_found']:,}") # Update metadata data['metadata'] = data.get('metadata', {}) data['metadata']['email_semantics_enriched_at'] = datetime.utcnow().isoformat() + 'Z' data['metadata']['email_semantics_stats'] = stats # Verify reviews preserved reviewed_after = sum(1 for c in candidates if c.get('reviewed')) print(f"\nReviews preserved: {reviewed_after} (was {reviewed})") if reviewed_after != reviewed: print("ERROR: Review count mismatch!", file=sys.stderr) sys.exit(1) if args.dry_run: print(f"\n[DRY RUN] Would write to: {args.output}") else: # Ensure output directory exists args.output.parent.mkdir(parents=True, exist_ok=True) print(f"\nWriting enriched file to: {args.output}") save_json(args.output, data) # Get file size size_mb = args.output.stat().st_size / (1024 * 1024) print(f" - File size: {size_mb:.1f} MB") print("\nDone!") return 0 if __name__ == '__main__': sys.exit(main())