glam/scripts/enrich_email_semantics.py

#!/usr/bin/env python3
"""
Enrich entity resolution candidates with email semantic signals.

This script adds email semantic analysis to existing candidates without
regenerating the entire file, preserving all review decisions.

Usage:
    python scripts/enrich_email_semantics.py \
        --input data/entity_resolution/backups/production_20260113_141819.json \
        --output data/entity_resolution/entity_resolution_candidates.json
"""

import argparse
import json
import sys
from datetime import datetime
from pathlib import Path

# Add src to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent / 'src'))

from glam_extractor.entity_resolution.email_semantics import parse_email_semantics


def load_json(filepath: Path) -> dict:
    """Load JSON file."""
    with open(filepath, 'r', encoding='utf-8') as f:
        return json.load(f)


def save_json(filepath: Path, data: dict) -> None:
    """Save JSON file with proper formatting."""
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2, ensure_ascii=False)


def enrich_candidate_with_email_semantics(candidate: dict) -> dict:
    """Add email semantic fields to a candidate."""
    email = candidate.get('wcms_email')

    if not email:
        # No email to analyze
        candidate['email_probable_birth_year'] = None
        candidate['email_birth_year_confidence'] = 0.0
        candidate['email_institution_name'] = None
        candidate['email_institution_type'] = None
        candidate['email_institution_ghcid'] = None
        candidate['email_extracted_names'] = []
        candidate['email_extracted_first_name'] = None
        candidate['email_extracted_last_name'] = None
        candidate['email_has_dutch_prefix'] = False
        candidate['email_is_institutional'] = False
        return candidate

    # Parse email semantics
    result = parse_email_semantics(email)

    # Add fields to candidate
    candidate['email_probable_birth_year'] = result.probable_birth_year
    candidate['email_birth_year_confidence'] = result.birth_year_confidence
    candidate['email_institution_name'] = result.institution_name
    candidate['email_institution_type'] = result.institution_type
    candidate['email_institution_ghcid'] = result.institution_ghcid
    candidate['email_extracted_names'] = result.extracted_names
    candidate['email_extracted_first_name'] = result.extracted_first_name
    candidate['email_extracted_last_name'] = result.extracted_last_name
    candidate['email_has_dutch_prefix'] = result.has_dutch_prefix
    candidate['email_is_institutional'] = result.is_institutional_domain

    # Update match signals if we found useful info
    signals = candidate.get('match_signals', [])

    if result.probable_birth_year:
        if 'email_birth_year' not in signals:
            signals.append('email_birth_year')

    if result.institution_name:
        if 'email_institution' not in signals:
            signals.append('email_institution')

    if result.extracted_names:
        if 'email_name_parts' not in signals:
            signals.append('email_name_parts')

    candidate['match_signals'] = signals

    return candidate


def main():
    parser = argparse.ArgumentParser(
        description='Enrich entity resolution candidates with email semantics'
    )
    parser.add_argument(
        '--input', '-i',
        type=Path,
        required=True,
        help='Path to input candidates file'
    )
    parser.add_argument(
        '--output', '-o',
        type=Path,
        required=True,
        help='Path for output file with email semantics'
    )
    parser.add_argument(
        '--dry-run',
        action='store_true',
        help='Show stats without writing'
    )

    args = parser.parse_args()

    # Validate input file exists
    if not args.input.exists():
        print(f"Error: Input file not found: {args.input}", file=sys.stderr)
        sys.exit(1)

    print(f"Loading candidates from: {args.input}")
    data = load_json(args.input)
    candidates = data.get('candidates', [])
    print(f"  - {len(candidates):,} candidates")

    # Count existing reviews
    reviewed = sum(1 for c in candidates if c.get('reviewed'))
    print(f"  - {reviewed} reviewed (will be preserved)")

    # Enrich each candidate
    print("\nEnriching with email semantics...")
    stats = {
        'total': len(candidates),
        'with_email': 0,
        'birth_year_found': 0,
        'institution_found': 0,
        'names_found': 0,
    }

    for i, candidate in enumerate(candidates):
        if i % 10000 == 0 and i > 0:
            print(f"  - Processed {i:,} candidates...")

        enrich_candidate_with_email_semantics(candidate)

        if candidate.get('wcms_email'):
            stats['with_email'] += 1
        if candidate.get('email_probable_birth_year'):
            stats['birth_year_found'] += 1
        if candidate.get('email_institution_name'):
            stats['institution_found'] += 1
        if candidate.get('email_extracted_names'):
            stats['names_found'] += 1

    print(f"\nEmail semantic enrichment stats:")
    print(f"  - Candidates with email: {stats['with_email']:,}")
    print(f"  - Birth year extracted: {stats['birth_year_found']:,}")
    print(f"  - Institution identified: {stats['institution_found']:,}")
    print(f"  - Name parts extracted: {stats['names_found']:,}")

    # Update metadata
    data['metadata'] = data.get('metadata', {})
    data['metadata']['email_semantics_enriched_at'] = datetime.utcnow().isoformat() + 'Z'
    data['metadata']['email_semantics_stats'] = stats

    # Verify reviews preserved
    reviewed_after = sum(1 for c in candidates if c.get('reviewed'))
    print(f"\nReviews preserved: {reviewed_after} (was {reviewed})")

    if reviewed_after != reviewed:
        print("ERROR: Review count mismatch!", file=sys.stderr)
        sys.exit(1)

    if args.dry_run:
        print(f"\n[DRY RUN] Would write to: {args.output}")
    else:
        # Ensure output directory exists
        args.output.parent.mkdir(parents=True, exist_ok=True)

        print(f"\nWriting enriched file to: {args.output}")
        save_json(args.output, data)

        # Get file size
        size_mb = args.output.stat().st_size / (1024 * 1024)
        print(f"  - File size: {size_mb:.1f} MB")

    print("\nDone!")
    return 0


if __name__ == '__main__':
    sys.exit(main())