glam/scripts/merge_wcms_to_persons.py

#!/usr/bin/env python3
"""
Merge WCMS data into person entity files.

This script:
1. Builds an email → WCMS user data index from WCMS user files
2. Loads entity_resolution_candidates to find confirmed WCMS↔LinkedIn matches
3. Updates person entity files with WCMS identifiers and activity data

Usage:
    python scripts/merge_wcms_to_persons.py --wcms-dir /Volumes/KINGSTON/data/wcms/data/person_profiles/users
    python scripts/merge_wcms_to_persons.py --dry-run  # Preview without writing
"""

import argparse
import json
import os
import sys
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, Optional, Any
from collections import defaultdict


def load_wcms_index(wcms_dir: Path, verbose: bool = False) -> Dict[str, Dict]:
    """Build email → WCMS user data index."""
    index = {}
    errors = 0

    if not wcms_dir.exists():
        print(f"ERROR: WCMS directory not found: {wcms_dir}")
        sys.exit(1)

    # Walk through all subdirectories (000, 001, 002, etc.)
    subdirs = sorted([d for d in wcms_dir.iterdir() if d.is_dir()])
    print(f"Found {len(subdirs)} WCMS subdirectories")

    for subdir in subdirs:
        for user_file in subdir.glob("user_*.json"):
            try:
                with open(user_file, 'r', encoding='utf-8') as f:
                    data = json.load(f)

                email = data.get('email', '').lower().strip()
                if email:
                    # Store the full WCMS record indexed by email
                    index[email] = {
                        'user_id': data.get('user_id'),
                        'username': data.get('username'),
                        'username_url': data.get('username_url'),
                        'full_name': data.get('full_name'),
                        'status': data.get('status'),
                        'roles': data.get('roles', []),
                        'registered_since': data.get('registered_since'),
                        'last_access': data.get('last_access'),
                        'abs_id': data.get('abs_id'),
                        'crm_id': data.get('crm_id'),
                        'email': email,
                        '_source_file': str(user_file)
                    }
            except Exception as e:
                errors += 1
                if verbose:
                    print(f"  Error reading {user_file}: {e}")

    print(f"Built WCMS index: {len(index)} emails indexed, {errors} errors")
    return index


def load_entity_candidates(candidates_file: Path) -> Dict[str, Dict]:
    """Load entity resolution candidates and build email → candidate mapping."""
    with open(candidates_file, 'r', encoding='utf-8') as f:
        data = json.load(f)

    candidates = data.get('candidates', [])

    # Build email → best candidate mapping
    # Prioritize confirmed matches, then high-confidence candidates
    email_to_candidate = {}

    for c in candidates:
        email = c.get('wcms_email', '').lower().strip()
        if not email:
            continue

        # Check if this is a better candidate than existing
        existing = email_to_candidate.get(email)

        is_confirmed = c.get('review_decision') == 'match'
        existing_confirmed = existing.get('review_decision') == 'match' if existing else False

        if not existing:
            email_to_candidate[email] = c
        elif is_confirmed and not existing_confirmed:
            # Prefer confirmed match
            email_to_candidate[email] = c
        elif is_confirmed == existing_confirmed:
            # Same confirmation status - prefer higher confidence
            if c.get('confidence_score', 0) > existing.get('confidence_score', 0):
                email_to_candidate[email] = c

    confirmed = sum(1 for c in email_to_candidate.values() if c.get('review_decision') == 'match')
    print(f"Loaded {len(email_to_candidate)} unique email→candidate mappings ({confirmed} confirmed matches)")

    return email_to_candidate


def find_person_file_by_slug(person_dir: Path, linkedin_slug: str) -> Optional[Path]:
    """Find person entity file by LinkedIn slug."""
    # Person files are named like: {linkedin-slug}_{timestamp}.json
    # There may be multiple versions - find the most recent
    pattern = f"{linkedin_slug}_*.json"
    matches = list(person_dir.glob(pattern))

    if not matches:
        return None

    # Return most recent (sorted by filename which includes timestamp)
    return sorted(matches)[-1]


def create_wcms_fields(wcms_data: Dict) -> tuple[Dict, Dict, Dict]:
    """Create wcms_identifiers, wcms_activity, and contact_details from WCMS data."""

    wcms_identifiers = {
        'user_id': wcms_data.get('user_id'),
        'abs_id': wcms_data.get('abs_id') or None,
        'crm_id': wcms_data.get('crm_id') or None,
        'username': wcms_data.get('username'),
        'username_url': wcms_data.get('username_url'),
    }

    wcms_activity = {
        'status': wcms_data.get('status'),
        'roles': wcms_data.get('roles', []),
        'registered_since': wcms_data.get('registered_since'),
        'last_access': wcms_data.get('last_access'),
    }

    email = wcms_data.get('email', '')
    contact_details = {
        'email': email,
        'email_domain': email.split('@')[1] if '@' in email else None,
    }

    return wcms_identifiers, wcms_activity, contact_details


def update_person_file(
    person_file: Path,
    wcms_identifiers: Dict,
    wcms_activity: Dict,
    contact_details: Dict,
    dry_run: bool = False
) -> bool:
    """Update person entity file with WCMS data."""

    try:
        with open(person_file, 'r', encoding='utf-8') as f:
            person_data = json.load(f)
    except Exception as e:
        print(f"  Error reading {person_file}: {e}")
        return False

    # Check if already has WCMS data
    if person_data.get('wcms_identifiers'):
        return False  # Already has WCMS data

    # Add WCMS fields
    person_data['wcms_identifiers'] = wcms_identifiers
    person_data['wcms_activity'] = wcms_activity

    # Add/update contact details
    if 'contact_details' not in person_data:
        person_data['contact_details'] = contact_details
    else:
        # Merge with existing contact details
        for k, v in contact_details.items():
            if v and not person_data['contact_details'].get(k):
                person_data['contact_details'][k] = v

    # Add provenance note
    if 'extraction_metadata' in person_data:
        notes = person_data['extraction_metadata'].get('notes', '')
        wcms_note = f"WCMS data merged on {datetime.now(timezone.utc).isoformat()}"
        person_data['extraction_metadata']['notes'] = f"{notes} {wcms_note}".strip()

    if dry_run:
        return True

    # Write updated file
    try:
        with open(person_file, 'w', encoding='utf-8') as f:
            json.dump(person_data, f, indent=2, ensure_ascii=False)
        return True
    except Exception as e:
        print(f"  Error writing {person_file}: {e}")
        return False


def main():
    parser = argparse.ArgumentParser(description='Merge WCMS data into person entity files')
    parser.add_argument('--wcms-dir', type=Path,
                        default=Path('/Volumes/KINGSTON/data/wcms/data/person_profiles/users'),
                        help='Path to WCMS users directory')
    parser.add_argument('--candidates-file', type=Path,
                        default=Path('data/entity_resolution/entity_resolution_candidates.json'),
                        help='Path to entity resolution candidates file')
    parser.add_argument('--person-dir', type=Path,
                        default=Path('data/custodian/person/entity'),
                        help='Path to person entity files directory')
    parser.add_argument('--dry-run', action='store_true',
                        help='Preview changes without writing files')
    parser.add_argument('--verbose', '-v', action='store_true',
                        help='Verbose output')
    parser.add_argument('--limit', type=int, default=None,
                        help='Limit number of files to process (for testing)')

    args = parser.parse_args()

    print("=" * 60)
    print("WCMS → Person Entity Merge")
    print("=" * 60)

    if args.dry_run:
        print("DRY RUN MODE - no files will be modified")

    # Step 1: Build WCMS email index
    print("\n[1/4] Building WCMS email index...")
    wcms_index = load_wcms_index(args.wcms_dir, verbose=args.verbose)

    # Step 2: Load entity resolution candidates
    print("\n[2/4] Loading entity resolution candidates...")
    candidates = load_entity_candidates(args.candidates_file)

    # Step 3: Find matches and update person files
    print("\n[3/4] Matching and updating person files...")

    stats = {
        'processed': 0,
        'matched': 0,
        'updated': 0,
        'already_has_wcms': 0,
        'no_person_file': 0,
        'no_wcms_data': 0,
        'errors': 0,
    }

    # Process each candidate that has both WCMS email and LinkedIn slug
    for email, candidate in list(candidates.items())[:args.limit] if args.limit else candidates.items():
        stats['processed'] += 1

        linkedin_slug = candidate.get('linkedin_slug')
        if not linkedin_slug:
            continue

        # Look up WCMS data by email
        wcms_data = wcms_index.get(email)
        if not wcms_data:
            stats['no_wcms_data'] += 1
            if args.verbose:
                print(f"  No WCMS data for email: {email}")
            continue

        # Find person entity file
        person_file = find_person_file_by_slug(args.person_dir, linkedin_slug)
        if not person_file:
            stats['no_person_file'] += 1
            if args.verbose:
                print(f"  No person file for slug: {linkedin_slug}")
            continue

        stats['matched'] += 1

        # Create WCMS fields
        wcms_identifiers, wcms_activity, contact_details = create_wcms_fields(wcms_data)

        # Update person file
        if update_person_file(person_file, wcms_identifiers, wcms_activity, contact_details, args.dry_run):
            stats['updated'] += 1
            if args.verbose:
                print(f"  Updated: {person_file.name}")
        else:
            stats['already_has_wcms'] += 1

    # Step 4: Report results
    print("\n[4/4] Results")
    print("-" * 40)
    print(f"  Candidates processed: {stats['processed']}")
    print(f"  WCMS↔Person matches:  {stats['matched']}")
    print(f"  Files updated:        {stats['updated']}")
    print(f"  Already had WCMS:     {stats['already_has_wcms']}")
    print(f"  No person file found: {stats['no_person_file']}")
    print(f"  No WCMS data found:   {stats['no_wcms_data']}")

    if args.dry_run:
        print("\nDRY RUN - no files were modified. Run without --dry-run to apply changes.")


if __name__ == '__main__':
    main()