#!/usr/bin/env python3 """ Merge WCMS data into person entity files. This script: 1. Builds an email → WCMS user data index from WCMS user files 2. Loads entity_resolution_candidates to find confirmed WCMS↔LinkedIn matches 3. Updates person entity files with WCMS identifiers and activity data Usage: python scripts/merge_wcms_to_persons.py --wcms-dir /Volumes/KINGSTON/data/wcms/data/person_profiles/users python scripts/merge_wcms_to_persons.py --dry-run # Preview without writing """ import argparse import json import os import sys from datetime import datetime, timezone from pathlib import Path from typing import Dict, Optional, Any from collections import defaultdict def load_wcms_index(wcms_dir: Path, verbose: bool = False) -> Dict[str, Dict]: """Build email → WCMS user data index.""" index = {} errors = 0 if not wcms_dir.exists(): print(f"ERROR: WCMS directory not found: {wcms_dir}") sys.exit(1) # Walk through all subdirectories (000, 001, 002, etc.) subdirs = sorted([d for d in wcms_dir.iterdir() if d.is_dir()]) print(f"Found {len(subdirs)} WCMS subdirectories") for subdir in subdirs: for user_file in subdir.glob("user_*.json"): try: with open(user_file, 'r', encoding='utf-8') as f: data = json.load(f) email = data.get('email', '').lower().strip() if email: # Store the full WCMS record indexed by email index[email] = { 'user_id': data.get('user_id'), 'username': data.get('username'), 'username_url': data.get('username_url'), 'full_name': data.get('full_name'), 'status': data.get('status'), 'roles': data.get('roles', []), 'registered_since': data.get('registered_since'), 'last_access': data.get('last_access'), 'abs_id': data.get('abs_id'), 'crm_id': data.get('crm_id'), 'email': email, '_source_file': str(user_file) } except Exception as e: errors += 1 if verbose: print(f" Error reading {user_file}: {e}") print(f"Built WCMS index: {len(index)} emails indexed, {errors} errors") return index def load_entity_candidates(candidates_file: Path) -> Dict[str, Dict]: """Load entity resolution candidates and build email → candidate mapping.""" with open(candidates_file, 'r', encoding='utf-8') as f: data = json.load(f) candidates = data.get('candidates', []) # Build email → best candidate mapping # Prioritize confirmed matches, then high-confidence candidates email_to_candidate = {} for c in candidates: email = c.get('wcms_email', '').lower().strip() if not email: continue # Check if this is a better candidate than existing existing = email_to_candidate.get(email) is_confirmed = c.get('review_decision') == 'match' existing_confirmed = existing.get('review_decision') == 'match' if existing else False if not existing: email_to_candidate[email] = c elif is_confirmed and not existing_confirmed: # Prefer confirmed match email_to_candidate[email] = c elif is_confirmed == existing_confirmed: # Same confirmation status - prefer higher confidence if c.get('confidence_score', 0) > existing.get('confidence_score', 0): email_to_candidate[email] = c confirmed = sum(1 for c in email_to_candidate.values() if c.get('review_decision') == 'match') print(f"Loaded {len(email_to_candidate)} unique email→candidate mappings ({confirmed} confirmed matches)") return email_to_candidate def find_person_file_by_slug(person_dir: Path, linkedin_slug: str) -> Optional[Path]: """Find person entity file by LinkedIn slug.""" # Person files are named like: {linkedin-slug}_{timestamp}.json # There may be multiple versions - find the most recent pattern = f"{linkedin_slug}_*.json" matches = list(person_dir.glob(pattern)) if not matches: return None # Return most recent (sorted by filename which includes timestamp) return sorted(matches)[-1] def create_wcms_fields(wcms_data: Dict) -> tuple[Dict, Dict, Dict]: """Create wcms_identifiers, wcms_activity, and contact_details from WCMS data.""" wcms_identifiers = { 'user_id': wcms_data.get('user_id'), 'abs_id': wcms_data.get('abs_id') or None, 'crm_id': wcms_data.get('crm_id') or None, 'username': wcms_data.get('username'), 'username_url': wcms_data.get('username_url'), } wcms_activity = { 'status': wcms_data.get('status'), 'roles': wcms_data.get('roles', []), 'registered_since': wcms_data.get('registered_since'), 'last_access': wcms_data.get('last_access'), } email = wcms_data.get('email', '') contact_details = { 'email': email, 'email_domain': email.split('@')[1] if '@' in email else None, } return wcms_identifiers, wcms_activity, contact_details def update_person_file( person_file: Path, wcms_identifiers: Dict, wcms_activity: Dict, contact_details: Dict, dry_run: bool = False ) -> bool: """Update person entity file with WCMS data.""" try: with open(person_file, 'r', encoding='utf-8') as f: person_data = json.load(f) except Exception as e: print(f" Error reading {person_file}: {e}") return False # Check if already has WCMS data if person_data.get('wcms_identifiers'): return False # Already has WCMS data # Add WCMS fields person_data['wcms_identifiers'] = wcms_identifiers person_data['wcms_activity'] = wcms_activity # Add/update contact details if 'contact_details' not in person_data: person_data['contact_details'] = contact_details else: # Merge with existing contact details for k, v in contact_details.items(): if v and not person_data['contact_details'].get(k): person_data['contact_details'][k] = v # Add provenance note if 'extraction_metadata' in person_data: notes = person_data['extraction_metadata'].get('notes', '') wcms_note = f"WCMS data merged on {datetime.now(timezone.utc).isoformat()}" person_data['extraction_metadata']['notes'] = f"{notes} {wcms_note}".strip() if dry_run: return True # Write updated file try: with open(person_file, 'w', encoding='utf-8') as f: json.dump(person_data, f, indent=2, ensure_ascii=False) return True except Exception as e: print(f" Error writing {person_file}: {e}") return False def main(): parser = argparse.ArgumentParser(description='Merge WCMS data into person entity files') parser.add_argument('--wcms-dir', type=Path, default=Path('/Volumes/KINGSTON/data/wcms/data/person_profiles/users'), help='Path to WCMS users directory') parser.add_argument('--candidates-file', type=Path, default=Path('data/entity_resolution/entity_resolution_candidates.json'), help='Path to entity resolution candidates file') parser.add_argument('--person-dir', type=Path, default=Path('data/custodian/person/entity'), help='Path to person entity files directory') parser.add_argument('--dry-run', action='store_true', help='Preview changes without writing files') parser.add_argument('--verbose', '-v', action='store_true', help='Verbose output') parser.add_argument('--limit', type=int, default=None, help='Limit number of files to process (for testing)') args = parser.parse_args() print("=" * 60) print("WCMS → Person Entity Merge") print("=" * 60) if args.dry_run: print("DRY RUN MODE - no files will be modified") # Step 1: Build WCMS email index print("\n[1/4] Building WCMS email index...") wcms_index = load_wcms_index(args.wcms_dir, verbose=args.verbose) # Step 2: Load entity resolution candidates print("\n[2/4] Loading entity resolution candidates...") candidates = load_entity_candidates(args.candidates_file) # Step 3: Find matches and update person files print("\n[3/4] Matching and updating person files...") stats = { 'processed': 0, 'matched': 0, 'updated': 0, 'already_has_wcms': 0, 'no_person_file': 0, 'no_wcms_data': 0, 'errors': 0, } # Process each candidate that has both WCMS email and LinkedIn slug for email, candidate in list(candidates.items())[:args.limit] if args.limit else candidates.items(): stats['processed'] += 1 linkedin_slug = candidate.get('linkedin_slug') if not linkedin_slug: continue # Look up WCMS data by email wcms_data = wcms_index.get(email) if not wcms_data: stats['no_wcms_data'] += 1 if args.verbose: print(f" No WCMS data for email: {email}") continue # Find person entity file person_file = find_person_file_by_slug(args.person_dir, linkedin_slug) if not person_file: stats['no_person_file'] += 1 if args.verbose: print(f" No person file for slug: {linkedin_slug}") continue stats['matched'] += 1 # Create WCMS fields wcms_identifiers, wcms_activity, contact_details = create_wcms_fields(wcms_data) # Update person file if update_person_file(person_file, wcms_identifiers, wcms_activity, contact_details, args.dry_run): stats['updated'] += 1 if args.verbose: print(f" Updated: {person_file.name}") else: stats['already_has_wcms'] += 1 # Step 4: Report results print("\n[4/4] Results") print("-" * 40) print(f" Candidates processed: {stats['processed']}") print(f" WCMS↔Person matches: {stats['matched']}") print(f" Files updated: {stats['updated']}") print(f" Already had WCMS: {stats['already_has_wcms']}") print(f" No person file found: {stats['no_person_file']}") print(f" No WCMS data found: {stats['no_wcms_data']}") if args.dry_run: print("\nDRY RUN - no files were modified. Run without --dry-run to apply changes.") if __name__ == '__main__': main()