#!/usr/bin/env python3 """ Migrate WCMS user profiles to data/person/ with entity resolution signals. This script: 1. Imports ALL WCMS profiles as separate records (NO auto-merging) 2. Extracts entity resolution signals (name, email domain, CRM ID) 3. Flags potential matches with existing LinkedIn profiles 4. Stores match candidates for manual review CRITICAL: We do NOT auto-merge on name alone. Entity resolution requires: - Multiple matching signals (employer, location, career, age) - Human verification for ambiguous cases - Full provenance tracking Usage: python scripts/migrate_wcms_users.py --dry-run --limit 100 python scripts/migrate_wcms_users.py --dry-run python scripts/migrate_wcms_users.py """ import json import argparse import re import uuid from pathlib import Path from datetime import datetime, timezone import unicodedata from typing import Dict, List, Tuple, Any, Optional, Set from collections import defaultdict # WCMS source paths WCMS_USERS_DIR = Path('/Volumes/KINGSTON/data/wcms/data/person_profiles/users') WCMS_USERS_NEW_DIR = Path('/Volumes/KINGSTON/data/wcms/data/person_profiles/users_new') PERSON_DIR = Path('/Users/kempersc/apps/glam/data/person') def normalize_name(name: str) -> str: """Normalize name for comparison (lowercase, no diacritics, no titles).""" if not name: return "" # Remove titles name = re.sub(r'\b(Dr|Prof|Mr|Mrs|Ms|Drs|Ir|Ing|PhD|MA|MSc|MBA|BSc)\b\.?', '', name, flags=re.IGNORECASE) # Normalize unicode nfkd = unicodedata.normalize('NFKD', name) ascii_name = ''.join(c for c in nfkd if not unicodedata.combining(c)) # Lowercase and clean return re.sub(r'[^a-z\s]', '', ascii_name.lower()).strip() def normalize_name_for_ppid(name: str) -> str: """Convert name to PPID format: FIRST-LAST""" if not name: return "UNKNOWN" name = re.sub(r'\b(Dr|Prof|Mr|Mrs|Ms|PhD|MA|MSc|MBA|BSc|Jr|Sr)\b\.?', '', name, flags=re.IGNORECASE) parts = [p.strip() for p in name.split() if p.strip()] if not parts: return "UNKNOWN" def normalize_part(p): nfkd = unicodedata.normalize('NFKD', p) ascii_name = ''.join(c for c in nfkd if not unicodedata.combining(c)) return re.sub(r'[^A-Za-z0-9]', '', ascii_name).upper() normalized = [normalize_part(p) for p in parts if normalize_part(p)] return '-'.join(normalized) if normalized else "UNKNOWN" def extract_email_domain(email: str) -> Optional[str]: """Extract domain from email for entity resolution.""" if not email or '@' not in email: return None return email.split('@')[-1].lower() def generate_wcms_ppid(name: str) -> str: """Generate PPID for WCMS user (same ID_ prefix as LinkedIn profiles).""" name_token = normalize_name_for_ppid(name) # Use same ID_ prefix as LinkedIn profiles - source tracked in metadata return f"ID_XX-XX-XXX_XXXX_XX-XX-XXX_XXXX_{name_token}" def build_linkedin_name_index(person_dir: Path) -> Dict[str, List[Dict]]: """Build index of existing LinkedIn profiles by normalized name. Returns: {normalized_name: [list of profile summaries]} """ print(" Building LinkedIn name index for entity resolution...") name_index = defaultdict(list) count = 0 for f in person_dir.glob('ID_*.json'): try: with open(f) as fp: data = json.load(fp) name = data.get('name', '') if not name: continue normalized = normalize_name(name) if not normalized: continue # Extract signals for entity resolution profile_summary = { 'file': f.name, 'ppid': data.get('ppid'), 'name': name, 'linkedin_slug': data.get('linkedin_slug'), 'headline': data.get('profile_data', {}).get('headline', ''), 'location': data.get('profile_data', {}).get('location', ''), 'affiliations': [a.get('custodian_name', '') for a in data.get('affiliations', [])], } name_index[normalized].append(profile_summary) count += 1 except: pass print(f" Indexed {count:,} LinkedIn profiles by name") print(f" Unique normalized names: {len(name_index):,}") return name_index def find_potential_matches(wcms_profile: Dict, name_index: Dict[str, List[Dict]]) -> List[Dict]: """Find potential LinkedIn matches for a WCMS user. Returns list of potential matches with match signals. DOES NOT AUTO-MERGE - just flags for manual review. """ full_name = wcms_profile.get('full_name', '') normalized = normalize_name(full_name) if not normalized or normalized not in name_index: return [] candidates = name_index[normalized] # Add match analysis for each candidate matches = [] for candidate in candidates: match_signals = { 'name_match': True, 'normalized_name': normalized, } # Email domain could match employer domain email = wcms_profile.get('email', '') email_domain = extract_email_domain(email) if email_domain: match_signals['wcms_email_domain'] = email_domain # Check if email domain appears in affiliations for aff in candidate.get('affiliations', []): if email_domain.split('.')[0] in aff.lower(): match_signals['email_domain_in_affiliation'] = True break matches.append({ 'linkedin_profile': candidate, 'match_signals': match_signals, 'requires_manual_review': True, 'auto_merge_allowed': False, # NEVER auto-merge }) return matches def transform_wcms_to_ppid(wcms_data: Dict, source_file: str, potential_matches: List[Dict]) -> Tuple[str, Dict]: """Transform WCMS profile to PPID format.""" full_name = wcms_data.get('full_name', 'Unknown') ppid = generate_wcms_ppid(full_name) # Extract email domain for entity resolution email = wcms_data.get('email', '') email_domain = extract_email_domain(email) ppid_profile = { "ppid": ppid, "ppid_type": "ID", # Same type as LinkedIn - source tracked in metadata "ppid_components": { "type": "ID", "first_location": "XX-XX-XXX", "first_date": "XXXX", "last_location": "XX-XX-XXX", "last_date": "XXXX", "name_tokens": normalize_name_for_ppid(full_name).split('-') }, "name": full_name, "birth_date": { "edtf": "XXXX", "precision": "unknown", "note": "Not available from WCMS" }, "is_living": True, "is_anonymous": False, # WCMS-specific identifiers - FULL contact details preserved! "wcms_identifiers": { "user_id": wcms_data.get('user_id'), "username": wcms_data.get('username'), "username_url": wcms_data.get('username_url'), "abs_id": wcms_data.get('abs_id'), "crm_id": wcms_data.get('crm_id'), }, # CONTACT DETAILS - CRUCIAL! Store full email, not just domain "contact_details": { "email": wcms_data.get('email'), # FULL email preserved "email_domain": email_domain, }, # Activity data "wcms_activity": { "status": wcms_data.get('status'), "roles": wcms_data.get('roles', []), "registered_since": wcms_data.get('registered_since'), "last_access": wcms_data.get('last_access'), "operations": wcms_data.get('operations', []), }, # Entity resolution - potential matches with LinkedIn profiles "entity_resolution": { "potential_linkedin_matches": len(potential_matches), "match_candidates": potential_matches[:5], # Store top 5 candidates "requires_manual_review": len(potential_matches) > 0, "auto_merged": False, "reviewed": False, "review_notes": None, }, # Classification - indicate this is from WCMS source "profile_classification": { "primary_classification": "human", # WCMS users are people "confidence": 0.95, "indicators": [{"type": "wcms_user", "reason": "Registered user in heritage CMS system"}], "reasoning": "WCMS user profile - registered heritage sector CMS user" }, # Data source tracking "data_sources": ["wcms"], # Track which systems this person appears in # Source provenance "extraction_metadata": { "extraction_agent": "migrate_wcms_users.py", "extraction_date": datetime.now(timezone.utc).isoformat(), "source_file": source_file, "source_system": "WCMS", "schema_version": "1.0.0" }, "migration_metadata": { "original_wcms_file": source_file, "original_user_id": wcms_data.get('user_id'), "migrated_at": datetime.now(timezone.utc).isoformat(), "migration_script": "migrate_wcms_users.py", "migration_version": "1.0" } } return ppid, ppid_profile def main(): parser = argparse.ArgumentParser(description='Migrate WCMS users with entity resolution') parser.add_argument('--dry-run', action='store_true', help='Preview only, no file changes') parser.add_argument('--limit', type=int, default=None, help='Limit number of profiles to process') parser.add_argument('--skip-matching', action='store_true', help='Skip LinkedIn matching (faster)') args = parser.parse_args() print("=" * 70) print("WCMS USER MIGRATION with Entity Resolution") print("=" * 70) print(" CRITICAL: No auto-merging on name alone!") print(" Potential matches flagged for manual review") # Check KINGSTON mount if not WCMS_USERS_DIR.exists(): print(f"\n ERROR: KINGSTON not mounted or path not found: {WCMS_USERS_DIR}") return # Phase 1: Build LinkedIn name index for entity resolution if not args.skip_matching: print("\nPhase 1: Building LinkedIn profile index...") name_index = build_linkedin_name_index(PERSON_DIR) else: print("\nPhase 1: Skipping LinkedIn matching (--skip-matching)") name_index = {} # Phase 2: Collect WCMS files print("\nPhase 2: Collecting WCMS user files...") wcms_files = [] # Main users directory for f in WCMS_USERS_DIR.glob('user_*.json'): wcms_files.append(('users', f)) # users_new directory if WCMS_USERS_NEW_DIR.exists(): for f in WCMS_USERS_NEW_DIR.glob('*.json'): if not f.name.startswith('._'): wcms_files.append(('users_new', f)) print(f" Found {len(wcms_files):,} WCMS user files") if args.limit: wcms_files = wcms_files[:args.limit] print(f" Limited to {args.limit} files") # Phase 3: Build existing PPID index print("\nPhase 3: Indexing existing PPID files...") existing_ppids = {f.stem for f in PERSON_DIR.glob('*.json')} print(f" Found {len(existing_ppids):,} existing PPID files") # Also index by WCMS user_id to prevent duplicates existing_wcms_ids = set() for f in PERSON_DIR.glob('ID_*.json'): try: with open(f) as fp: data = json.load(fp) uid = data.get('wcms_identifiers', {}).get('user_id') if uid: existing_wcms_ids.add(uid) except: pass print(f" Found {len(existing_wcms_ids):,} existing WCMS user IDs") # Phase 4: Process WCMS files print(f"\nPhase 4: Processing WCMS files (dry_run={args.dry_run})...") results = {'migrated': 0, 'duplicate': 0, 'error': 0} match_stats = {'with_matches': 0, 'no_matches': 0} samples = [] for i, (source_type, wcms_file) in enumerate(wcms_files): try: with open(wcms_file) as f: wcms_data = json.load(f) user_id = wcms_data.get('user_id') # Check for duplicate by WCMS user_id if user_id and user_id in existing_wcms_ids: results['duplicate'] += 1 continue # Find potential LinkedIn matches if name_index: potential_matches = find_potential_matches(wcms_data, name_index) else: potential_matches = [] if potential_matches: match_stats['with_matches'] += 1 else: match_stats['no_matches'] += 1 # Transform to PPID ppid, ppid_profile = transform_wcms_to_ppid( wcms_data, f"{source_type}/{wcms_file.name}", potential_matches ) # Handle PPID filename collision output_ppid = ppid if ppid in existing_ppids: short_uuid = str(uuid.uuid4())[:8] output_ppid = f"{ppid}-{short_uuid}" ppid_profile['ppid'] = output_ppid ppid_profile['ppid_components']['collision_uuid'] = short_uuid output_file = PERSON_DIR / f"{output_ppid}.json" # Double-check file doesn't exist while output_file.exists(): short_uuid = str(uuid.uuid4())[:8] output_ppid = f"{ppid}-{short_uuid}" ppid_profile['ppid'] = output_ppid ppid_profile['ppid_components']['collision_uuid'] = short_uuid output_file = PERSON_DIR / f"{output_ppid}.json" if not args.dry_run: with open(output_file, 'w') as f: json.dump(ppid_profile, f, indent=2, ensure_ascii=False) existing_ppids.add(output_ppid) if user_id: existing_wcms_ids.add(user_id) results['migrated'] += 1 if len(samples) < 5: samples.append((output_ppid, wcms_data.get('full_name', ''), len(potential_matches))) except Exception as e: results['error'] += 1 if results['error'] <= 5: print(f" ERROR: {wcms_file.name}: {e}") # Progress if (i + 1) % 10000 == 0: pct = ((i + 1) / len(wcms_files)) * 100 print(f" Progress: {i+1:,}/{len(wcms_files):,} ({pct:.1f}%) - " f"Migrated:{results['migrated']:,} Dup:{results['duplicate']:,} " f"Matches:{match_stats['with_matches']:,} Err:{results['error']}") # Summary print("\n" + "=" * 70) print(f"{'DRY RUN ' if args.dry_run else ''}MIGRATION SUMMARY") print("=" * 70) print(f" Total processed: {sum(results.values()):,}") print(f" Successfully migrated: {results['migrated']:,}") print(f" Duplicates skipped: {results['duplicate']:,}") print(f" Errors: {results['error']}") print(f"\n Entity Resolution:") print(f" With potential LinkedIn matches: {match_stats['with_matches']:,}") print(f" No matches found: {match_stats['no_matches']:,}") print(f" Requires manual review: {match_stats['with_matches']:,}") if samples: print(f"\n Sample migrated profiles:") for ppid, name, match_count in samples: print(f" {ppid[:50]}... | {name} | {match_count} potential matches") if args.dry_run: print(f"\n To execute migration, run without --dry-run flag") else: final_count = len(list(PERSON_DIR.glob('*.json'))) print(f"\n Migration complete!") print(f" Final profile count: {final_count:,}") if __name__ == '__main__': main()