#!/usr/bin/env python3 """ Fast WCMS migration with state file checkpointing. This is an optimized version that: 1. Uses a state file to track processed user IDs (no scanning 190K+ files) 2. Processes in batches with checkpoints 3. Resumes from where it left off Usage: python scripts/migrate_wcms_resume.py --batch-size 10000 python scripts/migrate_wcms_resume.py --dry-run --limit 100 """ import json import argparse import re import uuid from pathlib import Path from datetime import datetime, timezone import unicodedata from typing import Dict, Optional, Set # Paths WCMS_USERS_DIR = Path('/Volumes/KINGSTON/data/wcms/data/person_profiles/users') WCMS_USERS_NEW_DIR = Path('/Volumes/KINGSTON/data/wcms/data/person_profiles/users_new') PERSON_DIR = Path('/Users/kempersc/apps/glam/data/person') STATE_FILE = Path('/Users/kempersc/apps/glam/data/wcms_migration_state.json') def normalize_name_for_ppid(name: str) -> str: """Convert name to PPID format: FIRST-LAST""" if not name: return "UNKNOWN" name = re.sub(r'\b(Dr|Prof|Mr|Mrs|Ms|PhD|MA|MSc|MBA|BSc|Jr|Sr)\b\.?', '', name, flags=re.IGNORECASE) parts = [p.strip() for p in name.split() if p.strip()] if not parts: return "UNKNOWN" def normalize_part(p): nfkd = unicodedata.normalize('NFKD', p) ascii_name = ''.join(c for c in nfkd if not unicodedata.combining(c)) return re.sub(r'[^A-Za-z0-9]', '', ascii_name).upper() normalized = [normalize_part(p) for p in parts if normalize_part(p)] return '-'.join(normalized) if normalized else "UNKNOWN" def extract_email_domain(email: str) -> Optional[str]: """Extract domain from email.""" if not email or '@' not in email: return None return email.split('@')[-1].lower() def generate_wcms_ppid(name: str) -> str: """Generate PPID for WCMS user.""" name_token = normalize_name_for_ppid(name) return f"ID_XX-XX-XXX_XXXX_XX-XX-XXX_XXXX_{name_token}" def load_state() -> dict: """Load migration state from file.""" if STATE_FILE.exists(): with open(STATE_FILE) as f: return json.load(f) return { 'processed_user_ids': [], 'existing_ppids': [], 'last_checkpoint': None, 'stats': {'migrated': 0, 'duplicate': 0, 'error': 0} } def save_state(state: dict): """Save migration state to file.""" state['last_checkpoint'] = datetime.now(timezone.utc).isoformat() with open(STATE_FILE, 'w') as f: json.dump(state, f) def transform_wcms_to_ppid(wcms_data: dict, source_file: str) -> tuple: """Transform WCMS user data to PPID profile format.""" name = wcms_data.get('full_name') or wcms_data.get('username') or 'Unknown' ppid = generate_wcms_ppid(name) email_domain = extract_email_domain(wcms_data.get('email', '')) # Find Wikipedia URL from entity_resolution if present wikipedia_url = wcms_data.get('entity_resolution', {}).get('wikipedia_url') ppid_profile = { "ppid": ppid, "ppid_type": "ID", "ppid_components": { "type": "ID", "first_location": "XX-XX-XXX", "first_date": "XXXX", "last_location": "XX-XX-XXX", "last_date": "XXXX", "name_tokens": normalize_name_for_ppid(name).split('-') }, "name": name, "birth_date": { "edtf": "XXXX", "precision": "unknown", "note": "Not available from WCMS" }, "is_living": True, "is_anonymous": False, # WCMS-specific identifiers "wcms_identifiers": { "user_id": wcms_data.get('user_id'), "username": wcms_data.get('username'), "username_url": wcms_data.get('username_url'), "abs_id": wcms_data.get('abs_id'), "crm_id": wcms_data.get('crm_id'), }, # CONTACT DETAILS - FULL email preserved "contact_details": { "email": wcms_data.get('email'), "email_domain": email_domain, }, # Activity data "wcms_activity": { "status": wcms_data.get('status'), "roles": wcms_data.get('roles', []), "registered_since": wcms_data.get('registered_since'), "last_access": wcms_data.get('last_access'), "operations": wcms_data.get('operations', []), }, # Entity resolution - no auto-matching in this fast version "entity_resolution": { "potential_linkedin_matches": 0, "wikipedia_url": wikipedia_url, "match_candidates": [], "requires_manual_review": False, "auto_merged": False, "reviewed": False, "review_notes": None, }, "profile_classification": { "primary_classification": "human", "confidence": 0.95, "indicators": [{"type": "wcms_user", "reason": "Registered user in heritage CMS system"}], "reasoning": "WCMS user profile - registered heritage sector CMS user" }, "data_sources": ["wcms"], "extraction_metadata": { "extraction_agent": "migrate_wcms_resume.py", "extraction_date": datetime.now(timezone.utc).isoformat(), "source_file": source_file, "source_system": "WCMS", "schema_version": "1.0.0" }, "migration_metadata": { "original_wcms_file": source_file, "original_user_id": wcms_data.get('user_id'), "migrated_at": datetime.now(timezone.utc).isoformat(), "migration_script": "migrate_wcms_resume.py", "migration_version": "2.0" } } return ppid, ppid_profile def main(): parser = argparse.ArgumentParser(description='Fast WCMS migration with checkpointing') parser.add_argument('--dry-run', action='store_true', help='Preview only') parser.add_argument('--limit', type=int, default=None, help='Limit files to process') parser.add_argument('--batch-size', type=int, default=5000, help='Save checkpoint every N files') parser.add_argument('--rebuild-state', action='store_true', help='Rebuild state from existing files') args = parser.parse_args() print("=" * 70) print("WCMS MIGRATION (RESUME MODE)") print("=" * 70) # Check KINGSTON mount if not WCMS_USERS_DIR.exists(): print(f"\nERROR: KINGSTON not mounted: {WCMS_USERS_DIR}") return # Load or rebuild state if args.rebuild_state or not STATE_FILE.exists(): print("\nPhase 1: Building state from existing files...") print(" This may take a while for 190K+ files...") existing_ppids = set() processed_user_ids = set() count = 0 for f in PERSON_DIR.glob('ID_*.json'): count += 1 if count % 10000 == 0: print(f" Indexed {count:,} files...") try: existing_ppids.add(f.stem) with open(f) as fp: data = json.load(fp) uid = data.get('wcms_identifiers', {}).get('user_id') if uid: processed_user_ids.add(uid) except: pass state = { 'processed_user_ids': list(processed_user_ids), 'existing_ppids': list(existing_ppids), 'last_checkpoint': datetime.now(timezone.utc).isoformat(), 'stats': {'migrated': len(processed_user_ids), 'duplicate': 0, 'error': 0} } save_state(state) print(f" State built: {len(processed_user_ids):,} WCMS user IDs, {len(existing_ppids):,} PPIDs") else: print("\nPhase 1: Loading state from file...") state = load_state() print(f" Loaded: {len(state['processed_user_ids']):,} processed WCMS user IDs") print(f" Last checkpoint: {state['last_checkpoint']}") processed_user_ids = set(state['processed_user_ids']) existing_ppids = set(state['existing_ppids']) stats = state['stats'] # Collect WCMS files print("\nPhase 2: Collecting WCMS files...") wcms_files = [] # Use recursive glob to find files in subdirectories AND at top level for f in WCMS_USERS_DIR.glob('**/user_*.json'): if not f.name.startswith('._'): # Skip macOS hidden files wcms_files.append(('users', f)) if WCMS_USERS_NEW_DIR.exists(): for f in WCMS_USERS_NEW_DIR.glob('*.json'): if not f.name.startswith('._'): wcms_files.append(('users_new', f)) print(f" Found {len(wcms_files):,} WCMS source files") print(f" Already processed: {len(processed_user_ids):,}") print(f" Remaining: ~{len(wcms_files) - len(processed_user_ids):,}") if args.limit: wcms_files = wcms_files[:args.limit] print(f" Limited to {args.limit} files") # Process WCMS files print(f"\nPhase 3: Processing (dry_run={args.dry_run}, batch_size={args.batch_size})...") batch_migrated = 0 batch_skipped = 0 batch_errors = 0 for i, (source_type, wcms_file) in enumerate(wcms_files): try: with open(wcms_file) as f: wcms_data = json.load(f) user_id = wcms_data.get('user_id') # Skip if already processed if user_id and user_id in processed_user_ids: batch_skipped += 1 continue # Transform to PPID ppid, ppid_profile = transform_wcms_to_ppid( wcms_data, f"{source_type}/{wcms_file.name}" ) # Handle PPID filename collision output_ppid = ppid if ppid in existing_ppids: short_uuid = str(uuid.uuid4())[:8] output_ppid = f"{ppid}-{short_uuid}" ppid_profile['ppid'] = output_ppid ppid_profile['ppid_components']['collision_uuid'] = short_uuid output_file = PERSON_DIR / f"{output_ppid}.json" # Double-check file doesn't exist while output_file.exists(): short_uuid = str(uuid.uuid4())[:8] output_ppid = f"{ppid}-{short_uuid}" ppid_profile['ppid'] = output_ppid ppid_profile['ppid_components']['collision_uuid'] = short_uuid output_file = PERSON_DIR / f"{output_ppid}.json" if not args.dry_run: with open(output_file, 'w') as f: json.dump(ppid_profile, f, indent=2, ensure_ascii=False) existing_ppids.add(output_ppid) if user_id: processed_user_ids.add(user_id) batch_migrated += 1 stats['migrated'] += 1 except Exception as e: batch_errors += 1 stats['error'] += 1 if batch_errors <= 5: print(f" ERROR: {wcms_file.name}: {e}") # Progress and checkpoint if (i + 1) % args.batch_size == 0: pct = ((i + 1) / len(wcms_files)) * 100 print(f" Progress: {i+1:,}/{len(wcms_files):,} ({pct:.1f}%) - " f"Batch: +{batch_migrated:,} new, {batch_skipped:,} skip, {batch_errors} err") # Save checkpoint if not args.dry_run: state['processed_user_ids'] = list(processed_user_ids) state['existing_ppids'] = list(existing_ppids) state['stats'] = stats save_state(state) print(f" Checkpoint saved at {i+1:,}") batch_migrated = 0 batch_skipped = 0 batch_errors = 0 # Final checkpoint if not args.dry_run: state['processed_user_ids'] = list(processed_user_ids) state['existing_ppids'] = list(existing_ppids) state['stats'] = stats save_state(state) # Summary print("\n" + "=" * 70) print(f"{'DRY RUN ' if args.dry_run else ''}MIGRATION SUMMARY") print("=" * 70) print(f" Total WCMS source files: {len(wcms_files):,}") print(f" Total migrated (cumulative): {stats['migrated']:,}") print(f" Errors: {stats['error']}") print(f" WCMS user IDs in state: {len(processed_user_ids):,}") print(f" PPID files tracked: {len(existing_ppids):,}") if not args.dry_run: print(f"\n State saved to: {STATE_FILE}") if __name__ == '__main__': main()