357 lines
12 KiB
Python
357 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Fast WCMS migration with state file checkpointing.
|
|
|
|
This is an optimized version that:
|
|
1. Uses a state file to track processed user IDs (no scanning 190K+ files)
|
|
2. Processes in batches with checkpoints
|
|
3. Resumes from where it left off
|
|
|
|
Usage:
|
|
python scripts/migrate_wcms_resume.py --batch-size 10000
|
|
python scripts/migrate_wcms_resume.py --dry-run --limit 100
|
|
"""
|
|
|
|
import json
|
|
import argparse
|
|
import re
|
|
import uuid
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
import unicodedata
|
|
from typing import Dict, Optional, Set
|
|
|
|
# Paths
|
|
WCMS_USERS_DIR = Path('/Volumes/KINGSTON/data/wcms/data/person_profiles/users')
|
|
WCMS_USERS_NEW_DIR = Path('/Volumes/KINGSTON/data/wcms/data/person_profiles/users_new')
|
|
PERSON_DIR = Path('/Users/kempersc/apps/glam/data/person')
|
|
STATE_FILE = Path('/Users/kempersc/apps/glam/data/wcms_migration_state.json')
|
|
|
|
|
|
def normalize_name_for_ppid(name: str) -> str:
|
|
"""Convert name to PPID format: FIRST-LAST"""
|
|
if not name:
|
|
return "UNKNOWN"
|
|
|
|
name = re.sub(r'\b(Dr|Prof|Mr|Mrs|Ms|PhD|MA|MSc|MBA|BSc|Jr|Sr)\b\.?', '', name, flags=re.IGNORECASE)
|
|
parts = [p.strip() for p in name.split() if p.strip()]
|
|
if not parts:
|
|
return "UNKNOWN"
|
|
|
|
def normalize_part(p):
|
|
nfkd = unicodedata.normalize('NFKD', p)
|
|
ascii_name = ''.join(c for c in nfkd if not unicodedata.combining(c))
|
|
return re.sub(r'[^A-Za-z0-9]', '', ascii_name).upper()
|
|
|
|
normalized = [normalize_part(p) for p in parts if normalize_part(p)]
|
|
return '-'.join(normalized) if normalized else "UNKNOWN"
|
|
|
|
|
|
def extract_email_domain(email: str) -> Optional[str]:
|
|
"""Extract domain from email."""
|
|
if not email or '@' not in email:
|
|
return None
|
|
return email.split('@')[-1].lower()
|
|
|
|
|
|
def generate_wcms_ppid(name: str) -> str:
|
|
"""Generate PPID for WCMS user."""
|
|
name_token = normalize_name_for_ppid(name)
|
|
return f"ID_XX-XX-XXX_XXXX_XX-XX-XXX_XXXX_{name_token}"
|
|
|
|
|
|
def load_state() -> dict:
|
|
"""Load migration state from file."""
|
|
if STATE_FILE.exists():
|
|
with open(STATE_FILE) as f:
|
|
return json.load(f)
|
|
return {
|
|
'processed_user_ids': [],
|
|
'existing_ppids': [],
|
|
'last_checkpoint': None,
|
|
'stats': {'migrated': 0, 'duplicate': 0, 'error': 0}
|
|
}
|
|
|
|
|
|
def save_state(state: dict):
|
|
"""Save migration state to file."""
|
|
state['last_checkpoint'] = datetime.now(timezone.utc).isoformat()
|
|
with open(STATE_FILE, 'w') as f:
|
|
json.dump(state, f)
|
|
|
|
|
|
def transform_wcms_to_ppid(wcms_data: dict, source_file: str) -> tuple:
|
|
"""Transform WCMS user data to PPID profile format."""
|
|
name = wcms_data.get('full_name') or wcms_data.get('username') or 'Unknown'
|
|
ppid = generate_wcms_ppid(name)
|
|
email_domain = extract_email_domain(wcms_data.get('email', ''))
|
|
|
|
# Find Wikipedia URL from entity_resolution if present
|
|
wikipedia_url = wcms_data.get('entity_resolution', {}).get('wikipedia_url')
|
|
|
|
ppid_profile = {
|
|
"ppid": ppid,
|
|
"ppid_type": "ID",
|
|
"ppid_components": {
|
|
"type": "ID",
|
|
"first_location": "XX-XX-XXX",
|
|
"first_date": "XXXX",
|
|
"last_location": "XX-XX-XXX",
|
|
"last_date": "XXXX",
|
|
"name_tokens": normalize_name_for_ppid(name).split('-')
|
|
},
|
|
"name": name,
|
|
"birth_date": {
|
|
"edtf": "XXXX",
|
|
"precision": "unknown",
|
|
"note": "Not available from WCMS"
|
|
},
|
|
"is_living": True,
|
|
"is_anonymous": False,
|
|
|
|
# WCMS-specific identifiers
|
|
"wcms_identifiers": {
|
|
"user_id": wcms_data.get('user_id'),
|
|
"username": wcms_data.get('username'),
|
|
"username_url": wcms_data.get('username_url'),
|
|
"abs_id": wcms_data.get('abs_id'),
|
|
"crm_id": wcms_data.get('crm_id'),
|
|
},
|
|
|
|
# CONTACT DETAILS - FULL email preserved
|
|
"contact_details": {
|
|
"email": wcms_data.get('email'),
|
|
"email_domain": email_domain,
|
|
},
|
|
|
|
# Activity data
|
|
"wcms_activity": {
|
|
"status": wcms_data.get('status'),
|
|
"roles": wcms_data.get('roles', []),
|
|
"registered_since": wcms_data.get('registered_since'),
|
|
"last_access": wcms_data.get('last_access'),
|
|
"operations": wcms_data.get('operations', []),
|
|
},
|
|
|
|
# Entity resolution - no auto-matching in this fast version
|
|
"entity_resolution": {
|
|
"potential_linkedin_matches": 0,
|
|
"wikipedia_url": wikipedia_url,
|
|
"match_candidates": [],
|
|
"requires_manual_review": False,
|
|
"auto_merged": False,
|
|
"reviewed": False,
|
|
"review_notes": None,
|
|
},
|
|
|
|
"profile_classification": {
|
|
"primary_classification": "human",
|
|
"confidence": 0.95,
|
|
"indicators": [{"type": "wcms_user", "reason": "Registered user in heritage CMS system"}],
|
|
"reasoning": "WCMS user profile - registered heritage sector CMS user"
|
|
},
|
|
|
|
"data_sources": ["wcms"],
|
|
|
|
"extraction_metadata": {
|
|
"extraction_agent": "migrate_wcms_resume.py",
|
|
"extraction_date": datetime.now(timezone.utc).isoformat(),
|
|
"source_file": source_file,
|
|
"source_system": "WCMS",
|
|
"schema_version": "1.0.0"
|
|
},
|
|
|
|
"migration_metadata": {
|
|
"original_wcms_file": source_file,
|
|
"original_user_id": wcms_data.get('user_id'),
|
|
"migrated_at": datetime.now(timezone.utc).isoformat(),
|
|
"migration_script": "migrate_wcms_resume.py",
|
|
"migration_version": "2.0"
|
|
}
|
|
}
|
|
|
|
return ppid, ppid_profile
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='Fast WCMS migration with checkpointing')
|
|
parser.add_argument('--dry-run', action='store_true', help='Preview only')
|
|
parser.add_argument('--limit', type=int, default=None, help='Limit files to process')
|
|
parser.add_argument('--batch-size', type=int, default=5000, help='Save checkpoint every N files')
|
|
parser.add_argument('--rebuild-state', action='store_true', help='Rebuild state from existing files')
|
|
args = parser.parse_args()
|
|
|
|
print("=" * 70)
|
|
print("WCMS MIGRATION (RESUME MODE)")
|
|
print("=" * 70)
|
|
|
|
# Check KINGSTON mount
|
|
if not WCMS_USERS_DIR.exists():
|
|
print(f"\nERROR: KINGSTON not mounted: {WCMS_USERS_DIR}")
|
|
return
|
|
|
|
# Load or rebuild state
|
|
if args.rebuild_state or not STATE_FILE.exists():
|
|
print("\nPhase 1: Building state from existing files...")
|
|
print(" This may take a while for 190K+ files...")
|
|
|
|
existing_ppids = set()
|
|
processed_user_ids = set()
|
|
count = 0
|
|
|
|
for f in PERSON_DIR.glob('ID_*.json'):
|
|
count += 1
|
|
if count % 10000 == 0:
|
|
print(f" Indexed {count:,} files...")
|
|
|
|
try:
|
|
existing_ppids.add(f.stem)
|
|
with open(f) as fp:
|
|
data = json.load(fp)
|
|
uid = data.get('wcms_identifiers', {}).get('user_id')
|
|
if uid:
|
|
processed_user_ids.add(uid)
|
|
except:
|
|
pass
|
|
|
|
state = {
|
|
'processed_user_ids': list(processed_user_ids),
|
|
'existing_ppids': list(existing_ppids),
|
|
'last_checkpoint': datetime.now(timezone.utc).isoformat(),
|
|
'stats': {'migrated': len(processed_user_ids), 'duplicate': 0, 'error': 0}
|
|
}
|
|
save_state(state)
|
|
print(f" State built: {len(processed_user_ids):,} WCMS user IDs, {len(existing_ppids):,} PPIDs")
|
|
else:
|
|
print("\nPhase 1: Loading state from file...")
|
|
state = load_state()
|
|
print(f" Loaded: {len(state['processed_user_ids']):,} processed WCMS user IDs")
|
|
print(f" Last checkpoint: {state['last_checkpoint']}")
|
|
|
|
processed_user_ids = set(state['processed_user_ids'])
|
|
existing_ppids = set(state['existing_ppids'])
|
|
stats = state['stats']
|
|
|
|
# Collect WCMS files
|
|
print("\nPhase 2: Collecting WCMS files...")
|
|
wcms_files = []
|
|
|
|
# Use recursive glob to find files in subdirectories AND at top level
|
|
for f in WCMS_USERS_DIR.glob('**/user_*.json'):
|
|
if not f.name.startswith('._'): # Skip macOS hidden files
|
|
wcms_files.append(('users', f))
|
|
|
|
if WCMS_USERS_NEW_DIR.exists():
|
|
for f in WCMS_USERS_NEW_DIR.glob('*.json'):
|
|
if not f.name.startswith('._'):
|
|
wcms_files.append(('users_new', f))
|
|
|
|
print(f" Found {len(wcms_files):,} WCMS source files")
|
|
print(f" Already processed: {len(processed_user_ids):,}")
|
|
print(f" Remaining: ~{len(wcms_files) - len(processed_user_ids):,}")
|
|
|
|
if args.limit:
|
|
wcms_files = wcms_files[:args.limit]
|
|
print(f" Limited to {args.limit} files")
|
|
|
|
# Process WCMS files
|
|
print(f"\nPhase 3: Processing (dry_run={args.dry_run}, batch_size={args.batch_size})...")
|
|
|
|
batch_migrated = 0
|
|
batch_skipped = 0
|
|
batch_errors = 0
|
|
|
|
for i, (source_type, wcms_file) in enumerate(wcms_files):
|
|
try:
|
|
with open(wcms_file) as f:
|
|
wcms_data = json.load(f)
|
|
|
|
user_id = wcms_data.get('user_id')
|
|
|
|
# Skip if already processed
|
|
if user_id and user_id in processed_user_ids:
|
|
batch_skipped += 1
|
|
continue
|
|
|
|
# Transform to PPID
|
|
ppid, ppid_profile = transform_wcms_to_ppid(
|
|
wcms_data,
|
|
f"{source_type}/{wcms_file.name}"
|
|
)
|
|
|
|
# Handle PPID filename collision
|
|
output_ppid = ppid
|
|
if ppid in existing_ppids:
|
|
short_uuid = str(uuid.uuid4())[:8]
|
|
output_ppid = f"{ppid}-{short_uuid}"
|
|
ppid_profile['ppid'] = output_ppid
|
|
ppid_profile['ppid_components']['collision_uuid'] = short_uuid
|
|
|
|
output_file = PERSON_DIR / f"{output_ppid}.json"
|
|
|
|
# Double-check file doesn't exist
|
|
while output_file.exists():
|
|
short_uuid = str(uuid.uuid4())[:8]
|
|
output_ppid = f"{ppid}-{short_uuid}"
|
|
ppid_profile['ppid'] = output_ppid
|
|
ppid_profile['ppid_components']['collision_uuid'] = short_uuid
|
|
output_file = PERSON_DIR / f"{output_ppid}.json"
|
|
|
|
if not args.dry_run:
|
|
with open(output_file, 'w') as f:
|
|
json.dump(ppid_profile, f, indent=2, ensure_ascii=False)
|
|
|
|
existing_ppids.add(output_ppid)
|
|
if user_id:
|
|
processed_user_ids.add(user_id)
|
|
|
|
batch_migrated += 1
|
|
stats['migrated'] += 1
|
|
|
|
except Exception as e:
|
|
batch_errors += 1
|
|
stats['error'] += 1
|
|
if batch_errors <= 5:
|
|
print(f" ERROR: {wcms_file.name}: {e}")
|
|
|
|
# Progress and checkpoint
|
|
if (i + 1) % args.batch_size == 0:
|
|
pct = ((i + 1) / len(wcms_files)) * 100
|
|
print(f" Progress: {i+1:,}/{len(wcms_files):,} ({pct:.1f}%) - "
|
|
f"Batch: +{batch_migrated:,} new, {batch_skipped:,} skip, {batch_errors} err")
|
|
|
|
# Save checkpoint
|
|
if not args.dry_run:
|
|
state['processed_user_ids'] = list(processed_user_ids)
|
|
state['existing_ppids'] = list(existing_ppids)
|
|
state['stats'] = stats
|
|
save_state(state)
|
|
print(f" Checkpoint saved at {i+1:,}")
|
|
|
|
batch_migrated = 0
|
|
batch_skipped = 0
|
|
batch_errors = 0
|
|
|
|
# Final checkpoint
|
|
if not args.dry_run:
|
|
state['processed_user_ids'] = list(processed_user_ids)
|
|
state['existing_ppids'] = list(existing_ppids)
|
|
state['stats'] = stats
|
|
save_state(state)
|
|
|
|
# Summary
|
|
print("\n" + "=" * 70)
|
|
print(f"{'DRY RUN ' if args.dry_run else ''}MIGRATION SUMMARY")
|
|
print("=" * 70)
|
|
print(f" Total WCMS source files: {len(wcms_files):,}")
|
|
print(f" Total migrated (cumulative): {stats['migrated']:,}")
|
|
print(f" Errors: {stats['error']}")
|
|
print(f" WCMS user IDs in state: {len(processed_user_ids):,}")
|
|
print(f" PPID files tracked: {len(existing_ppids):,}")
|
|
|
|
if not args.dry_run:
|
|
print(f"\n State saved to: {STATE_FILE}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|