#!/usr/bin/env python3 """ Migrate entity profiles from data/custodian/person/entity/ to data/person/ This script (v3) optimizes for large-scale migration: 1. Pre-builds an index of existing PPID filenames (fast) 2. Processes entity files in batches with progress reporting 3. Uses multiprocessing for parallel file operations 4. Handles name collisions with counter suffixes Usage: python scripts/migrate_entity_to_ppid_v3.py --dry-run --limit 100 # Preview 100 profiles python scripts/migrate_entity_to_ppid_v3.py --dry-run # Preview all python scripts/migrate_entity_to_ppid_v3.py # Execute migration """ import json import argparse import re from pathlib import Path from datetime import datetime, timezone import unicodedata from multiprocessing import Pool, cpu_count import os # Patterns for detecting non-human profiles (institutions, not people) # NOTE: LinkedIn Member is INCLUDED - they are real people with privacy settings NON_HUMAN_PATTERNS = [ r'^TheMuseumsLab$', r'^Piet Blom Museum$', # Specific institution profile r'^Limburgs Museum$', # Specific institution profile r'^Miniature Museum$', # Specific institution profile r'^Stichting\s', # Dutch foundation names r'^ICOM\s', # ICOM organization r'^Fondazione\s', # Italian foundation r'^Google\s', # Company profiles (Google DeepMind etc) r'^Sound\s+Heritage$', # Specific organization r'^Company\s+name\s', # Parsing artifact "Company name X" r'^Computational\s+Research$', # Specific organization ] # Patterns for organization profiles that should be excluded # These end with institutional suffixes and have NO personal LinkedIn URL INSTITUTION_SUFFIX_PATTERNS = [ r'Museum$', r'Foundation$', r'Institute$', r'Organisation$', r'Organization$', r'University$', ] def is_human_profile(name, profile_data): """Determine if profile represents a human being (not an institution). LinkedIn Member profiles ARE included - they are real people with privacy settings. They have job titles and affiliations, just no visible name. """ if not name: return False # Check explicit non-human patterns (specific organizations) for pattern in NON_HUMAN_PATTERNS: if re.search(pattern, name, re.IGNORECASE): return False # Check institution suffix patterns - only exclude if NO personal LinkedIn URL # (Real people with names like "Jan Museum" would have a personal /in/ URL) linkedin_url = profile_data.get('linkedin_url', '') has_personal_linkedin = linkedin_url and '/in/' in linkedin_url if not has_personal_linkedin: for pattern in INSTITUTION_SUFFIX_PATTERNS: if re.search(pattern, name, re.IGNORECASE): return False # LinkedIn Member profiles ARE human - they just have privacy settings # They have job titles, affiliations, and are real people # (We'll generate their PPID from affiliation context) return True def normalize_name_for_ppid(name): """Convert name to PPID format: FIRST-LAST""" if not name: return "UNKNOWN" name = re.sub(r'\b(Dr|Prof|Mr|Mrs|Ms|PhD|MA|MSc|MBA|BSc|Jr|Sr|PSM|GIA|GG)\b\.?', '', name, flags=re.IGNORECASE) parts = [p.strip() for p in name.split() if p.strip()] if not parts: return "UNKNOWN" def normalize_part(p): nfkd = unicodedata.normalize('NFKD', p) ascii_name = ''.join(c for c in nfkd if not unicodedata.combining(c)) return re.sub(r'[^A-Za-z]', '', ascii_name).upper() normalized = [normalize_part(p) for p in parts if normalize_part(p)] return '-'.join(normalized) if normalized else "UNKNOWN" def generate_ppid(name, entity_data=None): """Generate PPID from name (locations/dates use XX placeholders). For LinkedIn Member profiles, use affiliation context to create unique ID. """ if name == 'LinkedIn Member' and entity_data: # Use affiliation context for anonymous profiles affiliations = entity_data.get('affiliations', []) headline = entity_data.get('profile_data', {}).get('headline', '') # Try to build context from affiliation if affiliations: org = affiliations[0].get('custodian_name', 'UNKNOWN-ORG') org_token = normalize_name_for_ppid(org) # Add headline keywords if available if headline: # Extract key role word from headline role_words = [] for word in headline.split()[:3]: # First 3 words normalized = normalize_name_for_ppid(word) if normalized and len(normalized) > 2: role_words.append(normalized) role_token = '-'.join(role_words[:2]) if role_words else 'STAFF' else: role_token = 'STAFF' name_token = f"ANON-{org_token[:20]}-{role_token[:15]}" else: name_token = "LINKEDIN-MEMBER" else: name_token = normalize_name_for_ppid(name) return f"ID_XX-XX-XXX_XXXX_XX-XX-XXX_XXXX_{name_token}" def transform_entity_to_ppid(entity_data, entity_file_name): """Transform entity profile to PPID format, preserving ALL data.""" name = entity_data.get('profile_data', {}).get('name') or entity_data.get('name', 'Unknown') ppid = generate_ppid(name, entity_data) # Pass entity_data for LinkedIn Member context # Determine if this is an anonymous profile is_anonymous = (name == 'LinkedIn Member') # Get name tokens based on PPID type if is_anonymous: # For anonymous, tokens come from PPID structure name_tokens = ppid.split('_')[-1].split('-') else: name_tokens = normalize_name_for_ppid(name).split('-') ppid_profile = { "ppid": ppid, "ppid_type": "ID", "ppid_components": { "type": "ID", "first_location": "XX-XX-XXX", "first_date": "XXXX", "last_location": "XX-XX-XXX", "last_date": "XXXX", "name_tokens": name_tokens, "is_anonymous": is_anonymous }, "name": name, "birth_date": { "edtf": "XXXX", "precision": "unknown", "note": "Not yet enriched - requires manual research" }, "is_living": True, "is_anonymous": is_anonymous, # Top-level flag for easy filtering "heritage_relevance": entity_data.get('heritage_relevance', { "is_heritage_relevant": True, "heritage_types": [], "rationale": "Extracted from heritage custodian LinkedIn page" }), "affiliations": entity_data.get('affiliations', []), "profile_data": entity_data.get('profile_data', {}), "web_claims": entity_data.get('web_claims', []), "source_observations": entity_data.get('source_observations', []), "extraction_metadata": entity_data.get('extraction_metadata', {}), "migration_metadata": { "original_entity_file": entity_file_name, "original_person_id": entity_data.get('person_id'), "original_linkedin_slug": entity_data.get('linkedin_slug'), "migrated_at": datetime.now(timezone.utc).isoformat(), "migration_script": "migrate_entity_to_ppid_v3.py", "migration_version": "3.0" } } return ppid, ppid_profile def process_entity_file(args): """Process a single entity file. Returns (status, ppid, file_path) or (status, reason, file_path).""" entity_file_path, existing_ppids_set, person_dir, dry_run = args try: with open(entity_file_path) as f: data = json.load(f) name = data.get('profile_data', {}).get('name') or data.get('name', '') # Skip non-human profiles if not is_human_profile(name, data.get('profile_data', {})): return ('skip', 'non-human', str(entity_file_path)) # Generate PPID ppid, ppid_profile = transform_entity_to_ppid(data, Path(entity_file_path).name) # Check if already exists if ppid in existing_ppids_set: return ('exists', ppid, str(entity_file_path)) # Handle collisions with counter suffix output_ppid = ppid counter = 1 while output_ppid in existing_ppids_set: output_ppid = f"{ppid}-{counter}" ppid_profile['ppid'] = output_ppid counter += 1 output_file = Path(person_dir) / f"{output_ppid}.json" if not dry_run: with open(output_file, 'w') as f: json.dump(ppid_profile, f, indent=2, ensure_ascii=False) return ('migrated', output_ppid, str(entity_file_path)) except Exception as e: return ('error', str(e), str(entity_file_path)) def main(): parser = argparse.ArgumentParser(description='Migrate entity profiles to PPID format (v3 - optimized)') parser.add_argument('--dry-run', action='store_true', help='Preview only, no file changes') parser.add_argument('--limit', type=int, default=None, help='Limit number of profiles to process') parser.add_argument('--workers', type=int, default=4, help='Number of parallel workers') parser.add_argument('--verbose', action='store_true', help='Show each migrated file') args = parser.parse_args() entity_dir = Path('/Users/kempersc/apps/glam/data/custodian/person/entity') person_dir = Path('/Users/kempersc/apps/glam/data/person') print("=" * 70) print("PPID MIGRATION SCRIPT v3.0 (Optimized)") print("=" * 70) # Phase 1: Build index of existing PPID filenames print("\nPhase 1: Indexing existing PPID files...") existing_ppids = set() for f in person_dir.glob('ID_*.json'): # Extract PPID from filename (remove .json) existing_ppids.add(f.stem) print(f" Found {len(existing_ppids):,} existing PPID files") # Phase 2: List entity files print("\nPhase 2: Listing entity files...") entity_files = list(entity_dir.glob('*.json')) total_entity = len(entity_files) print(f" Found {total_entity:,} entity files") if args.limit: entity_files = entity_files[:args.limit] print(f" Limited to {args.limit} files for this run") # Phase 3: Process files print(f"\nPhase 3: Processing files (workers={args.workers}, dry_run={args.dry_run})...") # Prepare args for multiprocessing process_args = [ (str(f), existing_ppids, str(person_dir), args.dry_run) for f in entity_files ] # Process in batches with progress results = {'migrated': 0, 'exists': 0, 'skip': 0, 'error': 0} migrated_samples = [] batch_size = 1000 for batch_start in range(0, len(process_args), batch_size): batch_end = min(batch_start + batch_size, len(process_args)) batch = process_args[batch_start:batch_end] with Pool(args.workers) as pool: batch_results = pool.map(process_entity_file, batch) for status, detail, file_path in batch_results: results[status] += 1 if status == 'migrated': # Add to existing set to prevent collisions within batch existing_ppids.add(detail) if args.verbose or len(migrated_samples) < 5: migrated_samples.append((detail, Path(file_path).name)) if status == 'error': print(f" ERROR: {file_path}: {detail}") # Progress report processed = batch_end pct = (processed / len(process_args)) * 100 print(f" Progress: {processed:,}/{len(process_args):,} ({pct:.1f}%) - " f"Migrated: {results['migrated']:,}, Exists: {results['exists']:,}, " f"Skip: {results['skip']:,}, Errors: {results['error']}") # Summary print("\n" + "=" * 70) print(f"{'DRY RUN ' if args.dry_run else ''}MIGRATION SUMMARY") print("=" * 70) print(f" Total processed: {sum(results.values()):,}") print(f" Migrated (new): {results['migrated']:,}") print(f" Already exists: {results['exists']:,}") print(f" Skipped (non-human): {results['skip']:,}") print(f" Errors: {results['error']}") if migrated_samples: print(f"\n Sample migrated profiles:") for ppid, source in migrated_samples[:5]: print(f" {ppid} <- {source}") if args.dry_run: print(f"\n To execute migration, run without --dry-run flag") else: final_count = len(list(person_dir.glob('ID_*.json'))) print(f"\n Migration complete!") print(f" Final PPID count: {final_count:,}") if __name__ == '__main__': main()