#!/usr/bin/env python3 """ Migrate entity profiles from data/custodian/person/entity/ to data/person/ This script (v4) processes ALL entries: 1. NO filtering - every profile is migrated 2. Adds classification tags indicating human vs institution likelihood 3. Handles collisions with UUID suffix (not counter) 4. Preserves ALL data with full provenance Usage: python scripts/migrate_entity_to_ppid_v4.py --dry-run --limit 100 # Preview 100 profiles python scripts/migrate_entity_to_ppid_v4.py --dry-run # Preview all python scripts/migrate_entity_to_ppid_v4.py # Execute migration """ import json import argparse import re import uuid from pathlib import Path from datetime import datetime, timezone import unicodedata from multiprocessing import Pool from typing import Dict, List, Tuple, Any # Patterns that suggest this might be an INSTITUTION (not a person) INSTITUTION_INDICATORS = [ (r'^Company\s+name\s', 'parsing_artifact', 'Profile name starts with "Company name" - likely parsing artifact'), (r'^Stichting\s', 'dutch_foundation', 'Dutch foundation (Stichting)'), (r'^Fondazione\s', 'italian_foundation', 'Italian foundation (Fondazione)'), (r'^ICOM\s', 'icom_organization', 'ICOM organization'), (r'^Google\s', 'company_profile', 'Google company profile'), (r'^TheMuseumsLab$', 'organization', 'TheMuseumsLab organization'), (r'^Sound\s+Heritage$', 'organization', 'Sound Heritage organization'), (r'^Computational\s+Research$', 'organization', 'Computational Research organization'), (r'Museum$', 'museum_suffix', 'Name ends with "Museum" - likely institution'), (r'Foundation$', 'foundation_suffix', 'Name ends with "Foundation" - likely institution'), (r'Institute$', 'institute_suffix', 'Name ends with "Institute" - likely institution'), (r'Organisation$', 'org_suffix', 'Name ends with "Organisation" - likely institution'), (r'Organization$', 'org_suffix', 'Name ends with "Organization" - likely institution'), (r'University$', 'university_suffix', 'Name ends with "University" - likely institution'), (r'Library$', 'library_suffix', 'Name ends with "Library" - likely institution'), (r'Archive$', 'archive_suffix', 'Name ends with "Archive" - likely institution'), (r'Archief$', 'archive_suffix', 'Name ends with "Archief" (Dutch archive) - likely institution'), (r'Bibliotheek$', 'library_suffix', 'Name ends with "Bibliotheek" (Dutch library) - likely institution'), ] # Patterns that suggest this is a PERSON PERSON_INDICATORS = [ (r'^(Dr|Prof|Mr|Mrs|Ms|Drs|Ir|Ing)\.\s', 'title_prefix', 'Has personal title prefix'), (r'\s(PhD|MA|MSc|MBA|BSc|Jr|Sr)$', 'degree_suffix', 'Has degree/suffix'), (r'^[A-Z][a-z]+\s+[A-Z][a-z]+$', 'two_word_name', 'Simple two-word personal name pattern'), (r'^[A-Z][a-z]+\s+(van|de|den|der|von|van der|van den|van de)\s+[A-Z]', 'dutch_name', 'Dutch personal name with particle'), ] def classify_profile(name: str, profile_data: Dict) -> Dict[str, Any]: """Classify profile as human, institution, anonymous, or unknown. Returns classification dict with: - primary_classification: 'human', 'institution', 'anonymous', 'unknown' - confidence: 0.0-1.0 - indicators: list of matched patterns - reasoning: human-readable explanation """ if not name: return { 'primary_classification': 'unknown', 'confidence': 0.0, 'indicators': [{'type': 'empty_name', 'reason': 'Name field is empty'}], 'reasoning': 'Cannot classify - name is empty' } if name == 'LinkedIn Member': headline = profile_data.get('headline', '') affiliations_count = len(profile_data.get('affiliations', []) if isinstance(profile_data.get('affiliations'), list) else []) return { 'primary_classification': 'anonymous', 'confidence': 0.9, 'indicators': [ {'type': 'linkedin_member', 'reason': 'LinkedIn privacy settings hide real name'}, {'type': 'has_headline', 'value': headline[:50] if headline else None}, ], 'reasoning': f'Anonymous LinkedIn profile with privacy settings. Has headline: {bool(headline)}' } institution_matches = [] person_matches = [] # Check institution indicators for pattern, indicator_type, reason in INSTITUTION_INDICATORS: if re.search(pattern, name, re.IGNORECASE): institution_matches.append({ 'type': indicator_type, 'pattern': pattern, 'reason': reason }) # Check person indicators for pattern, indicator_type, reason in PERSON_INDICATORS: if re.search(pattern, name, re.IGNORECASE): person_matches.append({ 'type': indicator_type, 'pattern': pattern, 'reason': reason }) # Check for personal LinkedIn URL (strong person indicator) linkedin_url = profile_data.get('linkedin_url', '') if linkedin_url and '/in/' in linkedin_url: person_matches.append({ 'type': 'personal_linkedin_url', 'reason': 'Has personal LinkedIn /in/ URL' }) # Determine classification if institution_matches and not person_matches: return { 'primary_classification': 'institution', 'confidence': min(0.5 + 0.1 * len(institution_matches), 0.9), 'indicators': institution_matches, 'reasoning': f'Matched {len(institution_matches)} institution pattern(s), no person patterns' } elif person_matches and not institution_matches: return { 'primary_classification': 'human', 'confidence': min(0.5 + 0.15 * len(person_matches), 0.95), 'indicators': person_matches, 'reasoning': f'Matched {len(person_matches)} person pattern(s), no institution patterns' } elif person_matches and institution_matches: # Conflicting signals - personal LinkedIn URL wins if any(i['type'] == 'personal_linkedin_url' for i in person_matches): return { 'primary_classification': 'human', 'confidence': 0.7, 'indicators': person_matches + institution_matches, 'reasoning': f'Conflicting patterns but has personal LinkedIn URL - likely human' } return { 'primary_classification': 'unknown', 'confidence': 0.3, 'indicators': person_matches + institution_matches, 'reasoning': f'Conflicting patterns: {len(person_matches)} person, {len(institution_matches)} institution' } else: # No patterns matched - assume human (most profiles are people) return { 'primary_classification': 'human', 'confidence': 0.6, 'indicators': [{'type': 'default', 'reason': 'No specific patterns matched, defaulting to human'}], 'reasoning': 'No specific patterns matched - assuming human (default)' } def normalize_name_for_ppid(name: str) -> str: """Convert name to PPID format: FIRST-LAST""" if not name: return "UNKNOWN" name = re.sub(r'\b(Dr|Prof|Mr|Mrs|Ms|PhD|MA|MSc|MBA|BSc|Jr|Sr|PSM|GIA|GG)\b\.?', '', name, flags=re.IGNORECASE) parts = [p.strip() for p in name.split() if p.strip()] if not parts: return "UNKNOWN" def normalize_part(p): nfkd = unicodedata.normalize('NFKD', p) ascii_name = ''.join(c for c in nfkd if not unicodedata.combining(c)) return re.sub(r'[^A-Za-z0-9]', '', ascii_name).upper() normalized = [normalize_part(p) for p in parts if normalize_part(p)] return '-'.join(normalized) if normalized else "UNKNOWN" def generate_ppid(name: str, entity_data: Dict = None) -> str: """Generate PPID from name (locations/dates use XX placeholders). For LinkedIn Member profiles, use affiliation context to create unique ID. """ if name == 'LinkedIn Member' and entity_data: affiliations = entity_data.get('affiliations', []) headline = entity_data.get('profile_data', {}).get('headline', '') if entity_data.get('profile_data') else '' if affiliations and isinstance(affiliations, list) and len(affiliations) > 0: org = affiliations[0].get('custodian_name', 'UNKNOWN-ORG') org_token = normalize_name_for_ppid(org)[:20] if headline: role_words = [] for word in headline.split()[:3]: normalized = normalize_name_for_ppid(word) if normalized and len(normalized) > 2: role_words.append(normalized) role_token = '-'.join(role_words[:2]) if role_words else 'STAFF' else: role_token = 'STAFF' name_token = f"ANON-{org_token}-{role_token[:15]}" else: name_token = "LINKEDIN-MEMBER" else: name_token = normalize_name_for_ppid(name) return f"ID_XX-XX-XXX_XXXX_XX-XX-XXX_XXXX_{name_token}" def transform_entity_to_ppid(entity_data: Dict, entity_file_name: str) -> Tuple[str, Dict]: """Transform entity profile to PPID format, preserving ALL data.""" name = entity_data.get('profile_data', {}).get('name') or entity_data.get('name', 'Unknown') ppid = generate_ppid(name, entity_data) # Classify the profile profile_data = entity_data.get('profile_data', {}) # Merge top-level affiliations into profile_data for classification profile_data_for_classification = {**profile_data} if 'affiliations' not in profile_data_for_classification: profile_data_for_classification['affiliations'] = entity_data.get('affiliations', []) classification = classify_profile(name, profile_data_for_classification) is_anonymous = (name == 'LinkedIn Member') if is_anonymous: name_tokens = ppid.split('_')[-1].split('-') else: name_tokens = normalize_name_for_ppid(name).split('-') ppid_profile = { "ppid": ppid, "ppid_type": "ID", "ppid_components": { "type": "ID", "first_location": "XX-XX-XXX", "first_date": "XXXX", "last_location": "XX-XX-XXX", "last_date": "XXXX", "name_tokens": name_tokens }, "name": name, "birth_date": { "edtf": "XXXX", "precision": "unknown", "note": "Not yet enriched - requires manual research" }, "is_living": True, "is_anonymous": is_anonymous, # Classification tags (the key feature of v4) "profile_classification": classification, "heritage_relevance": entity_data.get('heritage_relevance', { "is_heritage_relevant": True, "heritage_types": [], "rationale": "Extracted from heritage custodian LinkedIn page" }), "affiliations": entity_data.get('affiliations', []), "profile_data": entity_data.get('profile_data', {}), "web_claims": entity_data.get('web_claims', []), "source_observations": entity_data.get('source_observations', []), "extraction_metadata": entity_data.get('extraction_metadata', {}), "migration_metadata": { "original_entity_file": entity_file_name, "original_person_id": entity_data.get('person_id'), "original_linkedin_slug": entity_data.get('linkedin_slug'), "migrated_at": datetime.now(timezone.utc).isoformat(), "migration_script": "migrate_entity_to_ppid_v4.py", "migration_version": "4.0" } } return ppid, ppid_profile def process_entity_file(args): """Process a single entity file. Returns (status, ppid, classification, file_path).""" entity_file_path, existing_ppids_set, person_dir, dry_run = args try: with open(entity_file_path) as f: data = json.load(f) name = data.get('profile_data', {}).get('name') or data.get('name', '') # NO FILTERING - process everything ppid, ppid_profile = transform_entity_to_ppid(data, Path(entity_file_path).name) classification = ppid_profile['profile_classification']['primary_classification'] # Check if already exists - add UUID suffix for collision output_ppid = ppid if ppid in existing_ppids_set: # Add short UUID suffix for collision resolution short_uuid = str(uuid.uuid4())[:8] output_ppid = f"{ppid}-{short_uuid}" ppid_profile['ppid'] = output_ppid ppid_profile['ppid_components']['collision_uuid'] = short_uuid output_file = Path(person_dir) / f"{output_ppid}.json" if not dry_run: with open(output_file, 'w') as f: json.dump(ppid_profile, f, indent=2, ensure_ascii=False) return ('migrated', output_ppid, classification, str(entity_file_path)) except Exception as e: return ('error', str(e), 'error', str(entity_file_path)) def main(): parser = argparse.ArgumentParser(description='Migrate entity profiles to PPID format (v4 - tag everything)') parser.add_argument('--dry-run', action='store_true', help='Preview only, no file changes') parser.add_argument('--limit', type=int, default=None, help='Limit number of profiles to process') parser.add_argument('--workers', type=int, default=4, help='Number of parallel workers') parser.add_argument('--verbose', action='store_true', help='Show each migrated file') args = parser.parse_args() entity_dir = Path('/Users/kempersc/apps/glam/data/custodian/person/entity') person_dir = Path('/Users/kempersc/apps/glam/data/person') print("=" * 70) print("PPID MIGRATION SCRIPT v4.0 (Tag Everything, No Filtering)") print("=" * 70) # Phase 1: Build index of existing PPID filenames print("\nPhase 1: Indexing existing PPID files...") existing_ppids = set() for f in person_dir.glob('ID_*.json'): existing_ppids.add(f.stem) print(f" Found {len(existing_ppids):,} existing PPID files") # Phase 2: List entity files print("\nPhase 2: Listing entity files...") entity_files = list(entity_dir.glob('*.json')) total_entity = len(entity_files) print(f" Found {total_entity:,} entity files") if args.limit: entity_files = entity_files[:args.limit] print(f" Limited to {args.limit} files for this run") # Phase 3: Process files print(f"\nPhase 3: Processing ALL files (workers={args.workers}, dry_run={args.dry_run})...") print(" Note: NO filtering - all profiles are migrated with classification tags") process_args = [ (str(f), existing_ppids, str(person_dir), args.dry_run) for f in entity_files ] results = {'migrated': 0, 'error': 0} classifications = {'human': 0, 'institution': 0, 'anonymous': 0, 'unknown': 0} collisions = 0 samples = [] batch_size = 1000 for batch_start in range(0, len(process_args), batch_size): batch_end = min(batch_start + batch_size, len(process_args)) batch = process_args[batch_start:batch_end] with Pool(args.workers) as pool: batch_results = pool.map(process_entity_file, batch) for status, ppid_or_error, classification, file_path in batch_results: if status == 'migrated': results['migrated'] += 1 classifications[classification] = classifications.get(classification, 0) + 1 if '-' in ppid_or_error.split('_')[-1] and len(ppid_or_error.split('-')[-1]) == 8: collisions += 1 existing_ppids.add(ppid_or_error) if len(samples) < 5: samples.append((ppid_or_error, classification, Path(file_path).name)) else: results['error'] += 1 print(f" ERROR: {file_path}: {ppid_or_error}") processed = batch_end pct = (processed / len(process_args)) * 100 print(f" Progress: {processed:,}/{len(process_args):,} ({pct:.1f}%) - " f"H:{classifications['human']:,} I:{classifications['institution']:,} " f"A:{classifications['anonymous']:,} U:{classifications['unknown']:,} " f"Collisions:{collisions}") # Summary print("\n" + "=" * 70) print(f"{'DRY RUN ' if args.dry_run else ''}MIGRATION SUMMARY") print("=" * 70) print(f" Total processed: {results['migrated'] + results['error']:,}") print(f" Successfully migrated: {results['migrated']:,}") print(f" Errors: {results['error']}") print(f" Collisions (UUID suffix added): {collisions}") print(f"\n Classification breakdown:") print(f" Human: {classifications['human']:,}") print(f" Institution: {classifications['institution']:,}") print(f" Anonymous: {classifications['anonymous']:,}") print(f" Unknown: {classifications['unknown']:,}") if samples: print(f"\n Sample migrated profiles:") for ppid, classification, source in samples: print(f" [{classification:11}] {ppid[:60]}... <- {source[:40]}...") if args.dry_run: print(f"\n To execute migration, run without --dry-run flag") else: final_count = len(list(person_dir.glob('ID_*.json'))) print(f"\n Migration complete!") print(f" Final PPID count: {final_count:,}") if __name__ == '__main__': main()