#!/usr/bin/env python3 """ Migrate entity profiles from data/custodian/person/entity/ to data/person/ This script: 1. Reads entity profiles that are NOT already in data/person/ 2. Filters out non-human profiles (institutions, anonymous LinkedIn members) 3. Generates PPID based on profile data 4. Preserves ALL data including web_claims with XPath provenance 5. Creates proper PPID file in data/person/ Usage: python scripts/migrate_entity_to_ppid_v2.py --dry-run --limit 5 # Preview 5 profiles python scripts/migrate_entity_to_ppid_v2.py --dry-run # Preview all python scripts/migrate_entity_to_ppid_v2.py # Execute migration """ import json import argparse import re from pathlib import Path from urllib.parse import unquote from datetime import datetime, timezone from collections import defaultdict import unicodedata # Patterns for detecting non-human profiles NON_HUMAN_PATTERNS = [ r'^LinkedIn\s+Member$', r'^TheMuseumsLab$', r'Museum$', r'Foundation$', r'Stichting\s', r'^ICOM\s', r'^Fondazione\s', r'Institute$', r'Organisation$', r'Organization$', r'University$', r'^Google\s', r'^Sound\s+Heritage$', r'^Company\s', r'^Computational\s+Research$', ] def extract_linkedin_slug(url): """Extract LinkedIn slug from URL.""" if not url or 'linkedin.com/in/' not in url: return None slug = url.split('linkedin.com/in/')[-1].rstrip('/').split('?')[0] slug = unquote(slug) return slug.lower() def is_human_profile(name, profile_data): """Determine if profile represents a human being (not an institution).""" if not name: return False # Check against non-human patterns for pattern in NON_HUMAN_PATTERNS: if re.search(pattern, name, re.IGNORECASE): return False # LinkedIn Member with no URL is anonymous if name == 'LinkedIn Member' and not profile_data.get('linkedin_url'): return False return True def normalize_name_for_ppid(name): """Convert name to PPID format: FIRST-LAST""" if not name: return "UNKNOWN" # Remove titles/suffixes name = re.sub(r'\b(Dr|Prof|Mr|Mrs|Ms|PhD|MA|MSc|MBA|BSc|Jr|Sr|PSM|GIA|GG)\b\.?', '', name, flags=re.IGNORECASE) # Split and clean parts = [p.strip() for p in name.split() if p.strip()] if not parts: return "UNKNOWN" def normalize_part(p): nfkd = unicodedata.normalize('NFKD', p) ascii_name = ''.join(c for c in nfkd if not unicodedata.combining(c)) return re.sub(r'[^A-Za-z]', '', ascii_name).upper() normalized = [normalize_part(p) for p in parts if normalize_part(p)] if not normalized: return "UNKNOWN" return '-'.join(normalized) def generate_ppid(name): """Generate PPID from name (locations/dates use XX placeholders).""" birth_loc = "XX-XX-XXX" birth_date = "XXXX" current_loc = "XX-XX-XXX" death_date = "XXXX" name_token = normalize_name_for_ppid(name) return f"ID_{birth_loc}_{birth_date}_{current_loc}_{death_date}_{name_token}" def transform_entity_to_ppid(entity_data, entity_file): """Transform entity profile to PPID format, preserving ALL data.""" name = entity_data.get('profile_data', {}).get('name') or entity_data.get('name', 'Unknown') ppid = generate_ppid(name) # Build comprehensive PPID profile preserving ALL source data ppid_profile = { # PPID identification "ppid": ppid, "ppid_type": "ID", "ppid_components": { "type": "ID", "first_location": "XX-XX-XXX", "first_date": "XXXX", "last_location": "XX-XX-XXX", "last_date": "XXXX", "name_tokens": normalize_name_for_ppid(name).split('-') }, # Basic identity "name": name, "birth_date": { "edtf": "XXXX", "precision": "unknown", "note": "Not yet enriched - requires manual research" }, "is_living": True, # Heritage relevance (preserve from source) "heritage_relevance": entity_data.get('heritage_relevance', { "is_heritage_relevant": True, # Default to true since from custodian context "heritage_types": [], "rationale": "Extracted from heritage custodian LinkedIn page" }), # Affiliations (preserve ALL) "affiliations": entity_data.get('affiliations', []), # Profile data (preserve ALL) "profile_data": entity_data.get('profile_data', {}), # Web claims with full provenance (preserve ALL) "web_claims": entity_data.get('web_claims', []), # Source observations (preserve ALL) "source_observations": entity_data.get('source_observations', []), # Original extraction metadata "extraction_metadata": entity_data.get('extraction_metadata', {}), # Migration metadata "migration_metadata": { "original_entity_file": entity_file.name, "original_person_id": entity_data.get('person_id'), "original_linkedin_slug": entity_data.get('linkedin_slug'), "migrated_at": datetime.now(timezone.utc).isoformat(), "migration_script": "migrate_entity_to_ppid_v2.py", "migration_version": "2.0" } } return ppid, ppid_profile def main(): parser = argparse.ArgumentParser(description='Migrate entity profiles to PPID format (v2)') parser.add_argument('--dry-run', action='store_true', help='Preview only, no file changes') parser.add_argument('--limit', type=int, default=None, help='Limit number of profiles to process') parser.add_argument('--verbose', action='store_true', help='Show detailed output for each profile') args = parser.parse_args() entity_dir = Path('/Users/kempersc/apps/glam/data/custodian/person/entity') person_dir = Path('/Users/kempersc/apps/glam/data/person') # 1. Get existing LinkedIn slugs in data/person/ print("=" * 60) print("PPID MIGRATION SCRIPT v2.0") print("=" * 60) print("\nPhase 1: Loading existing PPID profiles...") existing_slugs = set() for f in person_dir.glob('ID_*.json'): try: data = json.load(open(f)) if 'profile_data' in data: url = data['profile_data'].get('linkedin_url') if url: slug = extract_linkedin_slug(url) if slug: existing_slugs.add(slug) except: pass print(f" Found {len(existing_slugs):,} existing LinkedIn slugs in data/person/") # 2. Find entity profiles NOT in data/person/ print("\nPhase 2: Scanning entity profiles...") to_migrate = [] skipped_existing = 0 skipped_no_linkedin = 0 skipped_non_human = 0 entity_files = list(entity_dir.glob('*.json')) print(f" Found {len(entity_files):,} entity files to scan") for f in entity_files: try: data = json.load(open(f)) name = data.get('profile_data', {}).get('name') or data.get('name', '') # Skip non-human profiles if not is_human_profile(name, data.get('profile_data', {})): skipped_non_human += 1 continue # Check for LinkedIn URL linkedin_url = data.get('profile_data', {}).get('linkedin_url') if not linkedin_url: skipped_no_linkedin += 1 continue slug = extract_linkedin_slug(linkedin_url) if slug and slug not in existing_slugs: to_migrate.append((f, data, slug)) elif slug: skipped_existing += 1 except Exception as e: pass print(f"\n Scan Results:") print(f" Already in PPID: {skipped_existing:,}") print(f" Skipped (non-human): {skipped_non_human:,}") print(f" Skipped (no LinkedIn): {skipped_no_linkedin:,}") print(f" TO MIGRATE: {len(to_migrate):,}") if args.limit: to_migrate = to_migrate[:args.limit] print(f"\n Limited to {args.limit} profiles for this run") # 3. Migrate profiles print("\nPhase 3: Migrating profiles...") migrated = 0 errors = 0 collision_count = 0 for entity_file, data, slug in to_migrate: try: ppid, ppid_profile = transform_entity_to_ppid(data, entity_file) output_file = person_dir / f"{ppid}.json" # Handle collisions with counter suffix original_ppid = ppid counter = 1 while output_file.exists(): collision_count += 1 ppid = f"{original_ppid}-{counter}" ppid_profile['ppid'] = ppid output_file = person_dir / f"{ppid}.json" counter += 1 name = ppid_profile['name'] web_claims_count = len(ppid_profile.get('web_claims', [])) affiliations_count = len(ppid_profile.get('affiliations', [])) if args.verbose or args.dry_run: print(f"\n {'[DRY-RUN] ' if args.dry_run else ''}Creating: {output_file.name}") print(f" Name: {name}") print(f" LinkedIn slug: {slug}") print(f" Web claims: {web_claims_count}") print(f" Affiliations: {affiliations_count}") if ppid_profile.get('source_observations'): print(f" Source observations: {len(ppid_profile['source_observations'])}") if not args.dry_run: with open(output_file, 'w') as f: json.dump(ppid_profile, f, indent=2, ensure_ascii=False) migrated += 1 except Exception as e: print(f" ERROR processing {entity_file.name}: {e}") errors += 1 # Summary print("\n" + "=" * 60) print(f"{'DRY RUN ' if args.dry_run else ''}MIGRATION SUMMARY") print("=" * 60) print(f" Profiles migrated: {migrated:,}") print(f" Name collisions resolved: {collision_count}") print(f" Errors: {errors}") if args.dry_run: print(f"\n To execute migration, run without --dry-run flag") else: print(f"\n Migration complete!") print(f" New profile count: {len(list(person_dir.glob('ID_*.json'))):,}") if __name__ == '__main__': main()