glam/scripts/migrate_entity_to_ppid_v3.py

333 lines
13 KiB
Python

#!/usr/bin/env python3
"""
Migrate entity profiles from data/custodian/person/entity/ to data/person/
This script (v3) optimizes for large-scale migration:
1. Pre-builds an index of existing PPID filenames (fast)
2. Processes entity files in batches with progress reporting
3. Uses multiprocessing for parallel file operations
4. Handles name collisions with counter suffixes
Usage:
python scripts/migrate_entity_to_ppid_v3.py --dry-run --limit 100 # Preview 100 profiles
python scripts/migrate_entity_to_ppid_v3.py --dry-run # Preview all
python scripts/migrate_entity_to_ppid_v3.py # Execute migration
"""
import json
import argparse
import re
from pathlib import Path
from datetime import datetime, timezone
import unicodedata
from multiprocessing import Pool, cpu_count
import os
# Patterns for detecting non-human profiles (institutions, not people)
# NOTE: LinkedIn Member is INCLUDED - they are real people with privacy settings
NON_HUMAN_PATTERNS = [
r'^TheMuseumsLab$',
r'^Piet Blom Museum$', # Specific institution profile
r'^Limburgs Museum$', # Specific institution profile
r'^Miniature Museum$', # Specific institution profile
r'^Stichting\s', # Dutch foundation names
r'^ICOM\s', # ICOM organization
r'^Fondazione\s', # Italian foundation
r'^Google\s', # Company profiles (Google DeepMind etc)
r'^Sound\s+Heritage$', # Specific organization
r'^Company\s+name\s', # Parsing artifact "Company name X"
r'^Computational\s+Research$', # Specific organization
]
# Patterns for organization profiles that should be excluded
# These end with institutional suffixes and have NO personal LinkedIn URL
INSTITUTION_SUFFIX_PATTERNS = [
r'Museum$',
r'Foundation$',
r'Institute$',
r'Organisation$',
r'Organization$',
r'University$',
]
def is_human_profile(name, profile_data):
"""Determine if profile represents a human being (not an institution).
LinkedIn Member profiles ARE included - they are real people with privacy settings.
They have job titles and affiliations, just no visible name.
"""
if not name:
return False
# Check explicit non-human patterns (specific organizations)
for pattern in NON_HUMAN_PATTERNS:
if re.search(pattern, name, re.IGNORECASE):
return False
# Check institution suffix patterns - only exclude if NO personal LinkedIn URL
# (Real people with names like "Jan Museum" would have a personal /in/ URL)
linkedin_url = profile_data.get('linkedin_url', '')
has_personal_linkedin = linkedin_url and '/in/' in linkedin_url
if not has_personal_linkedin:
for pattern in INSTITUTION_SUFFIX_PATTERNS:
if re.search(pattern, name, re.IGNORECASE):
return False
# LinkedIn Member profiles ARE human - they just have privacy settings
# They have job titles, affiliations, and are real people
# (We'll generate their PPID from affiliation context)
return True
def normalize_name_for_ppid(name):
"""Convert name to PPID format: FIRST-LAST"""
if not name:
return "UNKNOWN"
name = re.sub(r'\b(Dr|Prof|Mr|Mrs|Ms|PhD|MA|MSc|MBA|BSc|Jr|Sr|PSM|GIA|GG)\b\.?', '', name, flags=re.IGNORECASE)
parts = [p.strip() for p in name.split() if p.strip()]
if not parts:
return "UNKNOWN"
def normalize_part(p):
nfkd = unicodedata.normalize('NFKD', p)
ascii_name = ''.join(c for c in nfkd if not unicodedata.combining(c))
return re.sub(r'[^A-Za-z]', '', ascii_name).upper()
normalized = [normalize_part(p) for p in parts if normalize_part(p)]
return '-'.join(normalized) if normalized else "UNKNOWN"
def generate_ppid(name, entity_data=None):
"""Generate PPID from name (locations/dates use XX placeholders).
For LinkedIn Member profiles, use affiliation context to create unique ID.
"""
if name == 'LinkedIn Member' and entity_data:
# Use affiliation context for anonymous profiles
affiliations = entity_data.get('affiliations', [])
headline = entity_data.get('profile_data', {}).get('headline', '')
# Try to build context from affiliation
if affiliations:
org = affiliations[0].get('custodian_name', 'UNKNOWN-ORG')
org_token = normalize_name_for_ppid(org)
# Add headline keywords if available
if headline:
# Extract key role word from headline
role_words = []
for word in headline.split()[:3]: # First 3 words
normalized = normalize_name_for_ppid(word)
if normalized and len(normalized) > 2:
role_words.append(normalized)
role_token = '-'.join(role_words[:2]) if role_words else 'STAFF'
else:
role_token = 'STAFF'
name_token = f"ANON-{org_token[:20]}-{role_token[:15]}"
else:
name_token = "LINKEDIN-MEMBER"
else:
name_token = normalize_name_for_ppid(name)
return f"ID_XX-XX-XXX_XXXX_XX-XX-XXX_XXXX_{name_token}"
def transform_entity_to_ppid(entity_data, entity_file_name):
"""Transform entity profile to PPID format, preserving ALL data."""
name = entity_data.get('profile_data', {}).get('name') or entity_data.get('name', 'Unknown')
ppid = generate_ppid(name, entity_data) # Pass entity_data for LinkedIn Member context
# Determine if this is an anonymous profile
is_anonymous = (name == 'LinkedIn Member')
# Get name tokens based on PPID type
if is_anonymous:
# For anonymous, tokens come from PPID structure
name_tokens = ppid.split('_')[-1].split('-')
else:
name_tokens = normalize_name_for_ppid(name).split('-')
ppid_profile = {
"ppid": ppid,
"ppid_type": "ID",
"ppid_components": {
"type": "ID",
"first_location": "XX-XX-XXX",
"first_date": "XXXX",
"last_location": "XX-XX-XXX",
"last_date": "XXXX",
"name_tokens": name_tokens,
"is_anonymous": is_anonymous
},
"name": name,
"birth_date": {
"edtf": "XXXX",
"precision": "unknown",
"note": "Not yet enriched - requires manual research"
},
"is_living": True,
"is_anonymous": is_anonymous, # Top-level flag for easy filtering
"heritage_relevance": entity_data.get('heritage_relevance', {
"is_heritage_relevant": True,
"heritage_types": [],
"rationale": "Extracted from heritage custodian LinkedIn page"
}),
"affiliations": entity_data.get('affiliations', []),
"profile_data": entity_data.get('profile_data', {}),
"web_claims": entity_data.get('web_claims', []),
"source_observations": entity_data.get('source_observations', []),
"extraction_metadata": entity_data.get('extraction_metadata', {}),
"migration_metadata": {
"original_entity_file": entity_file_name,
"original_person_id": entity_data.get('person_id'),
"original_linkedin_slug": entity_data.get('linkedin_slug'),
"migrated_at": datetime.now(timezone.utc).isoformat(),
"migration_script": "migrate_entity_to_ppid_v3.py",
"migration_version": "3.0"
}
}
return ppid, ppid_profile
def process_entity_file(args):
"""Process a single entity file. Returns (status, ppid, file_path) or (status, reason, file_path)."""
entity_file_path, existing_ppids_set, person_dir, dry_run = args
try:
with open(entity_file_path) as f:
data = json.load(f)
name = data.get('profile_data', {}).get('name') or data.get('name', '')
# Skip non-human profiles
if not is_human_profile(name, data.get('profile_data', {})):
return ('skip', 'non-human', str(entity_file_path))
# Generate PPID
ppid, ppid_profile = transform_entity_to_ppid(data, Path(entity_file_path).name)
# Check if already exists
if ppid in existing_ppids_set:
return ('exists', ppid, str(entity_file_path))
# Handle collisions with counter suffix
output_ppid = ppid
counter = 1
while output_ppid in existing_ppids_set:
output_ppid = f"{ppid}-{counter}"
ppid_profile['ppid'] = output_ppid
counter += 1
output_file = Path(person_dir) / f"{output_ppid}.json"
if not dry_run:
with open(output_file, 'w') as f:
json.dump(ppid_profile, f, indent=2, ensure_ascii=False)
return ('migrated', output_ppid, str(entity_file_path))
except Exception as e:
return ('error', str(e), str(entity_file_path))
def main():
parser = argparse.ArgumentParser(description='Migrate entity profiles to PPID format (v3 - optimized)')
parser.add_argument('--dry-run', action='store_true', help='Preview only, no file changes')
parser.add_argument('--limit', type=int, default=None, help='Limit number of profiles to process')
parser.add_argument('--workers', type=int, default=4, help='Number of parallel workers')
parser.add_argument('--verbose', action='store_true', help='Show each migrated file')
args = parser.parse_args()
entity_dir = Path('/Users/kempersc/apps/glam/data/custodian/person/entity')
person_dir = Path('/Users/kempersc/apps/glam/data/person')
print("=" * 70)
print("PPID MIGRATION SCRIPT v3.0 (Optimized)")
print("=" * 70)
# Phase 1: Build index of existing PPID filenames
print("\nPhase 1: Indexing existing PPID files...")
existing_ppids = set()
for f in person_dir.glob('ID_*.json'):
# Extract PPID from filename (remove .json)
existing_ppids.add(f.stem)
print(f" Found {len(existing_ppids):,} existing PPID files")
# Phase 2: List entity files
print("\nPhase 2: Listing entity files...")
entity_files = list(entity_dir.glob('*.json'))
total_entity = len(entity_files)
print(f" Found {total_entity:,} entity files")
if args.limit:
entity_files = entity_files[:args.limit]
print(f" Limited to {args.limit} files for this run")
# Phase 3: Process files
print(f"\nPhase 3: Processing files (workers={args.workers}, dry_run={args.dry_run})...")
# Prepare args for multiprocessing
process_args = [
(str(f), existing_ppids, str(person_dir), args.dry_run)
for f in entity_files
]
# Process in batches with progress
results = {'migrated': 0, 'exists': 0, 'skip': 0, 'error': 0}
migrated_samples = []
batch_size = 1000
for batch_start in range(0, len(process_args), batch_size):
batch_end = min(batch_start + batch_size, len(process_args))
batch = process_args[batch_start:batch_end]
with Pool(args.workers) as pool:
batch_results = pool.map(process_entity_file, batch)
for status, detail, file_path in batch_results:
results[status] += 1
if status == 'migrated':
# Add to existing set to prevent collisions within batch
existing_ppids.add(detail)
if args.verbose or len(migrated_samples) < 5:
migrated_samples.append((detail, Path(file_path).name))
if status == 'error':
print(f" ERROR: {file_path}: {detail}")
# Progress report
processed = batch_end
pct = (processed / len(process_args)) * 100
print(f" Progress: {processed:,}/{len(process_args):,} ({pct:.1f}%) - "
f"Migrated: {results['migrated']:,}, Exists: {results['exists']:,}, "
f"Skip: {results['skip']:,}, Errors: {results['error']}")
# Summary
print("\n" + "=" * 70)
print(f"{'DRY RUN ' if args.dry_run else ''}MIGRATION SUMMARY")
print("=" * 70)
print(f" Total processed: {sum(results.values()):,}")
print(f" Migrated (new): {results['migrated']:,}")
print(f" Already exists: {results['exists']:,}")
print(f" Skipped (non-human): {results['skip']:,}")
print(f" Errors: {results['error']}")
if migrated_samples:
print(f"\n Sample migrated profiles:")
for ppid, source in migrated_samples[:5]:
print(f" {ppid} <- {source}")
if args.dry_run:
print(f"\n To execute migration, run without --dry-run flag")
else:
final_count = len(list(person_dir.glob('ID_*.json')))
print(f"\n Migration complete!")
print(f" Final PPID count: {final_count:,}")
if __name__ == '__main__':
main()