#!/usr/bin/env python3 """ Quick test migration script - processes a small sample without pre-scanning all files. Use for validation before running full migration. """ import json import re from pathlib import Path from urllib.parse import unquote from datetime import datetime, timezone import unicodedata # Patterns for detecting non-human profiles NON_HUMAN_PATTERNS = [ r'^LinkedIn\s+Member$', r'^TheMuseumsLab$', r'Museum$', r'Foundation$', r'Stichting\s', r'^ICOM\s', r'^Fondazione\s', r'Institute$', r'Organisation$', r'Organization$', r'University$', r'^Google\s', r'^Sound\s+Heritage$', r'^Company\s', r'^Computational\s+Research$', ] def extract_linkedin_slug(url): """Extract LinkedIn slug from URL.""" if not url or 'linkedin.com/in/' not in url: return None slug = url.split('linkedin.com/in/')[-1].rstrip('/').split('?')[0] slug = unquote(slug) return slug.lower() def is_human_profile(name, profile_data): """Determine if profile represents a human being (not an institution).""" if not name: return False for pattern in NON_HUMAN_PATTERNS: if re.search(pattern, name, re.IGNORECASE): return False if name == 'LinkedIn Member' and not profile_data.get('linkedin_url'): return False return True def normalize_name_for_ppid(name): """Convert name to PPID format: FIRST-LAST""" if not name: return "UNKNOWN" name = re.sub(r'\b(Dr|Prof|Mr|Mrs|Ms|PhD|MA|MSc|MBA|BSc|Jr|Sr|PSM|GIA|GG)\b\.?', '', name, flags=re.IGNORECASE) parts = [p.strip() for p in name.split() if p.strip()] if not parts: return "UNKNOWN" def normalize_part(p): nfkd = unicodedata.normalize('NFKD', p) ascii_name = ''.join(c for c in nfkd if not unicodedata.combining(c)) return re.sub(r'[^A-Za-z]', '', ascii_name).upper() normalized = [normalize_part(p) for p in parts if normalize_part(p)] return '-'.join(normalized) if normalized else "UNKNOWN" def generate_ppid(name): """Generate PPID from name (locations/dates use XX placeholders).""" name_token = normalize_name_for_ppid(name) return f"ID_XX-XX-XXX_XXXX_XX-XX-XXX_XXXX_{name_token}" def transform_entity_to_ppid(entity_data, entity_file): """Transform entity profile to PPID format, preserving ALL data.""" name = entity_data.get('profile_data', {}).get('name') or entity_data.get('name', 'Unknown') ppid = generate_ppid(name) ppid_profile = { "ppid": ppid, "ppid_type": "ID", "ppid_components": { "type": "ID", "first_location": "XX-XX-XXX", "first_date": "XXXX", "last_location": "XX-XX-XXX", "last_date": "XXXX", "name_tokens": normalize_name_for_ppid(name).split('-') }, "name": name, "birth_date": { "edtf": "XXXX", "precision": "unknown", "note": "Not yet enriched - requires manual research" }, "is_living": True, "heritage_relevance": entity_data.get('heritage_relevance', { "is_heritage_relevant": True, "heritage_types": [], "rationale": "Extracted from heritage custodian LinkedIn page" }), "affiliations": entity_data.get('affiliations', []), "profile_data": entity_data.get('profile_data', {}), "web_claims": entity_data.get('web_claims', []), "source_observations": entity_data.get('source_observations', []), "extraction_metadata": entity_data.get('extraction_metadata', {}), "migration_metadata": { "original_entity_file": entity_file.name, "original_person_id": entity_data.get('person_id'), "original_linkedin_slug": entity_data.get('linkedin_slug'), "migrated_at": datetime.now(timezone.utc).isoformat(), "migration_script": "migrate_entity_to_ppid_v2.py", "migration_version": "2.0" } } return ppid, ppid_profile def main(): entity_dir = Path('/Users/kempersc/apps/glam/data/custodian/person/entity') person_dir = Path('/Users/kempersc/apps/glam/data/person') print("=" * 70) print("QUICK TEST - Migration Preview (5 profiles)") print("=" * 70) # Get first 10 entity files and process 5 human profiles entity_files = list(entity_dir.glob('*.json'))[:50] # Scan only 50 processed = 0 skipped_non_human = 0 skipped_no_linkedin = 0 for f in entity_files: if processed >= 5: break try: data = json.load(open(f)) name = data.get('profile_data', {}).get('name') or data.get('name', '') # Skip non-human profiles if not is_human_profile(name, data.get('profile_data', {})): skipped_non_human += 1 continue # Check for LinkedIn URL linkedin_url = data.get('profile_data', {}).get('linkedin_url') if not linkedin_url: skipped_no_linkedin += 1 continue slug = extract_linkedin_slug(linkedin_url) # Transform to PPID format ppid, ppid_profile = transform_entity_to_ppid(data, f) output_file = person_dir / f"{ppid}.json" processed += 1 print(f"\n{'=' * 70}") print(f"Profile {processed}/5") print(f"{'=' * 70}") print(f"Source file: {f.name}") print(f"Name: {name}") print(f"LinkedIn slug: {slug}") print(f"Generated PPID: {ppid}") print(f"Output file: {output_file.name}") print(f"Already exists: {output_file.exists()}") # Show preserved data print(f"\nData preserved:") print(f" - Web claims: {len(ppid_profile.get('web_claims', []))}") print(f" - Affiliations: {len(ppid_profile.get('affiliations', []))}") print(f" - Source observations: {len(ppid_profile.get('source_observations', []))}") print(f" - Profile data fields: {list(ppid_profile.get('profile_data', {}).keys())}") if ppid_profile.get('web_claims'): print(f"\n Sample web claim:") claim = ppid_profile['web_claims'][0] print(f" claim_type: {claim.get('claim_type')}") print(f" claim_value: {str(claim.get('claim_value'))[:60]}...") print(f" xpath: {claim.get('xpath', 'N/A')[:80]}..." if claim.get('xpath') else " xpath: N/A") print(f" xpath_match_score: {claim.get('xpath_match_score', 'N/A')}") if ppid_profile.get('affiliations'): print(f"\n Sample affiliation:") aff = ppid_profile['affiliations'][0] print(f" org: {aff.get('organization_name', 'N/A')}") print(f" title: {aff.get('role_title', 'N/A')}") except Exception as e: print(f"ERROR processing {f.name}: {e}") print(f"\n{'=' * 70}") print("QUICK TEST SUMMARY") print(f"{'=' * 70}") print(f"Profiles shown: {processed}") print(f"Skipped (non-human): {skipped_non_human}") print(f"Skipped (no LinkedIn): {skipped_no_linkedin}") print(f"\nTo run full migration:") print(f" python scripts/migrate_entity_to_ppid_v2.py --dry-run") if __name__ == '__main__': main()