glam/scripts/migrate_entity_to_ppid_v2.py

#!/usr/bin/env python3
"""
Migrate entity profiles from data/custodian/person/entity/ to data/person/

This script:
1. Reads entity profiles that are NOT already in data/person/
2. Filters out non-human profiles (institutions, anonymous LinkedIn members)
3. Generates PPID based on profile data
4. Preserves ALL data including web_claims with XPath provenance
5. Creates proper PPID file in data/person/

Usage:
  python scripts/migrate_entity_to_ppid_v2.py --dry-run --limit 5  # Preview 5 profiles
  python scripts/migrate_entity_to_ppid_v2.py --dry-run            # Preview all
  python scripts/migrate_entity_to_ppid_v2.py                      # Execute migration
"""

import json
import argparse
import re
from pathlib import Path
from urllib.parse import unquote
from datetime import datetime, timezone
from collections import defaultdict
import unicodedata

# Patterns for detecting non-human profiles
NON_HUMAN_PATTERNS = [
    r'^LinkedIn\s+Member$',
    r'^TheMuseumsLab$',
    r'Museum$',
    r'Foundation$',
    r'Stichting\s',
    r'^ICOM\s',
    r'^Fondazione\s',
    r'Institute$',
    r'Organisation$',
    r'Organization$',
    r'University$',
    r'^Google\s',
    r'^Sound\s+Heritage$',
    r'^Company\s',
    r'^Computational\s+Research$',
]

def extract_linkedin_slug(url):
    """Extract LinkedIn slug from URL."""
    if not url or 'linkedin.com/in/' not in url:
        return None
    slug = url.split('linkedin.com/in/')[-1].rstrip('/').split('?')[0]
    slug = unquote(slug)
    return slug.lower()

def is_human_profile(name, profile_data):
    """Determine if profile represents a human being (not an institution)."""
    if not name:
        return False

    # Check against non-human patterns
    for pattern in NON_HUMAN_PATTERNS:
        if re.search(pattern, name, re.IGNORECASE):
            return False

    # LinkedIn Member with no URL is anonymous
    if name == 'LinkedIn Member' and not profile_data.get('linkedin_url'):
        return False

    return True

def normalize_name_for_ppid(name):
    """Convert name to PPID format: FIRST-LAST"""
    if not name:
        return "UNKNOWN"

    # Remove titles/suffixes
    name = re.sub(r'\b(Dr|Prof|Mr|Mrs|Ms|PhD|MA|MSc|MBA|BSc|Jr|Sr|PSM|GIA|GG)\b\.?', '', name, flags=re.IGNORECASE)

    # Split and clean
    parts = [p.strip() for p in name.split() if p.strip()]
    if not parts:
        return "UNKNOWN"

    def normalize_part(p):
        nfkd = unicodedata.normalize('NFKD', p)
        ascii_name = ''.join(c for c in nfkd if not unicodedata.combining(c))
        return re.sub(r'[^A-Za-z]', '', ascii_name).upper()

    normalized = [normalize_part(p) for p in parts if normalize_part(p)]
    if not normalized:
        return "UNKNOWN"

    return '-'.join(normalized)

def generate_ppid(name):
    """Generate PPID from name (locations/dates use XX placeholders)."""
    birth_loc = "XX-XX-XXX"
    birth_date = "XXXX"
    current_loc = "XX-XX-XXX"
    death_date = "XXXX"

    name_token = normalize_name_for_ppid(name)

    return f"ID_{birth_loc}_{birth_date}_{current_loc}_{death_date}_{name_token}"

def transform_entity_to_ppid(entity_data, entity_file):
    """Transform entity profile to PPID format, preserving ALL data."""

    name = entity_data.get('profile_data', {}).get('name') or entity_data.get('name', 'Unknown')
    ppid = generate_ppid(name)

    # Build comprehensive PPID profile preserving ALL source data
    ppid_profile = {
        # PPID identification
        "ppid": ppid,
        "ppid_type": "ID",
        "ppid_components": {
            "type": "ID",
            "first_location": "XX-XX-XXX",
            "first_date": "XXXX",
            "last_location": "XX-XX-XXX",
            "last_date": "XXXX",
            "name_tokens": normalize_name_for_ppid(name).split('-')
        },

        # Basic identity
        "name": name,
        "birth_date": {
            "edtf": "XXXX",
            "precision": "unknown",
            "note": "Not yet enriched - requires manual research"
        },
        "is_living": True,

        # Heritage relevance (preserve from source)
        "heritage_relevance": entity_data.get('heritage_relevance', {
            "is_heritage_relevant": True,  # Default to true since from custodian context
            "heritage_types": [],
            "rationale": "Extracted from heritage custodian LinkedIn page"
        }),

        # Affiliations (preserve ALL)
        "affiliations": entity_data.get('affiliations', []),

        # Profile data (preserve ALL)
        "profile_data": entity_data.get('profile_data', {}),

        # Web claims with full provenance (preserve ALL)
        "web_claims": entity_data.get('web_claims', []),

        # Source observations (preserve ALL)
        "source_observations": entity_data.get('source_observations', []),

        # Original extraction metadata
        "extraction_metadata": entity_data.get('extraction_metadata', {}),

        # Migration metadata
        "migration_metadata": {
            "original_entity_file": entity_file.name,
            "original_person_id": entity_data.get('person_id'),
            "original_linkedin_slug": entity_data.get('linkedin_slug'),
            "migrated_at": datetime.now(timezone.utc).isoformat(),
            "migration_script": "migrate_entity_to_ppid_v2.py",
            "migration_version": "2.0"
        }
    }

    return ppid, ppid_profile

def main():
    parser = argparse.ArgumentParser(description='Migrate entity profiles to PPID format (v2)')
    parser.add_argument('--dry-run', action='store_true', help='Preview only, no file changes')
    parser.add_argument('--limit', type=int, default=None, help='Limit number of profiles to process')
    parser.add_argument('--verbose', action='store_true', help='Show detailed output for each profile')
    args = parser.parse_args()

    entity_dir = Path('/Users/kempersc/apps/glam/data/custodian/person/entity')
    person_dir = Path('/Users/kempersc/apps/glam/data/person')

    # 1. Get existing LinkedIn slugs in data/person/
    print("=" * 60)
    print("PPID MIGRATION SCRIPT v2.0")
    print("=" * 60)
    print("\nPhase 1: Loading existing PPID profiles...")
    existing_slugs = set()
    for f in person_dir.glob('ID_*.json'):
        try:
            data = json.load(open(f))
            if 'profile_data' in data:
                url = data['profile_data'].get('linkedin_url')
                if url:
                    slug = extract_linkedin_slug(url)
                    if slug:
                        existing_slugs.add(slug)
        except:
            pass

    print(f"  Found {len(existing_slugs):,} existing LinkedIn slugs in data/person/")

    # 2. Find entity profiles NOT in data/person/
    print("\nPhase 2: Scanning entity profiles...")
    to_migrate = []
    skipped_existing = 0
    skipped_no_linkedin = 0
    skipped_non_human = 0

    entity_files = list(entity_dir.glob('*.json'))
    print(f"  Found {len(entity_files):,} entity files to scan")

    for f in entity_files:
        try:
            data = json.load(open(f))
            name = data.get('profile_data', {}).get('name') or data.get('name', '')

            # Skip non-human profiles
            if not is_human_profile(name, data.get('profile_data', {})):
                skipped_non_human += 1
                continue

            # Check for LinkedIn URL
            linkedin_url = data.get('profile_data', {}).get('linkedin_url')
            if not linkedin_url:
                skipped_no_linkedin += 1
                continue

            slug = extract_linkedin_slug(linkedin_url)
            if slug and slug not in existing_slugs:
                to_migrate.append((f, data, slug))
            elif slug:
                skipped_existing += 1

        except Exception as e:
            pass

    print(f"\n  Scan Results:")
    print(f"    Already in PPID: {skipped_existing:,}")
    print(f"    Skipped (non-human): {skipped_non_human:,}")
    print(f"    Skipped (no LinkedIn): {skipped_no_linkedin:,}")
    print(f"    TO MIGRATE: {len(to_migrate):,}")

    if args.limit:
        to_migrate = to_migrate[:args.limit]
        print(f"\n  Limited to {args.limit} profiles for this run")

    # 3. Migrate profiles
    print("\nPhase 3: Migrating profiles...")
    migrated = 0
    errors = 0
    collision_count = 0

    for entity_file, data, slug in to_migrate:
        try:
            ppid, ppid_profile = transform_entity_to_ppid(data, entity_file)
            output_file = person_dir / f"{ppid}.json"

            # Handle collisions with counter suffix
            original_ppid = ppid
            counter = 1
            while output_file.exists():
                collision_count += 1
                ppid = f"{original_ppid}-{counter}"
                ppid_profile['ppid'] = ppid
                output_file = person_dir / f"{ppid}.json"
                counter += 1

            name = ppid_profile['name']
            web_claims_count = len(ppid_profile.get('web_claims', []))
            affiliations_count = len(ppid_profile.get('affiliations', []))

            if args.verbose or args.dry_run:
                print(f"\n  {'[DRY-RUN] ' if args.dry_run else ''}Creating: {output_file.name}")
                print(f"    Name: {name}")
                print(f"    LinkedIn slug: {slug}")
                print(f"    Web claims: {web_claims_count}")
                print(f"    Affiliations: {affiliations_count}")
                if ppid_profile.get('source_observations'):
                    print(f"    Source observations: {len(ppid_profile['source_observations'])}")

            if not args.dry_run:
                with open(output_file, 'w') as f:
                    json.dump(ppid_profile, f, indent=2, ensure_ascii=False)

            migrated += 1

        except Exception as e:
            print(f"  ERROR processing {entity_file.name}: {e}")
            errors += 1

    # Summary
    print("\n" + "=" * 60)
    print(f"{'DRY RUN ' if args.dry_run else ''}MIGRATION SUMMARY")
    print("=" * 60)
    print(f"  Profiles migrated: {migrated:,}")
    print(f"  Name collisions resolved: {collision_count}")
    print(f"  Errors: {errors}")

    if args.dry_run:
        print(f"\n  To execute migration, run without --dry-run flag")
    else:
        print(f"\n  Migration complete!")
        print(f"  New profile count: {len(list(person_dir.glob('ID_*.json'))):,}")

if __name__ == '__main__':
    main()