glam/scripts/migrate_entity_to_ppid_v4.py

#!/usr/bin/env python3
"""
Migrate entity profiles from data/custodian/person/entity/ to data/person/

This script (v4) processes ALL entries:
1. NO filtering - every profile is migrated
2. Adds classification tags indicating human vs institution likelihood
3. Handles collisions with UUID suffix (not counter)
4. Preserves ALL data with full provenance

Usage:
  python scripts/migrate_entity_to_ppid_v4.py --dry-run --limit 100  # Preview 100 profiles
  python scripts/migrate_entity_to_ppid_v4.py --dry-run             # Preview all
  python scripts/migrate_entity_to_ppid_v4.py                       # Execute migration
"""

import json
import argparse
import re
import uuid
from pathlib import Path
from datetime import datetime, timezone
import unicodedata
from multiprocessing import Pool
from typing import Dict, List, Tuple, Any

# Patterns that suggest this might be an INSTITUTION (not a person)
INSTITUTION_INDICATORS = [
    (r'^Company\s+name\s', 'parsing_artifact', 'Profile name starts with "Company name" - likely parsing artifact'),
    (r'^Stichting\s', 'dutch_foundation', 'Dutch foundation (Stichting)'),
    (r'^Fondazione\s', 'italian_foundation', 'Italian foundation (Fondazione)'),
    (r'^ICOM\s', 'icom_organization', 'ICOM organization'),
    (r'^Google\s', 'company_profile', 'Google company profile'),
    (r'^TheMuseumsLab$', 'organization', 'TheMuseumsLab organization'),
    (r'^Sound\s+Heritage$', 'organization', 'Sound Heritage organization'),
    (r'^Computational\s+Research$', 'organization', 'Computational Research organization'),
    (r'Museum$', 'museum_suffix', 'Name ends with "Museum" - likely institution'),
    (r'Foundation$', 'foundation_suffix', 'Name ends with "Foundation" - likely institution'),
    (r'Institute$', 'institute_suffix', 'Name ends with "Institute" - likely institution'),
    (r'Organisation$', 'org_suffix', 'Name ends with "Organisation" - likely institution'),
    (r'Organization$', 'org_suffix', 'Name ends with "Organization" - likely institution'),
    (r'University$', 'university_suffix', 'Name ends with "University" - likely institution'),
    (r'Library$', 'library_suffix', 'Name ends with "Library" - likely institution'),
    (r'Archive$', 'archive_suffix', 'Name ends with "Archive" - likely institution'),
    (r'Archief$', 'archive_suffix', 'Name ends with "Archief" (Dutch archive) - likely institution'),
    (r'Bibliotheek$', 'library_suffix', 'Name ends with "Bibliotheek" (Dutch library) - likely institution'),
]

# Patterns that suggest this is a PERSON
PERSON_INDICATORS = [
    (r'^(Dr|Prof|Mr|Mrs|Ms|Drs|Ir|Ing)\.\s', 'title_prefix', 'Has personal title prefix'),
    (r'\s(PhD|MA|MSc|MBA|BSc|Jr|Sr)$', 'degree_suffix', 'Has degree/suffix'),
    (r'^[A-Z][a-z]+\s+[A-Z][a-z]+$', 'two_word_name', 'Simple two-word personal name pattern'),
    (r'^[A-Z][a-z]+\s+(van|de|den|der|von|van der|van den|van de)\s+[A-Z]', 'dutch_name', 'Dutch personal name with particle'),
]


def classify_profile(name: str, profile_data: Dict) -> Dict[str, Any]:
    """Classify profile as human, institution, anonymous, or unknown.

    Returns classification dict with:
    - primary_classification: 'human', 'institution', 'anonymous', 'unknown'
    - confidence: 0.0-1.0
    - indicators: list of matched patterns
    - reasoning: human-readable explanation
    """
    if not name:
        return {
            'primary_classification': 'unknown',
            'confidence': 0.0,
            'indicators': [{'type': 'empty_name', 'reason': 'Name field is empty'}],
            'reasoning': 'Cannot classify - name is empty'
        }

    if name == 'LinkedIn Member':
        headline = profile_data.get('headline', '')
        affiliations_count = len(profile_data.get('affiliations', []) if isinstance(profile_data.get('affiliations'), list) else [])

        return {
            'primary_classification': 'anonymous',
            'confidence': 0.9,
            'indicators': [
                {'type': 'linkedin_member', 'reason': 'LinkedIn privacy settings hide real name'},
                {'type': 'has_headline', 'value': headline[:50] if headline else None},
            ],
            'reasoning': f'Anonymous LinkedIn profile with privacy settings. Has headline: {bool(headline)}'
        }

    institution_matches = []
    person_matches = []

    # Check institution indicators
    for pattern, indicator_type, reason in INSTITUTION_INDICATORS:
        if re.search(pattern, name, re.IGNORECASE):
            institution_matches.append({
                'type': indicator_type,
                'pattern': pattern,
                'reason': reason
            })

    # Check person indicators
    for pattern, indicator_type, reason in PERSON_INDICATORS:
        if re.search(pattern, name, re.IGNORECASE):
            person_matches.append({
                'type': indicator_type,
                'pattern': pattern,
                'reason': reason
            })

    # Check for personal LinkedIn URL (strong person indicator)
    linkedin_url = profile_data.get('linkedin_url', '')
    if linkedin_url and '/in/' in linkedin_url:
        person_matches.append({
            'type': 'personal_linkedin_url',
            'reason': 'Has personal LinkedIn /in/ URL'
        })

    # Determine classification
    if institution_matches and not person_matches:
        return {
            'primary_classification': 'institution',
            'confidence': min(0.5 + 0.1 * len(institution_matches), 0.9),
            'indicators': institution_matches,
            'reasoning': f'Matched {len(institution_matches)} institution pattern(s), no person patterns'
        }
    elif person_matches and not institution_matches:
        return {
            'primary_classification': 'human',
            'confidence': min(0.5 + 0.15 * len(person_matches), 0.95),
            'indicators': person_matches,
            'reasoning': f'Matched {len(person_matches)} person pattern(s), no institution patterns'
        }
    elif person_matches and institution_matches:
        # Conflicting signals - personal LinkedIn URL wins
        if any(i['type'] == 'personal_linkedin_url' for i in person_matches):
            return {
                'primary_classification': 'human',
                'confidence': 0.7,
                'indicators': person_matches + institution_matches,
                'reasoning': f'Conflicting patterns but has personal LinkedIn URL - likely human'
            }
        return {
            'primary_classification': 'unknown',
            'confidence': 0.3,
            'indicators': person_matches + institution_matches,
            'reasoning': f'Conflicting patterns: {len(person_matches)} person, {len(institution_matches)} institution'
        }
    else:
        # No patterns matched - assume human (most profiles are people)
        return {
            'primary_classification': 'human',
            'confidence': 0.6,
            'indicators': [{'type': 'default', 'reason': 'No specific patterns matched, defaulting to human'}],
            'reasoning': 'No specific patterns matched - assuming human (default)'
        }


def normalize_name_for_ppid(name: str) -> str:
    """Convert name to PPID format: FIRST-LAST"""
    if not name:
        return "UNKNOWN"

    name = re.sub(r'\b(Dr|Prof|Mr|Mrs|Ms|PhD|MA|MSc|MBA|BSc|Jr|Sr|PSM|GIA|GG)\b\.?', '', name, flags=re.IGNORECASE)
    parts = [p.strip() for p in name.split() if p.strip()]
    if not parts:
        return "UNKNOWN"

    def normalize_part(p):
        nfkd = unicodedata.normalize('NFKD', p)
        ascii_name = ''.join(c for c in nfkd if not unicodedata.combining(c))
        return re.sub(r'[^A-Za-z0-9]', '', ascii_name).upper()

    normalized = [normalize_part(p) for p in parts if normalize_part(p)]
    return '-'.join(normalized) if normalized else "UNKNOWN"


def generate_ppid(name: str, entity_data: Dict = None) -> str:
    """Generate PPID from name (locations/dates use XX placeholders).

    For LinkedIn Member profiles, use affiliation context to create unique ID.
    """
    if name == 'LinkedIn Member' and entity_data:
        affiliations = entity_data.get('affiliations', [])
        headline = entity_data.get('profile_data', {}).get('headline', '') if entity_data.get('profile_data') else ''

        if affiliations and isinstance(affiliations, list) and len(affiliations) > 0:
            org = affiliations[0].get('custodian_name', 'UNKNOWN-ORG')
            org_token = normalize_name_for_ppid(org)[:20]

            if headline:
                role_words = []
                for word in headline.split()[:3]:
                    normalized = normalize_name_for_ppid(word)
                    if normalized and len(normalized) > 2:
                        role_words.append(normalized)
                role_token = '-'.join(role_words[:2]) if role_words else 'STAFF'
            else:
                role_token = 'STAFF'

            name_token = f"ANON-{org_token}-{role_token[:15]}"
        else:
            name_token = "LINKEDIN-MEMBER"
    else:
        name_token = normalize_name_for_ppid(name)

    return f"ID_XX-XX-XXX_XXXX_XX-XX-XXX_XXXX_{name_token}"


def transform_entity_to_ppid(entity_data: Dict, entity_file_name: str) -> Tuple[str, Dict]:
    """Transform entity profile to PPID format, preserving ALL data."""

    name = entity_data.get('profile_data', {}).get('name') or entity_data.get('name', 'Unknown')
    ppid = generate_ppid(name, entity_data)

    # Classify the profile
    profile_data = entity_data.get('profile_data', {})
    # Merge top-level affiliations into profile_data for classification
    profile_data_for_classification = {**profile_data}
    if 'affiliations' not in profile_data_for_classification:
        profile_data_for_classification['affiliations'] = entity_data.get('affiliations', [])

    classification = classify_profile(name, profile_data_for_classification)

    is_anonymous = (name == 'LinkedIn Member')

    if is_anonymous:
        name_tokens = ppid.split('_')[-1].split('-')
    else:
        name_tokens = normalize_name_for_ppid(name).split('-')

    ppid_profile = {
        "ppid": ppid,
        "ppid_type": "ID",
        "ppid_components": {
            "type": "ID",
            "first_location": "XX-XX-XXX",
            "first_date": "XXXX",
            "last_location": "XX-XX-XXX",
            "last_date": "XXXX",
            "name_tokens": name_tokens
        },
        "name": name,
        "birth_date": {
            "edtf": "XXXX",
            "precision": "unknown",
            "note": "Not yet enriched - requires manual research"
        },
        "is_living": True,
        "is_anonymous": is_anonymous,

        # Classification tags (the key feature of v4)
        "profile_classification": classification,

        "heritage_relevance": entity_data.get('heritage_relevance', {
            "is_heritage_relevant": True,
            "heritage_types": [],
            "rationale": "Extracted from heritage custodian LinkedIn page"
        }),
        "affiliations": entity_data.get('affiliations', []),
        "profile_data": entity_data.get('profile_data', {}),
        "web_claims": entity_data.get('web_claims', []),
        "source_observations": entity_data.get('source_observations', []),
        "extraction_metadata": entity_data.get('extraction_metadata', {}),
        "migration_metadata": {
            "original_entity_file": entity_file_name,
            "original_person_id": entity_data.get('person_id'),
            "original_linkedin_slug": entity_data.get('linkedin_slug'),
            "migrated_at": datetime.now(timezone.utc).isoformat(),
            "migration_script": "migrate_entity_to_ppid_v4.py",
            "migration_version": "4.0"
        }
    }

    return ppid, ppid_profile


def process_entity_file(args):
    """Process a single entity file. Returns (status, ppid, classification, file_path)."""
    entity_file_path, existing_ppids_set, person_dir, dry_run = args

    try:
        with open(entity_file_path) as f:
            data = json.load(f)

        name = data.get('profile_data', {}).get('name') or data.get('name', '')

        # NO FILTERING - process everything
        ppid, ppid_profile = transform_entity_to_ppid(data, Path(entity_file_path).name)
        classification = ppid_profile['profile_classification']['primary_classification']

        # Check if already exists - add UUID suffix for collision
        output_ppid = ppid
        if ppid in existing_ppids_set:
            # Add short UUID suffix for collision resolution
            short_uuid = str(uuid.uuid4())[:8]
            output_ppid = f"{ppid}-{short_uuid}"
            ppid_profile['ppid'] = output_ppid
            ppid_profile['ppid_components']['collision_uuid'] = short_uuid

        output_file = Path(person_dir) / f"{output_ppid}.json"

        if not dry_run:
            with open(output_file, 'w') as f:
                json.dump(ppid_profile, f, indent=2, ensure_ascii=False)

        return ('migrated', output_ppid, classification, str(entity_file_path))

    except Exception as e:
        return ('error', str(e), 'error', str(entity_file_path))


def main():
    parser = argparse.ArgumentParser(description='Migrate entity profiles to PPID format (v4 - tag everything)')
    parser.add_argument('--dry-run', action='store_true', help='Preview only, no file changes')
    parser.add_argument('--limit', type=int, default=None, help='Limit number of profiles to process')
    parser.add_argument('--workers', type=int, default=4, help='Number of parallel workers')
    parser.add_argument('--verbose', action='store_true', help='Show each migrated file')
    args = parser.parse_args()

    entity_dir = Path('/Users/kempersc/apps/glam/data/custodian/person/entity')
    person_dir = Path('/Users/kempersc/apps/glam/data/person')

    print("=" * 70)
    print("PPID MIGRATION SCRIPT v4.0 (Tag Everything, No Filtering)")
    print("=" * 70)

    # Phase 1: Build index of existing PPID filenames
    print("\nPhase 1: Indexing existing PPID files...")
    existing_ppids = set()
    for f in person_dir.glob('ID_*.json'):
        existing_ppids.add(f.stem)
    print(f"  Found {len(existing_ppids):,} existing PPID files")

    # Phase 2: List entity files
    print("\nPhase 2: Listing entity files...")
    entity_files = list(entity_dir.glob('*.json'))
    total_entity = len(entity_files)
    print(f"  Found {total_entity:,} entity files")

    if args.limit:
        entity_files = entity_files[:args.limit]
        print(f"  Limited to {args.limit} files for this run")

    # Phase 3: Process files
    print(f"\nPhase 3: Processing ALL files (workers={args.workers}, dry_run={args.dry_run})...")
    print("  Note: NO filtering - all profiles are migrated with classification tags")

    process_args = [
        (str(f), existing_ppids, str(person_dir), args.dry_run)
        for f in entity_files
    ]

    results = {'migrated': 0, 'error': 0}
    classifications = {'human': 0, 'institution': 0, 'anonymous': 0, 'unknown': 0}
    collisions = 0
    samples = []

    batch_size = 1000
    for batch_start in range(0, len(process_args), batch_size):
        batch_end = min(batch_start + batch_size, len(process_args))
        batch = process_args[batch_start:batch_end]

        with Pool(args.workers) as pool:
            batch_results = pool.map(process_entity_file, batch)

        for status, ppid_or_error, classification, file_path in batch_results:
            if status == 'migrated':
                results['migrated'] += 1
                classifications[classification] = classifications.get(classification, 0) + 1

                if '-' in ppid_or_error.split('_')[-1] and len(ppid_or_error.split('-')[-1]) == 8:
                    collisions += 1

                existing_ppids.add(ppid_or_error)

                if len(samples) < 5:
                    samples.append((ppid_or_error, classification, Path(file_path).name))
            else:
                results['error'] += 1
                print(f"  ERROR: {file_path}: {ppid_or_error}")

        processed = batch_end
        pct = (processed / len(process_args)) * 100
        print(f"  Progress: {processed:,}/{len(process_args):,} ({pct:.1f}%) - "
              f"H:{classifications['human']:,} I:{classifications['institution']:,} "
              f"A:{classifications['anonymous']:,} U:{classifications['unknown']:,} "
              f"Collisions:{collisions}")

    # Summary
    print("\n" + "=" * 70)
    print(f"{'DRY RUN ' if args.dry_run else ''}MIGRATION SUMMARY")
    print("=" * 70)
    print(f"  Total processed: {results['migrated'] + results['error']:,}")
    print(f"  Successfully migrated: {results['migrated']:,}")
    print(f"  Errors: {results['error']}")
    print(f"  Collisions (UUID suffix added): {collisions}")

    print(f"\n  Classification breakdown:")
    print(f"    Human:       {classifications['human']:,}")
    print(f"    Institution: {classifications['institution']:,}")
    print(f"    Anonymous:   {classifications['anonymous']:,}")
    print(f"    Unknown:     {classifications['unknown']:,}")

    if samples:
        print(f"\n  Sample migrated profiles:")
        for ppid, classification, source in samples:
            print(f"    [{classification:11}] {ppid[:60]}... <- {source[:40]}...")

    if args.dry_run:
        print(f"\n  To execute migration, run without --dry-run flag")
    else:
        final_count = len(list(person_dir.glob('ID_*.json')))
        print(f"\n  Migration complete!")
        print(f"  Final PPID count: {final_count:,}")


if __name__ == '__main__':
    main()