glam/scripts/migrate_entity_quick_test.py

#!/usr/bin/env python3
"""
Quick test migration script - processes a small sample without pre-scanning all files.
Use for validation before running full migration.
"""

import json
import re
from pathlib import Path
from urllib.parse import unquote
from datetime import datetime, timezone
import unicodedata

# Patterns for detecting non-human profiles
NON_HUMAN_PATTERNS = [
    r'^LinkedIn\s+Member$',
    r'^TheMuseumsLab$',
    r'Museum$',
    r'Foundation$',
    r'Stichting\s',
    r'^ICOM\s',
    r'^Fondazione\s',
    r'Institute$',
    r'Organisation$',
    r'Organization$',
    r'University$',
    r'^Google\s',
    r'^Sound\s+Heritage$',
    r'^Company\s',
    r'^Computational\s+Research$',
]

def extract_linkedin_slug(url):
    """Extract LinkedIn slug from URL."""
    if not url or 'linkedin.com/in/' not in url:
        return None
    slug = url.split('linkedin.com/in/')[-1].rstrip('/').split('?')[0]
    slug = unquote(slug)
    return slug.lower()

def is_human_profile(name, profile_data):
    """Determine if profile represents a human being (not an institution)."""
    if not name:
        return False

    for pattern in NON_HUMAN_PATTERNS:
        if re.search(pattern, name, re.IGNORECASE):
            return False

    if name == 'LinkedIn Member' and not profile_data.get('linkedin_url'):
        return False

    return True

def normalize_name_for_ppid(name):
    """Convert name to PPID format: FIRST-LAST"""
    if not name:
        return "UNKNOWN"

    name = re.sub(r'\b(Dr|Prof|Mr|Mrs|Ms|PhD|MA|MSc|MBA|BSc|Jr|Sr|PSM|GIA|GG)\b\.?', '', name, flags=re.IGNORECASE)
    parts = [p.strip() for p in name.split() if p.strip()]
    if not parts:
        return "UNKNOWN"

    def normalize_part(p):
        nfkd = unicodedata.normalize('NFKD', p)
        ascii_name = ''.join(c for c in nfkd if not unicodedata.combining(c))
        return re.sub(r'[^A-Za-z]', '', ascii_name).upper()

    normalized = [normalize_part(p) for p in parts if normalize_part(p)]
    return '-'.join(normalized) if normalized else "UNKNOWN"

def generate_ppid(name):
    """Generate PPID from name (locations/dates use XX placeholders)."""
    name_token = normalize_name_for_ppid(name)
    return f"ID_XX-XX-XXX_XXXX_XX-XX-XXX_XXXX_{name_token}"

def transform_entity_to_ppid(entity_data, entity_file):
    """Transform entity profile to PPID format, preserving ALL data."""

    name = entity_data.get('profile_data', {}).get('name') or entity_data.get('name', 'Unknown')
    ppid = generate_ppid(name)

    ppid_profile = {
        "ppid": ppid,
        "ppid_type": "ID",
        "ppid_components": {
            "type": "ID",
            "first_location": "XX-XX-XXX",
            "first_date": "XXXX",
            "last_location": "XX-XX-XXX",
            "last_date": "XXXX",
            "name_tokens": normalize_name_for_ppid(name).split('-')
        },
        "name": name,
        "birth_date": {
            "edtf": "XXXX",
            "precision": "unknown",
            "note": "Not yet enriched - requires manual research"
        },
        "is_living": True,
        "heritage_relevance": entity_data.get('heritage_relevance', {
            "is_heritage_relevant": True,
            "heritage_types": [],
            "rationale": "Extracted from heritage custodian LinkedIn page"
        }),
        "affiliations": entity_data.get('affiliations', []),
        "profile_data": entity_data.get('profile_data', {}),
        "web_claims": entity_data.get('web_claims', []),
        "source_observations": entity_data.get('source_observations', []),
        "extraction_metadata": entity_data.get('extraction_metadata', {}),
        "migration_metadata": {
            "original_entity_file": entity_file.name,
            "original_person_id": entity_data.get('person_id'),
            "original_linkedin_slug": entity_data.get('linkedin_slug'),
            "migrated_at": datetime.now(timezone.utc).isoformat(),
            "migration_script": "migrate_entity_to_ppid_v2.py",
            "migration_version": "2.0"
        }
    }

    return ppid, ppid_profile

def main():
    entity_dir = Path('/Users/kempersc/apps/glam/data/custodian/person/entity')
    person_dir = Path('/Users/kempersc/apps/glam/data/person')

    print("=" * 70)
    print("QUICK TEST - Migration Preview (5 profiles)")
    print("=" * 70)

    # Get first 10 entity files and process 5 human profiles
    entity_files = list(entity_dir.glob('*.json'))[:50]  # Scan only 50

    processed = 0
    skipped_non_human = 0
    skipped_no_linkedin = 0

    for f in entity_files:
        if processed >= 5:
            break

        try:
            data = json.load(open(f))
            name = data.get('profile_data', {}).get('name') or data.get('name', '')

            # Skip non-human profiles
            if not is_human_profile(name, data.get('profile_data', {})):
                skipped_non_human += 1
                continue

            # Check for LinkedIn URL
            linkedin_url = data.get('profile_data', {}).get('linkedin_url')
            if not linkedin_url:
                skipped_no_linkedin += 1
                continue

            slug = extract_linkedin_slug(linkedin_url)

            # Transform to PPID format
            ppid, ppid_profile = transform_entity_to_ppid(data, f)
            output_file = person_dir / f"{ppid}.json"

            processed += 1

            print(f"\n{'=' * 70}")
            print(f"Profile {processed}/5")
            print(f"{'=' * 70}")
            print(f"Source file: {f.name}")
            print(f"Name: {name}")
            print(f"LinkedIn slug: {slug}")
            print(f"Generated PPID: {ppid}")
            print(f"Output file: {output_file.name}")
            print(f"Already exists: {output_file.exists()}")

            # Show preserved data
            print(f"\nData preserved:")
            print(f"  - Web claims: {len(ppid_profile.get('web_claims', []))}")
            print(f"  - Affiliations: {len(ppid_profile.get('affiliations', []))}")
            print(f"  - Source observations: {len(ppid_profile.get('source_observations', []))}")
            print(f"  - Profile data fields: {list(ppid_profile.get('profile_data', {}).keys())}")

            if ppid_profile.get('web_claims'):
                print(f"\n  Sample web claim:")
                claim = ppid_profile['web_claims'][0]
                print(f"    claim_type: {claim.get('claim_type')}")
                print(f"    claim_value: {str(claim.get('claim_value'))[:60]}...")
                print(f"    xpath: {claim.get('xpath', 'N/A')[:80]}..." if claim.get('xpath') else "    xpath: N/A")
                print(f"    xpath_match_score: {claim.get('xpath_match_score', 'N/A')}")

            if ppid_profile.get('affiliations'):
                print(f"\n  Sample affiliation:")
                aff = ppid_profile['affiliations'][0]
                print(f"    org: {aff.get('organization_name', 'N/A')}")
                print(f"    title: {aff.get('role_title', 'N/A')}")

        except Exception as e:
            print(f"ERROR processing {f.name}: {e}")

    print(f"\n{'=' * 70}")
    print("QUICK TEST SUMMARY")
    print(f"{'=' * 70}")
    print(f"Profiles shown: {processed}")
    print(f"Skipped (non-human): {skipped_non_human}")
    print(f"Skipped (no LinkedIn): {skipped_no_linkedin}")
    print(f"\nTo run full migration:")
    print(f"  python scripts/migrate_entity_to_ppid_v2.py --dry-run")

if __name__ == '__main__':
    main()