glam/scripts/fetch_missing_confirmed_profiles.py

#!/usr/bin/env python3
"""
Fetch LinkedIn profiles for confirmed entity resolution matches that are missing entity files.

This script:
1. Identifies confirmed matches (review_decision == 'match') without entity files
2. Uses Exa API to fetch LinkedIn profile data
3. Creates entity files with proper structure including WCMS identifiers

Usage:
    python scripts/fetch_missing_confirmed_profiles.py [--dry-run] [--limit N]
"""

import json
import os
import re
import sys
import time
import argparse
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional, Dict, List
from urllib.parse import unquote

import httpx
from tqdm import tqdm


# Configuration
EXA_API_URL = "https://api.exa.ai/contents"
ENTITY_DIR = Path("data/custodian/person/entity")
CANDIDATES_FILE = Path("data/entity_resolution/entity_resolution_candidates.json")
WCMS_DIR = Path("data/person")
RATE_LIMIT_DELAY = 1.0  # seconds between requests


def get_exa_api_key() -> str:
    """Get Exa API key from environment."""
    key = os.environ.get("EXA_API_KEY")
    if not key:
        raise ValueError("EXA_API_KEY environment variable not set")
    return key


def normalize_slug(slug: str) -> str:
    """Normalize LinkedIn slug for filename use."""
    # URL decode percent-encoded characters
    slug = unquote(slug)
    # Remove problematic characters
    slug = slug.replace('✓', '').replace('✔', '')
    # Replace special chars with ASCII equivalents
    replacements = {
        'é': 'e', 'è': 'e', 'ê': 'e', 'ë': 'e',
        'á': 'a', 'à': 'a', 'â': 'a', 'ä': 'a',
        'í': 'i', 'ì': 'i', 'î': 'i', 'ï': 'i',
        'ó': 'o', 'ò': 'o', 'ô': 'o', 'ö': 'o',
        'ú': 'u', 'ù': 'u', 'û': 'u', 'ü': 'u',
        'ñ': 'n', 'ç': 'c',
    }
    for orig, repl in replacements.items():
        slug = slug.replace(orig, repl)
    return slug


def get_missing_profiles() -> List[Dict]:
    """Get confirmed matches that don't have entity files."""
    with open(CANDIDATES_FILE, 'r') as f:
        data = json.load(f)

    confirmed = [c for c in data['candidates'] if c.get('review_decision') == 'match']

    # Build slug -> candidate mapping
    slug_to_candidate = {}
    for c in confirmed:
        slug = c.get('linkedin_slug')
        if slug:
            slug_to_candidate[slug] = c

    # Find existing entity file slugs
    existing_slugs = set()
    for f in ENTITY_DIR.glob('*.json'):
        name = f.stem
        if '_202' in name:
            slug = name.rsplit('_202', 1)[0]
            existing_slugs.add(slug)

    # Find missing profiles
    missing = []
    for slug, candidate in slug_to_candidate.items():
        norm_slug = normalize_slug(slug)
        if slug not in existing_slugs and norm_slug not in existing_slugs:
            missing.append({
                'slug': slug,
                'normalized_slug': norm_slug,
                'wcms_ppid': candidate.get('wcms_ppid'),
                'wcms_name': candidate.get('wcms_name'),
                'wcms_email': candidate.get('wcms_email'),
                'linkedin_url': f'https://www.linkedin.com/in/{slug}',
            })

    return missing


def fetch_profile_exa(url: str, api_key: str, client: httpx.Client) -> Optional[Dict]:
    """Fetch LinkedIn profile using Exa contents API."""
    try:
        response = client.post(
            EXA_API_URL,
            headers={
                "Authorization": f"Bearer {api_key}",
                "Content-Type": "application/json",
            },
            json={
                "ids": [url],
                "text": True,
                "livecrawl": "preferred"
            },
            timeout=60.0
        )

        if response.status_code == 200:
            data = response.json()
            if data.get('results') and len(data['results']) > 0:
                return data['results'][0]
        else:
            print(f"\nError fetching {url}: HTTP {response.status_code}")
            if response.status_code == 429:
                print("Rate limited - waiting 30 seconds...")
                time.sleep(30)
        return None
    except Exception as e:
        print(f"\nException fetching {url}: {e}")
        return None


def parse_profile_text(text: str) -> Dict:
    """Parse profile text to extract structured data."""
    profile = {
        'name': None,
        'headline': None,
        'location': None,
        'about': None,
        'experience': [],
        'education': [],
        'skills': [],
        'languages': [],
    }

    if not text:
        return profile

    lines = text.strip().split('\n')

    # Try to extract name from first non-empty line
    for line in lines[:5]:
        line = line.strip()
        if line and not line.startswith('http') and len(line) < 100:
            # Likely the name
            if not any(x in line.lower() for x in ['linkedin', 'sign in', 'join']):
                profile['name'] = line
                break

    # Look for headline (usually follows name)
    if profile['name']:
        name_idx = None
        for i, line in enumerate(lines):
            if profile['name'] in line:
                name_idx = i
                break
        if name_idx is not None and name_idx + 1 < len(lines):
            headline = lines[name_idx + 1].strip()
            if headline and len(headline) < 200:
                profile['headline'] = headline

    # Extract location if present
    for line in lines:
        if any(loc in line for loc in ['Netherlands', 'Amsterdam', 'Rotterdam', 'Utrecht', 'Den Haag', 'The Hague']):
            if len(line) < 100:
                profile['location'] = line.strip()
                break

    return profile


def get_wcms_identifiers(wcms_ppid: str) -> Optional[Dict]:
    """Load WCMS identifiers from WCMS person file."""
    wcms_file = WCMS_DIR / f"{wcms_ppid}.json"
    if not wcms_file.exists():
        return None

    try:
        with open(wcms_file, 'r') as f:
            data = json.load(f)

        # Extract relevant WCMS identifiers
        identifiers = {}
        if data.get('user_id'):
            identifiers['user_id'] = data['user_id']
        if data.get('username'):
            identifiers['username'] = data['username']
        if data.get('username_url'):
            identifiers['username_url'] = data['username_url']
        if data.get('abs_id'):
            identifiers['abs_id'] = data['abs_id']
        if data.get('crm_id'):
            identifiers['crm_id'] = data['crm_id']

        # Also check nested structure
        if not identifiers:
            for key in ['user_id', 'username', 'username_url', 'abs_id', 'crm_id']:
                if data.get('wcms_identifiers', {}).get(key):
                    identifiers[key] = data['wcms_identifiers'][key]

        return identifiers if identifiers else None
    except Exception as e:
        print(f"Error loading WCMS file {wcms_ppid}: {e}")
        return None


def create_entity_file(profile_info: Dict, exa_result: Optional[Dict], wcms_ids: Optional[Dict]) -> Dict:
    """Create entity file structure."""
    slug = profile_info['normalized_slug']
    now = datetime.now(timezone.utc)

    # Parse profile data from Exa result
    profile_data = {
        'name': profile_info.get('wcms_name'),
        'linkedin_url': profile_info['linkedin_url'],
        'headline': None,
        'location': None,
        'connections': None,
        'about': None,
        'experience': [],
        'education': [],
        'skills': [],
        'languages': [],
        'profile_image_url': None,
    }

    if exa_result:
        parsed = parse_profile_text(exa_result.get('text', ''))
        if parsed.get('name'):
            profile_data['name'] = parsed['name']
        if parsed.get('headline'):
            profile_data['headline'] = parsed['headline']
        if parsed.get('location'):
            profile_data['location'] = parsed['location']

    entity = {
        'person_id': slug,
        'extraction_metadata': {
            'extraction_agent': 'fetch_missing_confirmed_profiles.py',
            'extraction_date': now.isoformat(),
            'extraction_source': 'Exa API crawl + entity resolution',
            'schema_version': '1.0.0',
            'notes': f'Created for confirmed entity resolution match. WCMS: {profile_info.get("wcms_ppid")}'
        },
        'profile_data': profile_data,
        'heritage_relevance': {
            'is_heritage_relevant': True,
            'heritage_types': ['O'],  # Default to Official/unknown
            'rationale': 'Confirmed match via entity resolution review'
        },
        'affiliations': [],
        'web_claims': [
            {
                'claim_type': 'linkedin_url',
                'claim_value': profile_info['linkedin_url'],
                'source_url': 'entity_resolution_review',
                'retrieved_on': now.isoformat(),
                'statement_created_at': now.isoformat(),
                'source_archived_at': now.isoformat(),
                'retrieval_agent': 'fetch_missing_confirmed_profiles.py'
            }
        ],
    }

    # Add WCMS identifiers if available
    if wcms_ids:
        entity['wcms_identifiers'] = wcms_ids

    return entity


def main():
    parser = argparse.ArgumentParser(description='Fetch missing LinkedIn profiles for confirmed matches')
    parser.add_argument('--dry-run', action='store_true', help='Show what would be fetched without fetching')
    parser.add_argument('--limit', type=int, default=None, help='Limit number of profiles to fetch')
    args = parser.parse_args()

    # Get missing profiles
    missing = get_missing_profiles()
    print(f"Found {len(missing)} confirmed matches without entity files")

    if args.limit:
        missing = missing[:args.limit]
        print(f"Limited to {len(missing)} profiles")

    if args.dry_run:
        print("\nDry run - profiles that would be fetched:")
        for p in missing:
            print(f"  {p['slug']} -> {p['wcms_ppid']}")
        return

    # Ensure output directory exists
    ENTITY_DIR.mkdir(parents=True, exist_ok=True)

    # Get API key
    try:
        api_key = get_exa_api_key()
    except ValueError as e:
        print(f"Error: {e}")
        sys.exit(1)

    # Fetch profiles
    success_count = 0
    error_count = 0

    with httpx.Client() as client:
        for profile in tqdm(missing, desc="Fetching profiles"):
            slug = profile['normalized_slug']
            url = profile['linkedin_url']

            # Fetch from Exa
            exa_result = fetch_profile_exa(url, api_key, client)

            # Get WCMS identifiers
            wcms_ids = None
            if profile.get('wcms_ppid'):
                wcms_ids = get_wcms_identifiers(profile['wcms_ppid'])

            # Create entity file
            entity = create_entity_file(profile, exa_result, wcms_ids)

            # Save file
            timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
            filename = f"{slug}_{timestamp}.json"
            filepath = ENTITY_DIR / filename

            with open(filepath, 'w', encoding='utf-8') as f:
                json.dump(entity, f, indent=2, ensure_ascii=False)

            if exa_result:
                success_count += 1
            else:
                error_count += 1

            # Rate limiting
            time.sleep(RATE_LIMIT_DELAY)

    print(f"\nDone! Created {success_count + error_count} entity files")
    print(f"  With Exa data: {success_count}")
    print(f"  Without Exa data (WCMS only): {error_count}")


if __name__ == '__main__':
    main()