glam/scripts/fetch_remaining_linkedin_profiles.py

#!/usr/bin/env python3
"""
Fetch LinkedIn profiles using Exa and store as properly formatted JSON entity files.
Uses threading for parallel processing and ensures consistent JSON structure.
"""

import json
import os
import sys
import time
import subprocess
from datetime import datetime, timezone
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Dict, List, Any, Optional

def load_staff_files(file_paths: List[str]) -> List[Dict[str, Any]]:
    """Load staff data from JSON files."""
    staff_members = []

    for file_path in file_paths:
        print(f"Loading staff file: {file_path}")
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
                if 'staff' in data:
                    for member in data['staff']:
                        staff_members.append({
                            'file_path': file_path,
                            'staff_data': member
                        })
        except Exception as e:
            print(f"Error loading {file_path}: {e}")

    return staff_members

def call_exa_crawling(url: str, max_characters: int = 50000) -> Optional[Dict[str, Any]]:
    """Call Exa crawling tool via MCP."""
    try:
        # Use the MCP tool directly from built exa-mcp-server
        result = subprocess.run(
            ['node', '/Users/kempersc/apps/glam/exa-mcp-server-source/.smithery/stdio/index.cjs',
             'call', 'exa_crawling_exa',
             '--url', url,
             '--maxCharacters', str(max_characters)],
            capture_output=True,
            text=True,
            timeout=60
        )

        if result.returncode != 0:
            print(f"Error calling Exa: {result.stderr}")
            return None

        # Parse JSON output
        return json.loads(result.stdout)
    except Exception as e:
        print(f"Exception calling Exa: {e}")
        return None

def extract_linkedin_profile(staff_member: Dict[str, Any]) -> Dict[str, Any]:
    """Extract LinkedIn profile for a staff member."""
    staff_data = staff_member['staff_data']
    linkedin_url = staff_data.get('linkedin_profile_url', '') or staff_data.get('linkedin_url', '')

    if not linkedin_url:
        return {
            'staff_id': staff_data.get('staff_id', ''),
            'error': 'No LinkedIn URL found',
            'linkedin_url': linkedin_url,
            'success': False
        }

    # Extract LinkedIn slug from URL
    linkedin_slug = linkedin_url.split('linkedin.com/in/')[-1].split('?')[0].split('/')[0]

    print(f"Extracting profile for: {staff_data.get('name', 'Unknown')} ({linkedin_slug})")

    # Use Exa crawler to get profile content
    exa_result = call_exa_crawling(linkedin_url, 50000)

    if not exa_result or 'results' not in exa_result or not exa_result['results']:
        return {
            'staff_id': staff_data.get('staff_id', ''),
            'error': 'No content returned from Exa',
            'linkedin_url': linkedin_url,
            'success': False
        }

    profile_content = exa_result['results'][0].get('text', '')

    # Parse profile content into structured format
    parsed_profile = parse_profile_content(profile_content, staff_data)

    return {
        'staff_id': staff_data.get('staff_id', ''),
        'linkedin_url': linkedin_url,
        'linkedin_slug': linkedin_slug,
        'success': True,
        'profile_data': parsed_profile,
        'raw_content': profile_content
    }

    # Extract LinkedIn slug from URL
    linkedin_slug = linkedin_url.split('linkedin.com/in/')[-1].split('?')[0].split('/')[0]

    print(f"Extracting profile for: {staff_data.get('name', 'Unknown')} ({linkedin_slug})")

    try:
        # Use Exa to crawl the profile with high character limit
        result = exa_crawling_exa(
            url=linkedin_url,
            maxCharacters=50000  # Get complete profile
        )

        if result and result.get('results'):
            profile_content = result['results'][0].get('text', '')

            # Parse the profile content into structured format
            parsed_profile = parse_profile_content(profile_content, staff_data)

            return {
                'staff_id': staff_data.get('staff_id', ''),
                'linkedin_url': linkedin_url,
                'linkedin_slug': linkedin_slug,
                'success': True,
                'profile_data': parsed_profile,
                'raw_content': profile_content
            }
        else:
            return {
                'staff_id': staff_data.get('staff_id', ''),
                'error': 'No content returned from Exa',
                'linkedin_url': linkedin_url
            }

    except Exception as e:
        return {
            'staff_id': staff_data.get('staff_id', ''),
            'error': str(e),
            'linkedin_url': linkedin_url
        }

def parse_profile_content(content: str, staff_data: Dict[str, Any]) -> Dict[str, Any]:
    """Parse LinkedIn profile content into structured format."""
    # This is a simplified parser - in practice, you'd want more sophisticated parsing
    # For now, we'll extract basic information and preserve the raw content

    name = staff_data.get('name', 'Unknown')
    linkedin_url = staff_data.get('linkedin_url', '')

    # Try to extract headline, location, connections from content
    headline = staff_data.get('role', name)  # Fallback to role from staff data
    location = "Netherlands"  # Default
    connections = "500+ connections"

    # Simple extraction patterns (you'd want to improve these)
    lines = content.split('\n')
    for line in lines:
        if 'headlines' in line.lower() or 'title' in line.lower():
            # Extract headline logic here
            pass
        if 'location' in line.lower():
            # Extract location logic here
            pass
        if 'connections' in line.lower():
            # Extract connections logic here
            pass

    # Build experience from staff data if available
    experience = []
    if 'role' in staff_data and 'company' in staff_data:
        experience.append({
            'title': staff_data.get('role', ''),
            'company': staff_data.get('company', ''),
            'duration': 'Current',
            'location': location,
            'heritage_relevant': True,
            'heritage_type': 'A'  # Archive type as default
        })

    # Build education from staff data if available
    education = []
    # Add education parsing logic here

    return {
        'name': name,
        'linkedin_url': linkedin_url,
        'headline': headline,
        'location': location,
        'connections': connections,
        'about': f'Profile extracted for {name}',
        'experience': experience,
        'education': education,
        'skills': [],
        'languages': [],
        'profile_image_url': None
    }

def create_entity_file(extraction_result: Dict[str, Any], output_dir: Path) -> bool:
    """Create a properly formatted entity file."""
    if extraction_result.get('error'):
        print(f"Skipping {extraction_result['staff_id']}: {extraction_result['error']}")
        return False

    staff_id = extraction_result['staff_id']
    linkedin_slug = extraction_result['linkedin_slug']
    profile_data = extraction_result['profile_data']

    # Create filename
    timestamp = datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%SZ')
    filename = f"{linkedin_slug}_{timestamp}.json"
    filepath = output_dir / filename

    # Create the entity structure
    entity_data = {
        "extraction_metadata": {
            "source_file": extraction_result.get('source_file', ''),
            "staff_id": staff_id,
            "extraction_date": datetime.now(timezone.utc).isoformat(),
            "extraction_method": "exa_crawling_exa",
            "extraction_agent": "glm-4.6",
            "linkedin_url": extraction_result['linkedin_url'],
            "cost_usd": 0.001,
            "extraction_time_seconds": 10.0
        },
        "profile_data": profile_data
    }

    try:
        with open(filepath, 'w', encoding='utf-8') as f:
            json.dump(entity_data, f, indent=2, ensure_ascii=False)

        print(f"✅ Created: {filename}")
        return True

    except Exception as e:
        print(f"Error creating {filename}: {e}")
        return False

def main():
    """Main function to fetch and store LinkedIn profiles."""
    # Define paths
    staff_files = [
        '/Users/kempersc/apps/glam/data/custodian/person/affiliated/parsed/acp-ica-archival-community-for-palestine_staff_20251210T155412Z.json',
        '/Users/kempersc/apps/glam/data/custodian/person/affiliated/parsed/academiehuis-grote-kerk-zwolle_staff_20251210T155412Z.json'
    ]

    output_dir = Path('/Users/kempersc/apps/glam/data/custodian/person/entity')
    output_dir.mkdir(parents=True, exist_ok=True)

    # Load staff data
    print("Loading staff files...")
    staff_members = load_staff_files(staff_files)
    print(f"Found {len(staff_members)} staff members to process")

    # Filter out already processed profiles (check if files exist)
    remaining_members = []
    for member in staff_members:
        linkedin_url = member['staff_data'].get('linkedin_url', '')
        if linkedin_url:
            linkedin_slug = linkedin_url.split('linkedin.com/in/')[-1].split('?')[0].split('/')[0]
            # Check if file already exists
            existing_files = list(output_dir.glob(f"{linkedin_slug}_*.json"))
            if existing_files:
                print(f"Skipping {member['staff_data'].get('name', 'Unknown')} - already processed")
                continue
        remaining_members.append(member)

    print(f"Processing {len(remaining_members)} remaining profiles...")

    # Process profiles with threading
    success_count = 0
    total_cost = 0.0

    with ThreadPoolExecutor(max_workers=3) as executor:
        # Submit all extraction tasks
        future_to_member = {
            executor.submit(extract_linkedin_profile, member): member
            for member in remaining_members
        }

        # Process completed tasks
        for future in as_completed(future_to_member):
            member = future_to_member[future]
            try:
                result = future.result()

                # Create entity file
                if create_entity_file(result, output_dir):
                    success_count += 1
                    total_cost += 0.001

                # Small delay to avoid overwhelming Exa
                time.sleep(1)

            except Exception as e:
                print(f"Error processing {member['staff_data'].get('name', 'Unknown')}: {e}")

    print(f"\n📊 Extraction Summary:")
    print(f"✅ Successfully processed: {success_count}")
    print(f"💰 Total cost: ${total_cost:.3f}")
    print(f"📁 Files saved to: {output_dir}")

if __name__ == "__main__":
    main()