glam/scripts/extract_comprehensive_profiles.py

#!/usr/bin/env python3
"""
Extract comprehensive LinkedIn profiles using Exa LinkedIn search service.
"""
import json
import os
import subprocess
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional, Dict, Any

def call_exa_linkedin_search(name: str, linkedin_url: str = "") -> Optional[dict]:
    """Call Exa LinkedIn search to find profile."""
    try:
        # Build search query - try name first, then URL if needed
        query = name if not linkedin_url else f"site:linkedin.com/in/ {linkedin_url.split('/')[-1]}"

        # Use JSON-RPC format to call exa-mcp-server
        cmd = [
            'echo',
            json.dumps({
                "jsonrpc": "2.0",
                "method": "tools/call",
                "params": {
                    "name": "linkedin_search_exa",
                    "arguments": {
                        "query": query,
                        "searchType": "profiles",
                        "numResults": 5
                    }
                },
                "id": 1
            }),
            '|',
            'exa-mcp-server'
        ]

        result = subprocess.run(
            cmd,
            capture_output=True,
            text=True,
            timeout=60
        )

        if result.returncode != 0:
            print(f"Error calling Exa LinkedIn search: {result.stderr}")
            return None

        # Parse JSON-RPC response
        response = json.loads(result.stdout)
        if 'result' in response and 'results' in response['result']:
            return response['result']['results']
        else:
            print(f"No results from LinkedIn search: {response}")
            return None

    except Exception as e:
        print(f"Exception calling Exa LinkedIn search: {e}")
        return None

def call_exa_crawling(linkedin_url: str) -> dict:
    """Call Exa crawling to get full profile content."""
    try:
        # Use JSON-RPC format to call exa-mcp-server
        cmd = [
            'echo',
            json.dumps({
                "jsonrpc": "2.0",
                "method": "tools/call",
                "params": {
                    "name": "crawling_exa",
                    "arguments": {
                        "url": linkedin_url,
                        "maxCharacters": 50000
                    }
                },
                "id": 1
            }),
            '|',
            'exa-mcp-server'
        ]

        result = subprocess.run(
            cmd,
            capture_output=True,
            text=True,
            timeout=60
        )

        if result.returncode != 0:
            print(f"Error calling Exa crawling: {result.stderr}")
            return None

        # Parse JSON-RPC response
        response = json.loads(result.stdout)
        if 'result' in response and 'results' in response['result']:
            return response['result']['results'][0]
        else:
            print(f"No results from crawling: {response}")
            return None

    except Exception as e:
        print(f"Exception calling Exa crawling: {e}")
        return None

def extract_comprehensive_profile(name: str, linkedin_url: str, output_file: str, source_file: str, staff_id: str) -> bool:
    """Extract comprehensive LinkedIn profile using Exa services."""

    print(f"Extracting comprehensive LinkedIn profile for: {name}")
    print(f"  URL: {linkedin_url}")

    # First try to get full profile content via crawling
    profile_data = None

    # Try crawling first (most comprehensive)
    crawl_result = call_exa_crawling(linkedin_url)
    if crawl_result:
        print(f"  ✅ Successfully crawled profile content")
        profile_data = {
            "name": name,
            "linkedin_url": linkedin_url,
            "headline": crawl_result.get("title", ""),
            "location": "",
            "connections": "",
            "about": crawl_result.get("text", "")[:2000] + "..." if len(crawl_result.get("text", "")) > 2000 else crawl_result.get("text", ""),
            "experience": [],
            "education": [],
            "skills": [],
            "languages": [],
            "profile_image_url": None
        }
    else:
        print(f"  ⚠️  Crawling failed, trying LinkedIn search...")
        # Fallback to LinkedIn search
        search_results = call_exa_linkedin_search(name, linkedin_url)
        if search_results and len(search_results) > 0:
            # Find the best match
            best_match = search_results[0]  # Take first result
            print(f"  ✅ Found profile via search")
            profile_data = {
                "name": name,
                "linkedin_url": linkedin_url,
                "headline": best_match.get("title", ""),
                "location": best_match.get("metadata", {}).get("location", ""),
                "connections": "",
                "about": best_match.get("text", "")[:2000] + "..." if len(best_match.get("text", "")) > 2000 else best_match.get("text", ""),
                "experience": [],
                "education": [],
                "skills": [],
                "languages": [],
                "profile_image_url": None
            }
        else:
            print(f"  ❌ Both crawling and search failed")
            return False

    # Create structured output
    structured_data = {
        "extraction_metadata": {
            "source_file": source_file,
            "staff_id": staff_id,
            "extraction_date": datetime.now(timezone.utc).isoformat(),
            "extraction_method": "exa_linkedin_search_and_crawl",
            "extraction_agent": "claude-opus-4.5",
            "linkedin_url": linkedin_url,
            "cost_usd": 0.002,  # Two API calls
            "request_id": f"search_crawl_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
        },
        "profile_data": profile_data
    }

    # Ensure output directory exists
    output_path = Path(output_file)
    output_path.parent.mkdir(parents=True, exist_ok=True)

    # Save to file
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(structured_data, f, indent=2, ensure_ascii=False)

    print(f"  ✅ Profile saved to: {output_path}")
    print(f"  📊 Name: {profile_data.get('name', 'Unknown')}")
    print(f"  📝 Headline: {profile_data.get('headline', '')[:100]}...")
    return True

def main():
    """Main function to extract comprehensive LinkedIn profiles."""

    # Define specific profiles to extract from Academiehuis Grote Kerk Zwolle
    profiles = [
        {
            'linkedin_url': 'https://www.linkedin.com/in/anja-van-hoorn-657b66223',
            'name': 'Anja van Hoorn',
            'output_file': '/Users/kempersc/apps/glam/data/custodian/person/entity/anja-van-hoorn-657b66223_20251210T160000Z.json',
            'source_file': '/Users/kempersc/apps/glam/data/custodian/person/affiliated/parsed/academiehuis-grote-kerk-zwolle_staff_20251210T155412Z.json',
            'staff_id': 'academiehuis-grote-kerk-zwolle_staff_0001_anja_van_hoorn'
        },
        {
            'linkedin_url': 'https://www.linkedin.com/in/inez-van-kleef',
            'name': 'Inez van Kleef',
            'output_file': '/Users/kempersc/apps/glam/data/custodian/person/entity/inez-van-kleef_20251210T160000Z.json',
            'source_file': '/Users/kempersc/apps/glam/data/custodian/person/affiliated/parsed/academiehuis-grote-kerk-zwolle_staff_20251210T155412Z.json',
            'staff_id': 'academiehuis-grote-kerk-zwolle_staff_0002_inez_van_kleef'
        },
        {
            'linkedin_url': 'https://www.linkedin.com/in/marga-edens-a284175',
            'name': 'Marga Edens',
            'output_file': '/Users/kempersc/apps/glam/data/custodian/person/entity/marga-edens-a284175_20251210T160000Z.json',
            'source_file': '/Users/kempersc/apps/glam/data/custodian/person/affiliated/parsed/academiehuis-grote-kerk-zwolle_staff_20251210T155412Z.json',
            'staff_id': 'academiehuis-grote-kerk-zwolle_staff_0003_marga_edens'
        }
    ]

    success_count = 0
    total_cost = 0.0

    for profile in profiles:
        if extract_comprehensive_profile(**profile):
            success_count += 1
            total_cost += 0.002
        # Delay between requests to respect rate limits
        import time
        time.sleep(3)

    print(f"\n📊 Extraction Summary:")
    print(f"✅ Successfully processed: {success_count}")
    print(f"💰 Total cost: ${total_cost:.3f}")
    print(f"📁 Files saved to: /Users/kempersc/apps/glam/data/custodian/person/entity")

if __name__ == "__main__":
    main()