glam/scripts/extract_linkedin_profile_exa.py

#!/usr/bin/env python3
"""
Extract LinkedIn profile using Exa crawler and save in proper structured format.
"""

import os
import json
import sys
import re
import subprocess
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Any, Optional


def call_exa_crawling(url: str, max_characters: int = 50000) -> Dict[str, Any]:
    """Call Exa crawling tool via MCP."""
    try:
        # Use the MCP tool directly
        result = subprocess.run(
            ['exa-mcp-server', 'call', 'crawling_exa',
             '--url', url,
             '--maxCharacters', str(max_characters)],
            capture_output=True,
            text=True,
            timeout=60
        )

        if result.returncode != 0:
            print(f"Error calling Exa: {result.stderr}")
            return None

        # Parse JSON output
        return json.loads(result.stdout)
    except Exception as e:
        print(f"Exception calling Exa: {e}")
        return None


def extract_linkedin_profile_with_exa(linkedin_url: str, output_file: str, source_file: str = "", staff_id: str = "") -> bool:
    """Extract LinkedIn profile using Exa crawler and save in structured format."""

    print(f"Extracting LinkedIn profile: {linkedin_url}")

    # Use Exa crawler to get profile content
    exa_result = call_exa_crawling(linkedin_url, 50000)

    if not exa_result or 'results' not in exa_result or not exa_result['results']:
        print(f"❌ Failed to extract profile from {linkedin_url}")
        return False

    # Get first (and only) result
    result = exa_result['results'][0]
    raw_content = result.get('text', '')
    title = result.get('title', '')
    url = result.get('url', linkedin_url)

    # Parse profile content
    profile_data = parse_linkedin_content(raw_content, title, url)

    # Create structured output
    structured_data = {
        "extraction_metadata": {
            "source_file": source_file or "manual_extraction",
            "staff_id": staff_id or "manual",
            "extraction_date": datetime.now(timezone.utc).isoformat(),
            "extraction_method": "exa_crawling_exa",
            "extraction_agent": "claude-opus-4.5",
            "linkedin_url": url,
            "cost_usd": 0,  # Exa contents endpoint is free
            "request_id": result.get('id', 'unknown')
        },
        "profile_data": profile_data
    }

    # Ensure output directory exists
    output_path = Path(output_file)
    output_path.parent.mkdir(parents=True, exist_ok=True)

    # Save to file
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(structured_data, f, indent=2, ensure_ascii=False)

    print(f"✅ Profile saved to: {output_file}")
    print(f"   Name: {profile_data.get('name', 'Unknown')}")
    print(f"   Headline: {profile_data.get('headline', '')[:80]}...")
    return True


def parse_linkedin_content(content: str, title: str, url: str) -> Dict[str, Any]:
    """Parse LinkedIn profile content from raw text."""

    # Initialize profile data
    profile = {
        "name": "",
        "linkedin_url": url,
        "headline": "",
        "location": "",
        "connections": "",
        "about": "",
        "experience": [],
        "education": [],
        "skills": [],
        "languages": [],
        "profile_image_url": None
    }

    # Extract name from title or content
    if title:
        # Remove " | LinkedIn" suffix
        name = title.replace(' | LinkedIn', '').strip()
        if name and '|' in name:
            # If there's a pipe, take the first part as name
            name = name.split('|')[0].strip()
        profile["name"] = name

    # Extract headline (usually right after name)
    lines = content.split('\n')
    for i, line in enumerate(lines):
        if line.strip() and not line.startswith('#') and i > 0:
            # This is likely the headline
            profile["headline"] = line.strip()
            break

    # Extract location
    location_patterns = [
        r'([A-Za-z\s]+,\s*[A-Za-z\s]+,\s*[A-Za-z\s]+)',  # City, Region, Country
        r'([A-Za-z\s]+,\s*[A-Za-z\s]+)',  # City, Country
    ]
    for pattern in location_patterns:
        match = re.search(pattern, content)
        if match:
            profile["location"] = match.group(1).strip()
            break

    # Extract connections
    conn_match = re.search(r'(\d+(?:,\d+)*)\s+connections', content, re.IGNORECASE)
    if conn_match:
        connections = conn_match.group(1)
        # Look for followers
        follower_match = re.search(r'(\d+(?:,\d+)*)\s+followers', content, re.IGNORECASE)
        if follower_match:
            connections += f" • {follower_match.group(1)} followers"
        profile["connections"] = connections

    # Extract About section
    about_match = re.search(r'About\s*\n+(.+?)(?=\n\n|\n[A-Z]|\Z)', content, re.DOTALL | re.IGNORECASE)
    if about_match:
        profile["about"] = about_match.group(1).strip()

    # Extract Experience section
    exp_section = re.search(r'Experience\s*\n+(.+?)(?=\n\n|\n[A-Z][a-z]+\s*\n|\Z)', content, re.DOTALL | re.IGNORECASE)
    if exp_section:
        exp_content = exp_section.group(1)
        # Parse individual experiences
        exp_entries = re.findall(r'([A-Z][^-\n][^\n]*)\n–\s*([^\n]+)\n([^•\n]*(?:\n[^•\n]*)*?)', exp_content)
        for title, company, details in exp_entries:
            exp = {
                "title": title.strip(),
                "company": company.strip(),
                "duration": "",
                "location": "",
                "description": ""
            }

            # Extract duration
            dur_match = re.search(r'(\w+\s+\d{4})\s*-\s*(Present|\w+\s+\d{4})', details)
            if dur_match:
                exp["duration"] = f"{dur_match.group(1)} - {dur_match.group(2)}"

            # Extract location from details
            loc_match = re.search(r'([A-Za-z\s]+,\s*[A-Za-z\s]+)', details)
            if loc_match:
                exp["location"] = loc_match.group(1).strip()

            # Clean up description
            exp["description"] = re.sub(r'\s+', ' ', details).strip()

            profile["experience"].append(exp)

    # Extract Education section
    edu_section = re.search(r'Education\s*\n+(.+?)(?=\n\n|\n[A-Z][a-z]+\s*\n|\Z)', content, re.DOTALL | re.IGNORECASE)
    if edu_section:
        edu_content = edu_section.group(1)
        # Parse individual education entries
        edu_entries = re.findall(r'([^\n]+)\n–\s*([^\n]+)\n([^•\n]*(?:\n[^•\n]*)*?)', edu_content)
        for school, degree, details in edu_entries:
            edu = {
                "school": school.strip(),
                "degree": degree.strip(),
                "years": "",
                "description": ""
            }

            # Extract years
            years_match = re.search(r'(\d{4})\s*-\s*(\d{4}|Present)', details)
            if years_match:
                edu["years"] = f"{years_match.group(1)} - {years_match.group(2)}"

            edu["description"] = re.sub(r'\s+', ' ', details).strip()

            profile["education"].append(edu)

    # Extract Skills section
    skills_section = re.search(r'Skills\s*\n+(.+?)(?=\n\n|\n[A-Z][a-z]+\s*\n|\Z)', content, re.DOTALL | re.IGNORECASE)
    if skills_section:
        skills_content = skills_section.group(1)
        # Extract skills (they might be separated by commas or bullets)
        skills = re.split(r'[,•\n]\s*', skills_content)
        profile["skills"] = [s.strip() for s in skills if s.strip() and len(s.strip()) > 1]

    # Extract Languages
    lang_match = re.search(r'Languages\s*\n+(.+?)(?=\n\n|\n[A-Z][a-z]+\s*\n|\Z)', content, re.DOTALL | re.IGNORECASE)
    if lang_match:
        lang_content = lang_match.group(1)
        languages = re.split(r'[,•\n]\s*', lang_content)
        profile["languages"] = [l.strip() for l in languages if l.strip() and len(l.strip()) > 1]

    # Try to extract profile image URL
    img_match = re.search(r'https://media\.licdn\.com/dms/image/[^\s\)]+', content)
    if img_match:
        profile["profile_image_url"] = img_match.group(0)

    return profile


def main():
    """Main function to extract a specific LinkedIn profile."""
    if len(sys.argv) < 3:
        print("Usage: python extract_linkedin_profile_exa.py <linkedin_url> <output_file> [source_file] [staff_id]")
        print("\nExample:")
        print("python extract_linkedin_profile_exa.py https://www.linkedin.com/in/annelien-vos-keen-657b66223 /Users/kempersc/apps/glam/data/custodian/person/entity/annelien-vos-keen-657b66223_20251210T160000Z.json")
        sys.exit(1)

    linkedin_url = sys.argv[1]
    output_file = sys.argv[2]
    source_file = sys.argv[3] if len(sys.argv) > 3 else ""
    staff_id = sys.argv[4] if len(sys.argv) > 4 else ""

    # Extract profile
    success = extract_linkedin_profile_with_exa(linkedin_url, output_file, source_file, staff_id)

    if success:
        print("\n✅ Profile extraction completed successfully!")
    else:
        print("\n❌ Profile extraction failed!")
        sys.exit(1)


if __name__ == "__main__":
    main()