glam/scripts/extract_profiles_simple.py

#!/usr/bin/env python3
"""
Simple script to extract LinkedIn profiles using the working pattern.
"""

import json
import os
import sys
import subprocess
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, Any

def call_exa_crawling(url: str, max_characters: int = 50000):
    """Call Exa crawling tool via MCP."""
    try:
        # Use the MCP tool directly
        mcp_path = '/Users/kempersc/apps/glam/exa-mcp-server-source/.smithery/stdio/index.cjs'
        result = subprocess.run(
            ['node', mcp_path, 'call', 'exa_crawling_exa',
             '--url', url,
             '--maxCharacters', str(max_characters)],
            capture_output=True,
            text=True,
            timeout=60
        )

        if result.returncode != 0:
            print(f"Error calling Exa: {result.stderr}")
            return None

        # Parse JSON output
        return json.loads(result.stdout)
    except Exception as e:
        print(f"Exception calling Exa: {e}")
        return None

def parse_linkedin_content(content: str, title: str, url: str) -> Dict[str, Any]:
    """Parse LinkedIn profile content from raw text."""

    # Initialize profile data
    profile = {
        "name": title,
        "linkedin_url": url,
        "headline": "",
        "location": "",
        "connections": "",
        "about": "",
        "experience": [],
        "education": [],
        "skills": [],
        "languages": [],
        "profile_image_url": None
    }

    # Simple extraction - look for key sections
    lines = content.split('\n')
    current_section = None

    for line in lines:
        line = line.strip()

        # Identify sections
        if line.startswith('## About'):
            current_section = 'about'
        elif line.startswith('## Experience'):
            current_section = 'experience'
        elif line.startswith('## Education'):
            current_section = 'education'
        elif line.startswith('## Skills'):
            current_section = 'skills'
        elif line.startswith('## Languages'):
            current_section = 'languages'
        elif line and not line.startswith('#') and current_section:
            # Extract content based on current section
            if current_section == 'about':
                profile['about'] += line + ' '
            elif current_section == 'experience' and 'at' in line:
                # Simple experience parsing
                parts = line.split(' at ')
                if len(parts) >= 2:
                    profile['experience'].append({
                        'title': parts[0].strip(),
                        'company': parts[1].strip(),
                        'duration': 'Current'
                    })
            elif current_section == 'education' and 'at' in line:
                # Simple education parsing
                parts = line.split(' at ')
                if len(parts) >= 2:
                    profile['education'].append({
                        'degree': parts[0].strip(),
                        'institution': parts[1].strip(),
                        'duration': 'Unknown'
                    })

    # Clean up the about section
    profile['about'] = profile['about'].strip()

    return profile

def extract_linkedin_profile(linkedin_url: str, output_file: str, source_file: str = "", staff_id: str = ""):
    """Extract LinkedIn profile using Exa crawler and save in structured format."""

    print(f"Extracting LinkedIn profile: {linkedin_url}")

    # Use Exa crawler to get profile content
    exa_result = call_exa_crawling(linkedin_url, 50000)

    if not exa_result or 'results' not in exa_result or not exa_result['results']:
        print(f"❌ Failed to extract profile from {linkedin_url}")
        return False

    # Get first (and only) result
    result = exa_result['results'][0]
    raw_content = result.get('text', '')
    title = result.get('title', 'Unknown')
    url = result.get('url', linkedin_url)

    # Parse profile content
    profile_data = parse_linkedin_content(raw_content, title, url)

    # Create structured output
    structured_data = {
        "extraction_metadata": {
            "source_file": source_file or "manual_extraction",
            "staff_id": staff_id or "manual",
            "extraction_date": datetime.now(timezone.utc).isoformat(),
            "extraction_method": "exa_crawling_exa",
            "extraction_agent": "glm-4.6",
            "linkedin_url": url,
            "cost_usd": 0.001,
            "request_id": result.get('id', 'unknown')
        },
        "profile_data": profile_data
    }

    # Ensure output directory exists
    output_path = Path(output_file)
    output_path.parent.mkdir(parents=True, exist_ok=True)

    # Save to file
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(structured_data, f, indent=2, ensure_ascii=False)

    print(f"✅ Profile saved to: {output_file}")
    print(f"   Name: {profile_data.get('name', 'Unknown')}")
    print(f"   Headline: {profile_data.get('headline', '')[:80]}...")
    return True

def main():
    """Main function to extract specific LinkedIn profiles."""

    # Define specific profiles to extract
    profiles_to_extract = [
        {
            'linkedin_url': 'https://www.linkedin.com/in/anja-van-hoorn-657b66223',
            'output_file': '/Users/kempersc/apps/glam/data/custodian/person/entity/anja-van-hoorn-657b66223_20251210T160000Z.json',
            'source_file': '/Users/kempersc/apps/glam/data/custodian/person/affiliated/parsed/academiehuis-grote-kerk-zwolle_staff_20251210T155412Z.json',
            'staff_id': 'academiehuis-grote-kerk-zwolle_staff_0001_anja_van_hoorn'
        },
        {
            'linkedin_url': 'https://www.linkedin.com/in/inez-van-kleef',
            'output_file': '/Users/kempersc/apps/glam/data/custodian/person/entity/inez-van-kleef_20251210T160000Z.json',
            'source_file': '/Users/kempersc/apps/glam/data/custodian/person/affiliated/parsed/academiehuis-grote-kerk-zwolle_staff_20251210T155412Z.json',
            'staff_id': 'academiehuis-grote-kerk-zwolle_staff_0002_inez_van_kleef'
        },
        {
            'linkedin_url': 'https://www.linkedin.com/in/marga-edens-a284175',
            'output_file': '/Users/kempersc/apps/glam/data/custodian/person/entity/marga-edens-a284175_20251210T160000Z.json',
            'source_file': '/Users/kempersc/apps/glam/data/custodian/person/affiliated/parsed/academiehuis-grote-kerk-zwolle_staff_20251210T155412Z.json',
            'staff_id': 'academiehuis-grote-kerk-zwolle_staff_0003_marga_edens'
        }
    ]

    success_count = 0
    total_cost = 0.0

    for profile in profiles_to_extract:
        if extract_linkedin_profile(**profile):
            success_count += 1
            total_cost += 0.001
        # Small delay to avoid overwhelming Exa
        import time
        time.sleep(2)

    print(f"\n📊 Extraction Summary:")
    print(f"✅ Successfully processed: {success_count}")
    print(f"💰 Total cost: ${total_cost:.3f}")
    print(f"📁 Files saved to: /Users/kempersc/apps/glam/data/custodian/person/entity")

if __name__ == "__main__":
    main()