glam/scripts/extract_specific_profiles.py

#!/usr/bin/env python3
"""
Simple script to extract LinkedIn profiles using the working pattern from extract_linkedin_profile_exa.py
"""

import json
import os
import sys
import subprocess
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Any, Optional

def call_exa_crawling(url: str, max_characters: int = 50000) -> Optional[Dict[str, Any]]:
    """Call Exa crawling tool via MCP."""
    try:
        # Use the MCP tool directly
        result = subprocess.run(
            ['mcp', 'call', 'exa_crawling_exa',
             '--url', url,
             '--maxCharacters', str(max_characters)],
            capture_output=True,
            text=True,
            timeout=60
        )

        if result.returncode != 0:
            print(f"Error calling Exa: {result.stderr}")
            return None

        # Parse JSON output
        return json.loads(result.stdout)
    except Exception as e:
        print(f"Exception calling Exa: {e}")
        return None

def parse_linkedin_content(content: str, title: str, url: str) -> Dict[str, Any]:
    """Parse LinkedIn profile content from raw text."""

    # Initialize profile data
    profile = {
        "name": title,
        "linkedin_url": url,
        "headline": "",
        "location": "",
        "connections": "",
        "about": "",
        "experience": [],
        "education": [],
        "skills": [],
        "languages": [],
        "profile_image_url": None
    }

    # Simple extraction - look for key sections
    lines = content.split('\n')
    current_section = None

    for line in lines:
        line = line.strip()

        # Identify sections
        if line.startswith('## About'):
            current_section = 'about'
        elif line.startswith('## Experience'):
            current_section = 'experience'
        elif line.startswith('## Education'):
            current_section = 'education'
        elif line.startswith('## Skills'):
            current_section = 'skills'
        elif line.startswith('## Languages'):
            current_section = 'languages'
        elif line and not line.startswith('#') and current_section:
            # Extract content based on current section
            if current_section == 'about':
                profile['about'] += line + ' '
            elif current_section == 'experience' and 'at' in line:
                # Simple experience parsing
                parts = line.split(' at ')
                if len(parts) >= 2:
                    profile['experience'].append({
                        'title': parts[0].strip(),
                        'company': parts[1].strip(),
                        'duration': 'Current'
                    })
            elif current_section == 'education' and 'at' in line:
                # Simple education parsing
                parts = line.split(' at ')
                if len(parts) >= 2:
                    profile['education'].append({
                        'degree': parts[0].strip(),
                        'institution': parts[1].strip(),
                        'duration': 'Unknown'
                    })

    # Clean up the about section
    profile['about'] = profile['about'].strip()

    return profile

def extract_linkedin_profile(linkedin_url: str, output_file: str, source_file: str = "", staff_id: str = "") -> bool:
    """Extract LinkedIn profile using Exa crawler and save in structured format."""

    print(f"Extracting LinkedIn profile: {linkedin_url}")

    # Use Exa crawler to get profile content
    exa_result = call_exa_crawling(linkedin_url, 50000)

    if not exa_result or 'results' not in exa_result or not exa_result['results']:
        print(f"❌ Failed to extract profile from {linkedin_url}")
        return False

    # Get first (and only) result
    result = exa_result['results'][0]
    raw_content = result.get('text', '')
    title = result.get('title', 'Unknown')
    url = result.get('url', linkedin_url)

    # Parse profile content
    profile_data = parse_linkedin_content(raw_content, title, url)

    # Create structured output
    structured_data = {
        "extraction_metadata": {
            "source_file": source_file or "manual_extraction",
            "staff_id": staff_id or "manual",
            "extraction_date": datetime.now(timezone.utc).isoformat(),
            "extraction_method": "exa_crawling_exa",
            "extraction_agent": "glm-4.6",
            "linkedin_url": url,
            "cost_usd": 0.001,
            "request_id": result.get('id', 'unknown')
        },
        "profile_data": profile_data
    }

    # Ensure output directory exists
    output_path = Path(output_file)
    output_path.parent.mkdir(parents=True, exist_ok=True)

    # Save to file
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(structured_data, f, indent=2, ensure_ascii=False)

    print(f"✅ Profile saved to: {output_file}")
    print(f"   Name: {profile_data.get('name', 'Unknown')}")
    print(f"   Headline: {profile_data.get('headline', '')[:80]}...")
    return True

def main():
    """Main function to extract specific LinkedIn profiles."""

    # Define specific profiles to extract
    profiles_to_extract = [
        {
            'linkedin_url': 'https://www.linkedin.com/in/anja-van-hoorn-657b66223',
            'output_file': '/Users/kempersc/apps/glam/data/custodian/person/entity/anja-van-hoorn-657b66223_20251210T160000Z.json',
            'source_file': '/Users/kempersc/apps/glam/data/custodian/person/affiliated/parsed/academiehuis-grote-kerk-zwolle_staff_20251210T155412Z.json',
            'staff_id': 'academiehuis-grote-kerk-zwolle_staff_0001_anja_van_hoorn'
        },
        {
            'linkedin_url': 'https://www.linkedin.com/in/inez-van-kleef-',
            'output_file': '/Users/kempersc/apps/glam/data/custodian/person/entity/inez-van-kleef_20251210T160000Z.json',
            'source_file': '/Users/kempersc/apps/glam/data/custodian/person/affiliated/parsed/academiehuis-grote-kerk-zwolle_staff_20251210T155412Z.json',
            'staff_id': 'academiehuis-grote-kerk-zwolle_staff_0002_inez_van_kleef'
        },
        {
            'linkedin_url': 'https://www.linkedin.com/in/marga-edens-a284175',
            'output_file': '/Users/kempersc/apps/glam/data/custodian/person/entity/marga-edens-a284175_20251210T160000Z.json',
            'source_file': '/Users/kempersc/apps/glam/data/custodian/person/affiliated/parsed/academiehuis-grote-kerk-zwolle_staff_20251210T155412Z.json',
            'staff_id': 'academiehuis-grote-kerk-zwolle_staff_0003_marga_edens'
        }
    ]

    success_count = 0
    total_cost = 0.0

    for profile in profiles_to_extract:
        if extract_linkedin_profile(**profile):
            success_count += 1
            total_cost += 0.001
        # Small delay to avoid overwhelming Exa
        import time
        time.sleep(2)

    print(f"\n📊 Extraction Summary:")
    print(f"✅ Successfully processed: {success_count}")
    print(f"💰 Total cost: ${total_cost:.3f}")
    print(f"📁 Files saved to: /Users/kempersc/apps/glam/data/custodian/person/entity")

if __name__ == "__main__":
    main()