glam/scripts/extract_profiles_working.py

#!/usr/bin/env python3
"""
Extract LinkedIn profiles using the working pattern from extract_linkedin_profile_exa.py
"""

import json
import os
import sys
import subprocess
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Any

def extract_linkedin_profile_with_exa(linkedin_url: str, output_file: str, source_file: str = "", staff_id: str = "") -> bool:
    """Extract LinkedIn profile using Exa crawler and save in structured format."""

    print(f"Extracting LinkedIn profile: {linkedin_url}")

    # Use Exa crawler to get profile content
    cmd = [
        'mcp', 'call', 'exa_crawling_exa',
        '--url', linkedin_url,
        '--maxCharacters', '50000'
    ]

    try:
        result = subprocess.run(
            cmd,
            capture_output=True,
            text=True,
            timeout=60
        )

        if result.returncode != 0:
            print(f"❌ Failed to extract profile from {linkedin_url}")
            print(f"Error: {result.stderr}")
            return False

        # Parse JSON output
        try:
            output = json.loads(result.stdout)
        except json.JSONDecodeError as e:
            print(f"Failed to parse JSON output: {e}")
            return False

        if not output or 'results' not in output or not output['results']:
            print(f"❌ No results returned from Exa")
            return False

        # Get first (and only) result
        result_data = output['results'][0]
        raw_content = result_data.get('text', '')
        title = result_data.get('title', '')
        url = result_data.get('url', linkedin_url)

        # Create minimal structured data
        profile_data = {
            "name": title,
            "linkedin_url": url,
            "headline": "",
            "location": "",
            "connections": "",
            "about": raw_content[:500] + "..." if len(raw_content) > 500 else raw_content,
            "experience": [],
            "education": [],
            "skills": [],
            "languages": [],
            "profile_image_url": None
        }

        # Create structured output
        structured_data = {
            "extraction_metadata": {
                "source_file": source_file or "manual_extraction",
                "staff_id": staff_id or "manual",
                "extraction_date": datetime.now(timezone.utc).isoformat(),
                "extraction_method": "exa_crawling_exa",
                "extraction_agent": "glm-4.6",
                "linkedin_url": url,
                "cost_usd": 0.001,
                "request_id": result_data.get('id', 'unknown')
            },
            "profile_data": profile_data
        }

        # Ensure output directory exists
        output_path = Path(output_file)
        output_path.parent.mkdir(parents=True, exist_ok=True)

        # Save to file
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(structured_data, f, indent=2, ensure_ascii=False)

        print(f"✅ Profile saved to: {output_file}")
        print(f"   Name: {profile_data.get('name', 'Unknown')}")
        return True

    except Exception as e:
        print(f"Exception during extraction: {e}")
        return False

def main():
    """Main function to extract specific LinkedIn profiles."""

    # Define specific profiles to extract
    profiles_to_extract = [
        {
            'linkedin_url': 'https://www.linkedin.com/in/anja-van-hoorn-657b66223',
            'output_file': '/Users/kempersc/apps/glam/data/custodian/person/entity/anja-van-hoorn-657b66223_20251210T160000Z.json',
            'source_file': '/Users/kempersc/apps/glam/data/custodian/person/affiliated/parsed/academiehuis-grote-kerk-zwolle_staff_20251210T155412Z.json',
            'staff_id': 'academiehuis-grote-kerk-zwolle_staff_0001_anja_van_hoorn'
        },
        {
            'linkedin_url': 'https://www.linkedin.com/in/inez-van-kleef',
            'output_file': '/Users/kempersc/apps/glam/data/custodian/person/entity/inez-van-kleef_20251210T160000Z.json',
            'source_file': '/Users/kempersc/apps/glam/data/custodian/person/affiliated/parsed/academiehuis-grote-kerk-zwolle_staff_20251210T155412Z.json',
            'staff_id': 'academiehuis-grote-kerk-zwolle_staff_0002_inez_van_kleef'
        },
        {
            'linkedin_url': 'https://www.linkedin.com/in/marga-edens-a284175',
            'output_file': '/Users/kempersc/apps/glam/data/custodian/person/entity/marga-edens-a284175_20251210T160000Z.json',
            'source_file': '/Users/kempersc/apps/glam/data/custodian/person/affiliated/parsed/academiehuis-grote-kerk-zwolle_staff_20251210T155412Z.json',
            'staff_id': 'academiehuis-grote-kerk-zwolle_staff_0003_marga_edens'
        }
    ]

    success_count = 0
    total_cost = 0.0

    for profile in profiles_to_extract:
        if extract_linkedin_profile_with_exa(**profile):
            success_count += 1
            total_cost += 0.001
        # Small delay to avoid overwhelming Exa
        import time
        time.sleep(2)

    print(f"\n📊 Extraction Summary:")
    print(f"✅ Successfully processed: {success_count}")
    print(f"💰 Total cost: ${total_cost:.3f}")
    print(f"📁 Files saved to: /Users/kempersc/apps/glam/data/custodian/person/entity")

if __name__ == "__main__":
    main()