glam/scripts/extract_direct.py

#!/usr/bin/env python3
"""
Direct extraction of LinkedIn profiles using subprocess pattern from working scripts.
"""

import json
import os
import sys
import subprocess
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Any

def extract_profile_directly(linkedin_url: str, output_file: str, source_file: str = "", staff_id: str = "") -> bool:
    """Extract LinkedIn profile using direct subprocess call."""

    print(f"Extracting LinkedIn profile: {linkedin_url}")

    # Build command similar to working pattern
    cmd = [
        sys.executable,  # Use current Python interpreter
        '-c',
        f'''
import json
import sys
import subprocess
from datetime import datetime, timezone

# Call Exa
result = subprocess.run(
    ["node", "/Users/kempersc/apps/glam/exa-mcp-server-source/.smithery/stdio/index.cjs", "call", "exa_crawling_exa",
     "--url", "{linkedin_url}",
     "--maxCharacters", "50000"],
    capture_output=True,
    text=True,
    timeout=60
)

if result.returncode == 0:
    # Parse JSON output
    try:
        output = json.loads(result.stdout)
        if output and "results" in output and output["results"]:
            profile_content = output["results"][0].get("text", "")
            title = output["results"][0].get("title", "Unknown")

            # Create minimal structured data
            profile_data = {{
                "name": title,
                "linkedin_url": "{linkedin_url}",
                "headline": "",
                "location": "",
                "connections": "",
                "about": profile_content[:1000] + "..." if len(profile_content) > 1000 else profile_content,
                "experience": [],
                "education": [],
                "skills": [],
                "languages": [],
                "profile_image_url": None
            }}

            # Create structured output
            structured_data = {{
                "extraction_metadata": {{
                    "source_file": "{source_file}",
                    "staff_id": "{staff_id}",
                    "extraction_date": datetime.now(timezone.utc).isoformat(),
                    "extraction_method": "exa_crawling_exa",
                    "extraction_agent": "glm-4.6",
                    "linkedin_url": "{linkedin_url}",
                    "cost_usd": 0.001,
                    "request_id": output["results"][0].get("id", "unknown")
                }},
                "profile_data": profile_data
            }}

            # Save to file
            with open("{output_file}", "w", encoding="utf-8") as f:
                json.dump(structured_data, f, indent=2, ensure_ascii=False)

            print(f"✅ Profile saved to: {output_file}")
            print(f"   Name: {{title}}")
            return True
    except Exception as e:
        print(f"Error parsing output: {{e}}")

else:
    print(f"Error calling Exa: {{result.stderr}}")

sys.exit(0 if result.returncode == 0 else 1)
'''
    ]

    try:
        result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)

        if result.returncode == 0:
            print(f"✅ Successfully extracted profile for {linkedin_url}")
            return True
        else:
            print(f"❌ Failed to extract profile: {result.stderr}")
            return False

    except Exception as e:
        print(f"Exception during extraction: {e}")
        return False

def main():
    """Main function to extract specific LinkedIn profiles."""

    # Define specific profiles to extract
    profiles_to_extract = [
        {
            'linkedin_url': 'https://www.linkedin.com/in/anja-van-hoorn-657b66223',
            'output_file': '/Users/kempersc/apps/glam/data/custodian/person/entity/anja-van-hoorn-657b66223_20251210T160000Z.json',
            'source_file': '/Users/kempersc/apps/glam/data/custodian/person/affiliated/parsed/academiehuis-grote-kerk-zwolle_staff_20251210T155412Z.json',
            'staff_id': 'academiehuis-grote-kerk-zwolle_staff_0001_anja_van_hoorn'
        },
        {
            'linkedin_url': 'https://www.linkedin.com/in/inez-van-kleef',
            'output_file': '/Users/kempersc/apps/glam/data/custodian/person/entity/inez-van-kleef_20251210T160000Z.json',
            'source_file': '/Users/kempersc/apps/glam/data/custodian/person/affiliated/parsed/academiehuis-grote-kerk-zwolle_staff_20251210T155412Z.json',
            'staff_id': 'academiehuis-grote-kerk-zwolle_staff_0002_inez_van_kleef'
        },
        {
            'linkedin_url': 'https://www.linkedin.com/in/marga-edens-a284175',
            'output_file': '/Users/kempersc/apps/glam/data/custodian/person/entity/marga-edens-a284175_20251210T160000Z.json',
            'source_file': '/Users/kempersc/apps/glam/data/custodian/person/affiliated/parsed/academiehuis-grote-kerk-zwolle_staff_20251210T155412Z.json',
            'staff_id': 'academiehuis-grote-kerk-zwolle_staff_0003_marga_edens'
        }
    ]

    success_count = 0
    total_cost = 0.0

    for profile in profiles_to_extract:
        if extract_profile_directly(**profile):
            success_count += 1
            total_cost += 0.001
        # Small delay to avoid overwhelming
        import time
        time.sleep(3)

    print(f"\n📊 Extraction Summary:")
    print(f"✅ Successfully processed: {success_count}")
    print(f"💰 Total cost: ${total_cost:.3f}")
    print(f"📁 Files saved to: /Users/kempersc/apps/glam/data/custodian/person/entity")

if __name__ == "__main__":
    main()