#!/usr/bin/env python3 """ Extract comprehensive LinkedIn profiles using Exa LinkedIn search service. """ import json import os import subprocess from datetime import datetime, timezone from pathlib import Path from typing import Optional, Dict, Any def call_exa_linkedin_search(name: str, linkedin_url: str = "") -> Optional[dict]: """Call Exa LinkedIn search to find profile.""" try: # Build search query - try name first, then URL if needed query = name if not linkedin_url else f"site:linkedin.com/in/ {linkedin_url.split('/')[-1]}" # Use JSON-RPC format to call exa-mcp-server cmd = [ 'echo', json.dumps({ "jsonrpc": "2.0", "method": "tools/call", "params": { "name": "linkedin_search_exa", "arguments": { "query": query, "searchType": "profiles", "numResults": 5 } }, "id": 1 }), '|', 'exa-mcp-server' ] result = subprocess.run( cmd, capture_output=True, text=True, timeout=60 ) if result.returncode != 0: print(f"Error calling Exa LinkedIn search: {result.stderr}") return None # Parse JSON-RPC response response = json.loads(result.stdout) if 'result' in response and 'results' in response['result']: return response['result']['results'] else: print(f"No results from LinkedIn search: {response}") return None except Exception as e: print(f"Exception calling Exa LinkedIn search: {e}") return None def call_exa_crawling(linkedin_url: str) -> dict: """Call Exa crawling to get full profile content.""" try: # Use JSON-RPC format to call exa-mcp-server cmd = [ 'echo', json.dumps({ "jsonrpc": "2.0", "method": "tools/call", "params": { "name": "crawling_exa", "arguments": { "url": linkedin_url, "maxCharacters": 50000 } }, "id": 1 }), '|', 'exa-mcp-server' ] result = subprocess.run( cmd, capture_output=True, text=True, timeout=60 ) if result.returncode != 0: print(f"Error calling Exa crawling: {result.stderr}") return None # Parse JSON-RPC response response = json.loads(result.stdout) if 'result' in response and 'results' in response['result']: return response['result']['results'][0] else: print(f"No results from crawling: {response}") return None except Exception as e: print(f"Exception calling Exa crawling: {e}") return None def extract_comprehensive_profile(name: str, linkedin_url: str, output_file: str, source_file: str, staff_id: str) -> bool: """Extract comprehensive LinkedIn profile using Exa services.""" print(f"Extracting comprehensive LinkedIn profile for: {name}") print(f" URL: {linkedin_url}") # First try to get full profile content via crawling profile_data = None # Try crawling first (most comprehensive) crawl_result = call_exa_crawling(linkedin_url) if crawl_result: print(f" ✅ Successfully crawled profile content") profile_data = { "name": name, "linkedin_url": linkedin_url, "headline": crawl_result.get("title", ""), "location": "", "connections": "", "about": crawl_result.get("text", "")[:2000] + "..." if len(crawl_result.get("text", "")) > 2000 else crawl_result.get("text", ""), "experience": [], "education": [], "skills": [], "languages": [], "profile_image_url": None } else: print(f" ⚠️ Crawling failed, trying LinkedIn search...") # Fallback to LinkedIn search search_results = call_exa_linkedin_search(name, linkedin_url) if search_results and len(search_results) > 0: # Find the best match best_match = search_results[0] # Take first result print(f" ✅ Found profile via search") profile_data = { "name": name, "linkedin_url": linkedin_url, "headline": best_match.get("title", ""), "location": best_match.get("metadata", {}).get("location", ""), "connections": "", "about": best_match.get("text", "")[:2000] + "..." if len(best_match.get("text", "")) > 2000 else best_match.get("text", ""), "experience": [], "education": [], "skills": [], "languages": [], "profile_image_url": None } else: print(f" ❌ Both crawling and search failed") return False # Create structured output structured_data = { "extraction_metadata": { "source_file": source_file, "staff_id": staff_id, "extraction_date": datetime.now(timezone.utc).isoformat(), "extraction_method": "exa_linkedin_search_and_crawl", "extraction_agent": "claude-opus-4.5", "linkedin_url": linkedin_url, "cost_usd": 0.002, # Two API calls "request_id": f"search_crawl_{datetime.now().strftime('%Y%m%d_%H%M%S')}" }, "profile_data": profile_data } # Ensure output directory exists output_path = Path(output_file) output_path.parent.mkdir(parents=True, exist_ok=True) # Save to file with open(output_path, 'w', encoding='utf-8') as f: json.dump(structured_data, f, indent=2, ensure_ascii=False) print(f" ✅ Profile saved to: {output_path}") print(f" 📊 Name: {profile_data.get('name', 'Unknown')}") print(f" 📝 Headline: {profile_data.get('headline', '')[:100]}...") return True def main(): """Main function to extract comprehensive LinkedIn profiles.""" # Define specific profiles to extract from Academiehuis Grote Kerk Zwolle profiles = [ { 'linkedin_url': 'https://www.linkedin.com/in/anja-van-hoorn-657b66223', 'name': 'Anja van Hoorn', 'output_file': '/Users/kempersc/apps/glam/data/custodian/person/entity/anja-van-hoorn-657b66223_20251210T160000Z.json', 'source_file': '/Users/kempersc/apps/glam/data/custodian/person/affiliated/parsed/academiehuis-grote-kerk-zwolle_staff_20251210T155412Z.json', 'staff_id': 'academiehuis-grote-kerk-zwolle_staff_0001_anja_van_hoorn' }, { 'linkedin_url': 'https://www.linkedin.com/in/inez-van-kleef', 'name': 'Inez van Kleef', 'output_file': '/Users/kempersc/apps/glam/data/custodian/person/entity/inez-van-kleef_20251210T160000Z.json', 'source_file': '/Users/kempersc/apps/glam/data/custodian/person/affiliated/parsed/academiehuis-grote-kerk-zwolle_staff_20251210T155412Z.json', 'staff_id': 'academiehuis-grote-kerk-zwolle_staff_0002_inez_van_kleef' }, { 'linkedin_url': 'https://www.linkedin.com/in/marga-edens-a284175', 'name': 'Marga Edens', 'output_file': '/Users/kempersc/apps/glam/data/custodian/person/entity/marga-edens-a284175_20251210T160000Z.json', 'source_file': '/Users/kempersc/apps/glam/data/custodian/person/affiliated/parsed/academiehuis-grote-kerk-zwolle_staff_20251210T155412Z.json', 'staff_id': 'academiehuis-grote-kerk-zwolle_staff_0003_marga_edens' } ] success_count = 0 total_cost = 0.0 for profile in profiles: if extract_comprehensive_profile(**profile): success_count += 1 total_cost += 0.002 # Delay between requests to respect rate limits import time time.sleep(3) print(f"\n📊 Extraction Summary:") print(f"✅ Successfully processed: {success_count}") print(f"💰 Total cost: ${total_cost:.3f}") print(f"📁 Files saved to: /Users/kempersc/apps/glam/data/custodian/person/entity") if __name__ == "__main__": main()