#!/usr/bin/env python3 """ Extract LinkedIn profile using Exa crawler and save in proper structured format. """ import os import json import sys import re import subprocess from datetime import datetime, timezone from pathlib import Path from typing import Dict, List, Any, Optional def call_exa_crawling(url: str, max_characters: int = 50000) -> Dict[str, Any]: """Call Exa crawling tool via MCP.""" try: # Use the MCP tool directly result = subprocess.run( ['exa-mcp-server', 'call', 'crawling_exa', '--url', url, '--maxCharacters', str(max_characters)], capture_output=True, text=True, timeout=60 ) if result.returncode != 0: print(f"Error calling Exa: {result.stderr}") return None # Parse JSON output return json.loads(result.stdout) except Exception as e: print(f"Exception calling Exa: {e}") return None def extract_linkedin_profile_with_exa(linkedin_url: str, output_file: str, source_file: str = "", staff_id: str = "") -> bool: """Extract LinkedIn profile using Exa crawler and save in structured format.""" print(f"Extracting LinkedIn profile: {linkedin_url}") # Use Exa crawler to get profile content exa_result = call_exa_crawling(linkedin_url, 50000) if not exa_result or 'results' not in exa_result or not exa_result['results']: print(f"❌ Failed to extract profile from {linkedin_url}") return False # Get first (and only) result result = exa_result['results'][0] raw_content = result.get('text', '') title = result.get('title', '') url = result.get('url', linkedin_url) # Parse profile content profile_data = parse_linkedin_content(raw_content, title, url) # Create structured output structured_data = { "extraction_metadata": { "source_file": source_file or "manual_extraction", "staff_id": staff_id or "manual", "extraction_date": datetime.now(timezone.utc).isoformat(), "extraction_method": "exa_crawling_exa", "extraction_agent": "claude-opus-4.5", "linkedin_url": url, "cost_usd": 0, # Exa contents endpoint is free "request_id": result.get('id', 'unknown') }, "profile_data": profile_data } # Ensure output directory exists output_path = Path(output_file) output_path.parent.mkdir(parents=True, exist_ok=True) # Save to file with open(output_file, 'w', encoding='utf-8') as f: json.dump(structured_data, f, indent=2, ensure_ascii=False) print(f"✅ Profile saved to: {output_file}") print(f" Name: {profile_data.get('name', 'Unknown')}") print(f" Headline: {profile_data.get('headline', '')[:80]}...") return True def parse_linkedin_content(content: str, title: str, url: str) -> Dict[str, Any]: """Parse LinkedIn profile content from raw text.""" # Initialize profile data profile = { "name": "", "linkedin_url": url, "headline": "", "location": "", "connections": "", "about": "", "experience": [], "education": [], "skills": [], "languages": [], "profile_image_url": None } # Extract name from title or content if title: # Remove " | LinkedIn" suffix name = title.replace(' | LinkedIn', '').strip() if name and '|' in name: # If there's a pipe, take the first part as name name = name.split('|')[0].strip() profile["name"] = name # Extract headline (usually right after name) lines = content.split('\n') for i, line in enumerate(lines): if line.strip() and not line.startswith('#') and i > 0: # This is likely the headline profile["headline"] = line.strip() break # Extract location location_patterns = [ r'([A-Za-z\s]+,\s*[A-Za-z\s]+,\s*[A-Za-z\s]+)', # City, Region, Country r'([A-Za-z\s]+,\s*[A-Za-z\s]+)', # City, Country ] for pattern in location_patterns: match = re.search(pattern, content) if match: profile["location"] = match.group(1).strip() break # Extract connections conn_match = re.search(r'(\d+(?:,\d+)*)\s+connections', content, re.IGNORECASE) if conn_match: connections = conn_match.group(1) # Look for followers follower_match = re.search(r'(\d+(?:,\d+)*)\s+followers', content, re.IGNORECASE) if follower_match: connections += f" • {follower_match.group(1)} followers" profile["connections"] = connections # Extract About section about_match = re.search(r'About\s*\n+(.+?)(?=\n\n|\n[A-Z]|\Z)', content, re.DOTALL | re.IGNORECASE) if about_match: profile["about"] = about_match.group(1).strip() # Extract Experience section exp_section = re.search(r'Experience\s*\n+(.+?)(?=\n\n|\n[A-Z][a-z]+\s*\n|\Z)', content, re.DOTALL | re.IGNORECASE) if exp_section: exp_content = exp_section.group(1) # Parse individual experiences exp_entries = re.findall(r'([A-Z][^-\n][^\n]*)\n–\s*([^\n]+)\n([^•\n]*(?:\n[^•\n]*)*?)', exp_content) for title, company, details in exp_entries: exp = { "title": title.strip(), "company": company.strip(), "duration": "", "location": "", "description": "" } # Extract duration dur_match = re.search(r'(\w+\s+\d{4})\s*-\s*(Present|\w+\s+\d{4})', details) if dur_match: exp["duration"] = f"{dur_match.group(1)} - {dur_match.group(2)}" # Extract location from details loc_match = re.search(r'([A-Za-z\s]+,\s*[A-Za-z\s]+)', details) if loc_match: exp["location"] = loc_match.group(1).strip() # Clean up description exp["description"] = re.sub(r'\s+', ' ', details).strip() profile["experience"].append(exp) # Extract Education section edu_section = re.search(r'Education\s*\n+(.+?)(?=\n\n|\n[A-Z][a-z]+\s*\n|\Z)', content, re.DOTALL | re.IGNORECASE) if edu_section: edu_content = edu_section.group(1) # Parse individual education entries edu_entries = re.findall(r'([^\n]+)\n–\s*([^\n]+)\n([^•\n]*(?:\n[^•\n]*)*?)', edu_content) for school, degree, details in edu_entries: edu = { "school": school.strip(), "degree": degree.strip(), "years": "", "description": "" } # Extract years years_match = re.search(r'(\d{4})\s*-\s*(\d{4}|Present)', details) if years_match: edu["years"] = f"{years_match.group(1)} - {years_match.group(2)}" edu["description"] = re.sub(r'\s+', ' ', details).strip() profile["education"].append(edu) # Extract Skills section skills_section = re.search(r'Skills\s*\n+(.+?)(?=\n\n|\n[A-Z][a-z]+\s*\n|\Z)', content, re.DOTALL | re.IGNORECASE) if skills_section: skills_content = skills_section.group(1) # Extract skills (they might be separated by commas or bullets) skills = re.split(r'[,•\n]\s*', skills_content) profile["skills"] = [s.strip() for s in skills if s.strip() and len(s.strip()) > 1] # Extract Languages lang_match = re.search(r'Languages\s*\n+(.+?)(?=\n\n|\n[A-Z][a-z]+\s*\n|\Z)', content, re.DOTALL | re.IGNORECASE) if lang_match: lang_content = lang_match.group(1) languages = re.split(r'[,•\n]\s*', lang_content) profile["languages"] = [l.strip() for l in languages if l.strip() and len(l.strip()) > 1] # Try to extract profile image URL img_match = re.search(r'https://media\.licdn\.com/dms/image/[^\s\)]+', content) if img_match: profile["profile_image_url"] = img_match.group(0) return profile def main(): """Main function to extract a specific LinkedIn profile.""" if len(sys.argv) < 3: print("Usage: python extract_linkedin_profile_exa.py [source_file] [staff_id]") print("\nExample:") print("python extract_linkedin_profile_exa.py https://www.linkedin.com/in/annelien-vos-keen-657b66223 /Users/kempersc/apps/glam/data/custodian/person/entity/annelien-vos-keen-657b66223_20251210T160000Z.json") sys.exit(1) linkedin_url = sys.argv[1] output_file = sys.argv[2] source_file = sys.argv[3] if len(sys.argv) > 3 else "" staff_id = sys.argv[4] if len(sys.argv) > 4 else "" # Extract profile success = extract_linkedin_profile_with_exa(linkedin_url, output_file, source_file, staff_id) if success: print("\n✅ Profile extraction completed successfully!") else: print("\n❌ Profile extraction failed!") sys.exit(1) if __name__ == "__main__": main()