251 lines
No EOL
9.1 KiB
Python
251 lines
No EOL
9.1 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Extract LinkedIn profile using Exa crawler and save in proper structured format.
|
||
"""
|
||
|
||
import os
|
||
import json
|
||
import sys
|
||
import re
|
||
import subprocess
|
||
from datetime import datetime, timezone
|
||
from pathlib import Path
|
||
from typing import Dict, List, Any, Optional
|
||
|
||
|
||
def call_exa_crawling(url: str, max_characters: int = 50000) -> Dict[str, Any]:
|
||
"""Call Exa crawling tool via MCP."""
|
||
try:
|
||
# Use the MCP tool directly
|
||
result = subprocess.run(
|
||
['exa-mcp-server', 'call', 'crawling_exa',
|
||
'--url', url,
|
||
'--maxCharacters', str(max_characters)],
|
||
capture_output=True,
|
||
text=True,
|
||
timeout=60
|
||
)
|
||
|
||
if result.returncode != 0:
|
||
print(f"Error calling Exa: {result.stderr}")
|
||
return None
|
||
|
||
# Parse JSON output
|
||
return json.loads(result.stdout)
|
||
except Exception as e:
|
||
print(f"Exception calling Exa: {e}")
|
||
return None
|
||
|
||
|
||
def extract_linkedin_profile_with_exa(linkedin_url: str, output_file: str, source_file: str = "", staff_id: str = "") -> bool:
|
||
"""Extract LinkedIn profile using Exa crawler and save in structured format."""
|
||
|
||
print(f"Extracting LinkedIn profile: {linkedin_url}")
|
||
|
||
# Use Exa crawler to get profile content
|
||
exa_result = call_exa_crawling(linkedin_url, 50000)
|
||
|
||
if not exa_result or 'results' not in exa_result or not exa_result['results']:
|
||
print(f"❌ Failed to extract profile from {linkedin_url}")
|
||
return False
|
||
|
||
# Get first (and only) result
|
||
result = exa_result['results'][0]
|
||
raw_content = result.get('text', '')
|
||
title = result.get('title', '')
|
||
url = result.get('url', linkedin_url)
|
||
|
||
# Parse profile content
|
||
profile_data = parse_linkedin_content(raw_content, title, url)
|
||
|
||
# Create structured output
|
||
structured_data = {
|
||
"extraction_metadata": {
|
||
"source_file": source_file or "manual_extraction",
|
||
"staff_id": staff_id or "manual",
|
||
"extraction_date": datetime.now(timezone.utc).isoformat(),
|
||
"extraction_method": "exa_crawling_exa",
|
||
"extraction_agent": "claude-opus-4.5",
|
||
"linkedin_url": url,
|
||
"cost_usd": 0, # Exa contents endpoint is free
|
||
"request_id": result.get('id', 'unknown')
|
||
},
|
||
"profile_data": profile_data
|
||
}
|
||
|
||
# Ensure output directory exists
|
||
output_path = Path(output_file)
|
||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||
|
||
# Save to file
|
||
with open(output_file, 'w', encoding='utf-8') as f:
|
||
json.dump(structured_data, f, indent=2, ensure_ascii=False)
|
||
|
||
print(f"✅ Profile saved to: {output_file}")
|
||
print(f" Name: {profile_data.get('name', 'Unknown')}")
|
||
print(f" Headline: {profile_data.get('headline', '')[:80]}...")
|
||
return True
|
||
|
||
|
||
def parse_linkedin_content(content: str, title: str, url: str) -> Dict[str, Any]:
|
||
"""Parse LinkedIn profile content from raw text."""
|
||
|
||
# Initialize profile data
|
||
profile = {
|
||
"name": "",
|
||
"linkedin_url": url,
|
||
"headline": "",
|
||
"location": "",
|
||
"connections": "",
|
||
"about": "",
|
||
"experience": [],
|
||
"education": [],
|
||
"skills": [],
|
||
"languages": [],
|
||
"profile_image_url": None
|
||
}
|
||
|
||
# Extract name from title or content
|
||
if title:
|
||
# Remove " | LinkedIn" suffix
|
||
name = title.replace(' | LinkedIn', '').strip()
|
||
if name and '|' in name:
|
||
# If there's a pipe, take the first part as name
|
||
name = name.split('|')[0].strip()
|
||
profile["name"] = name
|
||
|
||
# Extract headline (usually right after name)
|
||
lines = content.split('\n')
|
||
for i, line in enumerate(lines):
|
||
if line.strip() and not line.startswith('#') and i > 0:
|
||
# This is likely the headline
|
||
profile["headline"] = line.strip()
|
||
break
|
||
|
||
# Extract location
|
||
location_patterns = [
|
||
r'([A-Za-z\s]+,\s*[A-Za-z\s]+,\s*[A-Za-z\s]+)', # City, Region, Country
|
||
r'([A-Za-z\s]+,\s*[A-Za-z\s]+)', # City, Country
|
||
]
|
||
for pattern in location_patterns:
|
||
match = re.search(pattern, content)
|
||
if match:
|
||
profile["location"] = match.group(1).strip()
|
||
break
|
||
|
||
# Extract connections
|
||
conn_match = re.search(r'(\d+(?:,\d+)*)\s+connections', content, re.IGNORECASE)
|
||
if conn_match:
|
||
connections = conn_match.group(1)
|
||
# Look for followers
|
||
follower_match = re.search(r'(\d+(?:,\d+)*)\s+followers', content, re.IGNORECASE)
|
||
if follower_match:
|
||
connections += f" • {follower_match.group(1)} followers"
|
||
profile["connections"] = connections
|
||
|
||
# Extract About section
|
||
about_match = re.search(r'About\s*\n+(.+?)(?=\n\n|\n[A-Z]|\Z)', content, re.DOTALL | re.IGNORECASE)
|
||
if about_match:
|
||
profile["about"] = about_match.group(1).strip()
|
||
|
||
# Extract Experience section
|
||
exp_section = re.search(r'Experience\s*\n+(.+?)(?=\n\n|\n[A-Z][a-z]+\s*\n|\Z)', content, re.DOTALL | re.IGNORECASE)
|
||
if exp_section:
|
||
exp_content = exp_section.group(1)
|
||
# Parse individual experiences
|
||
exp_entries = re.findall(r'([A-Z][^-\n][^\n]*)\n–\s*([^\n]+)\n([^•\n]*(?:\n[^•\n]*)*?)', exp_content)
|
||
for title, company, details in exp_entries:
|
||
exp = {
|
||
"title": title.strip(),
|
||
"company": company.strip(),
|
||
"duration": "",
|
||
"location": "",
|
||
"description": ""
|
||
}
|
||
|
||
# Extract duration
|
||
dur_match = re.search(r'(\w+\s+\d{4})\s*-\s*(Present|\w+\s+\d{4})', details)
|
||
if dur_match:
|
||
exp["duration"] = f"{dur_match.group(1)} - {dur_match.group(2)}"
|
||
|
||
# Extract location from details
|
||
loc_match = re.search(r'([A-Za-z\s]+,\s*[A-Za-z\s]+)', details)
|
||
if loc_match:
|
||
exp["location"] = loc_match.group(1).strip()
|
||
|
||
# Clean up description
|
||
exp["description"] = re.sub(r'\s+', ' ', details).strip()
|
||
|
||
profile["experience"].append(exp)
|
||
|
||
# Extract Education section
|
||
edu_section = re.search(r'Education\s*\n+(.+?)(?=\n\n|\n[A-Z][a-z]+\s*\n|\Z)', content, re.DOTALL | re.IGNORECASE)
|
||
if edu_section:
|
||
edu_content = edu_section.group(1)
|
||
# Parse individual education entries
|
||
edu_entries = re.findall(r'([^\n]+)\n–\s*([^\n]+)\n([^•\n]*(?:\n[^•\n]*)*?)', edu_content)
|
||
for school, degree, details in edu_entries:
|
||
edu = {
|
||
"school": school.strip(),
|
||
"degree": degree.strip(),
|
||
"years": "",
|
||
"description": ""
|
||
}
|
||
|
||
# Extract years
|
||
years_match = re.search(r'(\d{4})\s*-\s*(\d{4}|Present)', details)
|
||
if years_match:
|
||
edu["years"] = f"{years_match.group(1)} - {years_match.group(2)}"
|
||
|
||
edu["description"] = re.sub(r'\s+', ' ', details).strip()
|
||
|
||
profile["education"].append(edu)
|
||
|
||
# Extract Skills section
|
||
skills_section = re.search(r'Skills\s*\n+(.+?)(?=\n\n|\n[A-Z][a-z]+\s*\n|\Z)', content, re.DOTALL | re.IGNORECASE)
|
||
if skills_section:
|
||
skills_content = skills_section.group(1)
|
||
# Extract skills (they might be separated by commas or bullets)
|
||
skills = re.split(r'[,•\n]\s*', skills_content)
|
||
profile["skills"] = [s.strip() for s in skills if s.strip() and len(s.strip()) > 1]
|
||
|
||
# Extract Languages
|
||
lang_match = re.search(r'Languages\s*\n+(.+?)(?=\n\n|\n[A-Z][a-z]+\s*\n|\Z)', content, re.DOTALL | re.IGNORECASE)
|
||
if lang_match:
|
||
lang_content = lang_match.group(1)
|
||
languages = re.split(r'[,•\n]\s*', lang_content)
|
||
profile["languages"] = [l.strip() for l in languages if l.strip() and len(l.strip()) > 1]
|
||
|
||
# Try to extract profile image URL
|
||
img_match = re.search(r'https://media\.licdn\.com/dms/image/[^\s\)]+', content)
|
||
if img_match:
|
||
profile["profile_image_url"] = img_match.group(0)
|
||
|
||
return profile
|
||
|
||
|
||
def main():
|
||
"""Main function to extract a specific LinkedIn profile."""
|
||
if len(sys.argv) < 3:
|
||
print("Usage: python extract_linkedin_profile_exa.py <linkedin_url> <output_file> [source_file] [staff_id]")
|
||
print("\nExample:")
|
||
print("python extract_linkedin_profile_exa.py https://www.linkedin.com/in/annelien-vos-keen-657b66223 /Users/kempersc/apps/glam/data/custodian/person/entity/annelien-vos-keen-657b66223_20251210T160000Z.json")
|
||
sys.exit(1)
|
||
|
||
linkedin_url = sys.argv[1]
|
||
output_file = sys.argv[2]
|
||
source_file = sys.argv[3] if len(sys.argv) > 3 else ""
|
||
staff_id = sys.argv[4] if len(sys.argv) > 4 else ""
|
||
|
||
# Extract profile
|
||
success = extract_linkedin_profile_with_exa(linkedin_url, output_file, source_file, staff_id)
|
||
|
||
if success:
|
||
print("\n✅ Profile extraction completed successfully!")
|
||
else:
|
||
print("\n❌ Profile extraction failed!")
|
||
sys.exit(1)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main() |