glam/scripts/extract_linkedin_profile_exa.py
2025-12-11 22:32:09 +01:00

251 lines
No EOL
9.1 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Extract LinkedIn profile using Exa crawler and save in proper structured format.
"""
import os
import json
import sys
import re
import subprocess
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Any, Optional
def call_exa_crawling(url: str, max_characters: int = 50000) -> Dict[str, Any]:
"""Call Exa crawling tool via MCP."""
try:
# Use the MCP tool directly
result = subprocess.run(
['exa-mcp-server', 'call', 'crawling_exa',
'--url', url,
'--maxCharacters', str(max_characters)],
capture_output=True,
text=True,
timeout=60
)
if result.returncode != 0:
print(f"Error calling Exa: {result.stderr}")
return None
# Parse JSON output
return json.loads(result.stdout)
except Exception as e:
print(f"Exception calling Exa: {e}")
return None
def extract_linkedin_profile_with_exa(linkedin_url: str, output_file: str, source_file: str = "", staff_id: str = "") -> bool:
"""Extract LinkedIn profile using Exa crawler and save in structured format."""
print(f"Extracting LinkedIn profile: {linkedin_url}")
# Use Exa crawler to get profile content
exa_result = call_exa_crawling(linkedin_url, 50000)
if not exa_result or 'results' not in exa_result or not exa_result['results']:
print(f"❌ Failed to extract profile from {linkedin_url}")
return False
# Get first (and only) result
result = exa_result['results'][0]
raw_content = result.get('text', '')
title = result.get('title', '')
url = result.get('url', linkedin_url)
# Parse profile content
profile_data = parse_linkedin_content(raw_content, title, url)
# Create structured output
structured_data = {
"extraction_metadata": {
"source_file": source_file or "manual_extraction",
"staff_id": staff_id or "manual",
"extraction_date": datetime.now(timezone.utc).isoformat(),
"extraction_method": "exa_crawling_exa",
"extraction_agent": "claude-opus-4.5",
"linkedin_url": url,
"cost_usd": 0, # Exa contents endpoint is free
"request_id": result.get('id', 'unknown')
},
"profile_data": profile_data
}
# Ensure output directory exists
output_path = Path(output_file)
output_path.parent.mkdir(parents=True, exist_ok=True)
# Save to file
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(structured_data, f, indent=2, ensure_ascii=False)
print(f"✅ Profile saved to: {output_file}")
print(f" Name: {profile_data.get('name', 'Unknown')}")
print(f" Headline: {profile_data.get('headline', '')[:80]}...")
return True
def parse_linkedin_content(content: str, title: str, url: str) -> Dict[str, Any]:
"""Parse LinkedIn profile content from raw text."""
# Initialize profile data
profile = {
"name": "",
"linkedin_url": url,
"headline": "",
"location": "",
"connections": "",
"about": "",
"experience": [],
"education": [],
"skills": [],
"languages": [],
"profile_image_url": None
}
# Extract name from title or content
if title:
# Remove " | LinkedIn" suffix
name = title.replace(' | LinkedIn', '').strip()
if name and '|' in name:
# If there's a pipe, take the first part as name
name = name.split('|')[0].strip()
profile["name"] = name
# Extract headline (usually right after name)
lines = content.split('\n')
for i, line in enumerate(lines):
if line.strip() and not line.startswith('#') and i > 0:
# This is likely the headline
profile["headline"] = line.strip()
break
# Extract location
location_patterns = [
r'([A-Za-z\s]+,\s*[A-Za-z\s]+,\s*[A-Za-z\s]+)', # City, Region, Country
r'([A-Za-z\s]+,\s*[A-Za-z\s]+)', # City, Country
]
for pattern in location_patterns:
match = re.search(pattern, content)
if match:
profile["location"] = match.group(1).strip()
break
# Extract connections
conn_match = re.search(r'(\d+(?:,\d+)*)\s+connections', content, re.IGNORECASE)
if conn_match:
connections = conn_match.group(1)
# Look for followers
follower_match = re.search(r'(\d+(?:,\d+)*)\s+followers', content, re.IGNORECASE)
if follower_match:
connections += f"{follower_match.group(1)} followers"
profile["connections"] = connections
# Extract About section
about_match = re.search(r'About\s*\n+(.+?)(?=\n\n|\n[A-Z]|\Z)', content, re.DOTALL | re.IGNORECASE)
if about_match:
profile["about"] = about_match.group(1).strip()
# Extract Experience section
exp_section = re.search(r'Experience\s*\n+(.+?)(?=\n\n|\n[A-Z][a-z]+\s*\n|\Z)', content, re.DOTALL | re.IGNORECASE)
if exp_section:
exp_content = exp_section.group(1)
# Parse individual experiences
exp_entries = re.findall(r'([A-Z][^-\n][^\n]*)\n\s*([^\n]+)\n([^•\n]*(?:\n[^•\n]*)*?)', exp_content)
for title, company, details in exp_entries:
exp = {
"title": title.strip(),
"company": company.strip(),
"duration": "",
"location": "",
"description": ""
}
# Extract duration
dur_match = re.search(r'(\w+\s+\d{4})\s*-\s*(Present|\w+\s+\d{4})', details)
if dur_match:
exp["duration"] = f"{dur_match.group(1)} - {dur_match.group(2)}"
# Extract location from details
loc_match = re.search(r'([A-Za-z\s]+,\s*[A-Za-z\s]+)', details)
if loc_match:
exp["location"] = loc_match.group(1).strip()
# Clean up description
exp["description"] = re.sub(r'\s+', ' ', details).strip()
profile["experience"].append(exp)
# Extract Education section
edu_section = re.search(r'Education\s*\n+(.+?)(?=\n\n|\n[A-Z][a-z]+\s*\n|\Z)', content, re.DOTALL | re.IGNORECASE)
if edu_section:
edu_content = edu_section.group(1)
# Parse individual education entries
edu_entries = re.findall(r'([^\n]+)\n\s*([^\n]+)\n([^•\n]*(?:\n[^•\n]*)*?)', edu_content)
for school, degree, details in edu_entries:
edu = {
"school": school.strip(),
"degree": degree.strip(),
"years": "",
"description": ""
}
# Extract years
years_match = re.search(r'(\d{4})\s*-\s*(\d{4}|Present)', details)
if years_match:
edu["years"] = f"{years_match.group(1)} - {years_match.group(2)}"
edu["description"] = re.sub(r'\s+', ' ', details).strip()
profile["education"].append(edu)
# Extract Skills section
skills_section = re.search(r'Skills\s*\n+(.+?)(?=\n\n|\n[A-Z][a-z]+\s*\n|\Z)', content, re.DOTALL | re.IGNORECASE)
if skills_section:
skills_content = skills_section.group(1)
# Extract skills (they might be separated by commas or bullets)
skills = re.split(r'[,•\n]\s*', skills_content)
profile["skills"] = [s.strip() for s in skills if s.strip() and len(s.strip()) > 1]
# Extract Languages
lang_match = re.search(r'Languages\s*\n+(.+?)(?=\n\n|\n[A-Z][a-z]+\s*\n|\Z)', content, re.DOTALL | re.IGNORECASE)
if lang_match:
lang_content = lang_match.group(1)
languages = re.split(r'[,•\n]\s*', lang_content)
profile["languages"] = [l.strip() for l in languages if l.strip() and len(l.strip()) > 1]
# Try to extract profile image URL
img_match = re.search(r'https://media\.licdn\.com/dms/image/[^\s\)]+', content)
if img_match:
profile["profile_image_url"] = img_match.group(0)
return profile
def main():
"""Main function to extract a specific LinkedIn profile."""
if len(sys.argv) < 3:
print("Usage: python extract_linkedin_profile_exa.py <linkedin_url> <output_file> [source_file] [staff_id]")
print("\nExample:")
print("python extract_linkedin_profile_exa.py https://www.linkedin.com/in/annelien-vos-keen-657b66223 /Users/kempersc/apps/glam/data/custodian/person/entity/annelien-vos-keen-657b66223_20251210T160000Z.json")
sys.exit(1)
linkedin_url = sys.argv[1]
output_file = sys.argv[2]
source_file = sys.argv[3] if len(sys.argv) > 3 else ""
staff_id = sys.argv[4] if len(sys.argv) > 4 else ""
# Extract profile
success = extract_linkedin_profile_with_exa(linkedin_url, output_file, source_file, staff_id)
if success:
print("\n✅ Profile extraction completed successfully!")
else:
print("\n❌ Profile extraction failed!")
sys.exit(1)
if __name__ == "__main__":
main()