glam/scripts/extract_comprehensive_profiles.py
2025-12-11 22:32:09 +01:00

230 lines
No EOL
8.5 KiB
Python

#!/usr/bin/env python3
"""
Extract comprehensive LinkedIn profiles using Exa LinkedIn search service.
"""
import json
import os
import subprocess
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional, Dict, Any
def call_exa_linkedin_search(name: str, linkedin_url: str = "") -> Optional[dict]:
"""Call Exa LinkedIn search to find profile."""
try:
# Build search query - try name first, then URL if needed
query = name if not linkedin_url else f"site:linkedin.com/in/ {linkedin_url.split('/')[-1]}"
# Use JSON-RPC format to call exa-mcp-server
cmd = [
'echo',
json.dumps({
"jsonrpc": "2.0",
"method": "tools/call",
"params": {
"name": "linkedin_search_exa",
"arguments": {
"query": query,
"searchType": "profiles",
"numResults": 5
}
},
"id": 1
}),
'|',
'exa-mcp-server'
]
result = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=60
)
if result.returncode != 0:
print(f"Error calling Exa LinkedIn search: {result.stderr}")
return None
# Parse JSON-RPC response
response = json.loads(result.stdout)
if 'result' in response and 'results' in response['result']:
return response['result']['results']
else:
print(f"No results from LinkedIn search: {response}")
return None
except Exception as e:
print(f"Exception calling Exa LinkedIn search: {e}")
return None
def call_exa_crawling(linkedin_url: str) -> dict:
"""Call Exa crawling to get full profile content."""
try:
# Use JSON-RPC format to call exa-mcp-server
cmd = [
'echo',
json.dumps({
"jsonrpc": "2.0",
"method": "tools/call",
"params": {
"name": "crawling_exa",
"arguments": {
"url": linkedin_url,
"maxCharacters": 50000
}
},
"id": 1
}),
'|',
'exa-mcp-server'
]
result = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=60
)
if result.returncode != 0:
print(f"Error calling Exa crawling: {result.stderr}")
return None
# Parse JSON-RPC response
response = json.loads(result.stdout)
if 'result' in response and 'results' in response['result']:
return response['result']['results'][0]
else:
print(f"No results from crawling: {response}")
return None
except Exception as e:
print(f"Exception calling Exa crawling: {e}")
return None
def extract_comprehensive_profile(name: str, linkedin_url: str, output_file: str, source_file: str, staff_id: str) -> bool:
"""Extract comprehensive LinkedIn profile using Exa services."""
print(f"Extracting comprehensive LinkedIn profile for: {name}")
print(f" URL: {linkedin_url}")
# First try to get full profile content via crawling
profile_data = None
# Try crawling first (most comprehensive)
crawl_result = call_exa_crawling(linkedin_url)
if crawl_result:
print(f" ✅ Successfully crawled profile content")
profile_data = {
"name": name,
"linkedin_url": linkedin_url,
"headline": crawl_result.get("title", ""),
"location": "",
"connections": "",
"about": crawl_result.get("text", "")[:2000] + "..." if len(crawl_result.get("text", "")) > 2000 else crawl_result.get("text", ""),
"experience": [],
"education": [],
"skills": [],
"languages": [],
"profile_image_url": None
}
else:
print(f" ⚠️ Crawling failed, trying LinkedIn search...")
# Fallback to LinkedIn search
search_results = call_exa_linkedin_search(name, linkedin_url)
if search_results and len(search_results) > 0:
# Find the best match
best_match = search_results[0] # Take first result
print(f" ✅ Found profile via search")
profile_data = {
"name": name,
"linkedin_url": linkedin_url,
"headline": best_match.get("title", ""),
"location": best_match.get("metadata", {}).get("location", ""),
"connections": "",
"about": best_match.get("text", "")[:2000] + "..." if len(best_match.get("text", "")) > 2000 else best_match.get("text", ""),
"experience": [],
"education": [],
"skills": [],
"languages": [],
"profile_image_url": None
}
else:
print(f" ❌ Both crawling and search failed")
return False
# Create structured output
structured_data = {
"extraction_metadata": {
"source_file": source_file,
"staff_id": staff_id,
"extraction_date": datetime.now(timezone.utc).isoformat(),
"extraction_method": "exa_linkedin_search_and_crawl",
"extraction_agent": "claude-opus-4.5",
"linkedin_url": linkedin_url,
"cost_usd": 0.002, # Two API calls
"request_id": f"search_crawl_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
},
"profile_data": profile_data
}
# Ensure output directory exists
output_path = Path(output_file)
output_path.parent.mkdir(parents=True, exist_ok=True)
# Save to file
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(structured_data, f, indent=2, ensure_ascii=False)
print(f" ✅ Profile saved to: {output_path}")
print(f" 📊 Name: {profile_data.get('name', 'Unknown')}")
print(f" 📝 Headline: {profile_data.get('headline', '')[:100]}...")
return True
def main():
"""Main function to extract comprehensive LinkedIn profiles."""
# Define specific profiles to extract from Academiehuis Grote Kerk Zwolle
profiles = [
{
'linkedin_url': 'https://www.linkedin.com/in/anja-van-hoorn-657b66223',
'name': 'Anja van Hoorn',
'output_file': '/Users/kempersc/apps/glam/data/custodian/person/entity/anja-van-hoorn-657b66223_20251210T160000Z.json',
'source_file': '/Users/kempersc/apps/glam/data/custodian/person/affiliated/parsed/academiehuis-grote-kerk-zwolle_staff_20251210T155412Z.json',
'staff_id': 'academiehuis-grote-kerk-zwolle_staff_0001_anja_van_hoorn'
},
{
'linkedin_url': 'https://www.linkedin.com/in/inez-van-kleef',
'name': 'Inez van Kleef',
'output_file': '/Users/kempersc/apps/glam/data/custodian/person/entity/inez-van-kleef_20251210T160000Z.json',
'source_file': '/Users/kempersc/apps/glam/data/custodian/person/affiliated/parsed/academiehuis-grote-kerk-zwolle_staff_20251210T155412Z.json',
'staff_id': 'academiehuis-grote-kerk-zwolle_staff_0002_inez_van_kleef'
},
{
'linkedin_url': 'https://www.linkedin.com/in/marga-edens-a284175',
'name': 'Marga Edens',
'output_file': '/Users/kempersc/apps/glam/data/custodian/person/entity/marga-edens-a284175_20251210T160000Z.json',
'source_file': '/Users/kempersc/apps/glam/data/custodian/person/affiliated/parsed/academiehuis-grote-kerk-zwolle_staff_20251210T155412Z.json',
'staff_id': 'academiehuis-grote-kerk-zwolle_staff_0003_marga_edens'
}
]
success_count = 0
total_cost = 0.0
for profile in profiles:
if extract_comprehensive_profile(**profile):
success_count += 1
total_cost += 0.002
# Delay between requests to respect rate limits
import time
time.sleep(3)
print(f"\n📊 Extraction Summary:")
print(f"✅ Successfully processed: {success_count}")
print(f"💰 Total cost: ${total_cost:.3f}")
print(f"📁 Files saved to: /Users/kempersc/apps/glam/data/custodian/person/entity")
if __name__ == "__main__":
main()