glam/scripts/fetch_remaining_linkedin_profiles.py
2025-12-11 22:32:09 +01:00

306 lines
No EOL
11 KiB
Python

#!/usr/bin/env python3
"""
Fetch LinkedIn profiles using Exa and store as properly formatted JSON entity files.
Uses threading for parallel processing and ensures consistent JSON structure.
"""
import json
import os
import sys
import time
import subprocess
from datetime import datetime, timezone
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Dict, List, Any, Optional
def load_staff_files(file_paths: List[str]) -> List[Dict[str, Any]]:
"""Load staff data from JSON files."""
staff_members = []
for file_path in file_paths:
print(f"Loading staff file: {file_path}")
try:
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
if 'staff' in data:
for member in data['staff']:
staff_members.append({
'file_path': file_path,
'staff_data': member
})
except Exception as e:
print(f"Error loading {file_path}: {e}")
return staff_members
def call_exa_crawling(url: str, max_characters: int = 50000) -> Optional[Dict[str, Any]]:
"""Call Exa crawling tool via MCP."""
try:
# Use the MCP tool directly from built exa-mcp-server
result = subprocess.run(
['node', '/Users/kempersc/apps/glam/exa-mcp-server-source/.smithery/stdio/index.cjs',
'call', 'exa_crawling_exa',
'--url', url,
'--maxCharacters', str(max_characters)],
capture_output=True,
text=True,
timeout=60
)
if result.returncode != 0:
print(f"Error calling Exa: {result.stderr}")
return None
# Parse JSON output
return json.loads(result.stdout)
except Exception as e:
print(f"Exception calling Exa: {e}")
return None
def extract_linkedin_profile(staff_member: Dict[str, Any]) -> Dict[str, Any]:
"""Extract LinkedIn profile for a staff member."""
staff_data = staff_member['staff_data']
linkedin_url = staff_data.get('linkedin_profile_url', '') or staff_data.get('linkedin_url', '')
if not linkedin_url:
return {
'staff_id': staff_data.get('staff_id', ''),
'error': 'No LinkedIn URL found',
'linkedin_url': linkedin_url,
'success': False
}
# Extract LinkedIn slug from URL
linkedin_slug = linkedin_url.split('linkedin.com/in/')[-1].split('?')[0].split('/')[0]
print(f"Extracting profile for: {staff_data.get('name', 'Unknown')} ({linkedin_slug})")
# Use Exa crawler to get profile content
exa_result = call_exa_crawling(linkedin_url, 50000)
if not exa_result or 'results' not in exa_result or not exa_result['results']:
return {
'staff_id': staff_data.get('staff_id', ''),
'error': 'No content returned from Exa',
'linkedin_url': linkedin_url,
'success': False
}
profile_content = exa_result['results'][0].get('text', '')
# Parse profile content into structured format
parsed_profile = parse_profile_content(profile_content, staff_data)
return {
'staff_id': staff_data.get('staff_id', ''),
'linkedin_url': linkedin_url,
'linkedin_slug': linkedin_slug,
'success': True,
'profile_data': parsed_profile,
'raw_content': profile_content
}
# Extract LinkedIn slug from URL
linkedin_slug = linkedin_url.split('linkedin.com/in/')[-1].split('?')[0].split('/')[0]
print(f"Extracting profile for: {staff_data.get('name', 'Unknown')} ({linkedin_slug})")
try:
# Use Exa to crawl the profile with high character limit
result = exa_crawling_exa(
url=linkedin_url,
maxCharacters=50000 # Get complete profile
)
if result and result.get('results'):
profile_content = result['results'][0].get('text', '')
# Parse the profile content into structured format
parsed_profile = parse_profile_content(profile_content, staff_data)
return {
'staff_id': staff_data.get('staff_id', ''),
'linkedin_url': linkedin_url,
'linkedin_slug': linkedin_slug,
'success': True,
'profile_data': parsed_profile,
'raw_content': profile_content
}
else:
return {
'staff_id': staff_data.get('staff_id', ''),
'error': 'No content returned from Exa',
'linkedin_url': linkedin_url
}
except Exception as e:
return {
'staff_id': staff_data.get('staff_id', ''),
'error': str(e),
'linkedin_url': linkedin_url
}
def parse_profile_content(content: str, staff_data: Dict[str, Any]) -> Dict[str, Any]:
"""Parse LinkedIn profile content into structured format."""
# This is a simplified parser - in practice, you'd want more sophisticated parsing
# For now, we'll extract basic information and preserve the raw content
name = staff_data.get('name', 'Unknown')
linkedin_url = staff_data.get('linkedin_url', '')
# Try to extract headline, location, connections from content
headline = staff_data.get('role', name) # Fallback to role from staff data
location = "Netherlands" # Default
connections = "500+ connections"
# Simple extraction patterns (you'd want to improve these)
lines = content.split('\n')
for line in lines:
if 'headlines' in line.lower() or 'title' in line.lower():
# Extract headline logic here
pass
if 'location' in line.lower():
# Extract location logic here
pass
if 'connections' in line.lower():
# Extract connections logic here
pass
# Build experience from staff data if available
experience = []
if 'role' in staff_data and 'company' in staff_data:
experience.append({
'title': staff_data.get('role', ''),
'company': staff_data.get('company', ''),
'duration': 'Current',
'location': location,
'heritage_relevant': True,
'heritage_type': 'A' # Archive type as default
})
# Build education from staff data if available
education = []
# Add education parsing logic here
return {
'name': name,
'linkedin_url': linkedin_url,
'headline': headline,
'location': location,
'connections': connections,
'about': f'Profile extracted for {name}',
'experience': experience,
'education': education,
'skills': [],
'languages': [],
'profile_image_url': None
}
def create_entity_file(extraction_result: Dict[str, Any], output_dir: Path) -> bool:
"""Create a properly formatted entity file."""
if extraction_result.get('error'):
print(f"Skipping {extraction_result['staff_id']}: {extraction_result['error']}")
return False
staff_id = extraction_result['staff_id']
linkedin_slug = extraction_result['linkedin_slug']
profile_data = extraction_result['profile_data']
# Create filename
timestamp = datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%SZ')
filename = f"{linkedin_slug}_{timestamp}.json"
filepath = output_dir / filename
# Create the entity structure
entity_data = {
"extraction_metadata": {
"source_file": extraction_result.get('source_file', ''),
"staff_id": staff_id,
"extraction_date": datetime.now(timezone.utc).isoformat(),
"extraction_method": "exa_crawling_exa",
"extraction_agent": "glm-4.6",
"linkedin_url": extraction_result['linkedin_url'],
"cost_usd": 0.001,
"extraction_time_seconds": 10.0
},
"profile_data": profile_data
}
try:
with open(filepath, 'w', encoding='utf-8') as f:
json.dump(entity_data, f, indent=2, ensure_ascii=False)
print(f"✅ Created: {filename}")
return True
except Exception as e:
print(f"Error creating {filename}: {e}")
return False
def main():
"""Main function to fetch and store LinkedIn profiles."""
# Define paths
staff_files = [
'/Users/kempersc/apps/glam/data/custodian/person/affiliated/parsed/acp-ica-archival-community-for-palestine_staff_20251210T155412Z.json',
'/Users/kempersc/apps/glam/data/custodian/person/affiliated/parsed/academiehuis-grote-kerk-zwolle_staff_20251210T155412Z.json'
]
output_dir = Path('/Users/kempersc/apps/glam/data/custodian/person/entity')
output_dir.mkdir(parents=True, exist_ok=True)
# Load staff data
print("Loading staff files...")
staff_members = load_staff_files(staff_files)
print(f"Found {len(staff_members)} staff members to process")
# Filter out already processed profiles (check if files exist)
remaining_members = []
for member in staff_members:
linkedin_url = member['staff_data'].get('linkedin_url', '')
if linkedin_url:
linkedin_slug = linkedin_url.split('linkedin.com/in/')[-1].split('?')[0].split('/')[0]
# Check if file already exists
existing_files = list(output_dir.glob(f"{linkedin_slug}_*.json"))
if existing_files:
print(f"Skipping {member['staff_data'].get('name', 'Unknown')} - already processed")
continue
remaining_members.append(member)
print(f"Processing {len(remaining_members)} remaining profiles...")
# Process profiles with threading
success_count = 0
total_cost = 0.0
with ThreadPoolExecutor(max_workers=3) as executor:
# Submit all extraction tasks
future_to_member = {
executor.submit(extract_linkedin_profile, member): member
for member in remaining_members
}
# Process completed tasks
for future in as_completed(future_to_member):
member = future_to_member[future]
try:
result = future.result()
# Create entity file
if create_entity_file(result, output_dir):
success_count += 1
total_cost += 0.001
# Small delay to avoid overwhelming Exa
time.sleep(1)
except Exception as e:
print(f"Error processing {member['staff_data'].get('name', 'Unknown')}: {e}")
print(f"\n📊 Extraction Summary:")
print(f"✅ Successfully processed: {success_count}")
print(f"💰 Total cost: ${total_cost:.3f}")
print(f"📁 Files saved to: {output_dir}")
if __name__ == "__main__":
main()