306 lines
No EOL
11 KiB
Python
306 lines
No EOL
11 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Fetch LinkedIn profiles using Exa and store as properly formatted JSON entity files.
|
|
Uses threading for parallel processing and ensures consistent JSON structure.
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import sys
|
|
import time
|
|
import subprocess
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
from typing import Dict, List, Any, Optional
|
|
|
|
def load_staff_files(file_paths: List[str]) -> List[Dict[str, Any]]:
|
|
"""Load staff data from JSON files."""
|
|
staff_members = []
|
|
|
|
for file_path in file_paths:
|
|
print(f"Loading staff file: {file_path}")
|
|
try:
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
if 'staff' in data:
|
|
for member in data['staff']:
|
|
staff_members.append({
|
|
'file_path': file_path,
|
|
'staff_data': member
|
|
})
|
|
except Exception as e:
|
|
print(f"Error loading {file_path}: {e}")
|
|
|
|
return staff_members
|
|
|
|
def call_exa_crawling(url: str, max_characters: int = 50000) -> Optional[Dict[str, Any]]:
|
|
"""Call Exa crawling tool via MCP."""
|
|
try:
|
|
# Use the MCP tool directly from built exa-mcp-server
|
|
result = subprocess.run(
|
|
['node', '/Users/kempersc/apps/glam/exa-mcp-server-source/.smithery/stdio/index.cjs',
|
|
'call', 'exa_crawling_exa',
|
|
'--url', url,
|
|
'--maxCharacters', str(max_characters)],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=60
|
|
)
|
|
|
|
if result.returncode != 0:
|
|
print(f"Error calling Exa: {result.stderr}")
|
|
return None
|
|
|
|
# Parse JSON output
|
|
return json.loads(result.stdout)
|
|
except Exception as e:
|
|
print(f"Exception calling Exa: {e}")
|
|
return None
|
|
|
|
def extract_linkedin_profile(staff_member: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Extract LinkedIn profile for a staff member."""
|
|
staff_data = staff_member['staff_data']
|
|
linkedin_url = staff_data.get('linkedin_profile_url', '') or staff_data.get('linkedin_url', '')
|
|
|
|
if not linkedin_url:
|
|
return {
|
|
'staff_id': staff_data.get('staff_id', ''),
|
|
'error': 'No LinkedIn URL found',
|
|
'linkedin_url': linkedin_url,
|
|
'success': False
|
|
}
|
|
|
|
# Extract LinkedIn slug from URL
|
|
linkedin_slug = linkedin_url.split('linkedin.com/in/')[-1].split('?')[0].split('/')[0]
|
|
|
|
print(f"Extracting profile for: {staff_data.get('name', 'Unknown')} ({linkedin_slug})")
|
|
|
|
# Use Exa crawler to get profile content
|
|
exa_result = call_exa_crawling(linkedin_url, 50000)
|
|
|
|
if not exa_result or 'results' not in exa_result or not exa_result['results']:
|
|
return {
|
|
'staff_id': staff_data.get('staff_id', ''),
|
|
'error': 'No content returned from Exa',
|
|
'linkedin_url': linkedin_url,
|
|
'success': False
|
|
}
|
|
|
|
profile_content = exa_result['results'][0].get('text', '')
|
|
|
|
# Parse profile content into structured format
|
|
parsed_profile = parse_profile_content(profile_content, staff_data)
|
|
|
|
return {
|
|
'staff_id': staff_data.get('staff_id', ''),
|
|
'linkedin_url': linkedin_url,
|
|
'linkedin_slug': linkedin_slug,
|
|
'success': True,
|
|
'profile_data': parsed_profile,
|
|
'raw_content': profile_content
|
|
}
|
|
|
|
# Extract LinkedIn slug from URL
|
|
linkedin_slug = linkedin_url.split('linkedin.com/in/')[-1].split('?')[0].split('/')[0]
|
|
|
|
print(f"Extracting profile for: {staff_data.get('name', 'Unknown')} ({linkedin_slug})")
|
|
|
|
try:
|
|
# Use Exa to crawl the profile with high character limit
|
|
result = exa_crawling_exa(
|
|
url=linkedin_url,
|
|
maxCharacters=50000 # Get complete profile
|
|
)
|
|
|
|
if result and result.get('results'):
|
|
profile_content = result['results'][0].get('text', '')
|
|
|
|
# Parse the profile content into structured format
|
|
parsed_profile = parse_profile_content(profile_content, staff_data)
|
|
|
|
return {
|
|
'staff_id': staff_data.get('staff_id', ''),
|
|
'linkedin_url': linkedin_url,
|
|
'linkedin_slug': linkedin_slug,
|
|
'success': True,
|
|
'profile_data': parsed_profile,
|
|
'raw_content': profile_content
|
|
}
|
|
else:
|
|
return {
|
|
'staff_id': staff_data.get('staff_id', ''),
|
|
'error': 'No content returned from Exa',
|
|
'linkedin_url': linkedin_url
|
|
}
|
|
|
|
except Exception as e:
|
|
return {
|
|
'staff_id': staff_data.get('staff_id', ''),
|
|
'error': str(e),
|
|
'linkedin_url': linkedin_url
|
|
}
|
|
|
|
def parse_profile_content(content: str, staff_data: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Parse LinkedIn profile content into structured format."""
|
|
# This is a simplified parser - in practice, you'd want more sophisticated parsing
|
|
# For now, we'll extract basic information and preserve the raw content
|
|
|
|
name = staff_data.get('name', 'Unknown')
|
|
linkedin_url = staff_data.get('linkedin_url', '')
|
|
|
|
# Try to extract headline, location, connections from content
|
|
headline = staff_data.get('role', name) # Fallback to role from staff data
|
|
location = "Netherlands" # Default
|
|
connections = "500+ connections"
|
|
|
|
# Simple extraction patterns (you'd want to improve these)
|
|
lines = content.split('\n')
|
|
for line in lines:
|
|
if 'headlines' in line.lower() or 'title' in line.lower():
|
|
# Extract headline logic here
|
|
pass
|
|
if 'location' in line.lower():
|
|
# Extract location logic here
|
|
pass
|
|
if 'connections' in line.lower():
|
|
# Extract connections logic here
|
|
pass
|
|
|
|
# Build experience from staff data if available
|
|
experience = []
|
|
if 'role' in staff_data and 'company' in staff_data:
|
|
experience.append({
|
|
'title': staff_data.get('role', ''),
|
|
'company': staff_data.get('company', ''),
|
|
'duration': 'Current',
|
|
'location': location,
|
|
'heritage_relevant': True,
|
|
'heritage_type': 'A' # Archive type as default
|
|
})
|
|
|
|
# Build education from staff data if available
|
|
education = []
|
|
# Add education parsing logic here
|
|
|
|
return {
|
|
'name': name,
|
|
'linkedin_url': linkedin_url,
|
|
'headline': headline,
|
|
'location': location,
|
|
'connections': connections,
|
|
'about': f'Profile extracted for {name}',
|
|
'experience': experience,
|
|
'education': education,
|
|
'skills': [],
|
|
'languages': [],
|
|
'profile_image_url': None
|
|
}
|
|
|
|
def create_entity_file(extraction_result: Dict[str, Any], output_dir: Path) -> bool:
|
|
"""Create a properly formatted entity file."""
|
|
if extraction_result.get('error'):
|
|
print(f"Skipping {extraction_result['staff_id']}: {extraction_result['error']}")
|
|
return False
|
|
|
|
staff_id = extraction_result['staff_id']
|
|
linkedin_slug = extraction_result['linkedin_slug']
|
|
profile_data = extraction_result['profile_data']
|
|
|
|
# Create filename
|
|
timestamp = datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%SZ')
|
|
filename = f"{linkedin_slug}_{timestamp}.json"
|
|
filepath = output_dir / filename
|
|
|
|
# Create the entity structure
|
|
entity_data = {
|
|
"extraction_metadata": {
|
|
"source_file": extraction_result.get('source_file', ''),
|
|
"staff_id": staff_id,
|
|
"extraction_date": datetime.now(timezone.utc).isoformat(),
|
|
"extraction_method": "exa_crawling_exa",
|
|
"extraction_agent": "glm-4.6",
|
|
"linkedin_url": extraction_result['linkedin_url'],
|
|
"cost_usd": 0.001,
|
|
"extraction_time_seconds": 10.0
|
|
},
|
|
"profile_data": profile_data
|
|
}
|
|
|
|
try:
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
json.dump(entity_data, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"✅ Created: {filename}")
|
|
return True
|
|
|
|
except Exception as e:
|
|
print(f"Error creating {filename}: {e}")
|
|
return False
|
|
|
|
def main():
|
|
"""Main function to fetch and store LinkedIn profiles."""
|
|
# Define paths
|
|
staff_files = [
|
|
'/Users/kempersc/apps/glam/data/custodian/person/affiliated/parsed/acp-ica-archival-community-for-palestine_staff_20251210T155412Z.json',
|
|
'/Users/kempersc/apps/glam/data/custodian/person/affiliated/parsed/academiehuis-grote-kerk-zwolle_staff_20251210T155412Z.json'
|
|
]
|
|
|
|
output_dir = Path('/Users/kempersc/apps/glam/data/custodian/person/entity')
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Load staff data
|
|
print("Loading staff files...")
|
|
staff_members = load_staff_files(staff_files)
|
|
print(f"Found {len(staff_members)} staff members to process")
|
|
|
|
# Filter out already processed profiles (check if files exist)
|
|
remaining_members = []
|
|
for member in staff_members:
|
|
linkedin_url = member['staff_data'].get('linkedin_url', '')
|
|
if linkedin_url:
|
|
linkedin_slug = linkedin_url.split('linkedin.com/in/')[-1].split('?')[0].split('/')[0]
|
|
# Check if file already exists
|
|
existing_files = list(output_dir.glob(f"{linkedin_slug}_*.json"))
|
|
if existing_files:
|
|
print(f"Skipping {member['staff_data'].get('name', 'Unknown')} - already processed")
|
|
continue
|
|
remaining_members.append(member)
|
|
|
|
print(f"Processing {len(remaining_members)} remaining profiles...")
|
|
|
|
# Process profiles with threading
|
|
success_count = 0
|
|
total_cost = 0.0
|
|
|
|
with ThreadPoolExecutor(max_workers=3) as executor:
|
|
# Submit all extraction tasks
|
|
future_to_member = {
|
|
executor.submit(extract_linkedin_profile, member): member
|
|
for member in remaining_members
|
|
}
|
|
|
|
# Process completed tasks
|
|
for future in as_completed(future_to_member):
|
|
member = future_to_member[future]
|
|
try:
|
|
result = future.result()
|
|
|
|
# Create entity file
|
|
if create_entity_file(result, output_dir):
|
|
success_count += 1
|
|
total_cost += 0.001
|
|
|
|
# Small delay to avoid overwhelming Exa
|
|
time.sleep(1)
|
|
|
|
except Exception as e:
|
|
print(f"Error processing {member['staff_data'].get('name', 'Unknown')}: {e}")
|
|
|
|
print(f"\n📊 Extraction Summary:")
|
|
print(f"✅ Successfully processed: {success_count}")
|
|
print(f"💰 Total cost: ${total_cost:.3f}")
|
|
print(f"📁 Files saved to: {output_dir}")
|
|
|
|
if __name__ == "__main__":
|
|
main() |