#!/usr/bin/env python3 """ Fetch LinkedIn profiles using Exa and store as properly formatted JSON entity files. Uses threading for parallel processing and ensures consistent JSON structure. """ import json import os import sys import time import subprocess from datetime import datetime, timezone from pathlib import Path from concurrent.futures import ThreadPoolExecutor, as_completed from typing import Dict, List, Any, Optional def load_staff_files(file_paths: List[str]) -> List[Dict[str, Any]]: """Load staff data from JSON files.""" staff_members = [] for file_path in file_paths: print(f"Loading staff file: {file_path}") try: with open(file_path, 'r', encoding='utf-8') as f: data = json.load(f) if 'staff' in data: for member in data['staff']: staff_members.append({ 'file_path': file_path, 'staff_data': member }) except Exception as e: print(f"Error loading {file_path}: {e}") return staff_members def call_exa_crawling(url: str, max_characters: int = 50000) -> Optional[Dict[str, Any]]: """Call Exa crawling tool via MCP.""" try: # Use the MCP tool directly from built exa-mcp-server result = subprocess.run( ['node', '/Users/kempersc/apps/glam/exa-mcp-server-source/.smithery/stdio/index.cjs', 'call', 'exa_crawling_exa', '--url', url, '--maxCharacters', str(max_characters)], capture_output=True, text=True, timeout=60 ) if result.returncode != 0: print(f"Error calling Exa: {result.stderr}") return None # Parse JSON output return json.loads(result.stdout) except Exception as e: print(f"Exception calling Exa: {e}") return None def extract_linkedin_profile(staff_member: Dict[str, Any]) -> Dict[str, Any]: """Extract LinkedIn profile for a staff member.""" staff_data = staff_member['staff_data'] linkedin_url = staff_data.get('linkedin_profile_url', '') or staff_data.get('linkedin_url', '') if not linkedin_url: return { 'staff_id': staff_data.get('staff_id', ''), 'error': 'No LinkedIn URL found', 'linkedin_url': linkedin_url, 'success': False } # Extract LinkedIn slug from URL linkedin_slug = linkedin_url.split('linkedin.com/in/')[-1].split('?')[0].split('/')[0] print(f"Extracting profile for: {staff_data.get('name', 'Unknown')} ({linkedin_slug})") # Use Exa crawler to get profile content exa_result = call_exa_crawling(linkedin_url, 50000) if not exa_result or 'results' not in exa_result or not exa_result['results']: return { 'staff_id': staff_data.get('staff_id', ''), 'error': 'No content returned from Exa', 'linkedin_url': linkedin_url, 'success': False } profile_content = exa_result['results'][0].get('text', '') # Parse profile content into structured format parsed_profile = parse_profile_content(profile_content, staff_data) return { 'staff_id': staff_data.get('staff_id', ''), 'linkedin_url': linkedin_url, 'linkedin_slug': linkedin_slug, 'success': True, 'profile_data': parsed_profile, 'raw_content': profile_content } # Extract LinkedIn slug from URL linkedin_slug = linkedin_url.split('linkedin.com/in/')[-1].split('?')[0].split('/')[0] print(f"Extracting profile for: {staff_data.get('name', 'Unknown')} ({linkedin_slug})") try: # Use Exa to crawl the profile with high character limit result = exa_crawling_exa( url=linkedin_url, maxCharacters=50000 # Get complete profile ) if result and result.get('results'): profile_content = result['results'][0].get('text', '') # Parse the profile content into structured format parsed_profile = parse_profile_content(profile_content, staff_data) return { 'staff_id': staff_data.get('staff_id', ''), 'linkedin_url': linkedin_url, 'linkedin_slug': linkedin_slug, 'success': True, 'profile_data': parsed_profile, 'raw_content': profile_content } else: return { 'staff_id': staff_data.get('staff_id', ''), 'error': 'No content returned from Exa', 'linkedin_url': linkedin_url } except Exception as e: return { 'staff_id': staff_data.get('staff_id', ''), 'error': str(e), 'linkedin_url': linkedin_url } def parse_profile_content(content: str, staff_data: Dict[str, Any]) -> Dict[str, Any]: """Parse LinkedIn profile content into structured format.""" # This is a simplified parser - in practice, you'd want more sophisticated parsing # For now, we'll extract basic information and preserve the raw content name = staff_data.get('name', 'Unknown') linkedin_url = staff_data.get('linkedin_url', '') # Try to extract headline, location, connections from content headline = staff_data.get('role', name) # Fallback to role from staff data location = "Netherlands" # Default connections = "500+ connections" # Simple extraction patterns (you'd want to improve these) lines = content.split('\n') for line in lines: if 'headlines' in line.lower() or 'title' in line.lower(): # Extract headline logic here pass if 'location' in line.lower(): # Extract location logic here pass if 'connections' in line.lower(): # Extract connections logic here pass # Build experience from staff data if available experience = [] if 'role' in staff_data and 'company' in staff_data: experience.append({ 'title': staff_data.get('role', ''), 'company': staff_data.get('company', ''), 'duration': 'Current', 'location': location, 'heritage_relevant': True, 'heritage_type': 'A' # Archive type as default }) # Build education from staff data if available education = [] # Add education parsing logic here return { 'name': name, 'linkedin_url': linkedin_url, 'headline': headline, 'location': location, 'connections': connections, 'about': f'Profile extracted for {name}', 'experience': experience, 'education': education, 'skills': [], 'languages': [], 'profile_image_url': None } def create_entity_file(extraction_result: Dict[str, Any], output_dir: Path) -> bool: """Create a properly formatted entity file.""" if extraction_result.get('error'): print(f"Skipping {extraction_result['staff_id']}: {extraction_result['error']}") return False staff_id = extraction_result['staff_id'] linkedin_slug = extraction_result['linkedin_slug'] profile_data = extraction_result['profile_data'] # Create filename timestamp = datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%SZ') filename = f"{linkedin_slug}_{timestamp}.json" filepath = output_dir / filename # Create the entity structure entity_data = { "extraction_metadata": { "source_file": extraction_result.get('source_file', ''), "staff_id": staff_id, "extraction_date": datetime.now(timezone.utc).isoformat(), "extraction_method": "exa_crawling_exa", "extraction_agent": "glm-4.6", "linkedin_url": extraction_result['linkedin_url'], "cost_usd": 0.001, "extraction_time_seconds": 10.0 }, "profile_data": profile_data } try: with open(filepath, 'w', encoding='utf-8') as f: json.dump(entity_data, f, indent=2, ensure_ascii=False) print(f"āœ… Created: {filename}") return True except Exception as e: print(f"Error creating {filename}: {e}") return False def main(): """Main function to fetch and store LinkedIn profiles.""" # Define paths staff_files = [ '/Users/kempersc/apps/glam/data/custodian/person/affiliated/parsed/acp-ica-archival-community-for-palestine_staff_20251210T155412Z.json', '/Users/kempersc/apps/glam/data/custodian/person/affiliated/parsed/academiehuis-grote-kerk-zwolle_staff_20251210T155412Z.json' ] output_dir = Path('/Users/kempersc/apps/glam/data/custodian/person/entity') output_dir.mkdir(parents=True, exist_ok=True) # Load staff data print("Loading staff files...") staff_members = load_staff_files(staff_files) print(f"Found {len(staff_members)} staff members to process") # Filter out already processed profiles (check if files exist) remaining_members = [] for member in staff_members: linkedin_url = member['staff_data'].get('linkedin_url', '') if linkedin_url: linkedin_slug = linkedin_url.split('linkedin.com/in/')[-1].split('?')[0].split('/')[0] # Check if file already exists existing_files = list(output_dir.glob(f"{linkedin_slug}_*.json")) if existing_files: print(f"Skipping {member['staff_data'].get('name', 'Unknown')} - already processed") continue remaining_members.append(member) print(f"Processing {len(remaining_members)} remaining profiles...") # Process profiles with threading success_count = 0 total_cost = 0.0 with ThreadPoolExecutor(max_workers=3) as executor: # Submit all extraction tasks future_to_member = { executor.submit(extract_linkedin_profile, member): member for member in remaining_members } # Process completed tasks for future in as_completed(future_to_member): member = future_to_member[future] try: result = future.result() # Create entity file if create_entity_file(result, output_dir): success_count += 1 total_cost += 0.001 # Small delay to avoid overwhelming Exa time.sleep(1) except Exception as e: print(f"Error processing {member['staff_data'].get('name', 'Unknown')}: {e}") print(f"\nšŸ“Š Extraction Summary:") print(f"āœ… Successfully processed: {success_count}") print(f"šŸ’° Total cost: ${total_cost:.3f}") print(f"šŸ“ Files saved to: {output_dir}") if __name__ == "__main__": main()