#!/usr/bin/env python3 """ Simple script to extract LinkedIn profiles using the working pattern from extract_linkedin_profile_exa.py """ import json import os import sys import subprocess from datetime import datetime, timezone from pathlib import Path from typing import Dict, List, Any, Optional def call_exa_crawling(url: str, max_characters: int = 50000) -> Optional[Dict[str, Any]]: """Call Exa crawling tool via MCP.""" try: # Use the MCP tool directly result = subprocess.run( ['mcp', 'call', 'exa_crawling_exa', '--url', url, '--maxCharacters', str(max_characters)], capture_output=True, text=True, timeout=60 ) if result.returncode != 0: print(f"Error calling Exa: {result.stderr}") return None # Parse JSON output return json.loads(result.stdout) except Exception as e: print(f"Exception calling Exa: {e}") return None def parse_linkedin_content(content: str, title: str, url: str) -> Dict[str, Any]: """Parse LinkedIn profile content from raw text.""" # Initialize profile data profile = { "name": title, "linkedin_url": url, "headline": "", "location": "", "connections": "", "about": "", "experience": [], "education": [], "skills": [], "languages": [], "profile_image_url": None } # Simple extraction - look for key sections lines = content.split('\n') current_section = None for line in lines: line = line.strip() # Identify sections if line.startswith('## About'): current_section = 'about' elif line.startswith('## Experience'): current_section = 'experience' elif line.startswith('## Education'): current_section = 'education' elif line.startswith('## Skills'): current_section = 'skills' elif line.startswith('## Languages'): current_section = 'languages' elif line and not line.startswith('#') and current_section: # Extract content based on current section if current_section == 'about': profile['about'] += line + ' ' elif current_section == 'experience' and 'at' in line: # Simple experience parsing parts = line.split(' at ') if len(parts) >= 2: profile['experience'].append({ 'title': parts[0].strip(), 'company': parts[1].strip(), 'duration': 'Current' }) elif current_section == 'education' and 'at' in line: # Simple education parsing parts = line.split(' at ') if len(parts) >= 2: profile['education'].append({ 'degree': parts[0].strip(), 'institution': parts[1].strip(), 'duration': 'Unknown' }) # Clean up the about section profile['about'] = profile['about'].strip() return profile def extract_linkedin_profile(linkedin_url: str, output_file: str, source_file: str = "", staff_id: str = "") -> bool: """Extract LinkedIn profile using Exa crawler and save in structured format.""" print(f"Extracting LinkedIn profile: {linkedin_url}") # Use Exa crawler to get profile content exa_result = call_exa_crawling(linkedin_url, 50000) if not exa_result or 'results' not in exa_result or not exa_result['results']: print(f"āŒ Failed to extract profile from {linkedin_url}") return False # Get first (and only) result result = exa_result['results'][0] raw_content = result.get('text', '') title = result.get('title', 'Unknown') url = result.get('url', linkedin_url) # Parse profile content profile_data = parse_linkedin_content(raw_content, title, url) # Create structured output structured_data = { "extraction_metadata": { "source_file": source_file or "manual_extraction", "staff_id": staff_id or "manual", "extraction_date": datetime.now(timezone.utc).isoformat(), "extraction_method": "exa_crawling_exa", "extraction_agent": "glm-4.6", "linkedin_url": url, "cost_usd": 0.001, "request_id": result.get('id', 'unknown') }, "profile_data": profile_data } # Ensure output directory exists output_path = Path(output_file) output_path.parent.mkdir(parents=True, exist_ok=True) # Save to file with open(output_file, 'w', encoding='utf-8') as f: json.dump(structured_data, f, indent=2, ensure_ascii=False) print(f"āœ… Profile saved to: {output_file}") print(f" Name: {profile_data.get('name', 'Unknown')}") print(f" Headline: {profile_data.get('headline', '')[:80]}...") return True def main(): """Main function to extract specific LinkedIn profiles.""" # Define specific profiles to extract profiles_to_extract = [ { 'linkedin_url': 'https://www.linkedin.com/in/anja-van-hoorn-657b66223', 'output_file': '/Users/kempersc/apps/glam/data/custodian/person/entity/anja-van-hoorn-657b66223_20251210T160000Z.json', 'source_file': '/Users/kempersc/apps/glam/data/custodian/person/affiliated/parsed/academiehuis-grote-kerk-zwolle_staff_20251210T155412Z.json', 'staff_id': 'academiehuis-grote-kerk-zwolle_staff_0001_anja_van_hoorn' }, { 'linkedin_url': 'https://www.linkedin.com/in/inez-van-kleef-', 'output_file': '/Users/kempersc/apps/glam/data/custodian/person/entity/inez-van-kleef_20251210T160000Z.json', 'source_file': '/Users/kempersc/apps/glam/data/custodian/person/affiliated/parsed/academiehuis-grote-kerk-zwolle_staff_20251210T155412Z.json', 'staff_id': 'academiehuis-grote-kerk-zwolle_staff_0002_inez_van_kleef' }, { 'linkedin_url': 'https://www.linkedin.com/in/marga-edens-a284175', 'output_file': '/Users/kempersc/apps/glam/data/custodian/person/entity/marga-edens-a284175_20251210T160000Z.json', 'source_file': '/Users/kempersc/apps/glam/data/custodian/person/affiliated/parsed/academiehuis-grote-kerk-zwolle_staff_20251210T155412Z.json', 'staff_id': 'academiehuis-grote-kerk-zwolle_staff_0003_marga_edens' } ] success_count = 0 total_cost = 0.0 for profile in profiles_to_extract: if extract_linkedin_profile(**profile): success_count += 1 total_cost += 0.001 # Small delay to avoid overwhelming Exa import time time.sleep(2) print(f"\nšŸ“Š Extraction Summary:") print(f"āœ… Successfully processed: {success_count}") print(f"šŸ’° Total cost: ${total_cost:.3f}") print(f"šŸ“ Files saved to: /Users/kempersc/apps/glam/data/custodian/person/entity") if __name__ == "__main__": main()