#!/usr/bin/env python3 """ Simple script to extract LinkedIn profiles using the working pattern. """ import json import os import sys import subprocess from datetime import datetime, timezone from pathlib import Path from typing import Dict, Any def call_exa_crawling(url: str, max_characters: int = 50000): """Call Exa crawling tool via MCP.""" try: # Use the MCP tool directly mcp_path = '/Users/kempersc/apps/glam/exa-mcp-server-source/.smithery/stdio/index.cjs' result = subprocess.run( ['node', mcp_path, 'call', 'exa_crawling_exa', '--url', url, '--maxCharacters', str(max_characters)], capture_output=True, text=True, timeout=60 ) if result.returncode != 0: print(f"Error calling Exa: {result.stderr}") return None # Parse JSON output return json.loads(result.stdout) except Exception as e: print(f"Exception calling Exa: {e}") return None def parse_linkedin_content(content: str, title: str, url: str) -> Dict[str, Any]: """Parse LinkedIn profile content from raw text.""" # Initialize profile data profile = { "name": title, "linkedin_url": url, "headline": "", "location": "", "connections": "", "about": "", "experience": [], "education": [], "skills": [], "languages": [], "profile_image_url": None } # Simple extraction - look for key sections lines = content.split('\n') current_section = None for line in lines: line = line.strip() # Identify sections if line.startswith('## About'): current_section = 'about' elif line.startswith('## Experience'): current_section = 'experience' elif line.startswith('## Education'): current_section = 'education' elif line.startswith('## Skills'): current_section = 'skills' elif line.startswith('## Languages'): current_section = 'languages' elif line and not line.startswith('#') and current_section: # Extract content based on current section if current_section == 'about': profile['about'] += line + ' ' elif current_section == 'experience' and 'at' in line: # Simple experience parsing parts = line.split(' at ') if len(parts) >= 2: profile['experience'].append({ 'title': parts[0].strip(), 'company': parts[1].strip(), 'duration': 'Current' }) elif current_section == 'education' and 'at' in line: # Simple education parsing parts = line.split(' at ') if len(parts) >= 2: profile['education'].append({ 'degree': parts[0].strip(), 'institution': parts[1].strip(), 'duration': 'Unknown' }) # Clean up the about section profile['about'] = profile['about'].strip() return profile def extract_linkedin_profile(linkedin_url: str, output_file: str, source_file: str = "", staff_id: str = ""): """Extract LinkedIn profile using Exa crawler and save in structured format.""" print(f"Extracting LinkedIn profile: {linkedin_url}") # Use Exa crawler to get profile content exa_result = call_exa_crawling(linkedin_url, 50000) if not exa_result or 'results' not in exa_result or not exa_result['results']: print(f"āŒ Failed to extract profile from {linkedin_url}") return False # Get first (and only) result result = exa_result['results'][0] raw_content = result.get('text', '') title = result.get('title', 'Unknown') url = result.get('url', linkedin_url) # Parse profile content profile_data = parse_linkedin_content(raw_content, title, url) # Create structured output structured_data = { "extraction_metadata": { "source_file": source_file or "manual_extraction", "staff_id": staff_id or "manual", "extraction_date": datetime.now(timezone.utc).isoformat(), "extraction_method": "exa_crawling_exa", "extraction_agent": "glm-4.6", "linkedin_url": url, "cost_usd": 0.001, "request_id": result.get('id', 'unknown') }, "profile_data": profile_data } # Ensure output directory exists output_path = Path(output_file) output_path.parent.mkdir(parents=True, exist_ok=True) # Save to file with open(output_file, 'w', encoding='utf-8') as f: json.dump(structured_data, f, indent=2, ensure_ascii=False) print(f"āœ… Profile saved to: {output_file}") print(f" Name: {profile_data.get('name', 'Unknown')}") print(f" Headline: {profile_data.get('headline', '')[:80]}...") return True def main(): """Main function to extract specific LinkedIn profiles.""" # Define specific profiles to extract profiles_to_extract = [ { 'linkedin_url': 'https://www.linkedin.com/in/anja-van-hoorn-657b66223', 'output_file': '/Users/kempersc/apps/glam/data/custodian/person/entity/anja-van-hoorn-657b66223_20251210T160000Z.json', 'source_file': '/Users/kempersc/apps/glam/data/custodian/person/affiliated/parsed/academiehuis-grote-kerk-zwolle_staff_20251210T155412Z.json', 'staff_id': 'academiehuis-grote-kerk-zwolle_staff_0001_anja_van_hoorn' }, { 'linkedin_url': 'https://www.linkedin.com/in/inez-van-kleef', 'output_file': '/Users/kempersc/apps/glam/data/custodian/person/entity/inez-van-kleef_20251210T160000Z.json', 'source_file': '/Users/kempersc/apps/glam/data/custodian/person/affiliated/parsed/academiehuis-grote-kerk-zwolle_staff_20251210T155412Z.json', 'staff_id': 'academiehuis-grote-kerk-zwolle_staff_0002_inez_van_kleef' }, { 'linkedin_url': 'https://www.linkedin.com/in/marga-edens-a284175', 'output_file': '/Users/kempersc/apps/glam/data/custodian/person/entity/marga-edens-a284175_20251210T160000Z.json', 'source_file': '/Users/kempersc/apps/glam/data/custodian/person/affiliated/parsed/academiehuis-grote-kerk-zwolle_staff_20251210T155412Z.json', 'staff_id': 'academiehuis-grote-kerk-zwolle_staff_0003_marga_edens' } ] success_count = 0 total_cost = 0.0 for profile in profiles_to_extract: if extract_linkedin_profile(**profile): success_count += 1 total_cost += 0.001 # Small delay to avoid overwhelming Exa import time time.sleep(2) print(f"\nšŸ“Š Extraction Summary:") print(f"āœ… Successfully processed: {success_count}") print(f"šŸ’° Total cost: ${total_cost:.3f}") print(f"šŸ“ Files saved to: /Users/kempersc/apps/glam/data/custodian/person/entity") if __name__ == "__main__": main()