#!/usr/bin/env python3 """ Extract LinkedIn profiles using the working pattern from extract_linkedin_profile_exa.py """ import json import os import sys import subprocess from datetime import datetime, timezone from pathlib import Path from typing import Dict, List, Any def extract_linkedin_profile_with_exa(linkedin_url: str, output_file: str, source_file: str = "", staff_id: str = "") -> bool: """Extract LinkedIn profile using Exa crawler and save in structured format.""" print(f"Extracting LinkedIn profile: {linkedin_url}") # Use Exa crawler to get profile content cmd = [ 'mcp', 'call', 'exa_crawling_exa', '--url', linkedin_url, '--maxCharacters', '50000' ] try: result = subprocess.run( cmd, capture_output=True, text=True, timeout=60 ) if result.returncode != 0: print(f"āŒ Failed to extract profile from {linkedin_url}") print(f"Error: {result.stderr}") return False # Parse JSON output try: output = json.loads(result.stdout) except json.JSONDecodeError as e: print(f"Failed to parse JSON output: {e}") return False if not output or 'results' not in output or not output['results']: print(f"āŒ No results returned from Exa") return False # Get first (and only) result result_data = output['results'][0] raw_content = result_data.get('text', '') title = result_data.get('title', '') url = result_data.get('url', linkedin_url) # Create minimal structured data profile_data = { "name": title, "linkedin_url": url, "headline": "", "location": "", "connections": "", "about": raw_content[:500] + "..." if len(raw_content) > 500 else raw_content, "experience": [], "education": [], "skills": [], "languages": [], "profile_image_url": None } # Create structured output structured_data = { "extraction_metadata": { "source_file": source_file or "manual_extraction", "staff_id": staff_id or "manual", "extraction_date": datetime.now(timezone.utc).isoformat(), "extraction_method": "exa_crawling_exa", "extraction_agent": "glm-4.6", "linkedin_url": url, "cost_usd": 0.001, "request_id": result_data.get('id', 'unknown') }, "profile_data": profile_data } # Ensure output directory exists output_path = Path(output_file) output_path.parent.mkdir(parents=True, exist_ok=True) # Save to file with open(output_file, 'w', encoding='utf-8') as f: json.dump(structured_data, f, indent=2, ensure_ascii=False) print(f"āœ… Profile saved to: {output_file}") print(f" Name: {profile_data.get('name', 'Unknown')}") return True except Exception as e: print(f"Exception during extraction: {e}") return False def main(): """Main function to extract specific LinkedIn profiles.""" # Define specific profiles to extract profiles_to_extract = [ { 'linkedin_url': 'https://www.linkedin.com/in/anja-van-hoorn-657b66223', 'output_file': '/Users/kempersc/apps/glam/data/custodian/person/entity/anja-van-hoorn-657b66223_20251210T160000Z.json', 'source_file': '/Users/kempersc/apps/glam/data/custodian/person/affiliated/parsed/academiehuis-grote-kerk-zwolle_staff_20251210T155412Z.json', 'staff_id': 'academiehuis-grote-kerk-zwolle_staff_0001_anja_van_hoorn' }, { 'linkedin_url': 'https://www.linkedin.com/in/inez-van-kleef', 'output_file': '/Users/kempersc/apps/glam/data/custodian/person/entity/inez-van-kleef_20251210T160000Z.json', 'source_file': '/Users/kempersc/apps/glam/data/custodian/person/affiliated/parsed/academiehuis-grote-kerk-zwolle_staff_20251210T155412Z.json', 'staff_id': 'academiehuis-grote-kerk-zwolle_staff_0002_inez_van_kleef' }, { 'linkedin_url': 'https://www.linkedin.com/in/marga-edens-a284175', 'output_file': '/Users/kempersc/apps/glam/data/custodian/person/entity/marga-edens-a284175_20251210T160000Z.json', 'source_file': '/Users/kempersc/apps/glam/data/custodian/person/affiliated/parsed/academiehuis-grote-kerk-zwolle_staff_20251210T155412Z.json', 'staff_id': 'academiehuis-grote-kerk-zwolle_staff_0003_marga_edens' } ] success_count = 0 total_cost = 0.0 for profile in profiles_to_extract: if extract_linkedin_profile_with_exa(**profile): success_count += 1 total_cost += 0.001 # Small delay to avoid overwhelming Exa import time time.sleep(2) print(f"\nšŸ“Š Extraction Summary:") print(f"āœ… Successfully processed: {success_count}") print(f"šŸ’° Total cost: ${total_cost:.3f}") print(f"šŸ“ Files saved to: /Users/kempersc/apps/glam/data/custodian/person/entity") if __name__ == "__main__": main()