#!/usr/bin/env python3 """ Direct extraction of LinkedIn profiles using subprocess pattern from working scripts. """ import json import os import sys import subprocess from datetime import datetime, timezone from pathlib import Path from typing import Dict, List, Any def extract_profile_directly(linkedin_url: str, output_file: str, source_file: str = "", staff_id: str = "") -> bool: """Extract LinkedIn profile using direct subprocess call.""" print(f"Extracting LinkedIn profile: {linkedin_url}") # Build command similar to working pattern cmd = [ sys.executable, # Use current Python interpreter '-c', f''' import json import sys import subprocess from datetime import datetime, timezone # Call Exa result = subprocess.run( ["node", "/Users/kempersc/apps/glam/exa-mcp-server-source/.smithery/stdio/index.cjs", "call", "exa_crawling_exa", "--url", "{linkedin_url}", "--maxCharacters", "50000"], capture_output=True, text=True, timeout=60 ) if result.returncode == 0: # Parse JSON output try: output = json.loads(result.stdout) if output and "results" in output and output["results"]: profile_content = output["results"][0].get("text", "") title = output["results"][0].get("title", "Unknown") # Create minimal structured data profile_data = {{ "name": title, "linkedin_url": "{linkedin_url}", "headline": "", "location": "", "connections": "", "about": profile_content[:1000] + "..." if len(profile_content) > 1000 else profile_content, "experience": [], "education": [], "skills": [], "languages": [], "profile_image_url": None }} # Create structured output structured_data = {{ "extraction_metadata": {{ "source_file": "{source_file}", "staff_id": "{staff_id}", "extraction_date": datetime.now(timezone.utc).isoformat(), "extraction_method": "exa_crawling_exa", "extraction_agent": "glm-4.6", "linkedin_url": "{linkedin_url}", "cost_usd": 0.001, "request_id": output["results"][0].get("id", "unknown") }}, "profile_data": profile_data }} # Save to file with open("{output_file}", "w", encoding="utf-8") as f: json.dump(structured_data, f, indent=2, ensure_ascii=False) print(f"āœ… Profile saved to: {output_file}") print(f" Name: {{title}}") return True except Exception as e: print(f"Error parsing output: {{e}}") else: print(f"Error calling Exa: {{result.stderr}}") sys.exit(0 if result.returncode == 0 else 1) ''' ] try: result = subprocess.run(cmd, capture_output=True, text=True, timeout=120) if result.returncode == 0: print(f"āœ… Successfully extracted profile for {linkedin_url}") return True else: print(f"āŒ Failed to extract profile: {result.stderr}") return False except Exception as e: print(f"Exception during extraction: {e}") return False def main(): """Main function to extract specific LinkedIn profiles.""" # Define specific profiles to extract profiles_to_extract = [ { 'linkedin_url': 'https://www.linkedin.com/in/anja-van-hoorn-657b66223', 'output_file': '/Users/kempersc/apps/glam/data/custodian/person/entity/anja-van-hoorn-657b66223_20251210T160000Z.json', 'source_file': '/Users/kempersc/apps/glam/data/custodian/person/affiliated/parsed/academiehuis-grote-kerk-zwolle_staff_20251210T155412Z.json', 'staff_id': 'academiehuis-grote-kerk-zwolle_staff_0001_anja_van_hoorn' }, { 'linkedin_url': 'https://www.linkedin.com/in/inez-van-kleef', 'output_file': '/Users/kempersc/apps/glam/data/custodian/person/entity/inez-van-kleef_20251210T160000Z.json', 'source_file': '/Users/kempersc/apps/glam/data/custodian/person/affiliated/parsed/academiehuis-grote-kerk-zwolle_staff_20251210T155412Z.json', 'staff_id': 'academiehuis-grote-kerk-zwolle_staff_0002_inez_van_kleef' }, { 'linkedin_url': 'https://www.linkedin.com/in/marga-edens-a284175', 'output_file': '/Users/kempersc/apps/glam/data/custodian/person/entity/marga-edens-a284175_20251210T160000Z.json', 'source_file': '/Users/kempersc/apps/glam/data/custodian/person/affiliated/parsed/academiehuis-grote-kerk-zwolle_staff_20251210T155412Z.json', 'staff_id': 'academiehuis-grote-kerk-zwolle_staff_0003_marga_edens' } ] success_count = 0 total_cost = 0.0 for profile in profiles_to_extract: if extract_profile_directly(**profile): success_count += 1 total_cost += 0.001 # Small delay to avoid overwhelming import time time.sleep(3) print(f"\nšŸ“Š Extraction Summary:") print(f"āœ… Successfully processed: {success_count}") print(f"šŸ’° Total cost: ${total_cost:.3f}") print(f"šŸ“ Files saved to: /Users/kempersc/apps/glam/data/custodian/person/entity") if __name__ == "__main__": main()