glam/scripts/extract_profiles_direct.py
2025-12-11 22:32:09 +01:00

158 lines
No EOL
5.8 KiB
Python

#!/usr/bin/env python3
"""
Direct extraction of LinkedIn profiles using subprocess pattern from working scripts.
"""
import json
import os
import sys
import subprocess
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Any
def extract_profile_directly(linkedin_url: str, output_file: str, source_file: str = "", staff_id: str = "") -> bool:
"""Extract LinkedIn profile using direct subprocess call."""
print(f"Extracting LinkedIn profile: {linkedin_url}")
# Build command similar to working pattern
cmd = [
sys.executable, # Use current Python interpreter
'-c',
'''
import json
import subprocess
import sys
from datetime import datetime, timezone
# Get parameters from outer scope
linkedin_url = sys.argv[1] if len(sys.argv) > 1 else ""
output_file = sys.argv[2] if len(sys.argv) > 2 else ""
source_file = sys.argv[3] if len(sys.argv) > 3 else ""
staff_id = sys.argv[4] if len(sys.argv) > 4 else ""
# Call Exa
result = subprocess.run(
["node", "/Users/kempersc/apps/glam/exa-mcp-server-source/.smithery/stdio/index.cjs", "call", "exa_crawling_exa",
"--url", linkedin_url,
"--maxCharacters", "50000"],
capture_output=True,
text=True,
timeout=60
)
if result.returncode == 0:
# Parse JSON output
try:
output = json.loads(result.stdout)
if output and "results" in output and output["results"]:
profile_content = output["results"][0].get("text", "")
title = output["results"][0].get("title", "Unknown")
# Create minimal structured data
profile_data = {
"name": title,
"linkedin_url": linkedin_url,
"headline": "",
"location": "",
"connections": "",
"about": profile_content[:1000] + "..." if len(profile_content) > 1000 else profile_content,
"experience": [],
"education": [],
"skills": [],
"languages": [],
"profile_image_url": None
}
# Create structured output
structured_data = {
"extraction_metadata": {
"source_file": source_file,
"staff_id": staff_id,
"extraction_date": datetime.now(timezone.utc).isoformat(),
"extraction_method": "exa_crawling_exa",
"extraction_agent": "glm-4.6",
"linkedin_url": linkedin_url,
"cost_usd": 0.001,
"request_id": output["results"][0].get("id", "unknown")
},
"profile_data": profile_data
}
# Save to file
with open(output_file, "w", encoding="utf-8") as f:
json.dump(structured_data, f, indent=2, ensure_ascii=False)
print(f"✅ Profile saved to: {output_file}")
print(f" Name: {title}")
sys.exit(0)
except json.JSONDecodeError as e:
print(f"Failed to parse JSON output: {e}")
sys.exit(1)
else:
print(f"Error calling Exa: {result.stderr}")
sys.exit(1)
''',
linkedin_url,
output_file,
source_file,
staff_id
]
try:
result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
if result.returncode == 0:
print(f"✅ Successfully extracted profile for {linkedin_url}")
return True
else:
print(f"❌ Failed to extract profile: {result.stderr}")
return False
except Exception as e:
print(f"Exception during extraction: {e}")
return False
def main():
"""Main function to extract specific LinkedIn profiles."""
# Define specific profiles to extract
profiles_to_extract = [
{
'linkedin_url': 'https://www.linkedin.com/in/anja-van-hoorn-657b66223',
'output_file': '/Users/kempersc/apps/glam/data/custodian/person/entity/anja-van-hoorn-657b66223_20251210T160000Z.json',
'source_file': '/Users/kempersc/apps/glam/data/custodian/person/affiliated/parsed/academiehuis-grote-kerk-zwolle_staff_20251210T155412Z.json',
'staff_id': 'academiehuis-grote-kerk-zwolle_staff_0001_anja_van_hoorn'
},
{
'linkedin_url': 'https://www.linkedin.com/in/inez-van-kleef',
'output_file': '/Users/kempersc/apps/glam/data/custodian/person/entity/inez-van-kleef_20251210T160000Z.json',
'source_file': '/Users/kempersc/apps/glam/data/custodian/person/affiliated/parsed/academiehuis-grote-kerk-zwolle_staff_20251210T155412Z.json',
'staff_id': 'academiehuis-grote-kerk-zwolle_staff_0002_inez_van_kleef'
},
{
'linkedin_url': 'https://www.linkedin.com/in/marga-edens-a284175',
'output_file': '/Users/kempersc/apps/glam/data/custodian/person/entity/marga-edens-a284175_20251210T160000Z.json',
'source_file': '/Users/kempersc/apps/glam/data/custodian/person/affiliated/parsed/academiehuis-grote-kerk-zwolle_staff_20251210T155412Z.json',
'staff_id': 'academiehuis-grote-kerk-zwolle_staff_0003_marga_edens'
}
]
success_count = 0
total_cost = 0.0
for profile in profiles_to_extract:
if extract_profile_directly(**profile):
success_count += 1
total_cost += 0.001
# Small delay to avoid overwhelming
import time
time.sleep(3)
print(f"\n📊 Extraction Summary:")
print(f"✅ Successfully processed: {success_count}")
print(f"💰 Total cost: ${total_cost:.3f}")
print(f"📁 Files saved to: /Users/kempersc/apps/glam/data/custodian/person/entity")
if __name__ == "__main__":
main()