glam/scripts/extract_profiles_simple.py
2025-12-11 22:32:09 +01:00

193 lines
No EOL
7.1 KiB
Python

#!/usr/bin/env python3
"""
Simple script to extract LinkedIn profiles using the working pattern.
"""
import json
import os
import sys
import subprocess
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, Any
def call_exa_crawling(url: str, max_characters: int = 50000):
"""Call Exa crawling tool via MCP."""
try:
# Use the MCP tool directly
mcp_path = '/Users/kempersc/apps/glam/exa-mcp-server-source/.smithery/stdio/index.cjs'
result = subprocess.run(
['node', mcp_path, 'call', 'exa_crawling_exa',
'--url', url,
'--maxCharacters', str(max_characters)],
capture_output=True,
text=True,
timeout=60
)
if result.returncode != 0:
print(f"Error calling Exa: {result.stderr}")
return None
# Parse JSON output
return json.loads(result.stdout)
except Exception as e:
print(f"Exception calling Exa: {e}")
return None
def parse_linkedin_content(content: str, title: str, url: str) -> Dict[str, Any]:
"""Parse LinkedIn profile content from raw text."""
# Initialize profile data
profile = {
"name": title,
"linkedin_url": url,
"headline": "",
"location": "",
"connections": "",
"about": "",
"experience": [],
"education": [],
"skills": [],
"languages": [],
"profile_image_url": None
}
# Simple extraction - look for key sections
lines = content.split('\n')
current_section = None
for line in lines:
line = line.strip()
# Identify sections
if line.startswith('## About'):
current_section = 'about'
elif line.startswith('## Experience'):
current_section = 'experience'
elif line.startswith('## Education'):
current_section = 'education'
elif line.startswith('## Skills'):
current_section = 'skills'
elif line.startswith('## Languages'):
current_section = 'languages'
elif line and not line.startswith('#') and current_section:
# Extract content based on current section
if current_section == 'about':
profile['about'] += line + ' '
elif current_section == 'experience' and 'at' in line:
# Simple experience parsing
parts = line.split(' at ')
if len(parts) >= 2:
profile['experience'].append({
'title': parts[0].strip(),
'company': parts[1].strip(),
'duration': 'Current'
})
elif current_section == 'education' and 'at' in line:
# Simple education parsing
parts = line.split(' at ')
if len(parts) >= 2:
profile['education'].append({
'degree': parts[0].strip(),
'institution': parts[1].strip(),
'duration': 'Unknown'
})
# Clean up the about section
profile['about'] = profile['about'].strip()
return profile
def extract_linkedin_profile(linkedin_url: str, output_file: str, source_file: str = "", staff_id: str = ""):
"""Extract LinkedIn profile using Exa crawler and save in structured format."""
print(f"Extracting LinkedIn profile: {linkedin_url}")
# Use Exa crawler to get profile content
exa_result = call_exa_crawling(linkedin_url, 50000)
if not exa_result or 'results' not in exa_result or not exa_result['results']:
print(f"❌ Failed to extract profile from {linkedin_url}")
return False
# Get first (and only) result
result = exa_result['results'][0]
raw_content = result.get('text', '')
title = result.get('title', 'Unknown')
url = result.get('url', linkedin_url)
# Parse profile content
profile_data = parse_linkedin_content(raw_content, title, url)
# Create structured output
structured_data = {
"extraction_metadata": {
"source_file": source_file or "manual_extraction",
"staff_id": staff_id or "manual",
"extraction_date": datetime.now(timezone.utc).isoformat(),
"extraction_method": "exa_crawling_exa",
"extraction_agent": "glm-4.6",
"linkedin_url": url,
"cost_usd": 0.001,
"request_id": result.get('id', 'unknown')
},
"profile_data": profile_data
}
# Ensure output directory exists
output_path = Path(output_file)
output_path.parent.mkdir(parents=True, exist_ok=True)
# Save to file
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(structured_data, f, indent=2, ensure_ascii=False)
print(f"✅ Profile saved to: {output_file}")
print(f" Name: {profile_data.get('name', 'Unknown')}")
print(f" Headline: {profile_data.get('headline', '')[:80]}...")
return True
def main():
"""Main function to extract specific LinkedIn profiles."""
# Define specific profiles to extract
profiles_to_extract = [
{
'linkedin_url': 'https://www.linkedin.com/in/anja-van-hoorn-657b66223',
'output_file': '/Users/kempersc/apps/glam/data/custodian/person/entity/anja-van-hoorn-657b66223_20251210T160000Z.json',
'source_file': '/Users/kempersc/apps/glam/data/custodian/person/affiliated/parsed/academiehuis-grote-kerk-zwolle_staff_20251210T155412Z.json',
'staff_id': 'academiehuis-grote-kerk-zwolle_staff_0001_anja_van_hoorn'
},
{
'linkedin_url': 'https://www.linkedin.com/in/inez-van-kleef',
'output_file': '/Users/kempersc/apps/glam/data/custodian/person/entity/inez-van-kleef_20251210T160000Z.json',
'source_file': '/Users/kempersc/apps/glam/data/custodian/person/affiliated/parsed/academiehuis-grote-kerk-zwolle_staff_20251210T155412Z.json',
'staff_id': 'academiehuis-grote-kerk-zwolle_staff_0002_inez_van_kleef'
},
{
'linkedin_url': 'https://www.linkedin.com/in/marga-edens-a284175',
'output_file': '/Users/kempersc/apps/glam/data/custodian/person/entity/marga-edens-a284175_20251210T160000Z.json',
'source_file': '/Users/kempersc/apps/glam/data/custodian/person/affiliated/parsed/academiehuis-grote-kerk-zwolle_staff_20251210T155412Z.json',
'staff_id': 'academiehuis-grote-kerk-zwolle_staff_0003_marga_edens'
}
]
success_count = 0
total_cost = 0.0
for profile in profiles_to_extract:
if extract_linkedin_profile(**profile):
success_count += 1
total_cost += 0.001
# Small delay to avoid overwhelming Exa
import time
time.sleep(2)
print(f"\n📊 Extraction Summary:")
print(f"✅ Successfully processed: {success_count}")
print(f"💰 Total cost: ${total_cost:.3f}")
print(f"📁 Files saved to: /Users/kempersc/apps/glam/data/custodian/person/entity")
if __name__ == "__main__":
main()