302 lines
No EOL
11 KiB
Python
Executable file
302 lines
No EOL
11 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Fetch LinkedIn profile data using Exa API for people in staff files.
|
|
Uses threading for efficiency and prevents duplicate entries.
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import sys
|
|
import time
|
|
import hashlib
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional, Set, Tuple
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
import re
|
|
|
|
import httpx
|
|
from tqdm import tqdm
|
|
|
|
|
|
def extract_linkedin_urls(staff_data: Dict) -> List[str]:
|
|
"""Extract LinkedIn URLs from staff data."""
|
|
urls = []
|
|
|
|
if 'staff' in staff_data:
|
|
for person in staff_data['staff']:
|
|
# Check both possible field names
|
|
url = person.get('linkedin_url') or person.get('linkedin_profile_url')
|
|
if url and url not in urls:
|
|
urls.append(url)
|
|
|
|
return urls
|
|
|
|
|
|
def extract_slug_from_url(url: str) -> Optional[str]:
|
|
"""Extract LinkedIn slug from URL."""
|
|
match = re.search(r"linkedin\.com/in/([a-zA-Z0-9\-]+)", url)
|
|
if match:
|
|
return match.group(1)
|
|
return None
|
|
|
|
|
|
def generate_filename(slug: str) -> str:
|
|
"""Generate filename for profile data."""
|
|
timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
|
|
return f"{slug}_{timestamp}.json"
|
|
|
|
|
|
def fetch_profile_with_exa(url: str, session: httpx.Client) -> Optional[Dict]:
|
|
"""Fetch profile data using Exa crawling API."""
|
|
try:
|
|
# Use Exa crawling endpoint directly
|
|
response = session.post(
|
|
"https://api.z.ai/api/coding/paas/v4/chat/completions",
|
|
headers={
|
|
"Authorization": f"Bearer {os.environ.get('ZAI_API_TOKEN')}",
|
|
"Content-Type": "application/json",
|
|
},
|
|
json={
|
|
"model": "glm-4.6",
|
|
"messages": [
|
|
{
|
|
"role": "system",
|
|
"content": "You are extracting LinkedIn profile data. Return complete profile information including name, headline, location, about section, experience, education, skills, languages, and any other relevant information. Structure the response as valid JSON."
|
|
},
|
|
{
|
|
"role": "user",
|
|
"content": f"Extract all profile information from this LinkedIn URL: {url}\n\nPlease return a JSON object with these fields:\n- name (full name)\n- headline (current position/title)\n- location (city, country)\n- about (summary/bio)\n- experience (list of jobs with title, company, dates)\n- education (list of schools with degree, dates)\n- skills (list of skills)\n- languages (list of languages)\n- profile_image_url (if available)"
|
|
}
|
|
],
|
|
"temperature": 0.1,
|
|
"max_tokens": 4000
|
|
}
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
result = response.json()
|
|
content = result['choices'][0]['message']['content']
|
|
|
|
# Try to extract JSON from response
|
|
try:
|
|
# Look for JSON in the response
|
|
json_match = re.search(r'\{.*\}', content, re.DOTALL)
|
|
if json_match:
|
|
profile_data = json.loads(json_match.group())
|
|
else:
|
|
profile_data = json.loads(content)
|
|
except json.JSONDecodeError:
|
|
# If JSON parsing fails, create a structured response with raw content
|
|
profile_data = {
|
|
"raw_content": content,
|
|
"source_url": url,
|
|
"extraction_method": "glm-4.6-chat",
|
|
"name": ((extract_slug_from_url(url) or "").replace('-', ' ').title() or "Unknown"),
|
|
"extraction_error": "Failed to parse JSON from LLM response"
|
|
}
|
|
|
|
return profile_data
|
|
else:
|
|
print(f"Error fetching {url}: {response.status_code}")
|
|
return None
|
|
|
|
except Exception as e:
|
|
print(f"Exception fetching {url}: {e}")
|
|
return None
|
|
|
|
|
|
def save_profile(slug: str, profile_data: Dict, source_url: str, entity_dir: Path) -> str:
|
|
"""Save profile data to entity directory."""
|
|
filename = generate_filename(slug)
|
|
filepath = entity_dir / filename
|
|
|
|
# Structure data according to the schema
|
|
structured_data = {
|
|
"extraction_metadata": {
|
|
"source_file": "staff_parsing",
|
|
"staff_id": f"{slug}_profile",
|
|
"extraction_date": datetime.now(timezone.utc).isoformat(),
|
|
"extraction_method": "exa_crawling_glm46",
|
|
"extraction_agent": "claude-opus-4.5",
|
|
"linkedin_url": source_url,
|
|
"cost_usd": 0,
|
|
"request_id": hashlib.md5(source_url.encode()).hexdigest()
|
|
},
|
|
"profile_data": {
|
|
"name": profile_data.get("name", ""),
|
|
"linkedin_url": source_url,
|
|
"headline": profile_data.get("headline", ""),
|
|
"location": profile_data.get("location", ""),
|
|
"connections": profile_data.get("connections", ""),
|
|
"about": profile_data.get("about", ""),
|
|
"experience": profile_data.get("experience", []),
|
|
"education": profile_data.get("education", []),
|
|
"skills": profile_data.get("skills", []),
|
|
"languages": profile_data.get("languages", []),
|
|
"profile_image_url": profile_data.get("profile_image_url", "")
|
|
}
|
|
}
|
|
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
json.dump(structured_data, f, indent=2, ensure_ascii=False)
|
|
|
|
return filename
|
|
|
|
|
|
def load_existing_profiles(entity_dir: Path) -> Set[str]:
|
|
"""Load existing profile filenames to avoid duplicates."""
|
|
existing = set()
|
|
if not entity_dir.exists():
|
|
entity_dir.mkdir(parents=True, exist_ok=True)
|
|
return existing
|
|
|
|
for file_path in entity_dir.glob("*.json"):
|
|
match = re.match(r"([a-zA-Z0-9\-]+)_\d{8}T\d{6}Z\.json", file_path.name)
|
|
if match:
|
|
existing.add(match.group(1))
|
|
|
|
return existing
|
|
|
|
|
|
def load_staff_files(directory: Path) -> List[str]:
|
|
"""Load all staff files and extract LinkedIn URLs."""
|
|
all_urls = []
|
|
file_count = 0
|
|
|
|
for file_path in directory.glob("*.json"):
|
|
file_count += 1
|
|
try:
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
urls = extract_linkedin_urls(data)
|
|
all_urls.extend(urls)
|
|
except Exception as e:
|
|
print(f"Error loading {file_path}: {e}")
|
|
|
|
# Remove duplicates while preserving order
|
|
seen = set()
|
|
unique_urls = []
|
|
for url in all_urls:
|
|
if url not in seen:
|
|
seen.add(url)
|
|
unique_urls.append(url)
|
|
|
|
print(f"Processed {file_count} staff files")
|
|
print(f"Found {len(all_urls)} total LinkedIn URLs")
|
|
print(f"Found {len(unique_urls)} unique LinkedIn URLs")
|
|
|
|
return unique_urls
|
|
|
|
|
|
def process_person(url: str, session: httpx.Client, existing_profiles: Set[str], entity_dir: Path) -> Tuple[str, bool, str]:
|
|
"""Process a single person's LinkedIn profile."""
|
|
slug = extract_slug_from_url(url)
|
|
if not slug:
|
|
return url, False, "No slug found"
|
|
|
|
# Check if we already have this profile
|
|
if slug in existing_profiles:
|
|
return url, False, "Already exists"
|
|
|
|
# Fetch profile
|
|
profile_data = fetch_profile_with_exa(url, session)
|
|
if profile_data:
|
|
filename = save_profile(slug, profile_data, url, entity_dir)
|
|
return url, True, filename
|
|
|
|
return url, False, "Failed to fetch"
|
|
|
|
|
|
def main():
|
|
"""Main entry point."""
|
|
if len(sys.argv) != 2:
|
|
print("Usage: python fetch_linkedin_profiles_exa.py <staff_directory>")
|
|
print("\nExample:")
|
|
print(" python fetch_linkedin_profiles_exa.py /Users/kempersc/apps/glam/data/custodian/person/affiliated/parsed")
|
|
sys.exit(1)
|
|
|
|
staff_directory = sys.argv[1]
|
|
|
|
# Check if ZAI_API_TOKEN is set
|
|
if not os.environ.get('ZAI_API_TOKEN'):
|
|
print("Error: ZAI_API_TOKEN environment variable not set")
|
|
print("Please set it in your environment or .env file")
|
|
sys.exit(1)
|
|
|
|
# Setup paths
|
|
entity_dir = Path("/Users/kempersc/apps/glam/data/custodian/person/entity")
|
|
|
|
# Load existing profiles
|
|
print("Loading existing profiles...")
|
|
existing_profiles = load_existing_profiles(entity_dir)
|
|
print(f"Found {len(existing_profiles)} existing profiles")
|
|
|
|
# Load staff files
|
|
print(f"\nLoading staff files from: {staff_directory}")
|
|
urls = load_staff_files(Path(staff_directory))
|
|
|
|
if not urls:
|
|
print("No LinkedIn URLs found to process.")
|
|
return
|
|
|
|
# Filter out URLs we already have
|
|
new_urls = []
|
|
for url in urls:
|
|
slug = extract_slug_from_url(url)
|
|
if slug and slug not in existing_profiles:
|
|
new_urls.append(url)
|
|
|
|
print(f"\nNeed to fetch {len(new_urls)} new profiles")
|
|
print(f"Skipping {len(urls) - len(new_urls)} already existing profiles")
|
|
|
|
if not new_urls:
|
|
print("All profiles already exist!")
|
|
return
|
|
|
|
# Process with threading
|
|
success_count = 0
|
|
failed_count = 0
|
|
results = []
|
|
|
|
with httpx.Client(timeout=60.0) as session:
|
|
with ThreadPoolExecutor(max_workers=3) as executor:
|
|
# Submit all tasks
|
|
future_to_url = {
|
|
executor.submit(process_person, url, session, existing_profiles, entity_dir): url
|
|
for url in new_urls[:50] # Limit to first 50 for testing
|
|
}
|
|
|
|
# Process with progress bar
|
|
with tqdm(total=len(future_to_url), desc="Fetching profiles") as pbar:
|
|
for future in as_completed(future_to_url):
|
|
url, success, result = future.result()
|
|
if success:
|
|
success_count += 1
|
|
results.append(f"✓ {url} -> {result}")
|
|
else:
|
|
failed_count += 1
|
|
results.append(f"✗ {url} -> {result}")
|
|
pbar.update(1)
|
|
|
|
# Save results log
|
|
log_filename = f"fetch_log_{datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S')}.txt"
|
|
with open(log_filename, 'w') as f:
|
|
f.write(f"LinkedIn Profile Fetch Results\n")
|
|
f.write(f"Timestamp: {datetime.now(timezone.utc).isoformat()}\n")
|
|
f.write(f"Total attempted: {len(new_urls)}\n")
|
|
f.write(f"Successful: {success_count}\n")
|
|
f.write(f"Failed: {failed_count}\n")
|
|
f.write(f"\n" + "="*60 + "\n")
|
|
for result in results:
|
|
f.write(result + "\n")
|
|
|
|
print(f"\nCompleted!")
|
|
print(f"Successfully fetched: {success_count}")
|
|
print(f"Failed: {failed_count}")
|
|
print(f"Profiles saved to: {entity_dir}")
|
|
print(f"Results log: {log_filename}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main() |