glam/scripts/fetch_linkedin_profiles_exa_v2.py
2025-12-11 22:32:09 +01:00

265 lines
No EOL
8.9 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Fetch LinkedIn profile data using Exa API for people in staff files.
Uses threading for efficiency and prevents duplicate entries.
"""
import json
import os
import sys
import time
import hashlib
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Optional, Set, Tuple
from concurrent.futures import ThreadPoolExecutor, as_completed
import re
import httpx
from tqdm import tqdm
def extract_linkedin_urls(staff_data: Dict) -> List[str]:
"""Extract LinkedIn URLs from staff data."""
urls = []
if 'staff' in staff_data:
for person in staff_data['staff']:
# Check both possible field names
url = person.get('linkedin_url') or person.get('linkedin_profile_url')
if url and url not in urls:
urls.append(url)
return urls
def extract_slug_from_url(url: str) -> Optional[str]:
"""Extract LinkedIn slug from URL."""
match = re.search(r"linkedin\.com/in/([a-zA-Z0-9\-]+)", url)
if match:
return match.group(1)
return None
def generate_filename(slug: str) -> str:
"""Generate filename for profile data."""
timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
return f"{slug}_{timestamp}.json"
def fetch_profile_with_exa(url: str, session: httpx.Client) -> Optional[Dict]:
"""Fetch profile data using Exa crawling API."""
try:
# Use the Exa crawling endpoint directly
response = session.post(
"https://api.z.ai/api/coding/paas/v4/chat/completions",
headers={
"Authorization": f"Bearer {os.environ.get('ZAI_API_TOKEN')}",
"Content-Type": "application/json",
},
json={
"model": "glm-4.6",
"messages": [
{
"role": "system",
"content": "Extract LinkedIn profile data and return as structured JSON."
},
{
"role": "user",
"content": f"Extract complete profile information from: {url}\n\nReturn JSON with: name, headline, location, about, experience, education, skills, languages, profile_image_url"
}
],
"temperature": 0.1,
"max_tokens": 4000
}
)
if response.status_code == 200:
result = response.json()
content = result['choices'][0]['message']['content']
# Try to extract JSON from response
try:
json_match = re.search(r'\{.*\}', content, re.DOTALL)
if json_match:
profile_data = json.loads(json_match.group())
else:
profile_data = json.loads(content)
except json.JSONDecodeError:
profile_data = {
"raw_content": content,
"source_url": url,
"extraction_method": "glm-4.6-chat"
}
return profile_data
else:
print(f"Error fetching {url}: {response.status_code}")
return None
except Exception as e:
print(f"Exception fetching {url}: {e}")
return None
def save_profile(slug: str, profile_data: Dict, source_url: str, entity_dir: Path):
"""Save profile data to entity directory."""
filename = generate_filename(slug)
filepath = entity_dir / filename
# Structure the data according to the schema
structured_data = {
"extraction_metadata": {
"source_file": "staff_parsing",
"staff_id": f"{slug}_profile",
"extraction_date": datetime.now(timezone.utc).isoformat(),
"extraction_method": "exa_crawling_glm46",
"extraction_agent": "claude-opus-4.5",
"linkedin_url": source_url,
"cost_usd": 0,
"request_id": hashlib.md5(source_url.encode()).hexdigest()
},
"profile_data": {
"name": profile_data.get("name", ""),
"linkedin_url": source_url,
"headline": profile_data.get("headline", ""),
"location": profile_data.get("location", ""),
"connections": profile_data.get("connections", ""),
"about": profile_data.get("about", ""),
"experience": profile_data.get("experience", []),
"education": profile_data.get("education", []),
"skills": profile_data.get("skills", []),
"languages": profile_data.get("languages", []),
"profile_image_url": profile_data.get("profile_image_url", "")
}
}
with open(filepath, 'w', encoding='utf-8') as f:
json.dump(structured_data, f, indent=2, ensure_ascii=False)
return filename
def load_existing_profiles(entity_dir: Path) -> Set[str]:
"""Load existing profile filenames to avoid duplicates."""
existing = set()
if not entity_dir.exists():
entity_dir.mkdir(parents=True, exist_ok=True)
return existing
for file_path in entity_dir.glob("*.json"):
match = re.match(r"([a-zA-Z0-9\-]+)_\d{8}T\d{6}Z\.json", file_path.name)
if match:
existing.add(match.group(1))
return existing
def load_staff_files(directory: Path) -> List[str]:
"""Load all staff files and extract LinkedIn URLs."""
all_urls = []
for file_path in directory.glob("*.json"):
try:
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
urls = extract_linkedin_urls(data)
all_urls.extend(urls)
except Exception as e:
print(f"Error loading {file_path}: {e}")
# Remove duplicates while preserving order
seen = set()
unique_urls = []
for url in all_urls:
if url not in seen:
seen.add(url)
unique_urls.append(url)
return unique_urls
def process_person(url: str, session: httpx.Client, existing_profiles: Set[str], entity_dir: Path) -> Tuple[str, bool]:
"""Process a single person's LinkedIn profile."""
slug = extract_slug_from_url(url)
if not slug:
return url, False
# Check if we already have this profile
if slug in existing_profiles:
print(f"Profile already exists: {slug}")
return url, False
# Fetch profile
profile_data = fetch_profile_with_exa(url, session)
if profile_data:
filename = save_profile(slug, profile_data, url, entity_dir)
print(f"Saved profile: {filename}")
time.sleep(1.0) # Rate limiting
return url, True
return url, False
def main():
"""Main entry point."""
if len(sys.argv) != 2:
print("Usage: python fetch_linkedin_profiles_exa.py <staff_directory>")
print("\nExample:")
print(" python fetch_linkedin_profiles_exa.py /Users/kempersc/apps/glam/data/custodian/person/affiliated/parsed")
sys.exit(1)
staff_directory = sys.argv[1]
# Check if ZAI_API_TOKEN is set
if not os.environ.get('ZAI_API_TOKEN'):
print("Error: ZAI_API_TOKEN environment variable not set")
print("Please set it in your environment or .env file")
sys.exit(1)
# Setup paths
entity_dir = Path("/Users/kempersc/apps/glam/data/custodian/person/entity")
# Load existing profiles
print("Loading existing profiles...")
existing_profiles = load_existing_profiles(entity_dir)
print(f"Found {len(existing_profiles)} existing profiles")
# Load staff files
print(f"\nLoading staff files from: {staff_directory}")
urls = load_staff_files(Path(staff_directory))
print(f"\nFound {len(urls)} unique LinkedIn URLs to process")
if not urls:
print("No LinkedIn URLs found to process.")
return
# Process with threading
success_count = 0
with httpx.Client(timeout=60.0) as session:
with ThreadPoolExecutor(max_workers=3) as executor:
# Submit all tasks
future_to_url = {
executor.submit(process_person, url, session, existing_profiles, entity_dir): url
for url in urls
}
# Process with progress bar
with tqdm(total=len(urls), desc="Fetching profiles") as pbar:
for future in as_completed(future_to_url):
url, success = future.result()
if success:
success_count += 1
# Update existing profiles set
slug = extract_slug_from_url(url)
if slug:
existing_profiles.add(slug)
pbar.update(1)
print(f"\nCompleted! Successfully fetched {success_count}/{len(urls)} profiles")
print(f"Profiles saved to: {entity_dir}")
if __name__ == "__main__":
main()