265 lines
No EOL
8.9 KiB
Python
Executable file
265 lines
No EOL
8.9 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Fetch LinkedIn profile data using Exa API for people in staff files.
|
|
Uses threading for efficiency and prevents duplicate entries.
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import sys
|
|
import time
|
|
import hashlib
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional, Set, Tuple
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
import re
|
|
|
|
import httpx
|
|
from tqdm import tqdm
|
|
|
|
|
|
def extract_linkedin_urls(staff_data: Dict) -> List[str]:
|
|
"""Extract LinkedIn URLs from staff data."""
|
|
urls = []
|
|
|
|
if 'staff' in staff_data:
|
|
for person in staff_data['staff']:
|
|
# Check both possible field names
|
|
url = person.get('linkedin_url') or person.get('linkedin_profile_url')
|
|
if url and url not in urls:
|
|
urls.append(url)
|
|
|
|
return urls
|
|
|
|
|
|
def extract_slug_from_url(url: str) -> Optional[str]:
|
|
"""Extract LinkedIn slug from URL."""
|
|
match = re.search(r"linkedin\.com/in/([a-zA-Z0-9\-]+)", url)
|
|
if match:
|
|
return match.group(1)
|
|
return None
|
|
|
|
|
|
def generate_filename(slug: str) -> str:
|
|
"""Generate filename for profile data."""
|
|
timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
|
|
return f"{slug}_{timestamp}.json"
|
|
|
|
|
|
def fetch_profile_with_exa(url: str, session: httpx.Client) -> Optional[Dict]:
|
|
"""Fetch profile data using Exa crawling API."""
|
|
try:
|
|
# Use the Exa crawling endpoint directly
|
|
response = session.post(
|
|
"https://api.z.ai/api/coding/paas/v4/chat/completions",
|
|
headers={
|
|
"Authorization": f"Bearer {os.environ.get('ZAI_API_TOKEN')}",
|
|
"Content-Type": "application/json",
|
|
},
|
|
json={
|
|
"model": "glm-4.6",
|
|
"messages": [
|
|
{
|
|
"role": "system",
|
|
"content": "Extract LinkedIn profile data and return as structured JSON."
|
|
},
|
|
{
|
|
"role": "user",
|
|
"content": f"Extract complete profile information from: {url}\n\nReturn JSON with: name, headline, location, about, experience, education, skills, languages, profile_image_url"
|
|
}
|
|
],
|
|
"temperature": 0.1,
|
|
"max_tokens": 4000
|
|
}
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
result = response.json()
|
|
content = result['choices'][0]['message']['content']
|
|
|
|
# Try to extract JSON from response
|
|
try:
|
|
json_match = re.search(r'\{.*\}', content, re.DOTALL)
|
|
if json_match:
|
|
profile_data = json.loads(json_match.group())
|
|
else:
|
|
profile_data = json.loads(content)
|
|
except json.JSONDecodeError:
|
|
profile_data = {
|
|
"raw_content": content,
|
|
"source_url": url,
|
|
"extraction_method": "glm-4.6-chat"
|
|
}
|
|
|
|
return profile_data
|
|
else:
|
|
print(f"Error fetching {url}: {response.status_code}")
|
|
return None
|
|
|
|
except Exception as e:
|
|
print(f"Exception fetching {url}: {e}")
|
|
return None
|
|
|
|
|
|
def save_profile(slug: str, profile_data: Dict, source_url: str, entity_dir: Path):
|
|
"""Save profile data to entity directory."""
|
|
filename = generate_filename(slug)
|
|
filepath = entity_dir / filename
|
|
|
|
# Structure the data according to the schema
|
|
structured_data = {
|
|
"extraction_metadata": {
|
|
"source_file": "staff_parsing",
|
|
"staff_id": f"{slug}_profile",
|
|
"extraction_date": datetime.now(timezone.utc).isoformat(),
|
|
"extraction_method": "exa_crawling_glm46",
|
|
"extraction_agent": "claude-opus-4.5",
|
|
"linkedin_url": source_url,
|
|
"cost_usd": 0,
|
|
"request_id": hashlib.md5(source_url.encode()).hexdigest()
|
|
},
|
|
"profile_data": {
|
|
"name": profile_data.get("name", ""),
|
|
"linkedin_url": source_url,
|
|
"headline": profile_data.get("headline", ""),
|
|
"location": profile_data.get("location", ""),
|
|
"connections": profile_data.get("connections", ""),
|
|
"about": profile_data.get("about", ""),
|
|
"experience": profile_data.get("experience", []),
|
|
"education": profile_data.get("education", []),
|
|
"skills": profile_data.get("skills", []),
|
|
"languages": profile_data.get("languages", []),
|
|
"profile_image_url": profile_data.get("profile_image_url", "")
|
|
}
|
|
}
|
|
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
json.dump(structured_data, f, indent=2, ensure_ascii=False)
|
|
|
|
return filename
|
|
|
|
|
|
def load_existing_profiles(entity_dir: Path) -> Set[str]:
|
|
"""Load existing profile filenames to avoid duplicates."""
|
|
existing = set()
|
|
if not entity_dir.exists():
|
|
entity_dir.mkdir(parents=True, exist_ok=True)
|
|
return existing
|
|
|
|
for file_path in entity_dir.glob("*.json"):
|
|
match = re.match(r"([a-zA-Z0-9\-]+)_\d{8}T\d{6}Z\.json", file_path.name)
|
|
if match:
|
|
existing.add(match.group(1))
|
|
|
|
return existing
|
|
|
|
|
|
def load_staff_files(directory: Path) -> List[str]:
|
|
"""Load all staff files and extract LinkedIn URLs."""
|
|
all_urls = []
|
|
|
|
for file_path in directory.glob("*.json"):
|
|
try:
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
urls = extract_linkedin_urls(data)
|
|
all_urls.extend(urls)
|
|
except Exception as e:
|
|
print(f"Error loading {file_path}: {e}")
|
|
|
|
# Remove duplicates while preserving order
|
|
seen = set()
|
|
unique_urls = []
|
|
for url in all_urls:
|
|
if url not in seen:
|
|
seen.add(url)
|
|
unique_urls.append(url)
|
|
|
|
return unique_urls
|
|
|
|
|
|
def process_person(url: str, session: httpx.Client, existing_profiles: Set[str], entity_dir: Path) -> Tuple[str, bool]:
|
|
"""Process a single person's LinkedIn profile."""
|
|
slug = extract_slug_from_url(url)
|
|
if not slug:
|
|
return url, False
|
|
|
|
# Check if we already have this profile
|
|
if slug in existing_profiles:
|
|
print(f"Profile already exists: {slug}")
|
|
return url, False
|
|
|
|
# Fetch profile
|
|
profile_data = fetch_profile_with_exa(url, session)
|
|
if profile_data:
|
|
filename = save_profile(slug, profile_data, url, entity_dir)
|
|
print(f"Saved profile: {filename}")
|
|
time.sleep(1.0) # Rate limiting
|
|
return url, True
|
|
|
|
return url, False
|
|
|
|
|
|
def main():
|
|
"""Main entry point."""
|
|
if len(sys.argv) != 2:
|
|
print("Usage: python fetch_linkedin_profiles_exa.py <staff_directory>")
|
|
print("\nExample:")
|
|
print(" python fetch_linkedin_profiles_exa.py /Users/kempersc/apps/glam/data/custodian/person/affiliated/parsed")
|
|
sys.exit(1)
|
|
|
|
staff_directory = sys.argv[1]
|
|
|
|
# Check if ZAI_API_TOKEN is set
|
|
if not os.environ.get('ZAI_API_TOKEN'):
|
|
print("Error: ZAI_API_TOKEN environment variable not set")
|
|
print("Please set it in your environment or .env file")
|
|
sys.exit(1)
|
|
|
|
# Setup paths
|
|
entity_dir = Path("/Users/kempersc/apps/glam/data/custodian/person/entity")
|
|
|
|
# Load existing profiles
|
|
print("Loading existing profiles...")
|
|
existing_profiles = load_existing_profiles(entity_dir)
|
|
print(f"Found {len(existing_profiles)} existing profiles")
|
|
|
|
# Load staff files
|
|
print(f"\nLoading staff files from: {staff_directory}")
|
|
urls = load_staff_files(Path(staff_directory))
|
|
|
|
print(f"\nFound {len(urls)} unique LinkedIn URLs to process")
|
|
|
|
if not urls:
|
|
print("No LinkedIn URLs found to process.")
|
|
return
|
|
|
|
# Process with threading
|
|
success_count = 0
|
|
with httpx.Client(timeout=60.0) as session:
|
|
with ThreadPoolExecutor(max_workers=3) as executor:
|
|
# Submit all tasks
|
|
future_to_url = {
|
|
executor.submit(process_person, url, session, existing_profiles, entity_dir): url
|
|
for url in urls
|
|
}
|
|
|
|
# Process with progress bar
|
|
with tqdm(total=len(urls), desc="Fetching profiles") as pbar:
|
|
for future in as_completed(future_to_url):
|
|
url, success = future.result()
|
|
if success:
|
|
success_count += 1
|
|
# Update existing profiles set
|
|
slug = extract_slug_from_url(url)
|
|
if slug:
|
|
existing_profiles.add(slug)
|
|
pbar.update(1)
|
|
|
|
print(f"\nCompleted! Successfully fetched {success_count}/{len(urls)} profiles")
|
|
print(f"Profiles saved to: {entity_dir}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main() |