336 lines
No EOL
12 KiB
Python
Executable file
336 lines
No EOL
12 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Complete LinkedIn profile fetching system that processes all staff directories
|
|
and fetches only new profiles not already in entity directory.
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import sys
|
|
import time
|
|
import hashlib
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional, Set, Tuple
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
import re
|
|
import subprocess
|
|
|
|
import httpx
|
|
from tqdm import tqdm
|
|
|
|
|
|
def extract_linkedin_urls(staff_data: Dict) -> List[str]:
|
|
"""Extract LinkedIn URLs from staff data."""
|
|
urls = []
|
|
|
|
if 'staff' in staff_data:
|
|
for person in staff_data['staff']:
|
|
# Check both possible field names
|
|
url = person.get('linkedin_url') or person.get('linkedin_profile_url')
|
|
if url and url not in urls:
|
|
urls.append(url)
|
|
|
|
return urls
|
|
|
|
|
|
def extract_slug_from_url(url: str) -> Optional[str]:
|
|
"""Extract LinkedIn slug from URL."""
|
|
match = re.search(r"linkedin\.com/in/([a-zA-Z0-9\-]+)", url)
|
|
if match:
|
|
return match.group(1)
|
|
return None
|
|
|
|
|
|
def generate_filename(slug: str) -> str:
|
|
"""Generate filename for profile data."""
|
|
timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
|
|
return f"{slug}_{timestamp}.json"
|
|
|
|
|
|
def fetch_profile_with_exa(url: str, session: httpx.Client) -> Optional[Dict]:
|
|
"""Fetch profile data using Exa crawling API."""
|
|
try:
|
|
# Use Exa crawling endpoint directly
|
|
response = session.post(
|
|
"https://api.z.ai/api/coding/paas/v4/chat/completions",
|
|
headers={
|
|
"Authorization": f"Bearer {os.environ.get('ZAI_API_TOKEN')}",
|
|
"Content-Type": "application/json",
|
|
},
|
|
json={
|
|
"model": "glm-4.6",
|
|
"messages": [
|
|
{
|
|
"role": "system",
|
|
"content": "Extract LinkedIn profile data and return as structured JSON."
|
|
},
|
|
{
|
|
"role": "user",
|
|
"content": f"Extract complete profile information from: {url}\n\nReturn JSON with: name, headline, location, about, experience, education, skills, languages, profile_image_url"
|
|
}
|
|
],
|
|
"temperature": 0.1,
|
|
"max_tokens": 4000
|
|
}
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
result = response.json()
|
|
content = result['choices'][0]['message']['content']
|
|
|
|
# Try to extract JSON from response
|
|
try:
|
|
# Look for JSON in the response
|
|
json_match = re.search(r'\{.*\}', content, re.DOTALL)
|
|
if json_match:
|
|
profile_data = json.loads(json_match.group())
|
|
else:
|
|
profile_data = json.loads(content)
|
|
except json.JSONDecodeError:
|
|
# If JSON parsing fails, create a structured response with raw content
|
|
slug = extract_slug_from_url(url) or "unknown"
|
|
profile_data = {
|
|
"raw_content": content,
|
|
"source_url": url,
|
|
"extraction_method": "glm-4.6-chat",
|
|
"name": slug.replace('-', ' ').title(),
|
|
"extraction_error": "Failed to parse JSON from LLM response"
|
|
}
|
|
|
|
return profile_data
|
|
else:
|
|
print(f"Error fetching {url}: {response.status_code}")
|
|
return None
|
|
|
|
except Exception as e:
|
|
print(f"Exception fetching {url}: {e}")
|
|
return None
|
|
|
|
|
|
def save_profile(slug: str, profile_data: Dict, source_url: str, entity_dir: Path) -> str:
|
|
"""Save profile data to entity directory."""
|
|
filename = generate_filename(slug)
|
|
filepath = entity_dir / filename
|
|
|
|
# Structure data according to the schema
|
|
structured_data = {
|
|
"extraction_metadata": {
|
|
"source_file": "staff_parsing",
|
|
"staff_id": f"{slug}_profile",
|
|
"extraction_date": datetime.now(timezone.utc).isoformat(),
|
|
"extraction_method": "exa_crawling_glm46",
|
|
"extraction_agent": "claude-opus-4.5",
|
|
"linkedin_url": source_url,
|
|
"cost_usd": 0,
|
|
"request_id": hashlib.md5(source_url.encode()).hexdigest()
|
|
},
|
|
"profile_data": {
|
|
"name": profile_data.get("name", ""),
|
|
"linkedin_url": source_url,
|
|
"headline": profile_data.get("headline", ""),
|
|
"location": profile_data.get("location", ""),
|
|
"connections": profile_data.get("connections", ""),
|
|
"about": profile_data.get("about", ""),
|
|
"experience": profile_data.get("experience", []),
|
|
"education": profile_data.get("education", []),
|
|
"skills": profile_data.get("skills", []),
|
|
"languages": profile_data.get("languages", []),
|
|
"profile_image_url": profile_data.get("profile_image_url", "")
|
|
}
|
|
}
|
|
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
json.dump(structured_data, f, indent=2, ensure_ascii=False)
|
|
|
|
return filename
|
|
|
|
|
|
def load_existing_profiles(entity_dir: Path) -> Set[str]:
|
|
"""Load existing profile filenames to avoid duplicates."""
|
|
existing = set()
|
|
if not entity_dir.exists():
|
|
entity_dir.mkdir(parents=True, exist_ok=True)
|
|
return existing
|
|
|
|
for file_path in entity_dir.glob("*.json"):
|
|
match = re.match(r"([a-zA-Z0-9\-]+)_\d{8}T\d{6}Z\.json", file_path.name)
|
|
if match:
|
|
existing.add(match.group(1))
|
|
|
|
return existing
|
|
|
|
|
|
def load_all_staff_files(staff_dirs: List[str]) -> List[str]:
|
|
"""Load all staff files from multiple directories and extract LinkedIn URLs."""
|
|
all_urls = []
|
|
file_count = 0
|
|
|
|
for staff_dir in staff_dirs:
|
|
dir_path = Path(staff_dir)
|
|
if not dir_path.exists():
|
|
print(f"Warning: Directory not found: {staff_dir}")
|
|
continue
|
|
|
|
for file_path in dir_path.glob("*.json"):
|
|
file_count += 1
|
|
try:
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
urls = extract_linkedin_urls(data)
|
|
all_urls.extend(urls)
|
|
except Exception as e:
|
|
print(f"Error loading {file_path}: {e}")
|
|
|
|
# Remove duplicates while preserving order
|
|
seen = set()
|
|
unique_urls = []
|
|
for url in all_urls:
|
|
if url not in seen:
|
|
seen.add(url)
|
|
unique_urls.append(url)
|
|
|
|
print(f"\nProcessed {file_count} staff files")
|
|
print(f"Found {len(all_urls)} total LinkedIn URLs")
|
|
print(f"Found {len(unique_urls)} unique LinkedIn URLs")
|
|
|
|
return unique_urls
|
|
|
|
|
|
def process_person(url: str, session: httpx.Client, existing_profiles: Set[str], entity_dir: Path) -> Tuple[str, bool, str]:
|
|
"""Process a single person's LinkedIn profile."""
|
|
slug = extract_slug_from_url(url)
|
|
if not slug:
|
|
return url, False, "No slug found"
|
|
|
|
# Check if we already have this profile
|
|
if slug in existing_profiles:
|
|
return url, False, "Already exists"
|
|
|
|
# Fetch profile
|
|
profile_data = fetch_profile_with_exa(url, session)
|
|
if profile_data:
|
|
filename = save_profile(slug, profile_data, url, entity_dir)
|
|
return url, True, filename
|
|
|
|
return url, False, "Failed to fetch"
|
|
|
|
|
|
def main():
|
|
"""Main entry point."""
|
|
# Check if ZAI_API_TOKEN is set
|
|
if not os.environ.get('ZAI_API_TOKEN'):
|
|
print("Error: ZAI_API_TOKEN environment variable not set")
|
|
print("Please set it in your environment or .env file")
|
|
print("\nTo set it temporarily:")
|
|
print(" export ZAI_API_TOKEN=your_token_here")
|
|
print(" python fetch_linkedin_profiles_complete.py")
|
|
sys.exit(1)
|
|
|
|
# Setup paths
|
|
entity_dir = Path("/Users/kempersc/apps/glam/data/custodian/person/entity")
|
|
|
|
# Load existing profiles
|
|
print("Loading existing profiles...")
|
|
existing_profiles = load_existing_profiles(entity_dir)
|
|
print(f"Found {len(existing_profiles)} existing profiles")
|
|
|
|
# Path to the staff directory
|
|
staff_dir = "/Users/kempersc/apps/glam/data/custodian/person/affiliated/parsed"
|
|
|
|
# Load all staff files
|
|
print("\n" + "="*60)
|
|
print("LOADING STAFF FILES")
|
|
print("="*60)
|
|
urls = load_all_staff_files([staff_dir])
|
|
|
|
if not urls:
|
|
print("No LinkedIn URLs found to process.")
|
|
return
|
|
|
|
# Filter out URLs we already have
|
|
new_urls = []
|
|
for url in urls:
|
|
slug = extract_slug_from_url(url)
|
|
if slug and slug not in existing_profiles:
|
|
new_urls.append(url)
|
|
|
|
print(f"\nNeed to fetch {len(new_urls)} new profiles")
|
|
print(f"Skipping {len(urls) - len(new_urls)} already existing profiles")
|
|
|
|
if not new_urls:
|
|
print("\nAll profiles already exist!")
|
|
return
|
|
|
|
# Ask user how many to process
|
|
print(f"\nThere are {len(new_urls)} new profiles to fetch.")
|
|
print("Enter number to process (or press Enter for all): ", end="")
|
|
try:
|
|
response = input()
|
|
if response.strip():
|
|
limit = int(response.strip())
|
|
new_urls = new_urls[:limit]
|
|
print(f"Processing first {len(new_urls)} profiles...")
|
|
else:
|
|
print("Processing all profiles...")
|
|
except (ValueError, KeyboardInterrupt):
|
|
print("\nProcessing all profiles...")
|
|
|
|
# Process with threading
|
|
success_count = 0
|
|
failed_count = 0
|
|
results = []
|
|
|
|
print("\n" + "="*60)
|
|
print("FETCHING PROFILES")
|
|
print("="*60)
|
|
|
|
with httpx.Client(timeout=60.0) as session:
|
|
with ThreadPoolExecutor(max_workers=3) as executor:
|
|
# Submit all tasks
|
|
future_to_url = {
|
|
executor.submit(process_person, url, session, existing_profiles, entity_dir): url
|
|
for url in new_urls
|
|
}
|
|
|
|
# Process with progress bar
|
|
with tqdm(total=len(new_urls), desc="Fetching profiles") as pbar:
|
|
for future in as_completed(future_to_url):
|
|
url, success, result = future.result()
|
|
if success:
|
|
success_count += 1
|
|
results.append(f"✓ {url} -> {result}")
|
|
else:
|
|
failed_count += 1
|
|
results.append(f"✗ {url} -> {result}")
|
|
pbar.update(1)
|
|
|
|
# Save results log
|
|
log_filename = f"fetch_log_{datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S')}.txt"
|
|
with open(log_filename, 'w') as f:
|
|
f.write(f"LinkedIn Profile Fetch Results\n")
|
|
f.write(f"Timestamp: {datetime.now(timezone.utc).isoformat()}\n")
|
|
f.write(f"Total attempted: {len(new_urls)}\n")
|
|
f.write(f"Successful: {success_count}\n")
|
|
f.write(f"Failed: {failed_count}\n")
|
|
f.write(f"\n" + "="*60 + "\n")
|
|
for result in results:
|
|
f.write(result + "\n")
|
|
|
|
print("\n" + "="*60)
|
|
print("RESULTS")
|
|
print("="*60)
|
|
print(f"Successfully fetched: {success_count}")
|
|
print(f"Failed: {failed_count}")
|
|
print(f"Profiles saved to: {entity_dir}")
|
|
print(f"Results log: {log_filename}")
|
|
|
|
# Show some successful profiles
|
|
if success_count > 0:
|
|
print(f"\nFirst 5 successfully fetched profiles:")
|
|
for i, result in enumerate([r for r in results if r.startswith("✓")][:5]):
|
|
print(f" {i+1}. {result}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main() |