glam/scripts/fetch_linkedin_profiles_complete.py

#!/usr/bin/env python3
"""
Complete LinkedIn profile fetching system that processes all staff directories
and fetches only new profiles not already in entity directory.
"""

import json
import os
import sys
import time
import hashlib
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Optional, Set, Tuple
from concurrent.futures import ThreadPoolExecutor, as_completed
import re
import subprocess

import httpx
from tqdm import tqdm


def extract_linkedin_urls(staff_data: Dict) -> List[str]:
    """Extract LinkedIn URLs from staff data."""
    urls = []

    if 'staff' in staff_data:
        for person in staff_data['staff']:
            # Check both possible field names
            url = person.get('linkedin_url') or person.get('linkedin_profile_url')
            if url and url not in urls:
                urls.append(url)

    return urls


def extract_slug_from_url(url: str) -> Optional[str]:
    """Extract LinkedIn slug from URL."""
    match = re.search(r"linkedin\.com/in/([a-zA-Z0-9\-]+)", url)
    if match:
        return match.group(1)
    return None


def generate_filename(slug: str) -> str:
    """Generate filename for profile data."""
    timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
    return f"{slug}_{timestamp}.json"


def fetch_profile_with_exa(url: str, session: httpx.Client) -> Optional[Dict]:
    """Fetch profile data using Exa crawling API."""
    try:
        # Use Exa crawling endpoint directly
        response = session.post(
            "https://api.z.ai/api/coding/paas/v4/chat/completions",
            headers={
                "Authorization": f"Bearer {os.environ.get('ZAI_API_TOKEN')}",
                "Content-Type": "application/json",
            },
            json={
                "model": "glm-4.6",
                "messages": [
                    {
                        "role": "system",
                        "content": "Extract LinkedIn profile data and return as structured JSON."
                    },
                    {
                        "role": "user",
                        "content": f"Extract complete profile information from: {url}\n\nReturn JSON with: name, headline, location, about, experience, education, skills, languages, profile_image_url"
                    }
                ],
                "temperature": 0.1,
                "max_tokens": 4000
            }
        )

        if response.status_code == 200:
            result = response.json()
            content = result['choices'][0]['message']['content']

            # Try to extract JSON from response
            try:
                # Look for JSON in the response
                json_match = re.search(r'\{.*\}', content, re.DOTALL)
                if json_match:
                    profile_data = json.loads(json_match.group())
                else:
                    profile_data = json.loads(content)
            except json.JSONDecodeError:
                # If JSON parsing fails, create a structured response with raw content
                slug = extract_slug_from_url(url) or "unknown"
                profile_data = {
                    "raw_content": content,
                    "source_url": url,
                    "extraction_method": "glm-4.6-chat",
                    "name": slug.replace('-', ' ').title(),
                    "extraction_error": "Failed to parse JSON from LLM response"
                }

            return profile_data
        else:
            print(f"Error fetching {url}: {response.status_code}")
            return None

    except Exception as e:
        print(f"Exception fetching {url}: {e}")
        return None


def save_profile(slug: str, profile_data: Dict, source_url: str, entity_dir: Path) -> str:
    """Save profile data to entity directory."""
    filename = generate_filename(slug)
    filepath = entity_dir / filename

    # Structure data according to the schema
    structured_data = {
        "extraction_metadata": {
            "source_file": "staff_parsing",
            "staff_id": f"{slug}_profile",
            "extraction_date": datetime.now(timezone.utc).isoformat(),
            "extraction_method": "exa_crawling_glm46",
            "extraction_agent": "claude-opus-4.5",
            "linkedin_url": source_url,
            "cost_usd": 0,
            "request_id": hashlib.md5(source_url.encode()).hexdigest()
        },
        "profile_data": {
            "name": profile_data.get("name", ""),
            "linkedin_url": source_url,
            "headline": profile_data.get("headline", ""),
            "location": profile_data.get("location", ""),
            "connections": profile_data.get("connections", ""),
            "about": profile_data.get("about", ""),
            "experience": profile_data.get("experience", []),
            "education": profile_data.get("education", []),
            "skills": profile_data.get("skills", []),
            "languages": profile_data.get("languages", []),
            "profile_image_url": profile_data.get("profile_image_url", "")
        }
    }

    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(structured_data, f, indent=2, ensure_ascii=False)

    return filename


def load_existing_profiles(entity_dir: Path) -> Set[str]:
    """Load existing profile filenames to avoid duplicates."""
    existing = set()
    if not entity_dir.exists():
        entity_dir.mkdir(parents=True, exist_ok=True)
        return existing

    for file_path in entity_dir.glob("*.json"):
        match = re.match(r"([a-zA-Z0-9\-]+)_\d{8}T\d{6}Z\.json", file_path.name)
        if match:
            existing.add(match.group(1))

    return existing


def load_all_staff_files(staff_dirs: List[str]) -> List[str]:
    """Load all staff files from multiple directories and extract LinkedIn URLs."""
    all_urls = []
    file_count = 0

    for staff_dir in staff_dirs:
        dir_path = Path(staff_dir)
        if not dir_path.exists():
            print(f"Warning: Directory not found: {staff_dir}")
            continue

        for file_path in dir_path.glob("*.json"):
            file_count += 1
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                    urls = extract_linkedin_urls(data)
                    all_urls.extend(urls)
            except Exception as e:
                print(f"Error loading {file_path}: {e}")

    # Remove duplicates while preserving order
    seen = set()
    unique_urls = []
    for url in all_urls:
        if url not in seen:
            seen.add(url)
            unique_urls.append(url)

    print(f"\nProcessed {file_count} staff files")
    print(f"Found {len(all_urls)} total LinkedIn URLs")
    print(f"Found {len(unique_urls)} unique LinkedIn URLs")

    return unique_urls


def process_person(url: str, session: httpx.Client, existing_profiles: Set[str], entity_dir: Path) -> Tuple[str, bool, str]:
    """Process a single person's LinkedIn profile."""
    slug = extract_slug_from_url(url)
    if not slug:
        return url, False, "No slug found"

    # Check if we already have this profile
    if slug in existing_profiles:
        return url, False, "Already exists"

    # Fetch profile
    profile_data = fetch_profile_with_exa(url, session)
    if profile_data:
        filename = save_profile(slug, profile_data, url, entity_dir)
        return url, True, filename

    return url, False, "Failed to fetch"


def main():
    """Main entry point."""
    # Check if ZAI_API_TOKEN is set
    if not os.environ.get('ZAI_API_TOKEN'):
        print("Error: ZAI_API_TOKEN environment variable not set")
        print("Please set it in your environment or .env file")
        print("\nTo set it temporarily:")
        print("  export ZAI_API_TOKEN=your_token_here")
        print("  python fetch_linkedin_profiles_complete.py")
        sys.exit(1)

    # Setup paths
    entity_dir = Path("/Users/kempersc/apps/glam/data/custodian/person/entity")

    # Load existing profiles
    print("Loading existing profiles...")
    existing_profiles = load_existing_profiles(entity_dir)
    print(f"Found {len(existing_profiles)} existing profiles")

    # Path to the staff directory
    staff_dir = "/Users/kempersc/apps/glam/data/custodian/person/affiliated/parsed"

    # Load all staff files
    print("\n" + "="*60)
    print("LOADING STAFF FILES")
    print("="*60)
    urls = load_all_staff_files([staff_dir])

    if not urls:
        print("No LinkedIn URLs found to process.")
        return

    # Filter out URLs we already have
    new_urls = []
    for url in urls:
        slug = extract_slug_from_url(url)
        if slug and slug not in existing_profiles:
            new_urls.append(url)

    print(f"\nNeed to fetch {len(new_urls)} new profiles")
    print(f"Skipping {len(urls) - len(new_urls)} already existing profiles")

    if not new_urls:
        print("\nAll profiles already exist!")
        return

    # Ask user how many to process
    print(f"\nThere are {len(new_urls)} new profiles to fetch.")
    print("Enter number to process (or press Enter for all): ", end="")
    try:
        response = input()
        if response.strip():
            limit = int(response.strip())
            new_urls = new_urls[:limit]
            print(f"Processing first {len(new_urls)} profiles...")
        else:
            print("Processing all profiles...")
    except (ValueError, KeyboardInterrupt):
        print("\nProcessing all profiles...")

    # Process with threading
    success_count = 0
    failed_count = 0
    results = []

    print("\n" + "="*60)
    print("FETCHING PROFILES")
    print("="*60)

    with httpx.Client(timeout=60.0) as session:
        with ThreadPoolExecutor(max_workers=3) as executor:
            # Submit all tasks
            future_to_url = {
                executor.submit(process_person, url, session, existing_profiles, entity_dir): url
                for url in new_urls
            }

            # Process with progress bar
            with tqdm(total=len(new_urls), desc="Fetching profiles") as pbar:
                for future in as_completed(future_to_url):
                    url, success, result = future.result()
                    if success:
                        success_count += 1
                        results.append(f"✓ {url} -> {result}")
                    else:
                        failed_count += 1
                        results.append(f"✗ {url} -> {result}")
                    pbar.update(1)

    # Save results log
    log_filename = f"fetch_log_{datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S')}.txt"
    with open(log_filename, 'w') as f:
        f.write(f"LinkedIn Profile Fetch Results\n")
        f.write(f"Timestamp: {datetime.now(timezone.utc).isoformat()}\n")
        f.write(f"Total attempted: {len(new_urls)}\n")
        f.write(f"Successful: {success_count}\n")
        f.write(f"Failed: {failed_count}\n")
        f.write(f"\n" + "="*60 + "\n")
        for result in results:
            f.write(result + "\n")

    print("\n" + "="*60)
    print("RESULTS")
    print("="*60)
    print(f"Successfully fetched: {success_count}")
    print(f"Failed: {failed_count}")
    print(f"Profiles saved to: {entity_dir}")
    print(f"Results log: {log_filename}")

    # Show some successful profiles
    if success_count > 0:
        print(f"\nFirst 5 successfully fetched profiles:")
        for i, result in enumerate([r for r in results if r.startswith("✓")][:5]):
            print(f"  {i+1}. {result}")


if __name__ == "__main__":
    main()