glam/scripts/enrich_linkedin_profiles_unipile.py

#!/usr/bin/env python3
"""
Enrich LinkedIn profiles for Eye Filmmuseum using Unipile API.
This script enriches the extracted LinkedIn URLs with profile data.
"""

import os
import sys
import json
import yaml
import asyncio
import httpx
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Any, Optional
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

class LinkedInProfileEnricher:
    """Enrich LinkedIn profiles using Unipile API."""

    def __init__(self, api_key: str, dsn: str = "api1.unipile.com:13111"):
        self.api_key = api_key
        self.dsn = dsn
        self.base_url = f"https://{dsn}/api/v1"
        self.headers = {
            "accept": "application/json",
            "X-API-KEY": api_key
        }
        self.client = httpx.Client(timeout=60.0, headers=self.headers)

    async def enrich_profile(self, identifier: str) -> Optional[Dict[str, Any]]:
        """Enrich a single LinkedIn profile."""
        url = f"{self.base_url}/users/{identifier}"
        try:
            response = await self.client.get(url)
            if response.status_code == 200:
                return response.json()
            else:
                print(f"Error fetching {identifier}: {response.status_code}")
                return None
        except Exception as e:
            print(f"Exception fetching {identifier}: {e}")
            return None

    async def enrich_profiles(self, profiles: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
        """Enrich multiple LinkedIn profiles."""
        enriched = []

        async with httpx.AsyncClient(timeout=60.0, headers=self.headers) as client:
            tasks = []
            for profile in profiles:
                identifier = profile.get('linkedin_identifier')
                if identifier:
                    task = asyncio.create_task(self.enrich_profile(identifier))
                    tasks.append((profile, task))

            # Process in batches to avoid rate limiting
            batch_size = 10
            for i in range(0, len(tasks), batch_size):
                batch = tasks[i:i + batch_size]
                print(f"Processing batch {i//batch_size + 1}/{(len(tasks)-1)//batch_size + 1}...")

                results = await asyncio.gather(*[task for _, task in batch], return_exceptions=True)

                for profile, result in zip([p for p, _ in batch], results):
                    if isinstance(result, Exception):
                        print(f"Failed to enrich {profile['name']}: {result}")
                        enriched.append({
                            **profile,
                            'enrichment_error': str(result),
                            'enriched': False
                        })
                    elif result:
                        enriched.append({
                            **profile,
                            'enriched_data': result,
                            'enriched': True,
                            'enrichment_timestamp': datetime.now().isoformat() + 'Z'
                        })
                    else:
                        enriched.append({
                            **profile,
                            'enriched': False,
                            'enrichment_error': 'No data returned'
                        })

                # Rate limiting between batches
                if i + batch_size < len(tasks):
                    await asyncio.sleep(2)

        return enriched

    def extract_profile_info(self, api_data: Dict[str, Any]) -> Dict[str, Any]:
        """Extract relevant information from API response."""
        return {
            'first_name': api_data.get('first_name'),
            'last_name': api_data.get('last_name'),
            'full_name': f"{api_data.get('first_name', '')} {api_data.get('last_name', '')}".strip(),
            'headline': api_data.get('headline'),
            'location': api_data.get('location'),
            'industry': api_data.get('industry'),
            'summary': api_data.get('summary'),
            'connections_count': api_data.get('connections_count'),
            'followers_count': api_data.get('followers_count'),
            'profile_url': api_data.get('profile_url'),
            'profile_image_url': api_data.get('profile_image_url'),
            'company': api_data.get('company'),
            'job_title': api_data.get('job_title'),
            'experience': api_data.get('experience', []),
            'education': api_data.get('education', []),
            'skills': api_data.get('skills', []),
            'languages': api_data.get('languages', [])
        }

async def main():
    """Main function."""
    # Check for API credentials
    api_key = os.getenv('UNIPILE_API_KEY')
    if not api_key:
        print("\n" + "="*60)
        print("LINKEDIN PROFILE ENRICHER - UNIPILE API REQUIRED")
        print("="*60)
        print("\nERROR: UNIPILE_API_KEY environment variable not set")
        print("\nTo enrich LinkedIn profiles:")
        print("1. Sign up for Unipile free trial: https://dashboard.unipile.com/signup")
        print("2. Connect your LinkedIn account via Hosted Auth")
        print("3. Get your API key from dashboard")
        print("4. Set environment variable:")
        print("   export UNIPILE_API_KEY=your_api_key_here")
        print("\nAfter setting credentials, run this script again.")
        print("="*60)
        return

    # Initialize enricher
    dsn = os.getenv('UNIPILE_DSN', 'api1.unipile.com:13111')
    enricher = LinkedInProfileEnricher(api_key, dsn)

    # Load extracted LinkedIn profiles
    profiles_file = "/Users/kempersc/apps/glam/data/custodian/NL-NH-AMS-U-EFM-eye_filmmuseum_linkedin_enriched.yaml"

    print("Loading extracted LinkedIn profiles...")
    with open(profiles_file, 'r', encoding='utf-8') as f:
        data = yaml.safe_load(f)

    # Get the bulk extracted profiles
    bulk_extraction = data.get('linkedin_enrichment', {}).get('bulk_url_extraction', {})
    profiles = bulk_extraction.get('profiles', [])

    if not profiles:
        print("No LinkedIn profiles found in extracted data!")
        return

    print(f"Found {len(profiles)} LinkedIn profiles to enrich")

    # Enrich profiles
    print("\nStarting enrichment with Unipile API...")
    enriched_profiles = await enricher.enrich_profiles(profiles)

    # Create enrichment summary
    successful = sum(1 for p in enriched_profiles if p.get('enriched', False))
    failed = len(enriched_profiles) - successful

    print(f"\nEnrichment complete!")
    print(f"Successful: {successful}")
    print(f"Failed: {failed}")

    # Update the main Eye Filmmuseum file with enriched data
    eye_file = "/Users/kempersc/apps/glam/data/custodian/NL-NH-AMS-U-EFM-eye_filmmuseum.yaml"

    print("\nLoading main Eye Filmmuseum file...")
    with open(eye_file, 'r', encoding='utf-8') as f:
        eye_data = yaml.safe_load(f)

    # Add enriched LinkedIn data
    if 'linkedin_enrichment' not in eye_data:
        eye_data['linkedin_enrichment'] = {}

    # Create a mapping from LinkedIn URL to enriched data
    enrichment_map = {}
    for profile in enriched_profiles:
        if profile.get('enriched'):
            enrichment_map[profile['linkedin_url']] = profile['enriched_data']

    # Update existing LinkedIn URLs with enriched data
    def update_with_enrichment(obj):
        """Recursively update LinkedIn URLs with enriched data."""
        if isinstance(obj, dict):
            for key, value in obj.items():
                if key == 'linkedin_url' and isinstance(value, str) and value in enrichment_map:
                    # Add enriched_data field
                    obj['linkedin_enriched_data'] = enricher.extract_profile_info(enrichment_map[value])
                    obj['enrichment_timestamp'] = datetime.now().isoformat() + 'Z'
                elif isinstance(value, (dict, list)):
                    update_with_enrichment(value)
        elif isinstance(obj, list):
            for item in obj:
                update_with_enrichment(item)

    update_with_enrichment(eye_data)

    # Add enrichment metadata
    eye_data['linkedin_enrichment']['api_enrichment'] = {
        'enriched_timestamp': datetime.now().isoformat() + 'Z',
        'api_source': 'unipile',
        'total_profiles': len(profiles),
        'successful_enrichments': successful,
        'failed_enrichments': failed,
        'api_endpoint': 'https://api1.unipile.com/api/v1/users/{identifier}',
        'notes': [
            f"Profile enrichment completed via Unipile API",
            f"Successfully enriched {successful} out of {len(profiles)} profiles",
            "Failed profiles were marked with enrichment_error field"
        ]
    }

    # Update provenance
    if 'provenance' not in eye_data:
        eye_data['provenance'] = {}
    if 'notes' not in eye_data['provenance']:
        eye_data['provenance']['notes'] = []

    eye_data['provenance']['notes'].append(
        f"LinkedIn API enrichment on {datetime.now().isoformat()}Z "
        f"({successful}/{len(profiles)} profiles successfully enriched)"
    )

    # Save enriched data
    output_file = eye_file.replace('.yaml', '_linkedin_api_enriched.yaml')
    print(f"\nSaving enriched data to: {output_file}")

    with open(output_file, 'w', encoding='utf-8') as f:
        yaml.dump(eye_data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)

    print(f"\nEnrichment complete!")
    print(f"Output saved to: {output_file}")

    # Create summary report
    report = {
        'enrichment_timestamp': datetime.now().isoformat() + 'Z',
        'api_source': 'unipile',
        'total_profiles': len(profiles),
        'successful': successful,
        'failed': failed,
        'success_rate': f"{successful/len(profiles)*100:.1f}%" if profiles else "0%"
    }

    report_file = output_file.replace('.yaml', '_enrichment_report.json')
    with open(report_file, 'w', encoding='utf-8') as f:
        json.dump(report, f, indent=2)

    print(f"Report saved to: {report_file}")

if __name__ == "__main__":
    asyncio.run(main())