#!/usr/bin/env python3 """ Enrich LinkedIn profiles for Eye Filmmuseum using Unipile API. This script enriches the extracted LinkedIn URLs with profile data. """ import os import sys import json import yaml import asyncio import httpx from datetime import datetime from pathlib import Path from typing import Dict, List, Any, Optional from dotenv import load_dotenv # Load environment variables load_dotenv() class LinkedInProfileEnricher: """Enrich LinkedIn profiles using Unipile API.""" def __init__(self, api_key: str, dsn: str = "api1.unipile.com:13111"): self.api_key = api_key self.dsn = dsn self.base_url = f"https://{dsn}/api/v1" self.headers = { "accept": "application/json", "X-API-KEY": api_key } self.client = httpx.Client(timeout=60.0, headers=self.headers) async def enrich_profile(self, identifier: str) -> Optional[Dict[str, Any]]: """Enrich a single LinkedIn profile.""" url = f"{self.base_url}/users/{identifier}" try: response = await self.client.get(url) if response.status_code == 200: return response.json() else: print(f"Error fetching {identifier}: {response.status_code}") return None except Exception as e: print(f"Exception fetching {identifier}: {e}") return None async def enrich_profiles(self, profiles: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """Enrich multiple LinkedIn profiles.""" enriched = [] async with httpx.AsyncClient(timeout=60.0, headers=self.headers) as client: tasks = [] for profile in profiles: identifier = profile.get('linkedin_identifier') if identifier: task = asyncio.create_task(self.enrich_profile(identifier)) tasks.append((profile, task)) # Process in batches to avoid rate limiting batch_size = 10 for i in range(0, len(tasks), batch_size): batch = tasks[i:i + batch_size] print(f"Processing batch {i//batch_size + 1}/{(len(tasks)-1)//batch_size + 1}...") results = await asyncio.gather(*[task for _, task in batch], return_exceptions=True) for profile, result in zip([p for p, _ in batch], results): if isinstance(result, Exception): print(f"Failed to enrich {profile['name']}: {result}") enriched.append({ **profile, 'enrichment_error': str(result), 'enriched': False }) elif result: enriched.append({ **profile, 'enriched_data': result, 'enriched': True, 'enrichment_timestamp': datetime.now().isoformat() + 'Z' }) else: enriched.append({ **profile, 'enriched': False, 'enrichment_error': 'No data returned' }) # Rate limiting between batches if i + batch_size < len(tasks): await asyncio.sleep(2) return enriched def extract_profile_info(self, api_data: Dict[str, Any]) -> Dict[str, Any]: """Extract relevant information from API response.""" return { 'first_name': api_data.get('first_name'), 'last_name': api_data.get('last_name'), 'full_name': f"{api_data.get('first_name', '')} {api_data.get('last_name', '')}".strip(), 'headline': api_data.get('headline'), 'location': api_data.get('location'), 'industry': api_data.get('industry'), 'summary': api_data.get('summary'), 'connections_count': api_data.get('connections_count'), 'followers_count': api_data.get('followers_count'), 'profile_url': api_data.get('profile_url'), 'profile_image_url': api_data.get('profile_image_url'), 'company': api_data.get('company'), 'job_title': api_data.get('job_title'), 'experience': api_data.get('experience', []), 'education': api_data.get('education', []), 'skills': api_data.get('skills', []), 'languages': api_data.get('languages', []) } async def main(): """Main function.""" # Check for API credentials api_key = os.getenv('UNIPILE_API_KEY') if not api_key: print("\n" + "="*60) print("LINKEDIN PROFILE ENRICHER - UNIPILE API REQUIRED") print("="*60) print("\nERROR: UNIPILE_API_KEY environment variable not set") print("\nTo enrich LinkedIn profiles:") print("1. Sign up for Unipile free trial: https://dashboard.unipile.com/signup") print("2. Connect your LinkedIn account via Hosted Auth") print("3. Get your API key from dashboard") print("4. Set environment variable:") print(" export UNIPILE_API_KEY=your_api_key_here") print("\nAfter setting credentials, run this script again.") print("="*60) return # Initialize enricher dsn = os.getenv('UNIPILE_DSN', 'api1.unipile.com:13111') enricher = LinkedInProfileEnricher(api_key, dsn) # Load extracted LinkedIn profiles profiles_file = "/Users/kempersc/apps/glam/data/custodian/NL-NH-AMS-U-EFM-eye_filmmuseum_linkedin_enriched.yaml" print("Loading extracted LinkedIn profiles...") with open(profiles_file, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) # Get the bulk extracted profiles bulk_extraction = data.get('linkedin_enrichment', {}).get('bulk_url_extraction', {}) profiles = bulk_extraction.get('profiles', []) if not profiles: print("No LinkedIn profiles found in extracted data!") return print(f"Found {len(profiles)} LinkedIn profiles to enrich") # Enrich profiles print("\nStarting enrichment with Unipile API...") enriched_profiles = await enricher.enrich_profiles(profiles) # Create enrichment summary successful = sum(1 for p in enriched_profiles if p.get('enriched', False)) failed = len(enriched_profiles) - successful print(f"\nEnrichment complete!") print(f"Successful: {successful}") print(f"Failed: {failed}") # Update the main Eye Filmmuseum file with enriched data eye_file = "/Users/kempersc/apps/glam/data/custodian/NL-NH-AMS-U-EFM-eye_filmmuseum.yaml" print("\nLoading main Eye Filmmuseum file...") with open(eye_file, 'r', encoding='utf-8') as f: eye_data = yaml.safe_load(f) # Add enriched LinkedIn data if 'linkedin_enrichment' not in eye_data: eye_data['linkedin_enrichment'] = {} # Create a mapping from LinkedIn URL to enriched data enrichment_map = {} for profile in enriched_profiles: if profile.get('enriched'): enrichment_map[profile['linkedin_url']] = profile['enriched_data'] # Update existing LinkedIn URLs with enriched data def update_with_enrichment(obj): """Recursively update LinkedIn URLs with enriched data.""" if isinstance(obj, dict): for key, value in obj.items(): if key == 'linkedin_url' and isinstance(value, str) and value in enrichment_map: # Add enriched_data field obj['linkedin_enriched_data'] = enricher.extract_profile_info(enrichment_map[value]) obj['enrichment_timestamp'] = datetime.now().isoformat() + 'Z' elif isinstance(value, (dict, list)): update_with_enrichment(value) elif isinstance(obj, list): for item in obj: update_with_enrichment(item) update_with_enrichment(eye_data) # Add enrichment metadata eye_data['linkedin_enrichment']['api_enrichment'] = { 'enriched_timestamp': datetime.now().isoformat() + 'Z', 'api_source': 'unipile', 'total_profiles': len(profiles), 'successful_enrichments': successful, 'failed_enrichments': failed, 'api_endpoint': 'https://api1.unipile.com/api/v1/users/{identifier}', 'notes': [ f"Profile enrichment completed via Unipile API", f"Successfully enriched {successful} out of {len(profiles)} profiles", "Failed profiles were marked with enrichment_error field" ] } # Update provenance if 'provenance' not in eye_data: eye_data['provenance'] = {} if 'notes' not in eye_data['provenance']: eye_data['provenance']['notes'] = [] eye_data['provenance']['notes'].append( f"LinkedIn API enrichment on {datetime.now().isoformat()}Z " f"({successful}/{len(profiles)} profiles successfully enriched)" ) # Save enriched data output_file = eye_file.replace('.yaml', '_linkedin_api_enriched.yaml') print(f"\nSaving enriched data to: {output_file}") with open(output_file, 'w', encoding='utf-8') as f: yaml.dump(eye_data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) print(f"\nEnrichment complete!") print(f"Output saved to: {output_file}") # Create summary report report = { 'enrichment_timestamp': datetime.now().isoformat() + 'Z', 'api_source': 'unipile', 'total_profiles': len(profiles), 'successful': successful, 'failed': failed, 'success_rate': f"{successful/len(profiles)*100:.1f}%" if profiles else "0%" } report_file = output_file.replace('.yaml', '_enrichment_report.json') with open(report_file, 'w', encoding='utf-8') as f: json.dump(report, f, indent=2) print(f"Report saved to: {report_file}") if __name__ == "__main__": asyncio.run(main())