glam/scripts/enrich_linkedin_profiles_unipile.py
2025-12-10 13:01:13 +01:00

257 lines
No EOL
10 KiB
Python

#!/usr/bin/env python3
"""
Enrich LinkedIn profiles for Eye Filmmuseum using Unipile API.
This script enriches the extracted LinkedIn URLs with profile data.
"""
import os
import sys
import json
import yaml
import asyncio
import httpx
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Any, Optional
from dotenv import load_dotenv
# Load environment variables
load_dotenv()
class LinkedInProfileEnricher:
"""Enrich LinkedIn profiles using Unipile API."""
def __init__(self, api_key: str, dsn: str = "api1.unipile.com:13111"):
self.api_key = api_key
self.dsn = dsn
self.base_url = f"https://{dsn}/api/v1"
self.headers = {
"accept": "application/json",
"X-API-KEY": api_key
}
self.client = httpx.Client(timeout=60.0, headers=self.headers)
async def enrich_profile(self, identifier: str) -> Optional[Dict[str, Any]]:
"""Enrich a single LinkedIn profile."""
url = f"{self.base_url}/users/{identifier}"
try:
response = await self.client.get(url)
if response.status_code == 200:
return response.json()
else:
print(f"Error fetching {identifier}: {response.status_code}")
return None
except Exception as e:
print(f"Exception fetching {identifier}: {e}")
return None
async def enrich_profiles(self, profiles: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Enrich multiple LinkedIn profiles."""
enriched = []
async with httpx.AsyncClient(timeout=60.0, headers=self.headers) as client:
tasks = []
for profile in profiles:
identifier = profile.get('linkedin_identifier')
if identifier:
task = asyncio.create_task(self.enrich_profile(identifier))
tasks.append((profile, task))
# Process in batches to avoid rate limiting
batch_size = 10
for i in range(0, len(tasks), batch_size):
batch = tasks[i:i + batch_size]
print(f"Processing batch {i//batch_size + 1}/{(len(tasks)-1)//batch_size + 1}...")
results = await asyncio.gather(*[task for _, task in batch], return_exceptions=True)
for profile, result in zip([p for p, _ in batch], results):
if isinstance(result, Exception):
print(f"Failed to enrich {profile['name']}: {result}")
enriched.append({
**profile,
'enrichment_error': str(result),
'enriched': False
})
elif result:
enriched.append({
**profile,
'enriched_data': result,
'enriched': True,
'enrichment_timestamp': datetime.now().isoformat() + 'Z'
})
else:
enriched.append({
**profile,
'enriched': False,
'enrichment_error': 'No data returned'
})
# Rate limiting between batches
if i + batch_size < len(tasks):
await asyncio.sleep(2)
return enriched
def extract_profile_info(self, api_data: Dict[str, Any]) -> Dict[str, Any]:
"""Extract relevant information from API response."""
return {
'first_name': api_data.get('first_name'),
'last_name': api_data.get('last_name'),
'full_name': f"{api_data.get('first_name', '')} {api_data.get('last_name', '')}".strip(),
'headline': api_data.get('headline'),
'location': api_data.get('location'),
'industry': api_data.get('industry'),
'summary': api_data.get('summary'),
'connections_count': api_data.get('connections_count'),
'followers_count': api_data.get('followers_count'),
'profile_url': api_data.get('profile_url'),
'profile_image_url': api_data.get('profile_image_url'),
'company': api_data.get('company'),
'job_title': api_data.get('job_title'),
'experience': api_data.get('experience', []),
'education': api_data.get('education', []),
'skills': api_data.get('skills', []),
'languages': api_data.get('languages', [])
}
async def main():
"""Main function."""
# Check for API credentials
api_key = os.getenv('UNIPILE_API_KEY')
if not api_key:
print("\n" + "="*60)
print("LINKEDIN PROFILE ENRICHER - UNIPILE API REQUIRED")
print("="*60)
print("\nERROR: UNIPILE_API_KEY environment variable not set")
print("\nTo enrich LinkedIn profiles:")
print("1. Sign up for Unipile free trial: https://dashboard.unipile.com/signup")
print("2. Connect your LinkedIn account via Hosted Auth")
print("3. Get your API key from dashboard")
print("4. Set environment variable:")
print(" export UNIPILE_API_KEY=your_api_key_here")
print("\nAfter setting credentials, run this script again.")
print("="*60)
return
# Initialize enricher
dsn = os.getenv('UNIPILE_DSN', 'api1.unipile.com:13111')
enricher = LinkedInProfileEnricher(api_key, dsn)
# Load extracted LinkedIn profiles
profiles_file = "/Users/kempersc/apps/glam/data/custodian/NL-NH-AMS-U-EFM-eye_filmmuseum_linkedin_enriched.yaml"
print("Loading extracted LinkedIn profiles...")
with open(profiles_file, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
# Get the bulk extracted profiles
bulk_extraction = data.get('linkedin_enrichment', {}).get('bulk_url_extraction', {})
profiles = bulk_extraction.get('profiles', [])
if not profiles:
print("No LinkedIn profiles found in extracted data!")
return
print(f"Found {len(profiles)} LinkedIn profiles to enrich")
# Enrich profiles
print("\nStarting enrichment with Unipile API...")
enriched_profiles = await enricher.enrich_profiles(profiles)
# Create enrichment summary
successful = sum(1 for p in enriched_profiles if p.get('enriched', False))
failed = len(enriched_profiles) - successful
print(f"\nEnrichment complete!")
print(f"Successful: {successful}")
print(f"Failed: {failed}")
# Update the main Eye Filmmuseum file with enriched data
eye_file = "/Users/kempersc/apps/glam/data/custodian/NL-NH-AMS-U-EFM-eye_filmmuseum.yaml"
print("\nLoading main Eye Filmmuseum file...")
with open(eye_file, 'r', encoding='utf-8') as f:
eye_data = yaml.safe_load(f)
# Add enriched LinkedIn data
if 'linkedin_enrichment' not in eye_data:
eye_data['linkedin_enrichment'] = {}
# Create a mapping from LinkedIn URL to enriched data
enrichment_map = {}
for profile in enriched_profiles:
if profile.get('enriched'):
enrichment_map[profile['linkedin_url']] = profile['enriched_data']
# Update existing LinkedIn URLs with enriched data
def update_with_enrichment(obj):
"""Recursively update LinkedIn URLs with enriched data."""
if isinstance(obj, dict):
for key, value in obj.items():
if key == 'linkedin_url' and isinstance(value, str) and value in enrichment_map:
# Add enriched_data field
obj['linkedin_enriched_data'] = enricher.extract_profile_info(enrichment_map[value])
obj['enrichment_timestamp'] = datetime.now().isoformat() + 'Z'
elif isinstance(value, (dict, list)):
update_with_enrichment(value)
elif isinstance(obj, list):
for item in obj:
update_with_enrichment(item)
update_with_enrichment(eye_data)
# Add enrichment metadata
eye_data['linkedin_enrichment']['api_enrichment'] = {
'enriched_timestamp': datetime.now().isoformat() + 'Z',
'api_source': 'unipile',
'total_profiles': len(profiles),
'successful_enrichments': successful,
'failed_enrichments': failed,
'api_endpoint': 'https://api1.unipile.com/api/v1/users/{identifier}',
'notes': [
f"Profile enrichment completed via Unipile API",
f"Successfully enriched {successful} out of {len(profiles)} profiles",
"Failed profiles were marked with enrichment_error field"
]
}
# Update provenance
if 'provenance' not in eye_data:
eye_data['provenance'] = {}
if 'notes' not in eye_data['provenance']:
eye_data['provenance']['notes'] = []
eye_data['provenance']['notes'].append(
f"LinkedIn API enrichment on {datetime.now().isoformat()}Z "
f"({successful}/{len(profiles)} profiles successfully enriched)"
)
# Save enriched data
output_file = eye_file.replace('.yaml', '_linkedin_api_enriched.yaml')
print(f"\nSaving enriched data to: {output_file}")
with open(output_file, 'w', encoding='utf-8') as f:
yaml.dump(eye_data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
print(f"\nEnrichment complete!")
print(f"Output saved to: {output_file}")
# Create summary report
report = {
'enrichment_timestamp': datetime.now().isoformat() + 'Z',
'api_source': 'unipile',
'total_profiles': len(profiles),
'successful': successful,
'failed': failed,
'success_rate': f"{successful/len(profiles)*100:.1f}%" if profiles else "0%"
}
report_file = output_file.replace('.yaml', '_enrichment_report.json')
with open(report_file, 'w', encoding='utf-8') as f:
json.dump(report, f, indent=2)
print(f"Report saved to: {report_file}")
if __name__ == "__main__":
asyncio.run(main())