257 lines
No EOL
10 KiB
Python
257 lines
No EOL
10 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Enrich LinkedIn profiles for Eye Filmmuseum using Unipile API.
|
|
This script enriches the extracted LinkedIn URLs with profile data.
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import json
|
|
import yaml
|
|
import asyncio
|
|
import httpx
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import Dict, List, Any, Optional
|
|
from dotenv import load_dotenv
|
|
|
|
# Load environment variables
|
|
load_dotenv()
|
|
|
|
class LinkedInProfileEnricher:
|
|
"""Enrich LinkedIn profiles using Unipile API."""
|
|
|
|
def __init__(self, api_key: str, dsn: str = "api1.unipile.com:13111"):
|
|
self.api_key = api_key
|
|
self.dsn = dsn
|
|
self.base_url = f"https://{dsn}/api/v1"
|
|
self.headers = {
|
|
"accept": "application/json",
|
|
"X-API-KEY": api_key
|
|
}
|
|
self.client = httpx.Client(timeout=60.0, headers=self.headers)
|
|
|
|
async def enrich_profile(self, identifier: str) -> Optional[Dict[str, Any]]:
|
|
"""Enrich a single LinkedIn profile."""
|
|
url = f"{self.base_url}/users/{identifier}"
|
|
try:
|
|
response = await self.client.get(url)
|
|
if response.status_code == 200:
|
|
return response.json()
|
|
else:
|
|
print(f"Error fetching {identifier}: {response.status_code}")
|
|
return None
|
|
except Exception as e:
|
|
print(f"Exception fetching {identifier}: {e}")
|
|
return None
|
|
|
|
async def enrich_profiles(self, profiles: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
"""Enrich multiple LinkedIn profiles."""
|
|
enriched = []
|
|
|
|
async with httpx.AsyncClient(timeout=60.0, headers=self.headers) as client:
|
|
tasks = []
|
|
for profile in profiles:
|
|
identifier = profile.get('linkedin_identifier')
|
|
if identifier:
|
|
task = asyncio.create_task(self.enrich_profile(identifier))
|
|
tasks.append((profile, task))
|
|
|
|
# Process in batches to avoid rate limiting
|
|
batch_size = 10
|
|
for i in range(0, len(tasks), batch_size):
|
|
batch = tasks[i:i + batch_size]
|
|
print(f"Processing batch {i//batch_size + 1}/{(len(tasks)-1)//batch_size + 1}...")
|
|
|
|
results = await asyncio.gather(*[task for _, task in batch], return_exceptions=True)
|
|
|
|
for profile, result in zip([p for p, _ in batch], results):
|
|
if isinstance(result, Exception):
|
|
print(f"Failed to enrich {profile['name']}: {result}")
|
|
enriched.append({
|
|
**profile,
|
|
'enrichment_error': str(result),
|
|
'enriched': False
|
|
})
|
|
elif result:
|
|
enriched.append({
|
|
**profile,
|
|
'enriched_data': result,
|
|
'enriched': True,
|
|
'enrichment_timestamp': datetime.now().isoformat() + 'Z'
|
|
})
|
|
else:
|
|
enriched.append({
|
|
**profile,
|
|
'enriched': False,
|
|
'enrichment_error': 'No data returned'
|
|
})
|
|
|
|
# Rate limiting between batches
|
|
if i + batch_size < len(tasks):
|
|
await asyncio.sleep(2)
|
|
|
|
return enriched
|
|
|
|
def extract_profile_info(self, api_data: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Extract relevant information from API response."""
|
|
return {
|
|
'first_name': api_data.get('first_name'),
|
|
'last_name': api_data.get('last_name'),
|
|
'full_name': f"{api_data.get('first_name', '')} {api_data.get('last_name', '')}".strip(),
|
|
'headline': api_data.get('headline'),
|
|
'location': api_data.get('location'),
|
|
'industry': api_data.get('industry'),
|
|
'summary': api_data.get('summary'),
|
|
'connections_count': api_data.get('connections_count'),
|
|
'followers_count': api_data.get('followers_count'),
|
|
'profile_url': api_data.get('profile_url'),
|
|
'profile_image_url': api_data.get('profile_image_url'),
|
|
'company': api_data.get('company'),
|
|
'job_title': api_data.get('job_title'),
|
|
'experience': api_data.get('experience', []),
|
|
'education': api_data.get('education', []),
|
|
'skills': api_data.get('skills', []),
|
|
'languages': api_data.get('languages', [])
|
|
}
|
|
|
|
async def main():
|
|
"""Main function."""
|
|
# Check for API credentials
|
|
api_key = os.getenv('UNIPILE_API_KEY')
|
|
if not api_key:
|
|
print("\n" + "="*60)
|
|
print("LINKEDIN PROFILE ENRICHER - UNIPILE API REQUIRED")
|
|
print("="*60)
|
|
print("\nERROR: UNIPILE_API_KEY environment variable not set")
|
|
print("\nTo enrich LinkedIn profiles:")
|
|
print("1. Sign up for Unipile free trial: https://dashboard.unipile.com/signup")
|
|
print("2. Connect your LinkedIn account via Hosted Auth")
|
|
print("3. Get your API key from dashboard")
|
|
print("4. Set environment variable:")
|
|
print(" export UNIPILE_API_KEY=your_api_key_here")
|
|
print("\nAfter setting credentials, run this script again.")
|
|
print("="*60)
|
|
return
|
|
|
|
# Initialize enricher
|
|
dsn = os.getenv('UNIPILE_DSN', 'api1.unipile.com:13111')
|
|
enricher = LinkedInProfileEnricher(api_key, dsn)
|
|
|
|
# Load extracted LinkedIn profiles
|
|
profiles_file = "/Users/kempersc/apps/glam/data/custodian/NL-NH-AMS-U-EFM-eye_filmmuseum_linkedin_enriched.yaml"
|
|
|
|
print("Loading extracted LinkedIn profiles...")
|
|
with open(profiles_file, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
# Get the bulk extracted profiles
|
|
bulk_extraction = data.get('linkedin_enrichment', {}).get('bulk_url_extraction', {})
|
|
profiles = bulk_extraction.get('profiles', [])
|
|
|
|
if not profiles:
|
|
print("No LinkedIn profiles found in extracted data!")
|
|
return
|
|
|
|
print(f"Found {len(profiles)} LinkedIn profiles to enrich")
|
|
|
|
# Enrich profiles
|
|
print("\nStarting enrichment with Unipile API...")
|
|
enriched_profiles = await enricher.enrich_profiles(profiles)
|
|
|
|
# Create enrichment summary
|
|
successful = sum(1 for p in enriched_profiles if p.get('enriched', False))
|
|
failed = len(enriched_profiles) - successful
|
|
|
|
print(f"\nEnrichment complete!")
|
|
print(f"Successful: {successful}")
|
|
print(f"Failed: {failed}")
|
|
|
|
# Update the main Eye Filmmuseum file with enriched data
|
|
eye_file = "/Users/kempersc/apps/glam/data/custodian/NL-NH-AMS-U-EFM-eye_filmmuseum.yaml"
|
|
|
|
print("\nLoading main Eye Filmmuseum file...")
|
|
with open(eye_file, 'r', encoding='utf-8') as f:
|
|
eye_data = yaml.safe_load(f)
|
|
|
|
# Add enriched LinkedIn data
|
|
if 'linkedin_enrichment' not in eye_data:
|
|
eye_data['linkedin_enrichment'] = {}
|
|
|
|
# Create a mapping from LinkedIn URL to enriched data
|
|
enrichment_map = {}
|
|
for profile in enriched_profiles:
|
|
if profile.get('enriched'):
|
|
enrichment_map[profile['linkedin_url']] = profile['enriched_data']
|
|
|
|
# Update existing LinkedIn URLs with enriched data
|
|
def update_with_enrichment(obj):
|
|
"""Recursively update LinkedIn URLs with enriched data."""
|
|
if isinstance(obj, dict):
|
|
for key, value in obj.items():
|
|
if key == 'linkedin_url' and isinstance(value, str) and value in enrichment_map:
|
|
# Add enriched_data field
|
|
obj['linkedin_enriched_data'] = enricher.extract_profile_info(enrichment_map[value])
|
|
obj['enrichment_timestamp'] = datetime.now().isoformat() + 'Z'
|
|
elif isinstance(value, (dict, list)):
|
|
update_with_enrichment(value)
|
|
elif isinstance(obj, list):
|
|
for item in obj:
|
|
update_with_enrichment(item)
|
|
|
|
update_with_enrichment(eye_data)
|
|
|
|
# Add enrichment metadata
|
|
eye_data['linkedin_enrichment']['api_enrichment'] = {
|
|
'enriched_timestamp': datetime.now().isoformat() + 'Z',
|
|
'api_source': 'unipile',
|
|
'total_profiles': len(profiles),
|
|
'successful_enrichments': successful,
|
|
'failed_enrichments': failed,
|
|
'api_endpoint': 'https://api1.unipile.com/api/v1/users/{identifier}',
|
|
'notes': [
|
|
f"Profile enrichment completed via Unipile API",
|
|
f"Successfully enriched {successful} out of {len(profiles)} profiles",
|
|
"Failed profiles were marked with enrichment_error field"
|
|
]
|
|
}
|
|
|
|
# Update provenance
|
|
if 'provenance' not in eye_data:
|
|
eye_data['provenance'] = {}
|
|
if 'notes' not in eye_data['provenance']:
|
|
eye_data['provenance']['notes'] = []
|
|
|
|
eye_data['provenance']['notes'].append(
|
|
f"LinkedIn API enrichment on {datetime.now().isoformat()}Z "
|
|
f"({successful}/{len(profiles)} profiles successfully enriched)"
|
|
)
|
|
|
|
# Save enriched data
|
|
output_file = eye_file.replace('.yaml', '_linkedin_api_enriched.yaml')
|
|
print(f"\nSaving enriched data to: {output_file}")
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
yaml.dump(eye_data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
print(f"\nEnrichment complete!")
|
|
print(f"Output saved to: {output_file}")
|
|
|
|
# Create summary report
|
|
report = {
|
|
'enrichment_timestamp': datetime.now().isoformat() + 'Z',
|
|
'api_source': 'unipile',
|
|
'total_profiles': len(profiles),
|
|
'successful': successful,
|
|
'failed': failed,
|
|
'success_rate': f"{successful/len(profiles)*100:.1f}%" if profiles else "0%"
|
|
}
|
|
|
|
report_file = output_file.replace('.yaml', '_enrichment_report.json')
|
|
with open(report_file, 'w', encoding='utf-8') as f:
|
|
json.dump(report, f, indent=2)
|
|
|
|
print(f"Report saved to: {report_file}")
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main()) |