glam/scripts/fetch_linkedin_connections_unipile.py
2025-12-10 13:01:13 +01:00

421 lines
15 KiB
Python

#!/usr/bin/env python3
"""
Fetch LinkedIn connections for Eye Filmmuseum staff via Unipile API.
This script uses the Unipile API to:
1. Get profile info for each Eye staff member with a LinkedIn URL
2. Fetch all 1st-degree connections for those profiles
3. Store results for integration into foaf_knows network data
REQUIREMENTS:
1. Sign up for Unipile free trial: https://dashboard.unipile.com/signup
2. Connect your LinkedIn account via Hosted Auth
3. Set UNIPILE_API_KEY and UNIPILE_DSN in .env or environment
Author: AI Agent for GLAM project
Date: 2024-12-10
"""
import os
import json
import time
import httpx
import yaml
import re
from pathlib import Path
from datetime import datetime
from typing import Optional, Dict, List, Any
from dataclasses import dataclass, asdict
from dotenv import load_dotenv
# Load environment variables
load_dotenv()
# Unipile API Configuration
UNIPILE_API_KEY = os.getenv("UNIPILE_API_KEY")
UNIPILE_DSN = os.getenv("UNIPILE_DSN", "api1.unipile.com:13111")
# Project paths
PROJECT_ROOT = Path(__file__).parent.parent
EYE_YAML_PATH = PROJECT_ROOT / "data" / "custodian" / "NL-NH-AMS-U-EFM-eye_filmmuseum.yaml"
OUTPUT_DIR = PROJECT_ROOT / "data" / "linkedin_connections"
@dataclass
class LinkedInConnection:
"""Represents a LinkedIn connection."""
identifier: str # public_identifier (vanity URL slug)
name: str
headline: Optional[str] = None
profile_url: Optional[str] = None
company: Optional[str] = None
location: Optional[str] = None
connected_at: Optional[str] = None # If available from API
@dataclass
class StaffConnections:
"""Represents an Eye staff member and their LinkedIn connections."""
staff_name: str
linkedin_url: str
linkedin_identifier: str
connections: List[LinkedInConnection]
scraped_at: str
total_connections: int
def extract_linkedin_identifier(url: str) -> Optional[str]:
"""
Extract the public identifier (vanity URL) from a LinkedIn profile URL.
Examples:
https://www.linkedin.com/in/giovannafossati/ -> giovannafossati
https://linkedin.com/in/julian-ross-6b91a812/ -> julian-ross-6b91a812
"""
patterns = [
r'linkedin\.com/in/([^/\?]+)',
r'linkedin\.com/pub/([^/\?]+)',
]
for pattern in patterns:
match = re.search(pattern, url)
if match:
return match.group(1).rstrip('/')
return None
def load_eye_staff_linkedin_urls() -> List[Dict[str, str]]:
"""
Load all LinkedIn URLs from the Eye Filmmuseum YAML file.
Returns list of {staff_name, linkedin_url, linkedin_identifier}.
"""
staff_urls = []
with open(EYE_YAML_PATH, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
# Navigate to pico_staff section
pico_staff = data.get('pico_staff', {})
for staff_entry in pico_staff.get('staff_observations', []):
staff_name = staff_entry.get('person_observed', {}).get('name', 'Unknown')
linkedin_url = staff_entry.get('person_observed', {}).get('linkedin_url')
if linkedin_url:
identifier = extract_linkedin_identifier(linkedin_url)
if identifier:
staff_urls.append({
'staff_name': staff_name,
'linkedin_url': linkedin_url,
'linkedin_identifier': identifier
})
return staff_urls
class UnipileClient:
"""Client for Unipile API."""
def __init__(self, api_key: str, dsn: str):
self.api_key = api_key
self.base_url = f"https://{dsn}/api/v1"
self.headers = {
"accept": "application/json",
"X-API-KEY": api_key
}
self.client = httpx.Client(timeout=60.0, headers=self.headers)
self.account_id = None
def __enter__(self):
return self
def __exit__(self, *args):
self.client.close()
def list_accounts(self) -> List[Dict]:
"""List all connected accounts."""
response = self.client.get(f"{self.base_url}/accounts")
response.raise_for_status()
return response.json().get('items', [])
def get_linkedin_account_id(self) -> Optional[str]:
"""Get the LinkedIn account ID (first LinkedIn account found)."""
accounts = self.list_accounts()
for account in accounts:
if account.get('type') == 'LINKEDIN':
return account.get('id')
return None
def get_own_relations(self, account_id: str, cursor: Optional[str] = None, limit: int = 100) -> Dict:
"""
Get connections for the connected LinkedIn account.
Note: This gets YOUR connections, not another user's connections.
To get another user's connections, we need a different approach.
Args:
account_id: Unipile account ID for LinkedIn
cursor: Pagination cursor
limit: Max results per page
"""
params = {
"account_id": account_id,
"limit": limit
}
if cursor:
params["cursor"] = cursor
response = self.client.get(
f"{self.base_url}/users/relations",
params=params
)
response.raise_for_status()
return response.json()
def get_profile(self, identifier: str, account_id: str) -> Dict:
"""
Get a LinkedIn profile by identifier.
Args:
identifier: LinkedIn public identifier (vanity URL slug)
account_id: Unipile account ID
"""
params = {"account_id": account_id}
response = self.client.get(
f"{self.base_url}/users/{identifier}",
params=params
)
response.raise_for_status()
return response.json()
def get_all_relations(self, account_id: str) -> List[Dict]:
"""Get all connections with pagination handling."""
all_relations = []
cursor = None
while True:
print(f" Fetching relations batch (cursor: {cursor or 'start'})...")
result = self.get_own_relations(account_id, cursor=cursor)
items = result.get('items', [])
all_relations.extend(items)
print(f" Got {len(items)} relations (total: {len(all_relations)})")
# Check for next page
cursor = result.get('cursor')
if not cursor or not items:
break
# Rate limiting - be conservative
time.sleep(1)
return all_relations
def fetch_connections_for_staff(
client: UnipileClient,
staff: Dict[str, str],
account_id: str
) -> Optional[StaffConnections]:
"""
Fetch connections for a single staff member.
Note: Unipile's "List all relations" endpoint returns YOUR connections,
not connections of an arbitrary profile. To get connections of Eye staff:
Option A: Use LinkedIn Search to find mutual connections
Option B: Use the staff member's own LinkedIn account connected to Unipile
For now, this function fetches profile info and prepares for later enrichment.
"""
identifier = staff['linkedin_identifier']
staff_name = staff['staff_name']
print(f"\nProcessing: {staff_name} ({identifier})")
try:
# Get profile info first
profile = client.get_profile(identifier, account_id)
print(f" Profile found: {profile.get('first_name')} {profile.get('last_name')}")
print(f" Headline: {profile.get('headline', 'N/A')}")
# Note: We can't get THEIR connections directly via Unipile
# We can only get OUR (the connected account's) connections
# For a full network analysis, you'd need each person to connect their own account
return StaffConnections(
staff_name=staff_name,
linkedin_url=staff['linkedin_url'],
linkedin_identifier=identifier,
connections=[], # Would need different approach to populate
scraped_at=datetime.utcnow().isoformat() + "Z",
total_connections=0
)
except httpx.HTTPStatusError as e:
print(f" Error fetching profile: {e.response.status_code}")
return None
def export_own_connections_to_yaml(relations: List[Dict], output_path: Path):
"""Export your own LinkedIn connections to YAML for analysis."""
connections = []
for rel in relations:
conn = LinkedInConnection(
identifier=rel.get('public_identifier', ''),
name=f"{rel.get('first_name', '')} {rel.get('last_name', '')}".strip(),
headline=rel.get('headline'),
profile_url=f"https://www.linkedin.com/in/{rel.get('public_identifier', '')}/",
company=None, # Would need to parse from headline or fetch profile
location=rel.get('location'),
)
connections.append(asdict(conn))
output = {
'scraped_at': datetime.utcnow().isoformat() + "Z",
'total_connections': len(connections),
'connections': connections
}
with open(output_path, 'w', encoding='utf-8') as f:
yaml.dump(output, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
print(f"Exported {len(connections)} connections to {output_path}")
def main():
"""Main entry point."""
print("=" * 60)
print("LinkedIn Connections Fetcher via Unipile API")
print("=" * 60)
# Check credentials
if not UNIPILE_API_KEY:
print("\nERROR: UNIPILE_API_KEY not set!")
print("\nTo use this script:")
print("1. Sign up for Unipile free trial: https://dashboard.unipile.com/signup")
print("2. Connect your LinkedIn account via Hosted Auth")
print("3. Get your API key from the dashboard")
print("4. Add to .env: UNIPILE_API_KEY=your_api_key_here")
print("5. Optionally set UNIPILE_DSN if different from default")
return
print(f"\nUsing DSN: {UNIPILE_DSN}")
# Create output directory
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
with UnipileClient(UNIPILE_API_KEY, UNIPILE_DSN) as client:
# Get LinkedIn account ID
print("\nFetching connected accounts...")
account_id = client.get_linkedin_account_id()
if not account_id:
print("ERROR: No LinkedIn account connected to Unipile!")
print("Please connect your LinkedIn account via the Unipile dashboard.")
return
print(f"LinkedIn account ID: {account_id}")
# Load Eye staff LinkedIn URLs
print("\nLoading Eye Filmmuseum staff LinkedIn URLs...")
staff_urls = load_eye_staff_linkedin_urls()
print(f"Found {len(staff_urls)} staff with LinkedIn URLs")
# OPTION 1: Export YOUR own connections
# This is what Unipile's "List all relations" actually does
print("\n" + "=" * 60)
print("OPTION 1: Export YOUR LinkedIn connections")
print("=" * 60)
user_input = input("\nFetch your own connections? (y/n): ").strip().lower()
if user_input == 'y':
print("\nFetching your connections...")
relations = client.get_all_relations(account_id)
output_path = OUTPUT_DIR / f"my_connections_{datetime.now().strftime('%Y%m%d_%H%M%S')}.yaml"
export_own_connections_to_yaml(relations, output_path)
# OPTION 2: Get profile info for each Eye staff member
# Note: This doesn't get THEIR connections, just their profile
print("\n" + "=" * 60)
print("OPTION 2: Fetch Eye staff profile info")
print("=" * 60)
user_input = input("\nFetch Eye staff profiles? (y/n): ").strip().lower()
if user_input == 'y':
profiles = []
for i, staff in enumerate(staff_urls, 1):
print(f"\n[{i}/{len(staff_urls)}] {staff['staff_name']}")
try:
profile = client.get_profile(staff['linkedin_identifier'], account_id)
profiles.append({
'staff_name': staff['staff_name'],
'linkedin_url': staff['linkedin_url'],
'linkedin_identifier': staff['linkedin_identifier'],
'profile': {
'first_name': profile.get('first_name'),
'last_name': profile.get('last_name'),
'headline': profile.get('headline'),
'location': profile.get('location'),
'industry': profile.get('industry'),
'summary': profile.get('summary'),
'connections_count': profile.get('connections_count'),
}
})
print(f" OK: {profile.get('first_name')} {profile.get('last_name')}")
except httpx.HTTPStatusError as e:
print(f" Error: {e.response.status_code}")
profiles.append({
'staff_name': staff['staff_name'],
'linkedin_url': staff['linkedin_url'],
'linkedin_identifier': staff['linkedin_identifier'],
'error': str(e)
})
# Rate limiting
time.sleep(2) # Conservative rate limit
# Export profiles
output_path = OUTPUT_DIR / f"eye_staff_profiles_{datetime.now().strftime('%Y%m%d_%H%M%S')}.yaml"
with open(output_path, 'w', encoding='utf-8') as f:
yaml.dump({
'scraped_at': datetime.utcnow().isoformat() + "Z",
'total_profiles': len(profiles),
'profiles': profiles
}, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
print(f"\nExported {len(profiles)} profiles to {output_path}")
print("\n" + "=" * 60)
print("IMPORTANT NOTE ON CONNECTION SCRAPING")
print("=" * 60)
print("""
Unipile's "List all relations" endpoint returns YOUR connections,
not the connections of arbitrary LinkedIn profiles.
To build a full network graph of Eye Filmmuseum staff connections:
1. MANUAL APPROACH:
- View each staff member's profile on LinkedIn
- Check "Mutual connections" section
- Record shared connections manually
2. LINKEDIN SEARCH APPROACH:
- Use Unipile's search API to find mutual connections
- Query: people who are connected to BOTH you and the staff member
3. COLLABORATIVE APPROACH:
- Have each staff member connect their LinkedIn to Unipile
- Then fetch each person's connections directly
For the GLAM project, Option 1 (manual) combined with the profile
data fetched above may be sufficient for FOAF enrichment.
""")
if __name__ == "__main__":
main()