421 lines
15 KiB
Python
421 lines
15 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Fetch LinkedIn connections for Eye Filmmuseum staff via Unipile API.
|
|
|
|
This script uses the Unipile API to:
|
|
1. Get profile info for each Eye staff member with a LinkedIn URL
|
|
2. Fetch all 1st-degree connections for those profiles
|
|
3. Store results for integration into foaf_knows network data
|
|
|
|
REQUIREMENTS:
|
|
1. Sign up for Unipile free trial: https://dashboard.unipile.com/signup
|
|
2. Connect your LinkedIn account via Hosted Auth
|
|
3. Set UNIPILE_API_KEY and UNIPILE_DSN in .env or environment
|
|
|
|
Author: AI Agent for GLAM project
|
|
Date: 2024-12-10
|
|
"""
|
|
|
|
import os
|
|
import json
|
|
import time
|
|
import httpx
|
|
import yaml
|
|
import re
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
from typing import Optional, Dict, List, Any
|
|
from dataclasses import dataclass, asdict
|
|
from dotenv import load_dotenv
|
|
|
|
# Load environment variables
|
|
load_dotenv()
|
|
|
|
# Unipile API Configuration
|
|
UNIPILE_API_KEY = os.getenv("UNIPILE_API_KEY")
|
|
UNIPILE_DSN = os.getenv("UNIPILE_DSN", "api1.unipile.com:13111")
|
|
|
|
# Project paths
|
|
PROJECT_ROOT = Path(__file__).parent.parent
|
|
EYE_YAML_PATH = PROJECT_ROOT / "data" / "custodian" / "NL-NH-AMS-U-EFM-eye_filmmuseum.yaml"
|
|
OUTPUT_DIR = PROJECT_ROOT / "data" / "linkedin_connections"
|
|
|
|
|
|
@dataclass
|
|
class LinkedInConnection:
|
|
"""Represents a LinkedIn connection."""
|
|
identifier: str # public_identifier (vanity URL slug)
|
|
name: str
|
|
headline: Optional[str] = None
|
|
profile_url: Optional[str] = None
|
|
company: Optional[str] = None
|
|
location: Optional[str] = None
|
|
connected_at: Optional[str] = None # If available from API
|
|
|
|
|
|
@dataclass
|
|
class StaffConnections:
|
|
"""Represents an Eye staff member and their LinkedIn connections."""
|
|
staff_name: str
|
|
linkedin_url: str
|
|
linkedin_identifier: str
|
|
connections: List[LinkedInConnection]
|
|
scraped_at: str
|
|
total_connections: int
|
|
|
|
|
|
def extract_linkedin_identifier(url: str) -> Optional[str]:
|
|
"""
|
|
Extract the public identifier (vanity URL) from a LinkedIn profile URL.
|
|
|
|
Examples:
|
|
https://www.linkedin.com/in/giovannafossati/ -> giovannafossati
|
|
https://linkedin.com/in/julian-ross-6b91a812/ -> julian-ross-6b91a812
|
|
"""
|
|
patterns = [
|
|
r'linkedin\.com/in/([^/\?]+)',
|
|
r'linkedin\.com/pub/([^/\?]+)',
|
|
]
|
|
for pattern in patterns:
|
|
match = re.search(pattern, url)
|
|
if match:
|
|
return match.group(1).rstrip('/')
|
|
return None
|
|
|
|
|
|
def load_eye_staff_linkedin_urls() -> List[Dict[str, str]]:
|
|
"""
|
|
Load all LinkedIn URLs from the Eye Filmmuseum YAML file.
|
|
Returns list of {staff_name, linkedin_url, linkedin_identifier}.
|
|
"""
|
|
staff_urls = []
|
|
|
|
with open(EYE_YAML_PATH, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
# Navigate to pico_staff section
|
|
pico_staff = data.get('pico_staff', {})
|
|
|
|
for staff_entry in pico_staff.get('staff_observations', []):
|
|
staff_name = staff_entry.get('person_observed', {}).get('name', 'Unknown')
|
|
linkedin_url = staff_entry.get('person_observed', {}).get('linkedin_url')
|
|
|
|
if linkedin_url:
|
|
identifier = extract_linkedin_identifier(linkedin_url)
|
|
if identifier:
|
|
staff_urls.append({
|
|
'staff_name': staff_name,
|
|
'linkedin_url': linkedin_url,
|
|
'linkedin_identifier': identifier
|
|
})
|
|
|
|
return staff_urls
|
|
|
|
|
|
class UnipileClient:
|
|
"""Client for Unipile API."""
|
|
|
|
def __init__(self, api_key: str, dsn: str):
|
|
self.api_key = api_key
|
|
self.base_url = f"https://{dsn}/api/v1"
|
|
self.headers = {
|
|
"accept": "application/json",
|
|
"X-API-KEY": api_key
|
|
}
|
|
self.client = httpx.Client(timeout=60.0, headers=self.headers)
|
|
self.account_id = None
|
|
|
|
def __enter__(self):
|
|
return self
|
|
|
|
def __exit__(self, *args):
|
|
self.client.close()
|
|
|
|
def list_accounts(self) -> List[Dict]:
|
|
"""List all connected accounts."""
|
|
response = self.client.get(f"{self.base_url}/accounts")
|
|
response.raise_for_status()
|
|
return response.json().get('items', [])
|
|
|
|
def get_linkedin_account_id(self) -> Optional[str]:
|
|
"""Get the LinkedIn account ID (first LinkedIn account found)."""
|
|
accounts = self.list_accounts()
|
|
for account in accounts:
|
|
if account.get('type') == 'LINKEDIN':
|
|
return account.get('id')
|
|
return None
|
|
|
|
def get_own_relations(self, account_id: str, cursor: Optional[str] = None, limit: int = 100) -> Dict:
|
|
"""
|
|
Get connections for the connected LinkedIn account.
|
|
|
|
Note: This gets YOUR connections, not another user's connections.
|
|
To get another user's connections, we need a different approach.
|
|
|
|
Args:
|
|
account_id: Unipile account ID for LinkedIn
|
|
cursor: Pagination cursor
|
|
limit: Max results per page
|
|
"""
|
|
params = {
|
|
"account_id": account_id,
|
|
"limit": limit
|
|
}
|
|
if cursor:
|
|
params["cursor"] = cursor
|
|
|
|
response = self.client.get(
|
|
f"{self.base_url}/users/relations",
|
|
params=params
|
|
)
|
|
response.raise_for_status()
|
|
return response.json()
|
|
|
|
def get_profile(self, identifier: str, account_id: str) -> Dict:
|
|
"""
|
|
Get a LinkedIn profile by identifier.
|
|
|
|
Args:
|
|
identifier: LinkedIn public identifier (vanity URL slug)
|
|
account_id: Unipile account ID
|
|
"""
|
|
params = {"account_id": account_id}
|
|
response = self.client.get(
|
|
f"{self.base_url}/users/{identifier}",
|
|
params=params
|
|
)
|
|
response.raise_for_status()
|
|
return response.json()
|
|
|
|
def get_all_relations(self, account_id: str) -> List[Dict]:
|
|
"""Get all connections with pagination handling."""
|
|
all_relations = []
|
|
cursor = None
|
|
|
|
while True:
|
|
print(f" Fetching relations batch (cursor: {cursor or 'start'})...")
|
|
result = self.get_own_relations(account_id, cursor=cursor)
|
|
|
|
items = result.get('items', [])
|
|
all_relations.extend(items)
|
|
|
|
print(f" Got {len(items)} relations (total: {len(all_relations)})")
|
|
|
|
# Check for next page
|
|
cursor = result.get('cursor')
|
|
if not cursor or not items:
|
|
break
|
|
|
|
# Rate limiting - be conservative
|
|
time.sleep(1)
|
|
|
|
return all_relations
|
|
|
|
|
|
def fetch_connections_for_staff(
|
|
client: UnipileClient,
|
|
staff: Dict[str, str],
|
|
account_id: str
|
|
) -> Optional[StaffConnections]:
|
|
"""
|
|
Fetch connections for a single staff member.
|
|
|
|
Note: Unipile's "List all relations" endpoint returns YOUR connections,
|
|
not connections of an arbitrary profile. To get connections of Eye staff:
|
|
|
|
Option A: Use LinkedIn Search to find mutual connections
|
|
Option B: Use the staff member's own LinkedIn account connected to Unipile
|
|
|
|
For now, this function fetches profile info and prepares for later enrichment.
|
|
"""
|
|
identifier = staff['linkedin_identifier']
|
|
staff_name = staff['staff_name']
|
|
|
|
print(f"\nProcessing: {staff_name} ({identifier})")
|
|
|
|
try:
|
|
# Get profile info first
|
|
profile = client.get_profile(identifier, account_id)
|
|
print(f" Profile found: {profile.get('first_name')} {profile.get('last_name')}")
|
|
print(f" Headline: {profile.get('headline', 'N/A')}")
|
|
|
|
# Note: We can't get THEIR connections directly via Unipile
|
|
# We can only get OUR (the connected account's) connections
|
|
# For a full network analysis, you'd need each person to connect their own account
|
|
|
|
return StaffConnections(
|
|
staff_name=staff_name,
|
|
linkedin_url=staff['linkedin_url'],
|
|
linkedin_identifier=identifier,
|
|
connections=[], # Would need different approach to populate
|
|
scraped_at=datetime.utcnow().isoformat() + "Z",
|
|
total_connections=0
|
|
)
|
|
|
|
except httpx.HTTPStatusError as e:
|
|
print(f" Error fetching profile: {e.response.status_code}")
|
|
return None
|
|
|
|
|
|
def export_own_connections_to_yaml(relations: List[Dict], output_path: Path):
|
|
"""Export your own LinkedIn connections to YAML for analysis."""
|
|
connections = []
|
|
|
|
for rel in relations:
|
|
conn = LinkedInConnection(
|
|
identifier=rel.get('public_identifier', ''),
|
|
name=f"{rel.get('first_name', '')} {rel.get('last_name', '')}".strip(),
|
|
headline=rel.get('headline'),
|
|
profile_url=f"https://www.linkedin.com/in/{rel.get('public_identifier', '')}/",
|
|
company=None, # Would need to parse from headline or fetch profile
|
|
location=rel.get('location'),
|
|
)
|
|
connections.append(asdict(conn))
|
|
|
|
output = {
|
|
'scraped_at': datetime.utcnow().isoformat() + "Z",
|
|
'total_connections': len(connections),
|
|
'connections': connections
|
|
}
|
|
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(output, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
print(f"Exported {len(connections)} connections to {output_path}")
|
|
|
|
|
|
def main():
|
|
"""Main entry point."""
|
|
print("=" * 60)
|
|
print("LinkedIn Connections Fetcher via Unipile API")
|
|
print("=" * 60)
|
|
|
|
# Check credentials
|
|
if not UNIPILE_API_KEY:
|
|
print("\nERROR: UNIPILE_API_KEY not set!")
|
|
print("\nTo use this script:")
|
|
print("1. Sign up for Unipile free trial: https://dashboard.unipile.com/signup")
|
|
print("2. Connect your LinkedIn account via Hosted Auth")
|
|
print("3. Get your API key from the dashboard")
|
|
print("4. Add to .env: UNIPILE_API_KEY=your_api_key_here")
|
|
print("5. Optionally set UNIPILE_DSN if different from default")
|
|
return
|
|
|
|
print(f"\nUsing DSN: {UNIPILE_DSN}")
|
|
|
|
# Create output directory
|
|
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
with UnipileClient(UNIPILE_API_KEY, UNIPILE_DSN) as client:
|
|
# Get LinkedIn account ID
|
|
print("\nFetching connected accounts...")
|
|
account_id = client.get_linkedin_account_id()
|
|
|
|
if not account_id:
|
|
print("ERROR: No LinkedIn account connected to Unipile!")
|
|
print("Please connect your LinkedIn account via the Unipile dashboard.")
|
|
return
|
|
|
|
print(f"LinkedIn account ID: {account_id}")
|
|
|
|
# Load Eye staff LinkedIn URLs
|
|
print("\nLoading Eye Filmmuseum staff LinkedIn URLs...")
|
|
staff_urls = load_eye_staff_linkedin_urls()
|
|
print(f"Found {len(staff_urls)} staff with LinkedIn URLs")
|
|
|
|
# OPTION 1: Export YOUR own connections
|
|
# This is what Unipile's "List all relations" actually does
|
|
print("\n" + "=" * 60)
|
|
print("OPTION 1: Export YOUR LinkedIn connections")
|
|
print("=" * 60)
|
|
|
|
user_input = input("\nFetch your own connections? (y/n): ").strip().lower()
|
|
if user_input == 'y':
|
|
print("\nFetching your connections...")
|
|
relations = client.get_all_relations(account_id)
|
|
|
|
output_path = OUTPUT_DIR / f"my_connections_{datetime.now().strftime('%Y%m%d_%H%M%S')}.yaml"
|
|
export_own_connections_to_yaml(relations, output_path)
|
|
|
|
# OPTION 2: Get profile info for each Eye staff member
|
|
# Note: This doesn't get THEIR connections, just their profile
|
|
print("\n" + "=" * 60)
|
|
print("OPTION 2: Fetch Eye staff profile info")
|
|
print("=" * 60)
|
|
|
|
user_input = input("\nFetch Eye staff profiles? (y/n): ").strip().lower()
|
|
if user_input == 'y':
|
|
profiles = []
|
|
|
|
for i, staff in enumerate(staff_urls, 1):
|
|
print(f"\n[{i}/{len(staff_urls)}] {staff['staff_name']}")
|
|
|
|
try:
|
|
profile = client.get_profile(staff['linkedin_identifier'], account_id)
|
|
profiles.append({
|
|
'staff_name': staff['staff_name'],
|
|
'linkedin_url': staff['linkedin_url'],
|
|
'linkedin_identifier': staff['linkedin_identifier'],
|
|
'profile': {
|
|
'first_name': profile.get('first_name'),
|
|
'last_name': profile.get('last_name'),
|
|
'headline': profile.get('headline'),
|
|
'location': profile.get('location'),
|
|
'industry': profile.get('industry'),
|
|
'summary': profile.get('summary'),
|
|
'connections_count': profile.get('connections_count'),
|
|
}
|
|
})
|
|
print(f" OK: {profile.get('first_name')} {profile.get('last_name')}")
|
|
except httpx.HTTPStatusError as e:
|
|
print(f" Error: {e.response.status_code}")
|
|
profiles.append({
|
|
'staff_name': staff['staff_name'],
|
|
'linkedin_url': staff['linkedin_url'],
|
|
'linkedin_identifier': staff['linkedin_identifier'],
|
|
'error': str(e)
|
|
})
|
|
|
|
# Rate limiting
|
|
time.sleep(2) # Conservative rate limit
|
|
|
|
# Export profiles
|
|
output_path = OUTPUT_DIR / f"eye_staff_profiles_{datetime.now().strftime('%Y%m%d_%H%M%S')}.yaml"
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump({
|
|
'scraped_at': datetime.utcnow().isoformat() + "Z",
|
|
'total_profiles': len(profiles),
|
|
'profiles': profiles
|
|
}, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
print(f"\nExported {len(profiles)} profiles to {output_path}")
|
|
|
|
print("\n" + "=" * 60)
|
|
print("IMPORTANT NOTE ON CONNECTION SCRAPING")
|
|
print("=" * 60)
|
|
print("""
|
|
Unipile's "List all relations" endpoint returns YOUR connections,
|
|
not the connections of arbitrary LinkedIn profiles.
|
|
|
|
To build a full network graph of Eye Filmmuseum staff connections:
|
|
|
|
1. MANUAL APPROACH:
|
|
- View each staff member's profile on LinkedIn
|
|
- Check "Mutual connections" section
|
|
- Record shared connections manually
|
|
|
|
2. LINKEDIN SEARCH APPROACH:
|
|
- Use Unipile's search API to find mutual connections
|
|
- Query: people who are connected to BOTH you and the staff member
|
|
|
|
3. COLLABORATIVE APPROACH:
|
|
- Have each staff member connect their LinkedIn to Unipile
|
|
- Then fetch each person's connections directly
|
|
|
|
For the GLAM project, Option 1 (manual) combined with the profile
|
|
data fetched above may be sufficient for FOAF enrichment.
|
|
""")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|