#!/usr/bin/env python3 """ Fetch LinkedIn connections for Eye Filmmuseum staff via Unipile API. This script uses the Unipile API to: 1. Get profile info for each Eye staff member with a LinkedIn URL 2. Fetch all 1st-degree connections for those profiles 3. Store results for integration into foaf_knows network data REQUIREMENTS: 1. Sign up for Unipile free trial: https://dashboard.unipile.com/signup 2. Connect your LinkedIn account via Hosted Auth 3. Set UNIPILE_API_KEY and UNIPILE_DSN in .env or environment Author: AI Agent for GLAM project Date: 2024-12-10 """ import os import json import time import httpx import yaml import re from pathlib import Path from datetime import datetime from typing import Optional, Dict, List, Any from dataclasses import dataclass, asdict from dotenv import load_dotenv # Load environment variables load_dotenv() # Unipile API Configuration UNIPILE_API_KEY = os.getenv("UNIPILE_API_KEY") UNIPILE_DSN = os.getenv("UNIPILE_DSN", "api1.unipile.com:13111") # Project paths PROJECT_ROOT = Path(__file__).parent.parent EYE_YAML_PATH = PROJECT_ROOT / "data" / "custodian" / "NL-NH-AMS-U-EFM-eye_filmmuseum.yaml" OUTPUT_DIR = PROJECT_ROOT / "data" / "linkedin_connections" @dataclass class LinkedInConnection: """Represents a LinkedIn connection.""" identifier: str # public_identifier (vanity URL slug) name: str headline: Optional[str] = None profile_url: Optional[str] = None company: Optional[str] = None location: Optional[str] = None connected_at: Optional[str] = None # If available from API @dataclass class StaffConnections: """Represents an Eye staff member and their LinkedIn connections.""" staff_name: str linkedin_url: str linkedin_identifier: str connections: List[LinkedInConnection] scraped_at: str total_connections: int def extract_linkedin_identifier(url: str) -> Optional[str]: """ Extract the public identifier (vanity URL) from a LinkedIn profile URL. Examples: https://www.linkedin.com/in/giovannafossati/ -> giovannafossati https://linkedin.com/in/julian-ross-6b91a812/ -> julian-ross-6b91a812 """ patterns = [ r'linkedin\.com/in/([^/\?]+)', r'linkedin\.com/pub/([^/\?]+)', ] for pattern in patterns: match = re.search(pattern, url) if match: return match.group(1).rstrip('/') return None def load_eye_staff_linkedin_urls() -> List[Dict[str, str]]: """ Load all LinkedIn URLs from the Eye Filmmuseum YAML file. Returns list of {staff_name, linkedin_url, linkedin_identifier}. """ staff_urls = [] with open(EYE_YAML_PATH, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) # Navigate to pico_staff section pico_staff = data.get('pico_staff', {}) for staff_entry in pico_staff.get('staff_observations', []): staff_name = staff_entry.get('person_observed', {}).get('name', 'Unknown') linkedin_url = staff_entry.get('person_observed', {}).get('linkedin_url') if linkedin_url: identifier = extract_linkedin_identifier(linkedin_url) if identifier: staff_urls.append({ 'staff_name': staff_name, 'linkedin_url': linkedin_url, 'linkedin_identifier': identifier }) return staff_urls class UnipileClient: """Client for Unipile API.""" def __init__(self, api_key: str, dsn: str): self.api_key = api_key self.base_url = f"https://{dsn}/api/v1" self.headers = { "accept": "application/json", "X-API-KEY": api_key } self.client = httpx.Client(timeout=60.0, headers=self.headers) self.account_id = None def __enter__(self): return self def __exit__(self, *args): self.client.close() def list_accounts(self) -> List[Dict]: """List all connected accounts.""" response = self.client.get(f"{self.base_url}/accounts") response.raise_for_status() return response.json().get('items', []) def get_linkedin_account_id(self) -> Optional[str]: """Get the LinkedIn account ID (first LinkedIn account found).""" accounts = self.list_accounts() for account in accounts: if account.get('type') == 'LINKEDIN': return account.get('id') return None def get_own_relations(self, account_id: str, cursor: Optional[str] = None, limit: int = 100) -> Dict: """ Get connections for the connected LinkedIn account. Note: This gets YOUR connections, not another user's connections. To get another user's connections, we need a different approach. Args: account_id: Unipile account ID for LinkedIn cursor: Pagination cursor limit: Max results per page """ params = { "account_id": account_id, "limit": limit } if cursor: params["cursor"] = cursor response = self.client.get( f"{self.base_url}/users/relations", params=params ) response.raise_for_status() return response.json() def get_profile(self, identifier: str, account_id: str) -> Dict: """ Get a LinkedIn profile by identifier. Args: identifier: LinkedIn public identifier (vanity URL slug) account_id: Unipile account ID """ params = {"account_id": account_id} response = self.client.get( f"{self.base_url}/users/{identifier}", params=params ) response.raise_for_status() return response.json() def get_all_relations(self, account_id: str) -> List[Dict]: """Get all connections with pagination handling.""" all_relations = [] cursor = None while True: print(f" Fetching relations batch (cursor: {cursor or 'start'})...") result = self.get_own_relations(account_id, cursor=cursor) items = result.get('items', []) all_relations.extend(items) print(f" Got {len(items)} relations (total: {len(all_relations)})") # Check for next page cursor = result.get('cursor') if not cursor or not items: break # Rate limiting - be conservative time.sleep(1) return all_relations def fetch_connections_for_staff( client: UnipileClient, staff: Dict[str, str], account_id: str ) -> Optional[StaffConnections]: """ Fetch connections for a single staff member. Note: Unipile's "List all relations" endpoint returns YOUR connections, not connections of an arbitrary profile. To get connections of Eye staff: Option A: Use LinkedIn Search to find mutual connections Option B: Use the staff member's own LinkedIn account connected to Unipile For now, this function fetches profile info and prepares for later enrichment. """ identifier = staff['linkedin_identifier'] staff_name = staff['staff_name'] print(f"\nProcessing: {staff_name} ({identifier})") try: # Get profile info first profile = client.get_profile(identifier, account_id) print(f" Profile found: {profile.get('first_name')} {profile.get('last_name')}") print(f" Headline: {profile.get('headline', 'N/A')}") # Note: We can't get THEIR connections directly via Unipile # We can only get OUR (the connected account's) connections # For a full network analysis, you'd need each person to connect their own account return StaffConnections( staff_name=staff_name, linkedin_url=staff['linkedin_url'], linkedin_identifier=identifier, connections=[], # Would need different approach to populate scraped_at=datetime.utcnow().isoformat() + "Z", total_connections=0 ) except httpx.HTTPStatusError as e: print(f" Error fetching profile: {e.response.status_code}") return None def export_own_connections_to_yaml(relations: List[Dict], output_path: Path): """Export your own LinkedIn connections to YAML for analysis.""" connections = [] for rel in relations: conn = LinkedInConnection( identifier=rel.get('public_identifier', ''), name=f"{rel.get('first_name', '')} {rel.get('last_name', '')}".strip(), headline=rel.get('headline'), profile_url=f"https://www.linkedin.com/in/{rel.get('public_identifier', '')}/", company=None, # Would need to parse from headline or fetch profile location=rel.get('location'), ) connections.append(asdict(conn)) output = { 'scraped_at': datetime.utcnow().isoformat() + "Z", 'total_connections': len(connections), 'connections': connections } with open(output_path, 'w', encoding='utf-8') as f: yaml.dump(output, f, default_flow_style=False, allow_unicode=True, sort_keys=False) print(f"Exported {len(connections)} connections to {output_path}") def main(): """Main entry point.""" print("=" * 60) print("LinkedIn Connections Fetcher via Unipile API") print("=" * 60) # Check credentials if not UNIPILE_API_KEY: print("\nERROR: UNIPILE_API_KEY not set!") print("\nTo use this script:") print("1. Sign up for Unipile free trial: https://dashboard.unipile.com/signup") print("2. Connect your LinkedIn account via Hosted Auth") print("3. Get your API key from the dashboard") print("4. Add to .env: UNIPILE_API_KEY=your_api_key_here") print("5. Optionally set UNIPILE_DSN if different from default") return print(f"\nUsing DSN: {UNIPILE_DSN}") # Create output directory OUTPUT_DIR.mkdir(parents=True, exist_ok=True) with UnipileClient(UNIPILE_API_KEY, UNIPILE_DSN) as client: # Get LinkedIn account ID print("\nFetching connected accounts...") account_id = client.get_linkedin_account_id() if not account_id: print("ERROR: No LinkedIn account connected to Unipile!") print("Please connect your LinkedIn account via the Unipile dashboard.") return print(f"LinkedIn account ID: {account_id}") # Load Eye staff LinkedIn URLs print("\nLoading Eye Filmmuseum staff LinkedIn URLs...") staff_urls = load_eye_staff_linkedin_urls() print(f"Found {len(staff_urls)} staff with LinkedIn URLs") # OPTION 1: Export YOUR own connections # This is what Unipile's "List all relations" actually does print("\n" + "=" * 60) print("OPTION 1: Export YOUR LinkedIn connections") print("=" * 60) user_input = input("\nFetch your own connections? (y/n): ").strip().lower() if user_input == 'y': print("\nFetching your connections...") relations = client.get_all_relations(account_id) output_path = OUTPUT_DIR / f"my_connections_{datetime.now().strftime('%Y%m%d_%H%M%S')}.yaml" export_own_connections_to_yaml(relations, output_path) # OPTION 2: Get profile info for each Eye staff member # Note: This doesn't get THEIR connections, just their profile print("\n" + "=" * 60) print("OPTION 2: Fetch Eye staff profile info") print("=" * 60) user_input = input("\nFetch Eye staff profiles? (y/n): ").strip().lower() if user_input == 'y': profiles = [] for i, staff in enumerate(staff_urls, 1): print(f"\n[{i}/{len(staff_urls)}] {staff['staff_name']}") try: profile = client.get_profile(staff['linkedin_identifier'], account_id) profiles.append({ 'staff_name': staff['staff_name'], 'linkedin_url': staff['linkedin_url'], 'linkedin_identifier': staff['linkedin_identifier'], 'profile': { 'first_name': profile.get('first_name'), 'last_name': profile.get('last_name'), 'headline': profile.get('headline'), 'location': profile.get('location'), 'industry': profile.get('industry'), 'summary': profile.get('summary'), 'connections_count': profile.get('connections_count'), } }) print(f" OK: {profile.get('first_name')} {profile.get('last_name')}") except httpx.HTTPStatusError as e: print(f" Error: {e.response.status_code}") profiles.append({ 'staff_name': staff['staff_name'], 'linkedin_url': staff['linkedin_url'], 'linkedin_identifier': staff['linkedin_identifier'], 'error': str(e) }) # Rate limiting time.sleep(2) # Conservative rate limit # Export profiles output_path = OUTPUT_DIR / f"eye_staff_profiles_{datetime.now().strftime('%Y%m%d_%H%M%S')}.yaml" with open(output_path, 'w', encoding='utf-8') as f: yaml.dump({ 'scraped_at': datetime.utcnow().isoformat() + "Z", 'total_profiles': len(profiles), 'profiles': profiles }, f, default_flow_style=False, allow_unicode=True, sort_keys=False) print(f"\nExported {len(profiles)} profiles to {output_path}") print("\n" + "=" * 60) print("IMPORTANT NOTE ON CONNECTION SCRAPING") print("=" * 60) print(""" Unipile's "List all relations" endpoint returns YOUR connections, not the connections of arbitrary LinkedIn profiles. To build a full network graph of Eye Filmmuseum staff connections: 1. MANUAL APPROACH: - View each staff member's profile on LinkedIn - Check "Mutual connections" section - Record shared connections manually 2. LINKEDIN SEARCH APPROACH: - Use Unipile's search API to find mutual connections - Query: people who are connected to BOTH you and the staff member 3. COLLABORATIVE APPROACH: - Have each staff member connect their LinkedIn to Unipile - Then fetch each person's connections directly For the GLAM project, Option 1 (manual) combined with the profile data fetched above may be sufficient for FOAF enrichment. """) if __name__ == "__main__": main()