#!/usr/bin/env python3 """ ULTIMATE LinkedIn extraction for Eye Filmmuseum. This script performs the most comprehensive extraction of ALL LinkedIn URLs. """ import os import sys import json import yaml import re from datetime import datetime from pathlib import Path from typing import Dict, List, Any, Optional, Set def ultimate_extract_linkedin_urls(data: Any, path: str = "") -> List[Dict[str, Any]]: """Ultimate extraction of LinkedIn URLs from any data structure.""" urls = [] seen_urls = set() def extract_from_value(value: Any, context_path: str) -> Optional[str]: """Extract and normalize LinkedIn URL from a single value.""" if isinstance(value, str): # Look for LinkedIn URLs in text import re as re_module # Find all LinkedIn URLs in the text linkedin_matches = re_module.findall(r'linkedin\.com/[^\s\)]+', value) for match in linkedin_matches: # Clean and normalize URL url = match.strip() if url.startswith('http'): clean_url = url elif url.startswith('//'): clean_url = f"https:{url}" else: clean_url = f"https://{url}" if clean_url not in seen_urls: seen_urls.add(clean_url) return clean_url return None def extract_from_object(obj: Any, context_path: str = "") -> None: """Recursively extract LinkedIn URLs from object.""" if isinstance(obj, dict): # Check all string values for LinkedIn URLs for key, value in obj.items(): current_path = f"{context_path}.{key}" if context_path else key # Direct LinkedIn URL fields if 'linkedin' in key.lower() and isinstance(value, str): url = extract_from_value(value, current_path) if url: name = find_name_in_context(obj, key) urls.append({ 'name': name, 'linkedin_url': url, 'path': current_path, 'field': key, 'context': obj }) # Check any string value for LinkedIn URLs url = extract_from_value(value, current_path) if url: name = find_name_in_context(obj, key) urls.append({ 'name': name, 'linkedin_url': url, 'path': current_path, 'field': key, 'context': obj }) # Recurse into nested structures extract_from_object(value, current_path) elif isinstance(obj, list): for i, item in enumerate(obj): current_path = f"{context_path}[{i}]" if context_path else f"[{i}]" extract_from_object(item, current_path) elif isinstance(obj, str): # Check for LinkedIn URLs in standalone strings url = extract_from_value(obj, path) if url: urls.append({ 'name': 'Unknown', 'linkedin_url': url, 'path': path, 'field': 'string_value', 'context': obj }) def find_name_in_context(obj: Dict, field_key: str) -> str: """Find the most relevant name for a LinkedIn URL.""" # Try various name fields name_fields = [ 'name', 'full_name', 'staff_name', 'person_name', 'title', 'label', 'organization', 'company' ] for field in name_fields: if field in obj and isinstance(obj[field], str) and obj[field].strip(): return obj[field].strip() # Check parent objects for names current_parts = path.split('.') for i in range(len(current_parts), 0, -1): parent_path = '.'.join(current_parts[:i]) # Navigate up the structure parent = obj for part in current_parts[:i]: if isinstance(parent, dict) and part in parent: parent = parent[part] elif isinstance(parent, list) and part.isdigit() and int(part) < len(parent): parent = parent[int(part)] else: break if isinstance(parent, dict): for field in name_fields: if field in parent and isinstance(parent[field], str) and parent[field].strip(): return parent[field].strip() return 'Unknown' # Start extraction extract_from_object(data) return urls def create_ultimate_enrichment(linkedin_data: List[Dict[str, Any]]) -> Dict[str, Any]: """Create the ultimate LinkedIn enrichment structure.""" # Categorize profiles personal_profiles = [] company_profiles = [] unknown_profiles = [] for item in linkedin_data: profile = { 'name': item.get('name', 'Unknown'), 'linkedin_url': item.get('linkedin_url'), 'path': item.get('path'), 'field': item.get('field') } if '/company/' in item['linkedin_url']: company_profiles.append(profile) elif item['name'] != 'Unknown': personal_profiles.append(profile) else: unknown_profiles.append(profile) enrichment = { 'extraction_timestamp': datetime.now().isoformat() + 'Z', 'extraction_method': 'ultimate_deep_extraction_v3', 'extraction_stats': { 'total_profiles': len(linkedin_data), 'personal_profiles': len(personal_profiles), 'company_profiles': len(company_profiles), 'unknown_profiles': len(unknown_profiles), 'high_confidence': len([p for p in linkedin_data if p['name'] != 'Unknown']), 'medium_confidence': len([p for p in linkedin_data if p['name'] == 'Unknown']) }, 'profiles_by_category': { 'personal': personal_profiles, 'company': company_profiles, 'unknown': unknown_profiles }, 'all_raw_data': linkedin_data } return enrichment def main(): """Main function.""" # Path to Eye Filmmuseum file eye_file = "/Users/kempersc/apps/glam/data/custodian/NL-NH-AMS-U-EFM-eye_filmmuseum.yaml" print("=" * 80) print("ULTIMATE LINKEDIN EXTRACTION FOR EYE FILMMUSEUM") print("=" * 80) print(f"\nLoading data from: {eye_file}") with open(eye_file, 'r', encoding='utf-8') as f: eye_data = yaml.safe_load(f) print("\nPerforming ultimate deep extraction of ALL LinkedIn URLs...") linkedin_data = ultimate_extract_linkedin_urls(eye_data) print(f"\n✓ Found {len(linkedin_data)} LinkedIn profiles!") # Show breakdown by category personal = sum(1 for item in linkedin_data if '/company/' not in item['linkedin_url'] and item['name'] != 'Unknown') company = sum(1 for item in linkedin_data if '/company/' in item['linkedin_url']) unknown = sum(1 for item in linkedin_data if item['name'] == 'Unknown') print(f" - Personal profiles: {personal}") print(f" - Company profiles: {company}") print(f" - Unknown names: {unknown}") # Show first 15 results print("\nFirst 15 profiles found:") for i, item in enumerate(linkedin_data[:15]): print(f" {i+1:2d}. {item.get('name', 'Unknown')}") print(f" URL: {item.get('linkedin_url', 'N/A')}") print(f" Path: {item.get('path', 'N/A')}") print(f" Field: {item.get('field', 'N/A')}") print() if len(linkedin_data) > 15: print(f" ... and {len(linkedin_data) - 15} more") # Create enrichment print("\nCreating ultimate enrichment structure...") enrichment = create_ultimate_enrichment(linkedin_data) # Add to existing data if 'linkedin_enrichment' not in eye_data: eye_data['linkedin_enrichment'] = {} # Merge with existing data eye_data['linkedin_enrichment']['ultimate_extraction'] = enrichment eye_data['linkedin_enrichment']['extraction_notes'] = [ f"Ultimate LinkedIn extraction completed on {enrichment['extraction_timestamp']}", f"Total profiles found: {enrichment['extraction_stats']['total_profiles']}", f"Personal profiles: {enrichment['extraction_stats']['personal_profiles']}", f"Company profiles: {enrichment['extraction_stats']['company_profiles']}", "Deep extraction scans ALL YAML fields including conservators, volunteers, interns", "Ready for API enrichment with Unipile when credentials are available" ] # Update provenance if 'provenance' not in eye_data: eye_data['provenance'] = {} if 'notes' not in eye_data['provenance']: eye_data['provenance']['notes'] = [] eye_data['provenance']['notes'].append( f"Ultimate LinkedIn deep extraction on {enrichment['extraction_timestamp']}" ) # Save enriched data output_file = eye_file.replace('.yaml', '_linkedin_ultimate.yaml') print(f"\nSaving enriched data to: {output_file}") with open(output_file, 'w', encoding='utf-8') as f: yaml.dump(eye_data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) # Save profiles-only files profiles_file = output_file.replace('.yaml', '_all_profiles.json') with open(profiles_file, 'w', encoding='utf-8') as f: json.dump({ 'extraction_timestamp': enrichment['extraction_timestamp'], 'total_profiles': len(linkedin_data), 'profiles': linkedin_data }, f, indent=2) # Create comprehensive CSV csv_file = output_file.replace('.yaml', '_profiles_ultimate.csv') with open(csv_file, 'w', encoding='utf-8') as f: f.write("Name,LinkedIn URL,Type,Path,Field,Confidence\n") for item in linkedin_data: profile_type = 'company' if '/company/' in item['linkedin_url'] else 'personal' confidence = 'high' if item.get('name', 'Unknown') != 'Unknown' else 'medium' f.write(f"{item.get('name', 'Unknown')},{item.get('linkedin_url', 'N/A')},{profile_type},{item.get('path', 'N/A')},{item.get('field', 'N/A')},{confidence}\n") # Create detailed report report = { 'extraction_timestamp': enrichment['extraction_timestamp'], 'method': 'ultimate_deep_extraction_v3', 'stats': enrichment['extraction_stats'], 'files_created': { 'main_yaml': output_file, 'profiles_json': profiles_file, 'profiles_csv': csv_file }, 'sample_profiles': linkedin_data[:20] # First 20 as sample } report_file = output_file.replace('.yaml', '_ultimate_report.json') with open(report_file, 'w', encoding='utf-8') as f: json.dump(report, f, indent=2) print("\n" + "=" * 80) print("ULTIMATE EXTRACTION COMPLETE!") print("=" * 80) print(f"Total LinkedIn profiles: {len(linkedin_data)}") print(f" - Personal: {enrichment['extraction_stats']['personal_profiles']}") print(f" - Company: {enrichment['extraction_stats']['company_profiles']}") print(f" - Unknown: {enrichment['extraction_stats']['unknown_profiles']}") print(f"\nFiles created:") print(f" 1. Main YAML: {output_file}") print(f" 2. Profiles JSON: {profiles_file}") print(f" 3. Profiles CSV: {csv_file}") print(f" 4. Report JSON: {report_file}") print("\n" + "=" * 80) print("READY FOR API ENRICHMENT") print("=" * 80) print(""" To enrich these profiles with detailed data using Unipile API: 1. Set up Unipile account: - Sign up: https://dashboard.unipile.com/signup - Connect your LinkedIn account via Hosted Auth - Get API key from dashboard 2. Set environment variables: export UNIPILE_API_KEY=your_api_key_here export UNIPILE_DSN=api1.unipile.com:13111 3. Run enrichment script: python scripts/enrich_linkedin_ultimate.py This will fetch comprehensive profile data including: - Full name and professional headline - Location and industry - Summary and about section - Connection count and follower count - Work experience history - Education background - Skills and languages - Profile image URL The enriched data will be seamlessly integrated into the Eye Filmmuseum YAML. """) if __name__ == "__main__": main()