#!/usr/bin/env python3 """ Comprehensive LinkedIn enrichment for Eye Filmmuseum. This script extracts all LinkedIn data and creates a structured enrichment section. """ import os import sys import json import yaml import re from datetime import datetime from pathlib import Path from typing import Dict, List, Any, Optional from collections import defaultdict def extract_all_linkedin_data(data: Dict[str, Any]) -> Dict[str, Any]: """Extract all LinkedIn-related data from Eye Filmmuseum YAML.""" linkedin_data = { 'management': [], 'board_of_trustees': [], 'department_heads': [], 'former_directors': [], 'chief_curator': [], 'collection_specialists': [], 'curators': [], 'archivists_and_film_specialists': [], 'programmers': [], 'pico_staff': [], 'deceased_staff': [], 'company_page': {}, 'foaf_knows': [] } def extract_linkedin_info(obj, path=""): """Extract LinkedIn info from any object.""" info = {} if isinstance(obj, dict): # Direct LinkedIn URL if 'linkedin_url' in obj: info['linkedin_url'] = obj['linkedin_url'] info['name'] = obj.get('name', obj.get('person_observed', {}).get('name', 'Unknown')) info['path'] = path # Extract from foaf_knows entries if 'foaf_knows' in obj and isinstance(obj['foaf_knows'], list): for foaf in obj['foaf_knows']: if isinstance(foaf, dict) and 'linkedin_url' in foaf: info['linkedin_url'] = foaf['linkedin_url'] info['name'] = foaf.get('name', foaf.get('type', 'Unknown')) info['path'] = f"{path}.foaf_knows" info['role'] = foaf.get('type') info['organization'] = foaf.get('organization') info['relationship'] = foaf.get('relationship') info['wikidata_id'] = foaf.get('wikidata_id') break # Check nested objects for key, value in obj.items(): if key not in ['linkedin_url', 'foaf_knows']: nested = extract_linkedin_info(value, f"{path}.{key}" if path else key) if nested: return nested elif isinstance(obj, list): for i, item in enumerate(obj): nested = extract_linkedin_info(item, f"{path}[{i}]" if path else f"[{i}]") if nested: return nested return info # Extract from each section sections = [ 'management', 'board_of_trustees', 'department_heads', 'former_directors', 'chief_curator', 'collection_specialists', 'curators', 'archivists_and_film_specialists', 'programmers', 'pico_staff', 'deceased_staff' ] for section in sections: section_data = data.get(section, []) if isinstance(section_data, dict): # Handle single entry sections for key, value in section_data.items(): info = extract_linkedin_info(value, f"{section}.{key}") if info: info['section_key'] = key linkedin_data[section].append(info) elif isinstance(section_data, list): # Handle list sections for i, item in enumerate(section_data): info = extract_linkedin_info(item, f"{section}[{i}]") if info: info['section_index'] = i linkedin_data[section].append(info) # Extract company page LinkedIn info if 'linkedin_enrichment' in data: company_data = data['linkedin_enrichment'] if 'company_linkedin_url' in company_data: linkedin_data['company_page'] = { 'linkedin_url': company_data['company_linkedin_url'], 'employee_count': company_data.get('company_stats', {}).get('employee_count_linkedin'), 'source': 'linkedin_enrichment.company_linkedin_url' } # Extract all foaf_knows with LinkedIn URLs def extract_foaf_with_linkedin(obj, path=""): results = [] if isinstance(obj, dict): if 'foaf_knows' in obj and isinstance(obj['foaf_knows'], list): for foaf in obj['foaf_knows']: if isinstance(foaf, dict) and 'linkedin_url' in foaf: result = { 'name': foaf.get('name'), 'type': foaf.get('type'), 'organization': foaf.get('organization'), 'relationship': foaf.get('relationship'), 'linkedin_url': foaf['linkedin_url'], 'wikidata_id': foaf.get('wikidata_id'), 'path': f"{path}.foaf_knows" } results.append(result) # Check nested for key, value in obj.items(): nested = extract_foaf_with_linkedin(value, f"{path}.{key}" if path else key) results.extend(nested) elif isinstance(obj, list): for i, item in enumerate(obj): nested = extract_foaf_with_linkedin(item, f"{path}[{i}]" if path else f"[{i}]") results.extend(nested) return results linkedin_data['foaf_knows'] = extract_foaf_with_linkedin(data) return linkedin_data def create_linkedin_identifier(url: str) -> Optional[str]: """Create LinkedIn identifier from URL.""" if not url: return None # Extract identifier from URL patterns = [ r'linkedin\.com/in/([^/?]+)', r'linkedin\.com/pub/([^/?]+)', ] for pattern in patterns: match = re.search(pattern, url) if match: return match.group(1).rstrip('/').split('?')[0] return None def create_comprehensive_linkedin_enrichment(linkedin_data: Dict[str, Any]) -> Dict[str, Any]: """Create comprehensive LinkedIn enrichment structure.""" enrichment = { 'extraction_timestamp': datetime.now().isoformat() + 'Z', 'extraction_method': 'comprehensive_yaml_extraction', 'total_sections_with_linkedin': 0, 'total_linkedin_urls': 0, 'unique_linkedin_urls': set(), 'sections': {} } # Process each section for section_name, items in linkedin_data.items(): if section_name == 'company_page': if items: enrichment['sections'][section_name] = items enrichment['total_linkedin_urls'] += 1 enrichment['unique_linkedin_urls'].add(items['linkedin_url']) enrichment['total_sections_with_linkedin'] += 1 elif section_name == 'foaf_knows': # Group foaf_knows by type foaf_by_type = defaultdict(list) for item in items: foaf_by_type[item['type']].append(item) enrichment['sections'][section_name] = { 'total_entries': len(items), 'by_type': dict(foaf_by_type), 'sample_entries': items[:10] # First 10 as sample } enrichment['total_linkedin_urls'] += len(items) for item in items: enrichment['unique_linkedin_urls'].add(item['linkedin_url']) if items: enrichment['total_sections_with_linkedin'] += 1 elif items: # Regular sections processed_items = [] for item in items: processed = { 'name': item.get('name'), 'linkedin_url': item.get('linkedin_url'), 'linkedin_identifier': create_linkedin_identifier(item.get('linkedin_url')), 'path': item.get('path'), 'role': item.get('role'), 'section_key': item.get('section_key'), 'section_index': item.get('section_index') } processed_items.append(processed) enrichment['unique_linkedin_urls'].add(item.get('linkedin_url')) enrichment['sections'][section_name] = { 'total_entries': len(processed_items), 'entries': processed_items } enrichment['total_linkedin_urls'] += len(items) if items: enrichment['total_sections_with_linkedin'] += 1 # Convert set to count enrichment['unique_linkedin_urls_count'] = len(enrichment['unique_linkedin_urls']) enrichment['unique_linkedin_urls'] = list(enrichment['unique_linkedin_urls']) return enrichment def main(): """Main function.""" # Path to Eye Filmmuseum file eye_file = "/Users/kempersc/apps/glam/data/custodian/NL-NH-AMS-U-EFM-eye_filmmuseum.yaml" print("=" * 70) print("COMPREHENSIVE LINKEDIN ENRICHMENT FOR EYE FILMMUSEUM") print("=" * 70) print(f"\nLoading Eye Filmmuseum data from: {eye_file}") with open(eye_file, 'r', encoding='utf-8') as f: eye_data = yaml.safe_load(f) print("\nExtracting all LinkedIn data...") linkedin_data = extract_all_linkedin_data(eye_data) # Create comprehensive enrichment print("\nCreating comprehensive LinkedIn enrichment...") enrichment = create_comprehensive_linkedin_enrichment(linkedin_data) # Print summary print("\n" + "=" * 50) print("LINKEDIN DATA SUMMARY") print("=" * 50) print(f"Total sections with LinkedIn data: {enrichment['total_sections_with_linkedin']}") print(f"Total LinkedIn URLs found: {enrichment['total_linkedin_urls']}") print(f"Unique LinkedIn URLs: {enrichment['unique_linkedin_urls_count']}") print("\nBreakdown by section:") for section, data in enrichment['sections'].items(): if section == 'foaf_knows': print(f"\n{section.upper()}:") print(f" Total entries: {data['total_entries']}") print(" By type:") for type_name, items in data['by_type'].items(): print(f" - {type_name}: {len(items)}") if data['sample_entries']: print(" Sample entries:") for item in data['sample_entries'][:3]: print(f" - {item['name']} ({item.get('type', 'Unknown')})") elif isinstance(data, dict) and 'total_entries' in data: print(f"\n{section.upper()}: {data['total_entries']} entries") if data['entries']: for item in data['entries'][:3]: print(f" - {item['name']}") elif isinstance(data, dict): print(f"\n{section.upper()}: 1 entry") if 'linkedin_url' in data: print(f" - Company page: {data['linkedin_url']}") # Add to existing data if 'linkedin_enrichment' not in eye_data: eye_data['linkedin_enrichment'] = {} # Merge with existing LinkedIn enrichment existing = eye_data['linkedin_enrichment'] existing.update({ 'comprehensive_extraction': enrichment, 'extraction_notes': [ f"Comprehensive LinkedIn extraction completed on {enrichment['extraction_timestamp']}", f"Found {enrichment['total_linkedin_urls']} LinkedIn URLs across {enrichment['total_sections_with_linkedin']} sections", f"Unique LinkedIn profiles: {enrichment['unique_linkedin_urls_count']}", "Data ready for API enrichment with Unipile when credentials are available", "Extraction includes: management, board, staff, curators, foaf_knows network" ] }) # Update provenance if 'provenance' not in eye_data: eye_data['provenance'] = {} if 'notes' not in eye_data['provenance']: eye_data['provenance']['notes'] = [] eye_data['provenance']['notes'].append( f"Comprehensive LinkedIn extraction on {enrichment['extraction_timestamp']}" ) # Save enriched data output_file = eye_file.replace('.yaml', '_linkedin_comprehensive.yaml') print(f"\nSaving enriched data to: {output_file}") with open(output_file, 'w', encoding='utf-8') as f: yaml.dump(eye_data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) # Save separate LinkedIn profiles file for easy access profiles_file = output_file.replace('.yaml', '_profiles_only.json') profiles = { 'extraction_timestamp': enrichment['extraction_timestamp'], 'total_profiles': enrichment['unique_linkedin_urls_count'], 'profiles': [] } # Collect all unique profiles all_profiles = [] for section_data in linkedin_data.values(): if isinstance(section_data, list): all_profiles.extend(section_data) elif isinstance(section_data, dict) and 'linkedin_url' in section_data: all_profiles.append(section_data) # Deduplicate by LinkedIn URL seen_urls = set() unique_profiles = [] for profile in all_profiles: url = profile.get('linkedin_url') if url and url not in seen_urls: seen_urls.add(url) unique_profiles.append({ 'name': profile.get('name'), 'linkedin_url': url, 'linkedin_identifier': create_linkedin_identifier(url), 'section': profile.get('path', '').split('.')[0] if profile.get('path') else 'unknown' }) profiles['profiles'] = unique_profiles with open(profiles_file, 'w', encoding='utf-8') as f: json.dump(profiles, f, indent=2) print(f"\nLinkedIn profiles saved to: {profiles_file}") # Create CSV for easy viewing csv_file = output_file.replace('.yaml', '_profiles.csv') with open(csv_file, 'w', encoding='utf-8') as f: f.write("Name,LinkedIn URL,Identifier,Section,Path\n") for profile in unique_profiles: f.write(f"{profile['name']},{profile['linkedin_url']},{profile['linkedin_identifier']},{profile['section']},{profile.get('path', '')}\n") print(f"CSV saved to: {csv_file}") print("\n" + "=" * 70) print("ENRICHMENT COMPLETE!") print("=" * 70) print(f"Total unique LinkedIn profiles: {len(unique_profiles)}") print(f"Main enriched file: {output_file}") print(f"Profiles-only JSON: {profiles_file}") print(f"Profiles CSV: {csv_file}") # Instructions for next steps print("\n" + "=" * 70) print("NEXT STEPS FOR API ENRICHMENT") print("=" * 70) print(""" To enrich these profiles with Unipile API: 1. Set up Unipile account: - Sign up: https://dashboard.unipile.com/signup - Connect LinkedIn account via Hosted Auth - Get API key from dashboard 2. Set environment variables: export UNIPILE_API_KEY=your_api_key_here export UNIPILE_DSN=api1.unipile.com:13111 3. Run enrichment script: python scripts/enrich_linkedin_profiles_unipile.py This will fetch detailed profile information for each LinkedIn URL including: name, headline, location, industry, summary, connection count. """) if __name__ == "__main__": main()