#!/usr/bin/env python3 """ FINAL LinkedIn extraction for Eye Filmmuseum. This script performs deep extraction of ALL LinkedIn URLs from the complex YAML structure. """ import os import sys import json import yaml import re from datetime import datetime from pathlib import Path from typing import Dict, List, Any, Optional, Set def deep_extract_linkedin_urls(data: Any, path: str = "") -> List[Dict[str, Any]]: """Deep extraction of LinkedIn URLs from nested data structure.""" urls = [] def extract_from_value(value, context_path: str): """Extract LinkedIn URL from a single value.""" if isinstance(value, str) and 'linkedin.com' in value: # Clean up the URL url = value.strip() if url.startswith('http'): return url elif url.startswith('//'): return f"https:{url}" else: return f"https://{url}" return None def extract_from_object(obj, context_path: str = ""): """Extract LinkedIn URLs from an object.""" results = [] if isinstance(obj, dict): for key, value in obj.items(): current_path = f"{context_path}.{key}" if context_path else key # Direct LinkedIn URL fields if 'linkedin' in key.lower() or key == 'url': url = extract_from_value(value, current_path) if url: # Try to find associated name name = find_associated_name(obj, key) results.append({ 'name': name, 'linkedin_url': url, 'path': current_path, 'field': key, 'context': obj }) # Check for LinkedIn URLs in string values if isinstance(value, str): url = extract_from_value(value, current_path) if url: name = find_associated_name(obj, key) results.append({ 'name': name, 'linkedin_url': url, 'path': current_path, 'field': key, 'context': obj }) # Recurse into nested structures nested_results = extract_from_object(value, current_path) results.extend(nested_results) elif isinstance(obj, list): for i, item in enumerate(obj): current_path = f"{context_path}[{i}]" if context_path else f"[{i}]" nested_results = extract_from_object(item, current_path) results.extend(nested_results) elif isinstance(obj, str): url = extract_from_value(obj, context_path) if url: results.append({ 'name': 'Unknown', 'linkedin_url': url, 'path': context_path, 'field': 'string_value', 'context': obj }) return results def find_associated_name(obj, field_key): """Try to find an associated name for a LinkedIn URL.""" # Check various name fields name_fields = [ 'name', 'full_name', 'staff_name', 'person_name', 'title', 'label', 'organization', 'company' ] for field in name_fields: if field in obj and isinstance(obj[field], str): return obj[field] # Check in parent context if available if isinstance(obj, dict): # Check for person_observed structure if 'person_observed' in obj: person = obj['person_observed'] if isinstance(person, dict) and 'name' in person: return person['name'] return 'Unknown' # Start deep extraction all_results = extract_from_object(data) # Deduplicate by LinkedIn URL seen_urls = set() unique_results = [] for result in all_results: url = result['linkedin_url'] if url and url not in seen_urls: seen_urls.add(url) unique_results.append(result) return unique_results def extract_linkedin_identifier(url: str) -> Optional[str]: """Extract LinkedIn identifier from URL.""" if not url: return None # Handle various LinkedIn URL formats patterns = [ r'linkedin\.com/in/([^/?]+)', r'linkedin\.com/pub/([^/?]+)', r'linkedin\.com/company/([^/?]+)', r'linkedin\.com/school/([^/?]+)' ] for pattern in patterns: match = re.search(pattern, url) if match: return match.group(1).rstrip('/').split('?')[0] return None def create_final_linkedin_enrichment(linkedin_data: List[Dict[str, Any]]) -> Dict[str, Any]: """Create the final LinkedIn enrichment structure.""" # Group by type (personal vs company) personal_profiles = [] company_profiles = [] for item in linkedin_data: profile = { 'name': item['name'], 'linkedin_url': item['linkedin_url'], 'linkedin_identifier': extract_linkedin_identifier(item['linkedin_url']), 'extraction_path': item['path'], 'field_found': item['field'] } # Classify as personal or company profile if '/company/' in item['linkedin_url']: company_profiles.append(profile) else: personal_profiles.append(profile) enrichment = { 'extraction_timestamp': datetime.now().isoformat() + 'Z', 'extraction_method': 'deep_yaml_extraction_v2', 'extraction_stats': { 'total_profiles_found': len(linkedin_data), 'personal_profiles': len(personal_profiles), 'company_profiles': len(company_profiles), 'unique_identifiers': len(set(p['linkedin_identifier'] for p in linkedin_data if p['linkedin_identifier'])) }, 'personal_profiles': personal_profiles, 'company_profiles': company_profiles, 'all_profiles': linkedin_data } return enrichment def main(): """Main function.""" # Path to Eye Filmmuseum file eye_file = "/Users/kempersc/apps/glam/data/custodian/NL-NH-AMS-U-EFM-eye_filmmuseum.yaml" print("=" * 80) print("FINAL LINKEDIN EXTRACTION FOR EYE FILMMUSEUM") print("=" * 80) print(f"\nLoading data from: {eye_file}") with open(eye_file, 'r', encoding='utf-8') as f: eye_data = yaml.safe_load(f) print("\nPerforming deep extraction of LinkedIn URLs...") linkedin_data = deep_extract_linkedin_urls(eye_data) print(f"\n✓ Found {len(linkedin_data)} LinkedIn profiles!") # Show first few results print("\nFirst 10 profiles found:") for i, item in enumerate(linkedin_data[:10]): print(f" {i+1:2d}. {item['name']}") print(f" URL: {item['linkedin_url']}") print(f" Path: {item['path']}") print(f" Field: {item['field']}") print() if len(linkedin_data) > 10: print(f" ... and {len(linkedin_data) - 10} more profiles") # Create enrichment print("\nCreating enrichment structure...") enrichment = create_final_linkedin_enrichment(linkedin_data) # Add to existing data if 'linkedin_enrichment' not in eye_data: eye_data['linkedin_enrichment'] = {} # Merge with existing data eye_data['linkedin_enrichment']['final_extraction'] = enrichment eye_data['linkedin_enrichment']['extraction_notes'] = [ f"Final deep LinkedIn extraction completed on {enrichment['extraction_timestamp']}", f"Total profiles found: {enrichment['extraction_stats']['total_profiles_found']}", f"Personal profiles: {enrichment['extraction_stats']['personal_profiles']}", f"Company profiles: {enrichment['extraction_stats']['company_profiles']}", "Deep extraction scans all YAML fields and nested structures", "Ready for API enrichment with Unipile when credentials available" ] # Update provenance if 'provenance' not in eye_data: eye_data['provenance'] = {} if 'notes' not in eye_data['provenance']: eye_data['provenance']['notes'] = [] eye_data['provenance']['notes'].append( f"Final LinkedIn deep extraction on {enrichment['extraction_timestamp']}" ) # Save enriched data output_file = eye_file.replace('.yaml', '_linkedin_final.yaml') print(f"\nSaving enriched data to: {output_file}") with open(output_file, 'w', encoding='utf-8') as f: yaml.dump(eye_data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) # Save profiles-only files profiles_file = output_file.replace('.yaml', '_all_profiles.json') with open(profiles_file, 'w', encoding='utf-8') as f: json.dump({ 'extraction_timestamp': enrichment['extraction_timestamp'], 'total_profiles': len(linkedin_data), 'profiles': linkedin_data }, f, indent=2) # Create CSV for easy viewing csv_file = output_file.replace('.yaml', '_profiles.csv') with open(csv_file, 'w', encoding='utf-8') as f: f.write("Name,LinkedIn URL,Identifier,Type,Path,Field\n") for item in linkedin_data: profile_type = 'company' if '/company/' in item['linkedin_url'] else 'personal' identifier = extract_linkedin_identifier(item['linkedin_url']) f.write(f"{item['name']},{item['linkedin_url']},{identifier},{profile_type},{item['path']},{item['field']}\n") # Create summary report report = { 'extraction_timestamp': enrichment['extraction_timestamp'], 'method': 'deep_yaml_extraction_v2', 'stats': enrichment['extraction_stats'], 'files_created': { 'main_yaml': output_file, 'profiles_json': profiles_file, 'profiles_csv': csv_file } } report_file = output_file.replace('.yaml', '_extraction_report.json') with open(report_file, 'w', encoding='utf-8') as f: json.dump(report, f, indent=2) print("\n" + "=" * 80) print("EXTRACTION COMPLETE!") print("=" * 80) print(f"Total LinkedIn profiles: {len(linkedin_data)}") print(f" - Personal profiles: {enrichment['extraction_stats']['personal_profiles']}") print(f" - Company profiles: {enrichment['extraction_stats']['company_profiles']}") print(f"\nFiles created:") print(f" 1. Main YAML: {output_file}") print(f" 2. Profiles JSON: {profiles_file}") print(f" 3. Profiles CSV: {csv_file}") print(f" 4. Report JSON: {report_file}") print("\n" + "=" * 80) print("READY FOR API ENRICHMENT") print("=" * 80) print(""" To enrich these profiles with detailed data using Unipile API: 1. Set up Unipile account: - Sign up: https://dashboard.unipile.com/signup - Connect your LinkedIn account via Hosted Auth - Get API key from dashboard 2. Set environment variables: export UNIPILE_API_KEY=your_api_key_here export UNIPILE_DSN=api1.unipile.com:13111 3. Run the enrichment script: python scripts/enrich_linkedin_with_api.py This will fetch: - Profile names, headlines, locations - Connection counts - Industry and summary information - Work experience and education - Skills and languages (if available) The enriched data will be added back to the Eye Filmmuseum YAML file. """) if __name__ == "__main__": main()