glam/scripts/linkedin_ultimate_extraction.py
2025-12-10 13:01:13 +01:00

333 lines
No EOL
13 KiB
Python

#!/usr/bin/env python3
"""
ULTIMATE LinkedIn extraction for Eye Filmmuseum.
This script performs the most comprehensive extraction of ALL LinkedIn URLs.
"""
import os
import sys
import json
import yaml
import re
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Any, Optional, Set
def ultimate_extract_linkedin_urls(data: Any, path: str = "") -> List[Dict[str, Any]]:
"""Ultimate extraction of LinkedIn URLs from any data structure."""
urls = []
seen_urls = set()
def extract_from_value(value: Any, context_path: str) -> Optional[str]:
"""Extract and normalize LinkedIn URL from a single value."""
if isinstance(value, str):
# Look for LinkedIn URLs in text
import re as re_module
# Find all LinkedIn URLs in the text
linkedin_matches = re_module.findall(r'linkedin\.com/[^\s\)]+', value)
for match in linkedin_matches:
# Clean and normalize URL
url = match.strip()
if url.startswith('http'):
clean_url = url
elif url.startswith('//'):
clean_url = f"https:{url}"
else:
clean_url = f"https://{url}"
if clean_url not in seen_urls:
seen_urls.add(clean_url)
return clean_url
return None
def extract_from_object(obj: Any, context_path: str = "") -> None:
"""Recursively extract LinkedIn URLs from object."""
if isinstance(obj, dict):
# Check all string values for LinkedIn URLs
for key, value in obj.items():
current_path = f"{context_path}.{key}" if context_path else key
# Direct LinkedIn URL fields
if 'linkedin' in key.lower() and isinstance(value, str):
url = extract_from_value(value, current_path)
if url:
name = find_name_in_context(obj, key)
urls.append({
'name': name,
'linkedin_url': url,
'path': current_path,
'field': key,
'context': obj
})
# Check any string value for LinkedIn URLs
url = extract_from_value(value, current_path)
if url:
name = find_name_in_context(obj, key)
urls.append({
'name': name,
'linkedin_url': url,
'path': current_path,
'field': key,
'context': obj
})
# Recurse into nested structures
extract_from_object(value, current_path)
elif isinstance(obj, list):
for i, item in enumerate(obj):
current_path = f"{context_path}[{i}]" if context_path else f"[{i}]"
extract_from_object(item, current_path)
elif isinstance(obj, str):
# Check for LinkedIn URLs in standalone strings
url = extract_from_value(obj, path)
if url:
urls.append({
'name': 'Unknown',
'linkedin_url': url,
'path': path,
'field': 'string_value',
'context': obj
})
def find_name_in_context(obj: Dict, field_key: str) -> str:
"""Find the most relevant name for a LinkedIn URL."""
# Try various name fields
name_fields = [
'name', 'full_name', 'staff_name', 'person_name',
'title', 'label', 'organization', 'company'
]
for field in name_fields:
if field in obj and isinstance(obj[field], str) and obj[field].strip():
return obj[field].strip()
# Check parent objects for names
current_parts = path.split('.')
for i in range(len(current_parts), 0, -1):
parent_path = '.'.join(current_parts[:i])
# Navigate up the structure
parent = obj
for part in current_parts[:i]:
if isinstance(parent, dict) and part in parent:
parent = parent[part]
elif isinstance(parent, list) and part.isdigit() and int(part) < len(parent):
parent = parent[int(part)]
else:
break
if isinstance(parent, dict):
for field in name_fields:
if field in parent and isinstance(parent[field], str) and parent[field].strip():
return parent[field].strip()
return 'Unknown'
# Start extraction
extract_from_object(data)
return urls
def create_ultimate_enrichment(linkedin_data: List[Dict[str, Any]]) -> Dict[str, Any]:
"""Create the ultimate LinkedIn enrichment structure."""
# Categorize profiles
personal_profiles = []
company_profiles = []
unknown_profiles = []
for item in linkedin_data:
profile = {
'name': item.get('name', 'Unknown'),
'linkedin_url': item.get('linkedin_url'),
'path': item.get('path'),
'field': item.get('field')
}
if '/company/' in item['linkedin_url']:
company_profiles.append(profile)
elif item['name'] != 'Unknown':
personal_profiles.append(profile)
else:
unknown_profiles.append(profile)
enrichment = {
'extraction_timestamp': datetime.now().isoformat() + 'Z',
'extraction_method': 'ultimate_deep_extraction_v3',
'extraction_stats': {
'total_profiles': len(linkedin_data),
'personal_profiles': len(personal_profiles),
'company_profiles': len(company_profiles),
'unknown_profiles': len(unknown_profiles),
'high_confidence': len([p for p in linkedin_data if p['name'] != 'Unknown']),
'medium_confidence': len([p for p in linkedin_data if p['name'] == 'Unknown'])
},
'profiles_by_category': {
'personal': personal_profiles,
'company': company_profiles,
'unknown': unknown_profiles
},
'all_raw_data': linkedin_data
}
return enrichment
def main():
"""Main function."""
# Path to Eye Filmmuseum file
eye_file = "/Users/kempersc/apps/glam/data/custodian/NL-NH-AMS-U-EFM-eye_filmmuseum.yaml"
print("=" * 80)
print("ULTIMATE LINKEDIN EXTRACTION FOR EYE FILMMUSEUM")
print("=" * 80)
print(f"\nLoading data from: {eye_file}")
with open(eye_file, 'r', encoding='utf-8') as f:
eye_data = yaml.safe_load(f)
print("\nPerforming ultimate deep extraction of ALL LinkedIn URLs...")
linkedin_data = ultimate_extract_linkedin_urls(eye_data)
print(f"\n✓ Found {len(linkedin_data)} LinkedIn profiles!")
# Show breakdown by category
personal = sum(1 for item in linkedin_data if '/company/' not in item['linkedin_url'] and item['name'] != 'Unknown')
company = sum(1 for item in linkedin_data if '/company/' in item['linkedin_url'])
unknown = sum(1 for item in linkedin_data if item['name'] == 'Unknown')
print(f" - Personal profiles: {personal}")
print(f" - Company profiles: {company}")
print(f" - Unknown names: {unknown}")
# Show first 15 results
print("\nFirst 15 profiles found:")
for i, item in enumerate(linkedin_data[:15]):
print(f" {i+1:2d}. {item.get('name', 'Unknown')}")
print(f" URL: {item.get('linkedin_url', 'N/A')}")
print(f" Path: {item.get('path', 'N/A')}")
print(f" Field: {item.get('field', 'N/A')}")
print()
if len(linkedin_data) > 15:
print(f" ... and {len(linkedin_data) - 15} more")
# Create enrichment
print("\nCreating ultimate enrichment structure...")
enrichment = create_ultimate_enrichment(linkedin_data)
# Add to existing data
if 'linkedin_enrichment' not in eye_data:
eye_data['linkedin_enrichment'] = {}
# Merge with existing data
eye_data['linkedin_enrichment']['ultimate_extraction'] = enrichment
eye_data['linkedin_enrichment']['extraction_notes'] = [
f"Ultimate LinkedIn extraction completed on {enrichment['extraction_timestamp']}",
f"Total profiles found: {enrichment['extraction_stats']['total_profiles']}",
f"Personal profiles: {enrichment['extraction_stats']['personal_profiles']}",
f"Company profiles: {enrichment['extraction_stats']['company_profiles']}",
"Deep extraction scans ALL YAML fields including conservators, volunteers, interns",
"Ready for API enrichment with Unipile when credentials are available"
]
# Update provenance
if 'provenance' not in eye_data:
eye_data['provenance'] = {}
if 'notes' not in eye_data['provenance']:
eye_data['provenance']['notes'] = []
eye_data['provenance']['notes'].append(
f"Ultimate LinkedIn deep extraction on {enrichment['extraction_timestamp']}"
)
# Save enriched data
output_file = eye_file.replace('.yaml', '_linkedin_ultimate.yaml')
print(f"\nSaving enriched data to: {output_file}")
with open(output_file, 'w', encoding='utf-8') as f:
yaml.dump(eye_data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
# Save profiles-only files
profiles_file = output_file.replace('.yaml', '_all_profiles.json')
with open(profiles_file, 'w', encoding='utf-8') as f:
json.dump({
'extraction_timestamp': enrichment['extraction_timestamp'],
'total_profiles': len(linkedin_data),
'profiles': linkedin_data
}, f, indent=2)
# Create comprehensive CSV
csv_file = output_file.replace('.yaml', '_profiles_ultimate.csv')
with open(csv_file, 'w', encoding='utf-8') as f:
f.write("Name,LinkedIn URL,Type,Path,Field,Confidence\n")
for item in linkedin_data:
profile_type = 'company' if '/company/' in item['linkedin_url'] else 'personal'
confidence = 'high' if item.get('name', 'Unknown') != 'Unknown' else 'medium'
f.write(f"{item.get('name', 'Unknown')},{item.get('linkedin_url', 'N/A')},{profile_type},{item.get('path', 'N/A')},{item.get('field', 'N/A')},{confidence}\n")
# Create detailed report
report = {
'extraction_timestamp': enrichment['extraction_timestamp'],
'method': 'ultimate_deep_extraction_v3',
'stats': enrichment['extraction_stats'],
'files_created': {
'main_yaml': output_file,
'profiles_json': profiles_file,
'profiles_csv': csv_file
},
'sample_profiles': linkedin_data[:20] # First 20 as sample
}
report_file = output_file.replace('.yaml', '_ultimate_report.json')
with open(report_file, 'w', encoding='utf-8') as f:
json.dump(report, f, indent=2)
print("\n" + "=" * 80)
print("ULTIMATE EXTRACTION COMPLETE!")
print("=" * 80)
print(f"Total LinkedIn profiles: {len(linkedin_data)}")
print(f" - Personal: {enrichment['extraction_stats']['personal_profiles']}")
print(f" - Company: {enrichment['extraction_stats']['company_profiles']}")
print(f" - Unknown: {enrichment['extraction_stats']['unknown_profiles']}")
print(f"\nFiles created:")
print(f" 1. Main YAML: {output_file}")
print(f" 2. Profiles JSON: {profiles_file}")
print(f" 3. Profiles CSV: {csv_file}")
print(f" 4. Report JSON: {report_file}")
print("\n" + "=" * 80)
print("READY FOR API ENRICHMENT")
print("=" * 80)
print("""
To enrich these profiles with detailed data using Unipile API:
1. Set up Unipile account:
- Sign up: https://dashboard.unipile.com/signup
- Connect your LinkedIn account via Hosted Auth
- Get API key from dashboard
2. Set environment variables:
export UNIPILE_API_KEY=your_api_key_here
export UNIPILE_DSN=api1.unipile.com:13111
3. Run enrichment script:
python scripts/enrich_linkedin_ultimate.py
This will fetch comprehensive profile data including:
- Full name and professional headline
- Location and industry
- Summary and about section
- Connection count and follower count
- Work experience history
- Education background
- Skills and languages
- Profile image URL
The enriched data will be seamlessly integrated into the Eye Filmmuseum YAML.
""")
if __name__ == "__main__":
main()