333 lines
No EOL
13 KiB
Python
333 lines
No EOL
13 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
ULTIMATE LinkedIn extraction for Eye Filmmuseum.
|
|
This script performs the most comprehensive extraction of ALL LinkedIn URLs.
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import json
|
|
import yaml
|
|
import re
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import Dict, List, Any, Optional, Set
|
|
|
|
def ultimate_extract_linkedin_urls(data: Any, path: str = "") -> List[Dict[str, Any]]:
|
|
"""Ultimate extraction of LinkedIn URLs from any data structure."""
|
|
urls = []
|
|
seen_urls = set()
|
|
|
|
def extract_from_value(value: Any, context_path: str) -> Optional[str]:
|
|
"""Extract and normalize LinkedIn URL from a single value."""
|
|
if isinstance(value, str):
|
|
# Look for LinkedIn URLs in text
|
|
import re as re_module
|
|
# Find all LinkedIn URLs in the text
|
|
linkedin_matches = re_module.findall(r'linkedin\.com/[^\s\)]+', value)
|
|
|
|
for match in linkedin_matches:
|
|
# Clean and normalize URL
|
|
url = match.strip()
|
|
if url.startswith('http'):
|
|
clean_url = url
|
|
elif url.startswith('//'):
|
|
clean_url = f"https:{url}"
|
|
else:
|
|
clean_url = f"https://{url}"
|
|
|
|
if clean_url not in seen_urls:
|
|
seen_urls.add(clean_url)
|
|
return clean_url
|
|
return None
|
|
|
|
def extract_from_object(obj: Any, context_path: str = "") -> None:
|
|
"""Recursively extract LinkedIn URLs from object."""
|
|
if isinstance(obj, dict):
|
|
# Check all string values for LinkedIn URLs
|
|
for key, value in obj.items():
|
|
current_path = f"{context_path}.{key}" if context_path else key
|
|
|
|
# Direct LinkedIn URL fields
|
|
if 'linkedin' in key.lower() and isinstance(value, str):
|
|
url = extract_from_value(value, current_path)
|
|
if url:
|
|
name = find_name_in_context(obj, key)
|
|
urls.append({
|
|
'name': name,
|
|
'linkedin_url': url,
|
|
'path': current_path,
|
|
'field': key,
|
|
'context': obj
|
|
})
|
|
|
|
# Check any string value for LinkedIn URLs
|
|
url = extract_from_value(value, current_path)
|
|
if url:
|
|
name = find_name_in_context(obj, key)
|
|
urls.append({
|
|
'name': name,
|
|
'linkedin_url': url,
|
|
'path': current_path,
|
|
'field': key,
|
|
'context': obj
|
|
})
|
|
|
|
# Recurse into nested structures
|
|
extract_from_object(value, current_path)
|
|
|
|
elif isinstance(obj, list):
|
|
for i, item in enumerate(obj):
|
|
current_path = f"{context_path}[{i}]" if context_path else f"[{i}]"
|
|
extract_from_object(item, current_path)
|
|
|
|
elif isinstance(obj, str):
|
|
# Check for LinkedIn URLs in standalone strings
|
|
url = extract_from_value(obj, path)
|
|
if url:
|
|
urls.append({
|
|
'name': 'Unknown',
|
|
'linkedin_url': url,
|
|
'path': path,
|
|
'field': 'string_value',
|
|
'context': obj
|
|
})
|
|
|
|
def find_name_in_context(obj: Dict, field_key: str) -> str:
|
|
"""Find the most relevant name for a LinkedIn URL."""
|
|
# Try various name fields
|
|
name_fields = [
|
|
'name', 'full_name', 'staff_name', 'person_name',
|
|
'title', 'label', 'organization', 'company'
|
|
]
|
|
|
|
for field in name_fields:
|
|
if field in obj and isinstance(obj[field], str) and obj[field].strip():
|
|
return obj[field].strip()
|
|
|
|
# Check parent objects for names
|
|
current_parts = path.split('.')
|
|
for i in range(len(current_parts), 0, -1):
|
|
parent_path = '.'.join(current_parts[:i])
|
|
# Navigate up the structure
|
|
parent = obj
|
|
for part in current_parts[:i]:
|
|
if isinstance(parent, dict) and part in parent:
|
|
parent = parent[part]
|
|
elif isinstance(parent, list) and part.isdigit() and int(part) < len(parent):
|
|
parent = parent[int(part)]
|
|
else:
|
|
break
|
|
|
|
if isinstance(parent, dict):
|
|
for field in name_fields:
|
|
if field in parent and isinstance(parent[field], str) and parent[field].strip():
|
|
return parent[field].strip()
|
|
|
|
return 'Unknown'
|
|
|
|
# Start extraction
|
|
extract_from_object(data)
|
|
|
|
return urls
|
|
|
|
def create_ultimate_enrichment(linkedin_data: List[Dict[str, Any]]) -> Dict[str, Any]:
|
|
"""Create the ultimate LinkedIn enrichment structure."""
|
|
|
|
# Categorize profiles
|
|
personal_profiles = []
|
|
company_profiles = []
|
|
unknown_profiles = []
|
|
|
|
for item in linkedin_data:
|
|
profile = {
|
|
'name': item.get('name', 'Unknown'),
|
|
'linkedin_url': item.get('linkedin_url'),
|
|
'path': item.get('path'),
|
|
'field': item.get('field')
|
|
}
|
|
|
|
if '/company/' in item['linkedin_url']:
|
|
company_profiles.append(profile)
|
|
elif item['name'] != 'Unknown':
|
|
personal_profiles.append(profile)
|
|
else:
|
|
unknown_profiles.append(profile)
|
|
|
|
enrichment = {
|
|
'extraction_timestamp': datetime.now().isoformat() + 'Z',
|
|
'extraction_method': 'ultimate_deep_extraction_v3',
|
|
'extraction_stats': {
|
|
'total_profiles': len(linkedin_data),
|
|
'personal_profiles': len(personal_profiles),
|
|
'company_profiles': len(company_profiles),
|
|
'unknown_profiles': len(unknown_profiles),
|
|
'high_confidence': len([p for p in linkedin_data if p['name'] != 'Unknown']),
|
|
'medium_confidence': len([p for p in linkedin_data if p['name'] == 'Unknown'])
|
|
},
|
|
'profiles_by_category': {
|
|
'personal': personal_profiles,
|
|
'company': company_profiles,
|
|
'unknown': unknown_profiles
|
|
},
|
|
'all_raw_data': linkedin_data
|
|
}
|
|
|
|
return enrichment
|
|
|
|
def main():
|
|
"""Main function."""
|
|
# Path to Eye Filmmuseum file
|
|
eye_file = "/Users/kempersc/apps/glam/data/custodian/NL-NH-AMS-U-EFM-eye_filmmuseum.yaml"
|
|
|
|
print("=" * 80)
|
|
print("ULTIMATE LINKEDIN EXTRACTION FOR EYE FILMMUSEUM")
|
|
print("=" * 80)
|
|
|
|
print(f"\nLoading data from: {eye_file}")
|
|
with open(eye_file, 'r', encoding='utf-8') as f:
|
|
eye_data = yaml.safe_load(f)
|
|
|
|
print("\nPerforming ultimate deep extraction of ALL LinkedIn URLs...")
|
|
linkedin_data = ultimate_extract_linkedin_urls(eye_data)
|
|
|
|
print(f"\n✓ Found {len(linkedin_data)} LinkedIn profiles!")
|
|
|
|
# Show breakdown by category
|
|
personal = sum(1 for item in linkedin_data if '/company/' not in item['linkedin_url'] and item['name'] != 'Unknown')
|
|
company = sum(1 for item in linkedin_data if '/company/' in item['linkedin_url'])
|
|
unknown = sum(1 for item in linkedin_data if item['name'] == 'Unknown')
|
|
|
|
print(f" - Personal profiles: {personal}")
|
|
print(f" - Company profiles: {company}")
|
|
print(f" - Unknown names: {unknown}")
|
|
|
|
# Show first 15 results
|
|
print("\nFirst 15 profiles found:")
|
|
for i, item in enumerate(linkedin_data[:15]):
|
|
print(f" {i+1:2d}. {item.get('name', 'Unknown')}")
|
|
print(f" URL: {item.get('linkedin_url', 'N/A')}")
|
|
print(f" Path: {item.get('path', 'N/A')}")
|
|
print(f" Field: {item.get('field', 'N/A')}")
|
|
print()
|
|
|
|
if len(linkedin_data) > 15:
|
|
print(f" ... and {len(linkedin_data) - 15} more")
|
|
|
|
# Create enrichment
|
|
print("\nCreating ultimate enrichment structure...")
|
|
enrichment = create_ultimate_enrichment(linkedin_data)
|
|
|
|
# Add to existing data
|
|
if 'linkedin_enrichment' not in eye_data:
|
|
eye_data['linkedin_enrichment'] = {}
|
|
|
|
# Merge with existing data
|
|
eye_data['linkedin_enrichment']['ultimate_extraction'] = enrichment
|
|
eye_data['linkedin_enrichment']['extraction_notes'] = [
|
|
f"Ultimate LinkedIn extraction completed on {enrichment['extraction_timestamp']}",
|
|
f"Total profiles found: {enrichment['extraction_stats']['total_profiles']}",
|
|
f"Personal profiles: {enrichment['extraction_stats']['personal_profiles']}",
|
|
f"Company profiles: {enrichment['extraction_stats']['company_profiles']}",
|
|
"Deep extraction scans ALL YAML fields including conservators, volunteers, interns",
|
|
"Ready for API enrichment with Unipile when credentials are available"
|
|
]
|
|
|
|
# Update provenance
|
|
if 'provenance' not in eye_data:
|
|
eye_data['provenance'] = {}
|
|
if 'notes' not in eye_data['provenance']:
|
|
eye_data['provenance']['notes'] = []
|
|
|
|
eye_data['provenance']['notes'].append(
|
|
f"Ultimate LinkedIn deep extraction on {enrichment['extraction_timestamp']}"
|
|
)
|
|
|
|
# Save enriched data
|
|
output_file = eye_file.replace('.yaml', '_linkedin_ultimate.yaml')
|
|
print(f"\nSaving enriched data to: {output_file}")
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
yaml.dump(eye_data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
# Save profiles-only files
|
|
profiles_file = output_file.replace('.yaml', '_all_profiles.json')
|
|
with open(profiles_file, 'w', encoding='utf-8') as f:
|
|
json.dump({
|
|
'extraction_timestamp': enrichment['extraction_timestamp'],
|
|
'total_profiles': len(linkedin_data),
|
|
'profiles': linkedin_data
|
|
}, f, indent=2)
|
|
|
|
# Create comprehensive CSV
|
|
csv_file = output_file.replace('.yaml', '_profiles_ultimate.csv')
|
|
with open(csv_file, 'w', encoding='utf-8') as f:
|
|
f.write("Name,LinkedIn URL,Type,Path,Field,Confidence\n")
|
|
for item in linkedin_data:
|
|
profile_type = 'company' if '/company/' in item['linkedin_url'] else 'personal'
|
|
confidence = 'high' if item.get('name', 'Unknown') != 'Unknown' else 'medium'
|
|
f.write(f"{item.get('name', 'Unknown')},{item.get('linkedin_url', 'N/A')},{profile_type},{item.get('path', 'N/A')},{item.get('field', 'N/A')},{confidence}\n")
|
|
|
|
# Create detailed report
|
|
report = {
|
|
'extraction_timestamp': enrichment['extraction_timestamp'],
|
|
'method': 'ultimate_deep_extraction_v3',
|
|
'stats': enrichment['extraction_stats'],
|
|
'files_created': {
|
|
'main_yaml': output_file,
|
|
'profiles_json': profiles_file,
|
|
'profiles_csv': csv_file
|
|
},
|
|
'sample_profiles': linkedin_data[:20] # First 20 as sample
|
|
}
|
|
|
|
report_file = output_file.replace('.yaml', '_ultimate_report.json')
|
|
with open(report_file, 'w', encoding='utf-8') as f:
|
|
json.dump(report, f, indent=2)
|
|
|
|
print("\n" + "=" * 80)
|
|
print("ULTIMATE EXTRACTION COMPLETE!")
|
|
print("=" * 80)
|
|
print(f"Total LinkedIn profiles: {len(linkedin_data)}")
|
|
print(f" - Personal: {enrichment['extraction_stats']['personal_profiles']}")
|
|
print(f" - Company: {enrichment['extraction_stats']['company_profiles']}")
|
|
print(f" - Unknown: {enrichment['extraction_stats']['unknown_profiles']}")
|
|
print(f"\nFiles created:")
|
|
print(f" 1. Main YAML: {output_file}")
|
|
print(f" 2. Profiles JSON: {profiles_file}")
|
|
print(f" 3. Profiles CSV: {csv_file}")
|
|
print(f" 4. Report JSON: {report_file}")
|
|
|
|
print("\n" + "=" * 80)
|
|
print("READY FOR API ENRICHMENT")
|
|
print("=" * 80)
|
|
print("""
|
|
To enrich these profiles with detailed data using Unipile API:
|
|
|
|
1. Set up Unipile account:
|
|
- Sign up: https://dashboard.unipile.com/signup
|
|
- Connect your LinkedIn account via Hosted Auth
|
|
- Get API key from dashboard
|
|
|
|
2. Set environment variables:
|
|
export UNIPILE_API_KEY=your_api_key_here
|
|
export UNIPILE_DSN=api1.unipile.com:13111
|
|
|
|
3. Run enrichment script:
|
|
python scripts/enrich_linkedin_ultimate.py
|
|
|
|
This will fetch comprehensive profile data including:
|
|
- Full name and professional headline
|
|
- Location and industry
|
|
- Summary and about section
|
|
- Connection count and follower count
|
|
- Work experience history
|
|
- Education background
|
|
- Skills and languages
|
|
- Profile image URL
|
|
|
|
The enriched data will be seamlessly integrated into the Eye Filmmuseum YAML.
|
|
""")
|
|
|
|
if __name__ == "__main__":
|
|
main() |