397 lines
No EOL
15 KiB
Python
397 lines
No EOL
15 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Comprehensive LinkedIn enrichment for Eye Filmmuseum.
|
|
This script extracts all LinkedIn data and creates a structured enrichment section.
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import json
|
|
import yaml
|
|
import re
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import Dict, List, Any, Optional
|
|
from collections import defaultdict
|
|
|
|
def extract_all_linkedin_data(data: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Extract all LinkedIn-related data from Eye Filmmuseum YAML."""
|
|
|
|
linkedin_data = {
|
|
'management': [],
|
|
'board_of_trustees': [],
|
|
'department_heads': [],
|
|
'former_directors': [],
|
|
'chief_curator': [],
|
|
'collection_specialists': [],
|
|
'curators': [],
|
|
'archivists_and_film_specialists': [],
|
|
'programmers': [],
|
|
'pico_staff': [],
|
|
'deceased_staff': [],
|
|
'company_page': {},
|
|
'foaf_knows': []
|
|
}
|
|
|
|
def extract_linkedin_info(obj, path=""):
|
|
"""Extract LinkedIn info from any object."""
|
|
info = {}
|
|
|
|
if isinstance(obj, dict):
|
|
# Direct LinkedIn URL
|
|
if 'linkedin_url' in obj:
|
|
info['linkedin_url'] = obj['linkedin_url']
|
|
info['name'] = obj.get('name', obj.get('person_observed', {}).get('name', 'Unknown'))
|
|
info['path'] = path
|
|
|
|
# Extract from foaf_knows entries
|
|
if 'foaf_knows' in obj and isinstance(obj['foaf_knows'], list):
|
|
for foaf in obj['foaf_knows']:
|
|
if isinstance(foaf, dict) and 'linkedin_url' in foaf:
|
|
info['linkedin_url'] = foaf['linkedin_url']
|
|
info['name'] = foaf.get('name', foaf.get('type', 'Unknown'))
|
|
info['path'] = f"{path}.foaf_knows"
|
|
info['role'] = foaf.get('type')
|
|
info['organization'] = foaf.get('organization')
|
|
info['relationship'] = foaf.get('relationship')
|
|
info['wikidata_id'] = foaf.get('wikidata_id')
|
|
break
|
|
|
|
# Check nested objects
|
|
for key, value in obj.items():
|
|
if key not in ['linkedin_url', 'foaf_knows']:
|
|
nested = extract_linkedin_info(value, f"{path}.{key}" if path else key)
|
|
if nested:
|
|
return nested
|
|
|
|
elif isinstance(obj, list):
|
|
for i, item in enumerate(obj):
|
|
nested = extract_linkedin_info(item, f"{path}[{i}]" if path else f"[{i}]")
|
|
if nested:
|
|
return nested
|
|
|
|
return info
|
|
|
|
# Extract from each section
|
|
sections = [
|
|
'management',
|
|
'board_of_trustees',
|
|
'department_heads',
|
|
'former_directors',
|
|
'chief_curator',
|
|
'collection_specialists',
|
|
'curators',
|
|
'archivists_and_film_specialists',
|
|
'programmers',
|
|
'pico_staff',
|
|
'deceased_staff'
|
|
]
|
|
|
|
for section in sections:
|
|
section_data = data.get(section, [])
|
|
if isinstance(section_data, dict):
|
|
# Handle single entry sections
|
|
for key, value in section_data.items():
|
|
info = extract_linkedin_info(value, f"{section}.{key}")
|
|
if info:
|
|
info['section_key'] = key
|
|
linkedin_data[section].append(info)
|
|
elif isinstance(section_data, list):
|
|
# Handle list sections
|
|
for i, item in enumerate(section_data):
|
|
info = extract_linkedin_info(item, f"{section}[{i}]")
|
|
if info:
|
|
info['section_index'] = i
|
|
linkedin_data[section].append(info)
|
|
|
|
# Extract company page LinkedIn info
|
|
if 'linkedin_enrichment' in data:
|
|
company_data = data['linkedin_enrichment']
|
|
if 'company_linkedin_url' in company_data:
|
|
linkedin_data['company_page'] = {
|
|
'linkedin_url': company_data['company_linkedin_url'],
|
|
'employee_count': company_data.get('company_stats', {}).get('employee_count_linkedin'),
|
|
'source': 'linkedin_enrichment.company_linkedin_url'
|
|
}
|
|
|
|
# Extract all foaf_knows with LinkedIn URLs
|
|
def extract_foaf_with_linkedin(obj, path=""):
|
|
results = []
|
|
if isinstance(obj, dict):
|
|
if 'foaf_knows' in obj and isinstance(obj['foaf_knows'], list):
|
|
for foaf in obj['foaf_knows']:
|
|
if isinstance(foaf, dict) and 'linkedin_url' in foaf:
|
|
result = {
|
|
'name': foaf.get('name'),
|
|
'type': foaf.get('type'),
|
|
'organization': foaf.get('organization'),
|
|
'relationship': foaf.get('relationship'),
|
|
'linkedin_url': foaf['linkedin_url'],
|
|
'wikidata_id': foaf.get('wikidata_id'),
|
|
'path': f"{path}.foaf_knows"
|
|
}
|
|
results.append(result)
|
|
|
|
# Check nested
|
|
for key, value in obj.items():
|
|
nested = extract_foaf_with_linkedin(value, f"{path}.{key}" if path else key)
|
|
results.extend(nested)
|
|
|
|
elif isinstance(obj, list):
|
|
for i, item in enumerate(obj):
|
|
nested = extract_foaf_with_linkedin(item, f"{path}[{i}]" if path else f"[{i}]")
|
|
results.extend(nested)
|
|
|
|
return results
|
|
|
|
linkedin_data['foaf_knows'] = extract_foaf_with_linkedin(data)
|
|
|
|
return linkedin_data
|
|
|
|
def create_linkedin_identifier(url: str) -> Optional[str]:
|
|
"""Create LinkedIn identifier from URL."""
|
|
if not url:
|
|
return None
|
|
|
|
# Extract identifier from URL
|
|
patterns = [
|
|
r'linkedin\.com/in/([^/?]+)',
|
|
r'linkedin\.com/pub/([^/?]+)',
|
|
]
|
|
|
|
for pattern in patterns:
|
|
match = re.search(pattern, url)
|
|
if match:
|
|
return match.group(1).rstrip('/').split('?')[0]
|
|
|
|
return None
|
|
|
|
def create_comprehensive_linkedin_enrichment(linkedin_data: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Create comprehensive LinkedIn enrichment structure."""
|
|
|
|
enrichment = {
|
|
'extraction_timestamp': datetime.now().isoformat() + 'Z',
|
|
'extraction_method': 'comprehensive_yaml_extraction',
|
|
'total_sections_with_linkedin': 0,
|
|
'total_linkedin_urls': 0,
|
|
'unique_linkedin_urls': set(),
|
|
'sections': {}
|
|
}
|
|
|
|
# Process each section
|
|
for section_name, items in linkedin_data.items():
|
|
if section_name == 'company_page':
|
|
if items:
|
|
enrichment['sections'][section_name] = items
|
|
enrichment['total_linkedin_urls'] += 1
|
|
enrichment['unique_linkedin_urls'].add(items['linkedin_url'])
|
|
enrichment['total_sections_with_linkedin'] += 1
|
|
elif section_name == 'foaf_knows':
|
|
# Group foaf_knows by type
|
|
foaf_by_type = defaultdict(list)
|
|
for item in items:
|
|
foaf_by_type[item['type']].append(item)
|
|
|
|
enrichment['sections'][section_name] = {
|
|
'total_entries': len(items),
|
|
'by_type': dict(foaf_by_type),
|
|
'sample_entries': items[:10] # First 10 as sample
|
|
}
|
|
enrichment['total_linkedin_urls'] += len(items)
|
|
for item in items:
|
|
enrichment['unique_linkedin_urls'].add(item['linkedin_url'])
|
|
if items:
|
|
enrichment['total_sections_with_linkedin'] += 1
|
|
elif items:
|
|
# Regular sections
|
|
processed_items = []
|
|
for item in items:
|
|
processed = {
|
|
'name': item.get('name'),
|
|
'linkedin_url': item.get('linkedin_url'),
|
|
'linkedin_identifier': create_linkedin_identifier(item.get('linkedin_url')),
|
|
'path': item.get('path'),
|
|
'role': item.get('role'),
|
|
'section_key': item.get('section_key'),
|
|
'section_index': item.get('section_index')
|
|
}
|
|
processed_items.append(processed)
|
|
enrichment['unique_linkedin_urls'].add(item.get('linkedin_url'))
|
|
|
|
enrichment['sections'][section_name] = {
|
|
'total_entries': len(processed_items),
|
|
'entries': processed_items
|
|
}
|
|
enrichment['total_linkedin_urls'] += len(items)
|
|
if items:
|
|
enrichment['total_sections_with_linkedin'] += 1
|
|
|
|
# Convert set to count
|
|
enrichment['unique_linkedin_urls_count'] = len(enrichment['unique_linkedin_urls'])
|
|
enrichment['unique_linkedin_urls'] = list(enrichment['unique_linkedin_urls'])
|
|
|
|
return enrichment
|
|
|
|
def main():
|
|
"""Main function."""
|
|
# Path to Eye Filmmuseum file
|
|
eye_file = "/Users/kempersc/apps/glam/data/custodian/NL-NH-AMS-U-EFM-eye_filmmuseum.yaml"
|
|
|
|
print("=" * 70)
|
|
print("COMPREHENSIVE LINKEDIN ENRICHMENT FOR EYE FILMMUSEUM")
|
|
print("=" * 70)
|
|
|
|
print(f"\nLoading Eye Filmmuseum data from: {eye_file}")
|
|
with open(eye_file, 'r', encoding='utf-8') as f:
|
|
eye_data = yaml.safe_load(f)
|
|
|
|
print("\nExtracting all LinkedIn data...")
|
|
linkedin_data = extract_all_linkedin_data(eye_data)
|
|
|
|
# Create comprehensive enrichment
|
|
print("\nCreating comprehensive LinkedIn enrichment...")
|
|
enrichment = create_comprehensive_linkedin_enrichment(linkedin_data)
|
|
|
|
# Print summary
|
|
print("\n" + "=" * 50)
|
|
print("LINKEDIN DATA SUMMARY")
|
|
print("=" * 50)
|
|
print(f"Total sections with LinkedIn data: {enrichment['total_sections_with_linkedin']}")
|
|
print(f"Total LinkedIn URLs found: {enrichment['total_linkedin_urls']}")
|
|
print(f"Unique LinkedIn URLs: {enrichment['unique_linkedin_urls_count']}")
|
|
|
|
print("\nBreakdown by section:")
|
|
for section, data in enrichment['sections'].items():
|
|
if section == 'foaf_knows':
|
|
print(f"\n{section.upper()}:")
|
|
print(f" Total entries: {data['total_entries']}")
|
|
print(" By type:")
|
|
for type_name, items in data['by_type'].items():
|
|
print(f" - {type_name}: {len(items)}")
|
|
if data['sample_entries']:
|
|
print(" Sample entries:")
|
|
for item in data['sample_entries'][:3]:
|
|
print(f" - {item['name']} ({item.get('type', 'Unknown')})")
|
|
elif isinstance(data, dict) and 'total_entries' in data:
|
|
print(f"\n{section.upper()}: {data['total_entries']} entries")
|
|
if data['entries']:
|
|
for item in data['entries'][:3]:
|
|
print(f" - {item['name']}")
|
|
elif isinstance(data, dict):
|
|
print(f"\n{section.upper()}: 1 entry")
|
|
if 'linkedin_url' in data:
|
|
print(f" - Company page: {data['linkedin_url']}")
|
|
|
|
# Add to existing data
|
|
if 'linkedin_enrichment' not in eye_data:
|
|
eye_data['linkedin_enrichment'] = {}
|
|
|
|
# Merge with existing LinkedIn enrichment
|
|
existing = eye_data['linkedin_enrichment']
|
|
existing.update({
|
|
'comprehensive_extraction': enrichment,
|
|
'extraction_notes': [
|
|
f"Comprehensive LinkedIn extraction completed on {enrichment['extraction_timestamp']}",
|
|
f"Found {enrichment['total_linkedin_urls']} LinkedIn URLs across {enrichment['total_sections_with_linkedin']} sections",
|
|
f"Unique LinkedIn profiles: {enrichment['unique_linkedin_urls_count']}",
|
|
"Data ready for API enrichment with Unipile when credentials are available",
|
|
"Extraction includes: management, board, staff, curators, foaf_knows network"
|
|
]
|
|
})
|
|
|
|
# Update provenance
|
|
if 'provenance' not in eye_data:
|
|
eye_data['provenance'] = {}
|
|
if 'notes' not in eye_data['provenance']:
|
|
eye_data['provenance']['notes'] = []
|
|
|
|
eye_data['provenance']['notes'].append(
|
|
f"Comprehensive LinkedIn extraction on {enrichment['extraction_timestamp']}"
|
|
)
|
|
|
|
# Save enriched data
|
|
output_file = eye_file.replace('.yaml', '_linkedin_comprehensive.yaml')
|
|
print(f"\nSaving enriched data to: {output_file}")
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
yaml.dump(eye_data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
# Save separate LinkedIn profiles file for easy access
|
|
profiles_file = output_file.replace('.yaml', '_profiles_only.json')
|
|
profiles = {
|
|
'extraction_timestamp': enrichment['extraction_timestamp'],
|
|
'total_profiles': enrichment['unique_linkedin_urls_count'],
|
|
'profiles': []
|
|
}
|
|
|
|
# Collect all unique profiles
|
|
all_profiles = []
|
|
for section_data in linkedin_data.values():
|
|
if isinstance(section_data, list):
|
|
all_profiles.extend(section_data)
|
|
elif isinstance(section_data, dict) and 'linkedin_url' in section_data:
|
|
all_profiles.append(section_data)
|
|
|
|
# Deduplicate by LinkedIn URL
|
|
seen_urls = set()
|
|
unique_profiles = []
|
|
for profile in all_profiles:
|
|
url = profile.get('linkedin_url')
|
|
if url and url not in seen_urls:
|
|
seen_urls.add(url)
|
|
unique_profiles.append({
|
|
'name': profile.get('name'),
|
|
'linkedin_url': url,
|
|
'linkedin_identifier': create_linkedin_identifier(url),
|
|
'section': profile.get('path', '').split('.')[0] if profile.get('path') else 'unknown'
|
|
})
|
|
|
|
profiles['profiles'] = unique_profiles
|
|
|
|
with open(profiles_file, 'w', encoding='utf-8') as f:
|
|
json.dump(profiles, f, indent=2)
|
|
|
|
print(f"\nLinkedIn profiles saved to: {profiles_file}")
|
|
|
|
# Create CSV for easy viewing
|
|
csv_file = output_file.replace('.yaml', '_profiles.csv')
|
|
with open(csv_file, 'w', encoding='utf-8') as f:
|
|
f.write("Name,LinkedIn URL,Identifier,Section,Path\n")
|
|
for profile in unique_profiles:
|
|
f.write(f"{profile['name']},{profile['linkedin_url']},{profile['linkedin_identifier']},{profile['section']},{profile.get('path', '')}\n")
|
|
|
|
print(f"CSV saved to: {csv_file}")
|
|
|
|
print("\n" + "=" * 70)
|
|
print("ENRICHMENT COMPLETE!")
|
|
print("=" * 70)
|
|
print(f"Total unique LinkedIn profiles: {len(unique_profiles)}")
|
|
print(f"Main enriched file: {output_file}")
|
|
print(f"Profiles-only JSON: {profiles_file}")
|
|
print(f"Profiles CSV: {csv_file}")
|
|
|
|
# Instructions for next steps
|
|
print("\n" + "=" * 70)
|
|
print("NEXT STEPS FOR API ENRICHMENT")
|
|
print("=" * 70)
|
|
print("""
|
|
To enrich these profiles with Unipile API:
|
|
|
|
1. Set up Unipile account:
|
|
- Sign up: https://dashboard.unipile.com/signup
|
|
- Connect LinkedIn account via Hosted Auth
|
|
- Get API key from dashboard
|
|
|
|
2. Set environment variables:
|
|
export UNIPILE_API_KEY=your_api_key_here
|
|
export UNIPILE_DSN=api1.unipile.com:13111
|
|
|
|
3. Run enrichment script:
|
|
python scripts/enrich_linkedin_profiles_unipile.py
|
|
|
|
This will fetch detailed profile information for each LinkedIn URL
|
|
including: name, headline, location, industry, summary, connection count.
|
|
""")
|
|
|
|
if __name__ == "__main__":
|
|
main() |