glam/scripts/linkedin_comprehensive_extraction.py
2025-12-10 13:01:13 +01:00

397 lines
No EOL
15 KiB
Python

#!/usr/bin/env python3
"""
Comprehensive LinkedIn enrichment for Eye Filmmuseum.
This script extracts all LinkedIn data and creates a structured enrichment section.
"""
import os
import sys
import json
import yaml
import re
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Any, Optional
from collections import defaultdict
def extract_all_linkedin_data(data: Dict[str, Any]) -> Dict[str, Any]:
"""Extract all LinkedIn-related data from Eye Filmmuseum YAML."""
linkedin_data = {
'management': [],
'board_of_trustees': [],
'department_heads': [],
'former_directors': [],
'chief_curator': [],
'collection_specialists': [],
'curators': [],
'archivists_and_film_specialists': [],
'programmers': [],
'pico_staff': [],
'deceased_staff': [],
'company_page': {},
'foaf_knows': []
}
def extract_linkedin_info(obj, path=""):
"""Extract LinkedIn info from any object."""
info = {}
if isinstance(obj, dict):
# Direct LinkedIn URL
if 'linkedin_url' in obj:
info['linkedin_url'] = obj['linkedin_url']
info['name'] = obj.get('name', obj.get('person_observed', {}).get('name', 'Unknown'))
info['path'] = path
# Extract from foaf_knows entries
if 'foaf_knows' in obj and isinstance(obj['foaf_knows'], list):
for foaf in obj['foaf_knows']:
if isinstance(foaf, dict) and 'linkedin_url' in foaf:
info['linkedin_url'] = foaf['linkedin_url']
info['name'] = foaf.get('name', foaf.get('type', 'Unknown'))
info['path'] = f"{path}.foaf_knows"
info['role'] = foaf.get('type')
info['organization'] = foaf.get('organization')
info['relationship'] = foaf.get('relationship')
info['wikidata_id'] = foaf.get('wikidata_id')
break
# Check nested objects
for key, value in obj.items():
if key not in ['linkedin_url', 'foaf_knows']:
nested = extract_linkedin_info(value, f"{path}.{key}" if path else key)
if nested:
return nested
elif isinstance(obj, list):
for i, item in enumerate(obj):
nested = extract_linkedin_info(item, f"{path}[{i}]" if path else f"[{i}]")
if nested:
return nested
return info
# Extract from each section
sections = [
'management',
'board_of_trustees',
'department_heads',
'former_directors',
'chief_curator',
'collection_specialists',
'curators',
'archivists_and_film_specialists',
'programmers',
'pico_staff',
'deceased_staff'
]
for section in sections:
section_data = data.get(section, [])
if isinstance(section_data, dict):
# Handle single entry sections
for key, value in section_data.items():
info = extract_linkedin_info(value, f"{section}.{key}")
if info:
info['section_key'] = key
linkedin_data[section].append(info)
elif isinstance(section_data, list):
# Handle list sections
for i, item in enumerate(section_data):
info = extract_linkedin_info(item, f"{section}[{i}]")
if info:
info['section_index'] = i
linkedin_data[section].append(info)
# Extract company page LinkedIn info
if 'linkedin_enrichment' in data:
company_data = data['linkedin_enrichment']
if 'company_linkedin_url' in company_data:
linkedin_data['company_page'] = {
'linkedin_url': company_data['company_linkedin_url'],
'employee_count': company_data.get('company_stats', {}).get('employee_count_linkedin'),
'source': 'linkedin_enrichment.company_linkedin_url'
}
# Extract all foaf_knows with LinkedIn URLs
def extract_foaf_with_linkedin(obj, path=""):
results = []
if isinstance(obj, dict):
if 'foaf_knows' in obj and isinstance(obj['foaf_knows'], list):
for foaf in obj['foaf_knows']:
if isinstance(foaf, dict) and 'linkedin_url' in foaf:
result = {
'name': foaf.get('name'),
'type': foaf.get('type'),
'organization': foaf.get('organization'),
'relationship': foaf.get('relationship'),
'linkedin_url': foaf['linkedin_url'],
'wikidata_id': foaf.get('wikidata_id'),
'path': f"{path}.foaf_knows"
}
results.append(result)
# Check nested
for key, value in obj.items():
nested = extract_foaf_with_linkedin(value, f"{path}.{key}" if path else key)
results.extend(nested)
elif isinstance(obj, list):
for i, item in enumerate(obj):
nested = extract_foaf_with_linkedin(item, f"{path}[{i}]" if path else f"[{i}]")
results.extend(nested)
return results
linkedin_data['foaf_knows'] = extract_foaf_with_linkedin(data)
return linkedin_data
def create_linkedin_identifier(url: str) -> Optional[str]:
"""Create LinkedIn identifier from URL."""
if not url:
return None
# Extract identifier from URL
patterns = [
r'linkedin\.com/in/([^/?]+)',
r'linkedin\.com/pub/([^/?]+)',
]
for pattern in patterns:
match = re.search(pattern, url)
if match:
return match.group(1).rstrip('/').split('?')[0]
return None
def create_comprehensive_linkedin_enrichment(linkedin_data: Dict[str, Any]) -> Dict[str, Any]:
"""Create comprehensive LinkedIn enrichment structure."""
enrichment = {
'extraction_timestamp': datetime.now().isoformat() + 'Z',
'extraction_method': 'comprehensive_yaml_extraction',
'total_sections_with_linkedin': 0,
'total_linkedin_urls': 0,
'unique_linkedin_urls': set(),
'sections': {}
}
# Process each section
for section_name, items in linkedin_data.items():
if section_name == 'company_page':
if items:
enrichment['sections'][section_name] = items
enrichment['total_linkedin_urls'] += 1
enrichment['unique_linkedin_urls'].add(items['linkedin_url'])
enrichment['total_sections_with_linkedin'] += 1
elif section_name == 'foaf_knows':
# Group foaf_knows by type
foaf_by_type = defaultdict(list)
for item in items:
foaf_by_type[item['type']].append(item)
enrichment['sections'][section_name] = {
'total_entries': len(items),
'by_type': dict(foaf_by_type),
'sample_entries': items[:10] # First 10 as sample
}
enrichment['total_linkedin_urls'] += len(items)
for item in items:
enrichment['unique_linkedin_urls'].add(item['linkedin_url'])
if items:
enrichment['total_sections_with_linkedin'] += 1
elif items:
# Regular sections
processed_items = []
for item in items:
processed = {
'name': item.get('name'),
'linkedin_url': item.get('linkedin_url'),
'linkedin_identifier': create_linkedin_identifier(item.get('linkedin_url')),
'path': item.get('path'),
'role': item.get('role'),
'section_key': item.get('section_key'),
'section_index': item.get('section_index')
}
processed_items.append(processed)
enrichment['unique_linkedin_urls'].add(item.get('linkedin_url'))
enrichment['sections'][section_name] = {
'total_entries': len(processed_items),
'entries': processed_items
}
enrichment['total_linkedin_urls'] += len(items)
if items:
enrichment['total_sections_with_linkedin'] += 1
# Convert set to count
enrichment['unique_linkedin_urls_count'] = len(enrichment['unique_linkedin_urls'])
enrichment['unique_linkedin_urls'] = list(enrichment['unique_linkedin_urls'])
return enrichment
def main():
"""Main function."""
# Path to Eye Filmmuseum file
eye_file = "/Users/kempersc/apps/glam/data/custodian/NL-NH-AMS-U-EFM-eye_filmmuseum.yaml"
print("=" * 70)
print("COMPREHENSIVE LINKEDIN ENRICHMENT FOR EYE FILMMUSEUM")
print("=" * 70)
print(f"\nLoading Eye Filmmuseum data from: {eye_file}")
with open(eye_file, 'r', encoding='utf-8') as f:
eye_data = yaml.safe_load(f)
print("\nExtracting all LinkedIn data...")
linkedin_data = extract_all_linkedin_data(eye_data)
# Create comprehensive enrichment
print("\nCreating comprehensive LinkedIn enrichment...")
enrichment = create_comprehensive_linkedin_enrichment(linkedin_data)
# Print summary
print("\n" + "=" * 50)
print("LINKEDIN DATA SUMMARY")
print("=" * 50)
print(f"Total sections with LinkedIn data: {enrichment['total_sections_with_linkedin']}")
print(f"Total LinkedIn URLs found: {enrichment['total_linkedin_urls']}")
print(f"Unique LinkedIn URLs: {enrichment['unique_linkedin_urls_count']}")
print("\nBreakdown by section:")
for section, data in enrichment['sections'].items():
if section == 'foaf_knows':
print(f"\n{section.upper()}:")
print(f" Total entries: {data['total_entries']}")
print(" By type:")
for type_name, items in data['by_type'].items():
print(f" - {type_name}: {len(items)}")
if data['sample_entries']:
print(" Sample entries:")
for item in data['sample_entries'][:3]:
print(f" - {item['name']} ({item.get('type', 'Unknown')})")
elif isinstance(data, dict) and 'total_entries' in data:
print(f"\n{section.upper()}: {data['total_entries']} entries")
if data['entries']:
for item in data['entries'][:3]:
print(f" - {item['name']}")
elif isinstance(data, dict):
print(f"\n{section.upper()}: 1 entry")
if 'linkedin_url' in data:
print(f" - Company page: {data['linkedin_url']}")
# Add to existing data
if 'linkedin_enrichment' not in eye_data:
eye_data['linkedin_enrichment'] = {}
# Merge with existing LinkedIn enrichment
existing = eye_data['linkedin_enrichment']
existing.update({
'comprehensive_extraction': enrichment,
'extraction_notes': [
f"Comprehensive LinkedIn extraction completed on {enrichment['extraction_timestamp']}",
f"Found {enrichment['total_linkedin_urls']} LinkedIn URLs across {enrichment['total_sections_with_linkedin']} sections",
f"Unique LinkedIn profiles: {enrichment['unique_linkedin_urls_count']}",
"Data ready for API enrichment with Unipile when credentials are available",
"Extraction includes: management, board, staff, curators, foaf_knows network"
]
})
# Update provenance
if 'provenance' not in eye_data:
eye_data['provenance'] = {}
if 'notes' not in eye_data['provenance']:
eye_data['provenance']['notes'] = []
eye_data['provenance']['notes'].append(
f"Comprehensive LinkedIn extraction on {enrichment['extraction_timestamp']}"
)
# Save enriched data
output_file = eye_file.replace('.yaml', '_linkedin_comprehensive.yaml')
print(f"\nSaving enriched data to: {output_file}")
with open(output_file, 'w', encoding='utf-8') as f:
yaml.dump(eye_data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
# Save separate LinkedIn profiles file for easy access
profiles_file = output_file.replace('.yaml', '_profiles_only.json')
profiles = {
'extraction_timestamp': enrichment['extraction_timestamp'],
'total_profiles': enrichment['unique_linkedin_urls_count'],
'profiles': []
}
# Collect all unique profiles
all_profiles = []
for section_data in linkedin_data.values():
if isinstance(section_data, list):
all_profiles.extend(section_data)
elif isinstance(section_data, dict) and 'linkedin_url' in section_data:
all_profiles.append(section_data)
# Deduplicate by LinkedIn URL
seen_urls = set()
unique_profiles = []
for profile in all_profiles:
url = profile.get('linkedin_url')
if url and url not in seen_urls:
seen_urls.add(url)
unique_profiles.append({
'name': profile.get('name'),
'linkedin_url': url,
'linkedin_identifier': create_linkedin_identifier(url),
'section': profile.get('path', '').split('.')[0] if profile.get('path') else 'unknown'
})
profiles['profiles'] = unique_profiles
with open(profiles_file, 'w', encoding='utf-8') as f:
json.dump(profiles, f, indent=2)
print(f"\nLinkedIn profiles saved to: {profiles_file}")
# Create CSV for easy viewing
csv_file = output_file.replace('.yaml', '_profiles.csv')
with open(csv_file, 'w', encoding='utf-8') as f:
f.write("Name,LinkedIn URL,Identifier,Section,Path\n")
for profile in unique_profiles:
f.write(f"{profile['name']},{profile['linkedin_url']},{profile['linkedin_identifier']},{profile['section']},{profile.get('path', '')}\n")
print(f"CSV saved to: {csv_file}")
print("\n" + "=" * 70)
print("ENRICHMENT COMPLETE!")
print("=" * 70)
print(f"Total unique LinkedIn profiles: {len(unique_profiles)}")
print(f"Main enriched file: {output_file}")
print(f"Profiles-only JSON: {profiles_file}")
print(f"Profiles CSV: {csv_file}")
# Instructions for next steps
print("\n" + "=" * 70)
print("NEXT STEPS FOR API ENRICHMENT")
print("=" * 70)
print("""
To enrich these profiles with Unipile API:
1. Set up Unipile account:
- Sign up: https://dashboard.unipile.com/signup
- Connect LinkedIn account via Hosted Auth
- Get API key from dashboard
2. Set environment variables:
export UNIPILE_API_KEY=your_api_key_here
export UNIPILE_DSN=api1.unipile.com:13111
3. Run enrichment script:
python scripts/enrich_linkedin_profiles_unipile.py
This will fetch detailed profile information for each LinkedIn URL
including: name, headline, location, industry, summary, connection count.
""")
if __name__ == "__main__":
main()