glam/scripts/extract_linkedin_profiles.py
2025-12-10 13:01:13 +01:00

164 lines
No EOL
5.8 KiB
Python

#!/usr/bin/env python3
"""
Extract and enrich LinkedIn profiles from Eye Filmmuseum data.
This script works with existing data to extract LinkedIn URLs and prepare enrichment data.
"""
import os
import sys
import json
import yaml
import re
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Any, Optional
# Add project root to path
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
def extract_linkedin_urls(data: Dict[str, Any]) -> List[Dict[str, Any]]:
"""Extract all LinkedIn URLs and associated info from Eye Filmmuseum data."""
linkedin_data = []
def extract_from_section(section_name: str, section_data: Any):
"""Extract LinkedIn URLs from any section."""
if isinstance(section_data, dict):
for key, value in section_data.items():
if isinstance(value, dict):
# Check for linkedin_url field
if 'linkedin_url' in value:
linkedin_data.append({
'name': value.get('name', key),
'role': section_name,
'linkedin_url': value['linkedin_url'],
'source': value
})
# Check nested objects
extract_from_section(f"{section_name}.{key}", value)
elif isinstance(value, list):
for i, item in enumerate(value):
extract_from_section(f"{section_name}.{key}[{i}]", item)
# Extract from all sections
for section_name, section_data in data.items():
extract_from_section(section_name, section_data)
return linkedin_data
def extract_linkedin_identifier(url: str) -> Optional[str]:
"""Extract LinkedIn identifier from URL."""
patterns = [
r'linkedin\.com/in/([^/?]+)',
r'linkedin\.com/pub/([^/?]+)',
]
for pattern in patterns:
match = re.search(pattern, url)
if match:
return match.group(1).rstrip('/').split('?')[0]
return None
def create_linkedin_enrichment(linkedin_data: List[Dict[str, Any]]) -> Dict[str, Any]:
"""Create LinkedIn enrichment structure."""
enrichment = {
'extraction_timestamp': datetime.now().isoformat() + 'Z',
'extraction_method': 'yaml_extraction',
'total_profiles_found': len(linkedin_data),
'profiles': []
}
for item in linkedin_data:
profile = {
'name': item['name'],
'role': item['role'],
'linkedin_url': item['linkedin_url'],
'linkedin_identifier': extract_linkedin_identifier(item['linkedin_url']),
'source_data': item['source']
}
enrichment['profiles'].append(profile)
return enrichment
def main():
"""Extract LinkedIn profiles from Eye Filmmuseum data."""
# Path to Eye Filmmuseum file
eye_file = "/Users/kempersc/apps/glam/data/custodian/NL-NH-AMS-U-EFM-eye_filmmuseum.yaml"
print("Loading Eye Filmmuseum data...")
with open(eye_file, 'r', encoding='utf-8') as f:
eye_data = yaml.safe_load(f)
print("Extracting LinkedIn URLs...")
linkedin_data = extract_linkedin_urls(eye_data)
print(f"Found {len(linkedin_data)} LinkedIn profiles:")
for item in linkedin_data[:10]: # Show first 10
print(f" - {item['name']} ({item['role']}): {item['linkedin_url']}")
if len(linkedin_data) > 10:
print(f" ... and {len(linkedin_data) - 10} more")
# Create enrichment
enrichment = create_linkedin_enrichment(linkedin_data)
# Add to existing data
if 'linkedin_enrichment' not in eye_data:
eye_data['linkedin_enrichment'] = {}
# Merge with existing LinkedIn enrichment
existing = eye_data['linkedin_enrichment']
existing.update({
'bulk_extraction': enrichment,
'extraction_notes': [
f"Bulk LinkedIn URL extraction completed on {enrichment['extraction_timestamp']}",
f"Found {enrichment['total_profiles_found']} total LinkedIn profiles across all sections",
"Profiles can be enriched with Unipile API when credentials are available"
]
})
# Update provenance
if 'provenance' not in eye_data:
eye_data['provenance'] = {}
if 'notes' not in eye_data['provenance']:
eye_data['provenance']['notes'] = []
eye_data['provenance']['notes'].append(
f"LinkedIn bulk URL extraction on {enrichment['extraction_timestamp']}"
)
# Save enriched data
output_file = eye_file.replace('.yaml', '_linkedin_bulk_extracted.yaml')
print(f"\nSaving enriched data to: {output_file}")
with open(output_file, 'w', encoding='utf-8') as f:
yaml.dump(eye_data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
print("\nExtraction complete!")
print(f"Total LinkedIn profiles extracted: {len(linkedin_data)}")
# Create summary report
report_file = output_file.replace('.yaml', '_report.json')
report = {
'extraction_timestamp': enrichment['extraction_timestamp'],
'total_profiles': len(linkedin_data),
'profiles_by_section': {}
}
# Count by section
for item in linkedin_data:
section = item['role']
if section not in report['profiles_by_section']:
report['profiles_by_section'][section] = 0
report['profiles_by_section'][section] += 1
with open(report_file, 'w', encoding='utf-8') as f:
json.dump(report, f, indent=2)
print(f"Report saved to: {report_file}")
# Show section breakdown
print("\nProfiles by section:")
for section, count in sorted(report['profiles_by_section'].items()):
print(f" {section}: {count}")
if __name__ == "__main__":
main()