164 lines
No EOL
5.8 KiB
Python
164 lines
No EOL
5.8 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Extract and enrich LinkedIn profiles from Eye Filmmuseum data.
|
|
This script works with existing data to extract LinkedIn URLs and prepare enrichment data.
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import json
|
|
import yaml
|
|
import re
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import Dict, List, Any, Optional
|
|
|
|
# Add project root to path
|
|
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
|
|
def extract_linkedin_urls(data: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
"""Extract all LinkedIn URLs and associated info from Eye Filmmuseum data."""
|
|
linkedin_data = []
|
|
|
|
def extract_from_section(section_name: str, section_data: Any):
|
|
"""Extract LinkedIn URLs from any section."""
|
|
if isinstance(section_data, dict):
|
|
for key, value in section_data.items():
|
|
if isinstance(value, dict):
|
|
# Check for linkedin_url field
|
|
if 'linkedin_url' in value:
|
|
linkedin_data.append({
|
|
'name': value.get('name', key),
|
|
'role': section_name,
|
|
'linkedin_url': value['linkedin_url'],
|
|
'source': value
|
|
})
|
|
# Check nested objects
|
|
extract_from_section(f"{section_name}.{key}", value)
|
|
elif isinstance(value, list):
|
|
for i, item in enumerate(value):
|
|
extract_from_section(f"{section_name}.{key}[{i}]", item)
|
|
|
|
# Extract from all sections
|
|
for section_name, section_data in data.items():
|
|
extract_from_section(section_name, section_data)
|
|
|
|
return linkedin_data
|
|
|
|
def extract_linkedin_identifier(url: str) -> Optional[str]:
|
|
"""Extract LinkedIn identifier from URL."""
|
|
patterns = [
|
|
r'linkedin\.com/in/([^/?]+)',
|
|
r'linkedin\.com/pub/([^/?]+)',
|
|
]
|
|
for pattern in patterns:
|
|
match = re.search(pattern, url)
|
|
if match:
|
|
return match.group(1).rstrip('/').split('?')[0]
|
|
return None
|
|
|
|
def create_linkedin_enrichment(linkedin_data: List[Dict[str, Any]]) -> Dict[str, Any]:
|
|
"""Create LinkedIn enrichment structure."""
|
|
enrichment = {
|
|
'extraction_timestamp': datetime.now().isoformat() + 'Z',
|
|
'extraction_method': 'yaml_extraction',
|
|
'total_profiles_found': len(linkedin_data),
|
|
'profiles': []
|
|
}
|
|
|
|
for item in linkedin_data:
|
|
profile = {
|
|
'name': item['name'],
|
|
'role': item['role'],
|
|
'linkedin_url': item['linkedin_url'],
|
|
'linkedin_identifier': extract_linkedin_identifier(item['linkedin_url']),
|
|
'source_data': item['source']
|
|
}
|
|
enrichment['profiles'].append(profile)
|
|
|
|
return enrichment
|
|
|
|
def main():
|
|
"""Extract LinkedIn profiles from Eye Filmmuseum data."""
|
|
# Path to Eye Filmmuseum file
|
|
eye_file = "/Users/kempersc/apps/glam/data/custodian/NL-NH-AMS-U-EFM-eye_filmmuseum.yaml"
|
|
|
|
print("Loading Eye Filmmuseum data...")
|
|
with open(eye_file, 'r', encoding='utf-8') as f:
|
|
eye_data = yaml.safe_load(f)
|
|
|
|
print("Extracting LinkedIn URLs...")
|
|
linkedin_data = extract_linkedin_urls(eye_data)
|
|
|
|
print(f"Found {len(linkedin_data)} LinkedIn profiles:")
|
|
for item in linkedin_data[:10]: # Show first 10
|
|
print(f" - {item['name']} ({item['role']}): {item['linkedin_url']}")
|
|
|
|
if len(linkedin_data) > 10:
|
|
print(f" ... and {len(linkedin_data) - 10} more")
|
|
|
|
# Create enrichment
|
|
enrichment = create_linkedin_enrichment(linkedin_data)
|
|
|
|
# Add to existing data
|
|
if 'linkedin_enrichment' not in eye_data:
|
|
eye_data['linkedin_enrichment'] = {}
|
|
|
|
# Merge with existing LinkedIn enrichment
|
|
existing = eye_data['linkedin_enrichment']
|
|
existing.update({
|
|
'bulk_extraction': enrichment,
|
|
'extraction_notes': [
|
|
f"Bulk LinkedIn URL extraction completed on {enrichment['extraction_timestamp']}",
|
|
f"Found {enrichment['total_profiles_found']} total LinkedIn profiles across all sections",
|
|
"Profiles can be enriched with Unipile API when credentials are available"
|
|
]
|
|
})
|
|
|
|
# Update provenance
|
|
if 'provenance' not in eye_data:
|
|
eye_data['provenance'] = {}
|
|
if 'notes' not in eye_data['provenance']:
|
|
eye_data['provenance']['notes'] = []
|
|
|
|
eye_data['provenance']['notes'].append(
|
|
f"LinkedIn bulk URL extraction on {enrichment['extraction_timestamp']}"
|
|
)
|
|
|
|
# Save enriched data
|
|
output_file = eye_file.replace('.yaml', '_linkedin_bulk_extracted.yaml')
|
|
print(f"\nSaving enriched data to: {output_file}")
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
yaml.dump(eye_data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
print("\nExtraction complete!")
|
|
print(f"Total LinkedIn profiles extracted: {len(linkedin_data)}")
|
|
|
|
# Create summary report
|
|
report_file = output_file.replace('.yaml', '_report.json')
|
|
report = {
|
|
'extraction_timestamp': enrichment['extraction_timestamp'],
|
|
'total_profiles': len(linkedin_data),
|
|
'profiles_by_section': {}
|
|
}
|
|
|
|
# Count by section
|
|
for item in linkedin_data:
|
|
section = item['role']
|
|
if section not in report['profiles_by_section']:
|
|
report['profiles_by_section'][section] = 0
|
|
report['profiles_by_section'][section] += 1
|
|
|
|
with open(report_file, 'w', encoding='utf-8') as f:
|
|
json.dump(report, f, indent=2)
|
|
|
|
print(f"Report saved to: {report_file}")
|
|
|
|
# Show section breakdown
|
|
print("\nProfiles by section:")
|
|
for section, count in sorted(report['profiles_by_section'].items()):
|
|
print(f" {section}: {count}")
|
|
|
|
if __name__ == "__main__":
|
|
main() |