192 lines
No EOL
7 KiB
Python
192 lines
No EOL
7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Extract and enrich LinkedIn profiles from Eye Filmmuseum data.
|
|
This script works with existing data to extract LinkedIn URLs and prepare enrichment data.
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import json
|
|
import yaml
|
|
import re
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import Dict, List, Any, Optional
|
|
|
|
# Add project root to path
|
|
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
|
|
def extract_linkedin_urls(data: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
"""Extract all LinkedIn URLs and associated info from Eye Filmmuseum data."""
|
|
linkedin_data = []
|
|
|
|
def extract_from_obj(obj, path=""):
|
|
"""Recursively extract LinkedIn URLs from object."""
|
|
if isinstance(obj, dict):
|
|
for key, value in obj.items():
|
|
current_path = f"{path}.{key}" if path else key
|
|
|
|
# Check for linkedin_url field
|
|
if key == 'linkedin_url' and isinstance(value, str):
|
|
# Try to find associated name from parent context
|
|
name = find_name_from_context(obj, path)
|
|
linkedin_data.append({
|
|
'name': name,
|
|
'path': current_path,
|
|
'linkedin_url': value,
|
|
'context': obj
|
|
})
|
|
|
|
# Recurse into nested objects
|
|
extract_from_obj(value, current_path)
|
|
elif isinstance(obj, list):
|
|
for i, item in enumerate(obj):
|
|
extract_from_obj(item, f"{path}[{i}]" if path else f"[{i}]")
|
|
|
|
def find_name_from_context(obj, path):
|
|
"""Try to find a name associated with the LinkedIn URL."""
|
|
# Check common name fields
|
|
for name_field in ['name', 'staff_name', 'person_observed', 'role']:
|
|
if name_field in obj and isinstance(obj[name_field], str):
|
|
return obj[name_field]
|
|
|
|
# If person_observed is nested
|
|
if 'person_observed' in obj and isinstance(obj['person_observed'], dict):
|
|
if 'name' in obj['person_observed']:
|
|
return obj['person_observed']['name']
|
|
|
|
return "Unknown"
|
|
|
|
# Start extraction from root
|
|
extract_from_obj(data)
|
|
|
|
return linkedin_data
|
|
|
|
def extract_linkedin_identifier(url: str) -> Optional[str]:
|
|
"""Extract LinkedIn identifier from URL."""
|
|
patterns = [
|
|
r'linkedin\.com/in/([^/?]+)',
|
|
r'linkedin\.com/pub/([^/?]+)',
|
|
]
|
|
for pattern in patterns:
|
|
match = re.search(pattern, url)
|
|
if match:
|
|
return match.group(1).rstrip('/').split('?')[0]
|
|
return None
|
|
|
|
def create_linkedin_enrichment(linkedin_data: List[Dict[str, Any]]) -> Dict[str, Any]:
|
|
"""Create LinkedIn enrichment structure."""
|
|
enrichment = {
|
|
'extraction_timestamp': datetime.now().isoformat() + 'Z',
|
|
'extraction_method': 'yaml_bulk_extraction',
|
|
'total_profiles_found': len(linkedin_data),
|
|
'profiles': []
|
|
}
|
|
|
|
for item in linkedin_data:
|
|
profile = {
|
|
'name': item['name'],
|
|
'path_in_yaml': item['path'],
|
|
'linkedin_url': item['linkedin_url'],
|
|
'linkedin_identifier': extract_linkedin_identifier(item['linkedin_url']),
|
|
'extracted_from': item['path']
|
|
}
|
|
enrichment['profiles'].append(profile)
|
|
|
|
return enrichment
|
|
|
|
def main():
|
|
"""Extract LinkedIn profiles from Eye Filmmuseum data."""
|
|
# Path to Eye Filmmuseum file
|
|
eye_file = "/Users/kempersc/apps/glam/data/custodian/NL-NH-AMS-U-EFM-eye_filmmuseum.yaml"
|
|
|
|
print("Loading Eye Filmmuseum data...")
|
|
with open(eye_file, 'r', encoding='utf-8') as f:
|
|
eye_data = yaml.safe_load(f)
|
|
|
|
print("Extracting LinkedIn URLs...")
|
|
linkedin_data = extract_linkedin_urls(eye_data)
|
|
|
|
print(f"Found {len(linkedin_data)} LinkedIn profiles:")
|
|
for item in linkedin_data[:20]: # Show first 20
|
|
print(f" - {item['name']} ({item['path']}): {item['linkedin_url']}")
|
|
|
|
if len(linkedin_data) > 20:
|
|
print(f" ... and {len(linkedin_data) - 20} more")
|
|
|
|
# Create enrichment
|
|
enrichment = create_linkedin_enrichment(linkedin_data)
|
|
|
|
# Add to existing data
|
|
if 'linkedin_enrichment' not in eye_data:
|
|
eye_data['linkedin_enrichment'] = {}
|
|
|
|
# Merge with existing LinkedIn enrichment
|
|
existing = eye_data['linkedin_enrichment']
|
|
existing.update({
|
|
'bulk_url_extraction': enrichment,
|
|
'extraction_notes': [
|
|
f"Bulk LinkedIn URL extraction completed on {enrichment['extraction_timestamp']}",
|
|
f"Found {enrichment['total_profiles_found']} total LinkedIn profiles across all sections",
|
|
"Profiles can be enriched with Unipile API when credentials are available",
|
|
"Note: These URLs were extracted from various sections including management, curators, collection_specialists, etc."
|
|
]
|
|
})
|
|
|
|
# Update provenance
|
|
if 'provenance' not in eye_data:
|
|
eye_data['provenance'] = {}
|
|
if 'notes' not in eye_data['provenance']:
|
|
eye_data['provenance']['notes'] = []
|
|
|
|
eye_data['provenance']['notes'].append(
|
|
f"LinkedIn bulk URL extraction on {enrichment['extraction_timestamp']}"
|
|
)
|
|
|
|
# Save enriched data
|
|
output_file = eye_file.replace('.yaml', '_linkedin_enriched.yaml')
|
|
print(f"\nSaving enriched data to: {output_file}")
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
yaml.dump(eye_data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
print("\nExtraction complete!")
|
|
print(f"Total LinkedIn profiles extracted: {len(linkedin_data)}")
|
|
|
|
# Create summary report
|
|
report_file = output_file.replace('.yaml', '_report.json')
|
|
report = {
|
|
'extraction_timestamp': enrichment['extraction_timestamp'],
|
|
'total_profiles': len(linkedin_data),
|
|
'profiles_by_section': {}
|
|
}
|
|
|
|
# Count by section
|
|
for item in linkedin_data:
|
|
section = item['path'].split('.')[0] # Get top-level section
|
|
if section not in report['profiles_by_section']:
|
|
report['profiles_by_section'][section] = 0
|
|
report['profiles_by_section'][section] += 1
|
|
|
|
with open(report_file, 'w', encoding='utf-8') as f:
|
|
json.dump(report, f, indent=2)
|
|
|
|
print(f"Report saved to: {report_file}")
|
|
|
|
# Show section breakdown
|
|
print("\nProfiles by section:")
|
|
for section, count in sorted(report['profiles_by_section'].items()):
|
|
print(f" {section}: {count}")
|
|
|
|
# Create a simple CSV for easy viewing
|
|
csv_file = output_file.replace('.yaml', '_profiles.csv')
|
|
with open(csv_file, 'w', encoding='utf-8') as f:
|
|
f.write("Name,Path,LinkedIn URL,Identifier\n")
|
|
for item in linkedin_data:
|
|
identifier = extract_linkedin_identifier(item['linkedin_url'])
|
|
f.write(f"{item['name']},{item['path']},{item['linkedin_url']},{identifier}\n")
|
|
|
|
print(f"CSV saved to: {csv_file}")
|
|
|
|
if __name__ == "__main__":
|
|
main() |