327 lines
No EOL
12 KiB
Python
327 lines
No EOL
12 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
FINAL LinkedIn extraction for Eye Filmmuseum.
|
|
This script performs deep extraction of ALL LinkedIn URLs from the complex YAML structure.
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import json
|
|
import yaml
|
|
import re
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import Dict, List, Any, Optional, Set
|
|
|
|
def deep_extract_linkedin_urls(data: Any, path: str = "") -> List[Dict[str, Any]]:
|
|
"""Deep extraction of LinkedIn URLs from nested data structure."""
|
|
urls = []
|
|
|
|
def extract_from_value(value, context_path: str):
|
|
"""Extract LinkedIn URL from a single value."""
|
|
if isinstance(value, str) and 'linkedin.com' in value:
|
|
# Clean up the URL
|
|
url = value.strip()
|
|
if url.startswith('http'):
|
|
return url
|
|
elif url.startswith('//'):
|
|
return f"https:{url}"
|
|
else:
|
|
return f"https://{url}"
|
|
return None
|
|
|
|
def extract_from_object(obj, context_path: str = ""):
|
|
"""Extract LinkedIn URLs from an object."""
|
|
results = []
|
|
|
|
if isinstance(obj, dict):
|
|
for key, value in obj.items():
|
|
current_path = f"{context_path}.{key}" if context_path else key
|
|
|
|
# Direct LinkedIn URL fields
|
|
if 'linkedin' in key.lower() or key == 'url':
|
|
url = extract_from_value(value, current_path)
|
|
if url:
|
|
# Try to find associated name
|
|
name = find_associated_name(obj, key)
|
|
results.append({
|
|
'name': name,
|
|
'linkedin_url': url,
|
|
'path': current_path,
|
|
'field': key,
|
|
'context': obj
|
|
})
|
|
|
|
# Check for LinkedIn URLs in string values
|
|
if isinstance(value, str):
|
|
url = extract_from_value(value, current_path)
|
|
if url:
|
|
name = find_associated_name(obj, key)
|
|
results.append({
|
|
'name': name,
|
|
'linkedin_url': url,
|
|
'path': current_path,
|
|
'field': key,
|
|
'context': obj
|
|
})
|
|
|
|
# Recurse into nested structures
|
|
nested_results = extract_from_object(value, current_path)
|
|
results.extend(nested_results)
|
|
|
|
elif isinstance(obj, list):
|
|
for i, item in enumerate(obj):
|
|
current_path = f"{context_path}[{i}]" if context_path else f"[{i}]"
|
|
nested_results = extract_from_object(item, current_path)
|
|
results.extend(nested_results)
|
|
|
|
elif isinstance(obj, str):
|
|
url = extract_from_value(obj, context_path)
|
|
if url:
|
|
results.append({
|
|
'name': 'Unknown',
|
|
'linkedin_url': url,
|
|
'path': context_path,
|
|
'field': 'string_value',
|
|
'context': obj
|
|
})
|
|
|
|
return results
|
|
|
|
def find_associated_name(obj, field_key):
|
|
"""Try to find an associated name for a LinkedIn URL."""
|
|
# Check various name fields
|
|
name_fields = [
|
|
'name', 'full_name', 'staff_name', 'person_name',
|
|
'title', 'label', 'organization', 'company'
|
|
]
|
|
|
|
for field in name_fields:
|
|
if field in obj and isinstance(obj[field], str):
|
|
return obj[field]
|
|
|
|
# Check in parent context if available
|
|
if isinstance(obj, dict):
|
|
# Check for person_observed structure
|
|
if 'person_observed' in obj:
|
|
person = obj['person_observed']
|
|
if isinstance(person, dict) and 'name' in person:
|
|
return person['name']
|
|
|
|
return 'Unknown'
|
|
|
|
# Start deep extraction
|
|
all_results = extract_from_object(data)
|
|
|
|
# Deduplicate by LinkedIn URL
|
|
seen_urls = set()
|
|
unique_results = []
|
|
|
|
for result in all_results:
|
|
url = result['linkedin_url']
|
|
if url and url not in seen_urls:
|
|
seen_urls.add(url)
|
|
unique_results.append(result)
|
|
|
|
return unique_results
|
|
|
|
def extract_linkedin_identifier(url: str) -> Optional[str]:
|
|
"""Extract LinkedIn identifier from URL."""
|
|
if not url:
|
|
return None
|
|
|
|
# Handle various LinkedIn URL formats
|
|
patterns = [
|
|
r'linkedin\.com/in/([^/?]+)',
|
|
r'linkedin\.com/pub/([^/?]+)',
|
|
r'linkedin\.com/company/([^/?]+)',
|
|
r'linkedin\.com/school/([^/?]+)'
|
|
]
|
|
|
|
for pattern in patterns:
|
|
match = re.search(pattern, url)
|
|
if match:
|
|
return match.group(1).rstrip('/').split('?')[0]
|
|
|
|
return None
|
|
|
|
def create_final_linkedin_enrichment(linkedin_data: List[Dict[str, Any]]) -> Dict[str, Any]:
|
|
"""Create the final LinkedIn enrichment structure."""
|
|
|
|
# Group by type (personal vs company)
|
|
personal_profiles = []
|
|
company_profiles = []
|
|
|
|
for item in linkedin_data:
|
|
profile = {
|
|
'name': item['name'],
|
|
'linkedin_url': item['linkedin_url'],
|
|
'linkedin_identifier': extract_linkedin_identifier(item['linkedin_url']),
|
|
'extraction_path': item['path'],
|
|
'field_found': item['field']
|
|
}
|
|
|
|
# Classify as personal or company profile
|
|
if '/company/' in item['linkedin_url']:
|
|
company_profiles.append(profile)
|
|
else:
|
|
personal_profiles.append(profile)
|
|
|
|
enrichment = {
|
|
'extraction_timestamp': datetime.now().isoformat() + 'Z',
|
|
'extraction_method': 'deep_yaml_extraction_v2',
|
|
'extraction_stats': {
|
|
'total_profiles_found': len(linkedin_data),
|
|
'personal_profiles': len(personal_profiles),
|
|
'company_profiles': len(company_profiles),
|
|
'unique_identifiers': len(set(p['linkedin_identifier'] for p in linkedin_data if p['linkedin_identifier']))
|
|
},
|
|
'personal_profiles': personal_profiles,
|
|
'company_profiles': company_profiles,
|
|
'all_profiles': linkedin_data
|
|
}
|
|
|
|
return enrichment
|
|
|
|
def main():
|
|
"""Main function."""
|
|
# Path to Eye Filmmuseum file
|
|
eye_file = "/Users/kempersc/apps/glam/data/custodian/NL-NH-AMS-U-EFM-eye_filmmuseum.yaml"
|
|
|
|
print("=" * 80)
|
|
print("FINAL LINKEDIN EXTRACTION FOR EYE FILMMUSEUM")
|
|
print("=" * 80)
|
|
|
|
print(f"\nLoading data from: {eye_file}")
|
|
with open(eye_file, 'r', encoding='utf-8') as f:
|
|
eye_data = yaml.safe_load(f)
|
|
|
|
print("\nPerforming deep extraction of LinkedIn URLs...")
|
|
linkedin_data = deep_extract_linkedin_urls(eye_data)
|
|
|
|
print(f"\n✓ Found {len(linkedin_data)} LinkedIn profiles!")
|
|
|
|
# Show first few results
|
|
print("\nFirst 10 profiles found:")
|
|
for i, item in enumerate(linkedin_data[:10]):
|
|
print(f" {i+1:2d}. {item['name']}")
|
|
print(f" URL: {item['linkedin_url']}")
|
|
print(f" Path: {item['path']}")
|
|
print(f" Field: {item['field']}")
|
|
print()
|
|
|
|
if len(linkedin_data) > 10:
|
|
print(f" ... and {len(linkedin_data) - 10} more profiles")
|
|
|
|
# Create enrichment
|
|
print("\nCreating enrichment structure...")
|
|
enrichment = create_final_linkedin_enrichment(linkedin_data)
|
|
|
|
# Add to existing data
|
|
if 'linkedin_enrichment' not in eye_data:
|
|
eye_data['linkedin_enrichment'] = {}
|
|
|
|
# Merge with existing data
|
|
eye_data['linkedin_enrichment']['final_extraction'] = enrichment
|
|
eye_data['linkedin_enrichment']['extraction_notes'] = [
|
|
f"Final deep LinkedIn extraction completed on {enrichment['extraction_timestamp']}",
|
|
f"Total profiles found: {enrichment['extraction_stats']['total_profiles_found']}",
|
|
f"Personal profiles: {enrichment['extraction_stats']['personal_profiles']}",
|
|
f"Company profiles: {enrichment['extraction_stats']['company_profiles']}",
|
|
"Deep extraction scans all YAML fields and nested structures",
|
|
"Ready for API enrichment with Unipile when credentials available"
|
|
]
|
|
|
|
# Update provenance
|
|
if 'provenance' not in eye_data:
|
|
eye_data['provenance'] = {}
|
|
if 'notes' not in eye_data['provenance']:
|
|
eye_data['provenance']['notes'] = []
|
|
|
|
eye_data['provenance']['notes'].append(
|
|
f"Final LinkedIn deep extraction on {enrichment['extraction_timestamp']}"
|
|
)
|
|
|
|
# Save enriched data
|
|
output_file = eye_file.replace('.yaml', '_linkedin_final.yaml')
|
|
print(f"\nSaving enriched data to: {output_file}")
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
yaml.dump(eye_data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
# Save profiles-only files
|
|
profiles_file = output_file.replace('.yaml', '_all_profiles.json')
|
|
with open(profiles_file, 'w', encoding='utf-8') as f:
|
|
json.dump({
|
|
'extraction_timestamp': enrichment['extraction_timestamp'],
|
|
'total_profiles': len(linkedin_data),
|
|
'profiles': linkedin_data
|
|
}, f, indent=2)
|
|
|
|
# Create CSV for easy viewing
|
|
csv_file = output_file.replace('.yaml', '_profiles.csv')
|
|
with open(csv_file, 'w', encoding='utf-8') as f:
|
|
f.write("Name,LinkedIn URL,Identifier,Type,Path,Field\n")
|
|
for item in linkedin_data:
|
|
profile_type = 'company' if '/company/' in item['linkedin_url'] else 'personal'
|
|
identifier = extract_linkedin_identifier(item['linkedin_url'])
|
|
f.write(f"{item['name']},{item['linkedin_url']},{identifier},{profile_type},{item['path']},{item['field']}\n")
|
|
|
|
# Create summary report
|
|
report = {
|
|
'extraction_timestamp': enrichment['extraction_timestamp'],
|
|
'method': 'deep_yaml_extraction_v2',
|
|
'stats': enrichment['extraction_stats'],
|
|
'files_created': {
|
|
'main_yaml': output_file,
|
|
'profiles_json': profiles_file,
|
|
'profiles_csv': csv_file
|
|
}
|
|
}
|
|
|
|
report_file = output_file.replace('.yaml', '_extraction_report.json')
|
|
with open(report_file, 'w', encoding='utf-8') as f:
|
|
json.dump(report, f, indent=2)
|
|
|
|
print("\n" + "=" * 80)
|
|
print("EXTRACTION COMPLETE!")
|
|
print("=" * 80)
|
|
print(f"Total LinkedIn profiles: {len(linkedin_data)}")
|
|
print(f" - Personal profiles: {enrichment['extraction_stats']['personal_profiles']}")
|
|
print(f" - Company profiles: {enrichment['extraction_stats']['company_profiles']}")
|
|
print(f"\nFiles created:")
|
|
print(f" 1. Main YAML: {output_file}")
|
|
print(f" 2. Profiles JSON: {profiles_file}")
|
|
print(f" 3. Profiles CSV: {csv_file}")
|
|
print(f" 4. Report JSON: {report_file}")
|
|
|
|
print("\n" + "=" * 80)
|
|
print("READY FOR API ENRICHMENT")
|
|
print("=" * 80)
|
|
print("""
|
|
To enrich these profiles with detailed data using Unipile API:
|
|
|
|
1. Set up Unipile account:
|
|
- Sign up: https://dashboard.unipile.com/signup
|
|
- Connect your LinkedIn account via Hosted Auth
|
|
- Get API key from dashboard
|
|
|
|
2. Set environment variables:
|
|
export UNIPILE_API_KEY=your_api_key_here
|
|
export UNIPILE_DSN=api1.unipile.com:13111
|
|
|
|
3. Run the enrichment script:
|
|
python scripts/enrich_linkedin_with_api.py
|
|
|
|
This will fetch:
|
|
- Profile names, headlines, locations
|
|
- Connection counts
|
|
- Industry and summary information
|
|
- Work experience and education
|
|
- Skills and languages (if available)
|
|
|
|
The enriched data will be added back to the Eye Filmmuseum YAML file.
|
|
""")
|
|
|
|
if __name__ == "__main__":
|
|
main() |