glam/scripts/linkedin_final_extraction.py
2025-12-10 13:01:13 +01:00

327 lines
No EOL
12 KiB
Python

#!/usr/bin/env python3
"""
FINAL LinkedIn extraction for Eye Filmmuseum.
This script performs deep extraction of ALL LinkedIn URLs from the complex YAML structure.
"""
import os
import sys
import json
import yaml
import re
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Any, Optional, Set
def deep_extract_linkedin_urls(data: Any, path: str = "") -> List[Dict[str, Any]]:
"""Deep extraction of LinkedIn URLs from nested data structure."""
urls = []
def extract_from_value(value, context_path: str):
"""Extract LinkedIn URL from a single value."""
if isinstance(value, str) and 'linkedin.com' in value:
# Clean up the URL
url = value.strip()
if url.startswith('http'):
return url
elif url.startswith('//'):
return f"https:{url}"
else:
return f"https://{url}"
return None
def extract_from_object(obj, context_path: str = ""):
"""Extract LinkedIn URLs from an object."""
results = []
if isinstance(obj, dict):
for key, value in obj.items():
current_path = f"{context_path}.{key}" if context_path else key
# Direct LinkedIn URL fields
if 'linkedin' in key.lower() or key == 'url':
url = extract_from_value(value, current_path)
if url:
# Try to find associated name
name = find_associated_name(obj, key)
results.append({
'name': name,
'linkedin_url': url,
'path': current_path,
'field': key,
'context': obj
})
# Check for LinkedIn URLs in string values
if isinstance(value, str):
url = extract_from_value(value, current_path)
if url:
name = find_associated_name(obj, key)
results.append({
'name': name,
'linkedin_url': url,
'path': current_path,
'field': key,
'context': obj
})
# Recurse into nested structures
nested_results = extract_from_object(value, current_path)
results.extend(nested_results)
elif isinstance(obj, list):
for i, item in enumerate(obj):
current_path = f"{context_path}[{i}]" if context_path else f"[{i}]"
nested_results = extract_from_object(item, current_path)
results.extend(nested_results)
elif isinstance(obj, str):
url = extract_from_value(obj, context_path)
if url:
results.append({
'name': 'Unknown',
'linkedin_url': url,
'path': context_path,
'field': 'string_value',
'context': obj
})
return results
def find_associated_name(obj, field_key):
"""Try to find an associated name for a LinkedIn URL."""
# Check various name fields
name_fields = [
'name', 'full_name', 'staff_name', 'person_name',
'title', 'label', 'organization', 'company'
]
for field in name_fields:
if field in obj and isinstance(obj[field], str):
return obj[field]
# Check in parent context if available
if isinstance(obj, dict):
# Check for person_observed structure
if 'person_observed' in obj:
person = obj['person_observed']
if isinstance(person, dict) and 'name' in person:
return person['name']
return 'Unknown'
# Start deep extraction
all_results = extract_from_object(data)
# Deduplicate by LinkedIn URL
seen_urls = set()
unique_results = []
for result in all_results:
url = result['linkedin_url']
if url and url not in seen_urls:
seen_urls.add(url)
unique_results.append(result)
return unique_results
def extract_linkedin_identifier(url: str) -> Optional[str]:
"""Extract LinkedIn identifier from URL."""
if not url:
return None
# Handle various LinkedIn URL formats
patterns = [
r'linkedin\.com/in/([^/?]+)',
r'linkedin\.com/pub/([^/?]+)',
r'linkedin\.com/company/([^/?]+)',
r'linkedin\.com/school/([^/?]+)'
]
for pattern in patterns:
match = re.search(pattern, url)
if match:
return match.group(1).rstrip('/').split('?')[0]
return None
def create_final_linkedin_enrichment(linkedin_data: List[Dict[str, Any]]) -> Dict[str, Any]:
"""Create the final LinkedIn enrichment structure."""
# Group by type (personal vs company)
personal_profiles = []
company_profiles = []
for item in linkedin_data:
profile = {
'name': item['name'],
'linkedin_url': item['linkedin_url'],
'linkedin_identifier': extract_linkedin_identifier(item['linkedin_url']),
'extraction_path': item['path'],
'field_found': item['field']
}
# Classify as personal or company profile
if '/company/' in item['linkedin_url']:
company_profiles.append(profile)
else:
personal_profiles.append(profile)
enrichment = {
'extraction_timestamp': datetime.now().isoformat() + 'Z',
'extraction_method': 'deep_yaml_extraction_v2',
'extraction_stats': {
'total_profiles_found': len(linkedin_data),
'personal_profiles': len(personal_profiles),
'company_profiles': len(company_profiles),
'unique_identifiers': len(set(p['linkedin_identifier'] for p in linkedin_data if p['linkedin_identifier']))
},
'personal_profiles': personal_profiles,
'company_profiles': company_profiles,
'all_profiles': linkedin_data
}
return enrichment
def main():
"""Main function."""
# Path to Eye Filmmuseum file
eye_file = "/Users/kempersc/apps/glam/data/custodian/NL-NH-AMS-U-EFM-eye_filmmuseum.yaml"
print("=" * 80)
print("FINAL LINKEDIN EXTRACTION FOR EYE FILMMUSEUM")
print("=" * 80)
print(f"\nLoading data from: {eye_file}")
with open(eye_file, 'r', encoding='utf-8') as f:
eye_data = yaml.safe_load(f)
print("\nPerforming deep extraction of LinkedIn URLs...")
linkedin_data = deep_extract_linkedin_urls(eye_data)
print(f"\n✓ Found {len(linkedin_data)} LinkedIn profiles!")
# Show first few results
print("\nFirst 10 profiles found:")
for i, item in enumerate(linkedin_data[:10]):
print(f" {i+1:2d}. {item['name']}")
print(f" URL: {item['linkedin_url']}")
print(f" Path: {item['path']}")
print(f" Field: {item['field']}")
print()
if len(linkedin_data) > 10:
print(f" ... and {len(linkedin_data) - 10} more profiles")
# Create enrichment
print("\nCreating enrichment structure...")
enrichment = create_final_linkedin_enrichment(linkedin_data)
# Add to existing data
if 'linkedin_enrichment' not in eye_data:
eye_data['linkedin_enrichment'] = {}
# Merge with existing data
eye_data['linkedin_enrichment']['final_extraction'] = enrichment
eye_data['linkedin_enrichment']['extraction_notes'] = [
f"Final deep LinkedIn extraction completed on {enrichment['extraction_timestamp']}",
f"Total profiles found: {enrichment['extraction_stats']['total_profiles_found']}",
f"Personal profiles: {enrichment['extraction_stats']['personal_profiles']}",
f"Company profiles: {enrichment['extraction_stats']['company_profiles']}",
"Deep extraction scans all YAML fields and nested structures",
"Ready for API enrichment with Unipile when credentials available"
]
# Update provenance
if 'provenance' not in eye_data:
eye_data['provenance'] = {}
if 'notes' not in eye_data['provenance']:
eye_data['provenance']['notes'] = []
eye_data['provenance']['notes'].append(
f"Final LinkedIn deep extraction on {enrichment['extraction_timestamp']}"
)
# Save enriched data
output_file = eye_file.replace('.yaml', '_linkedin_final.yaml')
print(f"\nSaving enriched data to: {output_file}")
with open(output_file, 'w', encoding='utf-8') as f:
yaml.dump(eye_data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
# Save profiles-only files
profiles_file = output_file.replace('.yaml', '_all_profiles.json')
with open(profiles_file, 'w', encoding='utf-8') as f:
json.dump({
'extraction_timestamp': enrichment['extraction_timestamp'],
'total_profiles': len(linkedin_data),
'profiles': linkedin_data
}, f, indent=2)
# Create CSV for easy viewing
csv_file = output_file.replace('.yaml', '_profiles.csv')
with open(csv_file, 'w', encoding='utf-8') as f:
f.write("Name,LinkedIn URL,Identifier,Type,Path,Field\n")
for item in linkedin_data:
profile_type = 'company' if '/company/' in item['linkedin_url'] else 'personal'
identifier = extract_linkedin_identifier(item['linkedin_url'])
f.write(f"{item['name']},{item['linkedin_url']},{identifier},{profile_type},{item['path']},{item['field']}\n")
# Create summary report
report = {
'extraction_timestamp': enrichment['extraction_timestamp'],
'method': 'deep_yaml_extraction_v2',
'stats': enrichment['extraction_stats'],
'files_created': {
'main_yaml': output_file,
'profiles_json': profiles_file,
'profiles_csv': csv_file
}
}
report_file = output_file.replace('.yaml', '_extraction_report.json')
with open(report_file, 'w', encoding='utf-8') as f:
json.dump(report, f, indent=2)
print("\n" + "=" * 80)
print("EXTRACTION COMPLETE!")
print("=" * 80)
print(f"Total LinkedIn profiles: {len(linkedin_data)}")
print(f" - Personal profiles: {enrichment['extraction_stats']['personal_profiles']}")
print(f" - Company profiles: {enrichment['extraction_stats']['company_profiles']}")
print(f"\nFiles created:")
print(f" 1. Main YAML: {output_file}")
print(f" 2. Profiles JSON: {profiles_file}")
print(f" 3. Profiles CSV: {csv_file}")
print(f" 4. Report JSON: {report_file}")
print("\n" + "=" * 80)
print("READY FOR API ENRICHMENT")
print("=" * 80)
print("""
To enrich these profiles with detailed data using Unipile API:
1. Set up Unipile account:
- Sign up: https://dashboard.unipile.com/signup
- Connect your LinkedIn account via Hosted Auth
- Get API key from dashboard
2. Set environment variables:
export UNIPILE_API_KEY=your_api_key_here
export UNIPILE_DSN=api1.unipile.com:13111
3. Run the enrichment script:
python scripts/enrich_linkedin_with_api.py
This will fetch:
- Profile names, headlines, locations
- Connection counts
- Industry and summary information
- Work experience and education
- Skills and languages (if available)
The enriched data will be added back to the Eye Filmmuseum YAML file.
""")
if __name__ == "__main__":
main()