glam/scripts/linkedin_ultimate_extraction.py

#!/usr/bin/env python3
"""
ULTIMATE LinkedIn extraction for Eye Filmmuseum.
This script performs the most comprehensive extraction of ALL LinkedIn URLs.
"""

import os
import sys
import json
import yaml
import re
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Any, Optional, Set

def ultimate_extract_linkedin_urls(data: Any, path: str = "") -> List[Dict[str, Any]]:
    """Ultimate extraction of LinkedIn URLs from any data structure."""
    urls = []
    seen_urls = set()

    def extract_from_value(value: Any, context_path: str) -> Optional[str]:
        """Extract and normalize LinkedIn URL from a single value."""
        if isinstance(value, str):
            # Look for LinkedIn URLs in text
            import re as re_module
            # Find all LinkedIn URLs in the text
            linkedin_matches = re_module.findall(r'linkedin\.com/[^\s\)]+', value)

            for match in linkedin_matches:
                # Clean and normalize URL
                url = match.strip()
                if url.startswith('http'):
                    clean_url = url
                elif url.startswith('//'):
                    clean_url = f"https:{url}"
                else:
                    clean_url = f"https://{url}"

                if clean_url not in seen_urls:
                    seen_urls.add(clean_url)
                    return clean_url
        return None

    def extract_from_object(obj: Any, context_path: str = "") -> None:
        """Recursively extract LinkedIn URLs from object."""
        if isinstance(obj, dict):
            # Check all string values for LinkedIn URLs
            for key, value in obj.items():
                current_path = f"{context_path}.{key}" if context_path else key

                # Direct LinkedIn URL fields
                if 'linkedin' in key.lower() and isinstance(value, str):
                    url = extract_from_value(value, current_path)
                    if url:
                        name = find_name_in_context(obj, key)
                        urls.append({
                            'name': name,
                            'linkedin_url': url,
                            'path': current_path,
                            'field': key,
                            'context': obj
                        })

                # Check any string value for LinkedIn URLs
                url = extract_from_value(value, current_path)
                if url:
                    name = find_name_in_context(obj, key)
                    urls.append({
                        'name': name,
                        'linkedin_url': url,
                        'path': current_path,
                        'field': key,
                        'context': obj
                    })

                # Recurse into nested structures
                extract_from_object(value, current_path)

        elif isinstance(obj, list):
            for i, item in enumerate(obj):
                current_path = f"{context_path}[{i}]" if context_path else f"[{i}]"
                extract_from_object(item, current_path)

        elif isinstance(obj, str):
            # Check for LinkedIn URLs in standalone strings
            url = extract_from_value(obj, path)
            if url:
                urls.append({
                    'name': 'Unknown',
                    'linkedin_url': url,
                    'path': path,
                    'field': 'string_value',
                    'context': obj
                })

    def find_name_in_context(obj: Dict, field_key: str) -> str:
        """Find the most relevant name for a LinkedIn URL."""
        # Try various name fields
        name_fields = [
            'name', 'full_name', 'staff_name', 'person_name',
            'title', 'label', 'organization', 'company'
        ]

        for field in name_fields:
            if field in obj and isinstance(obj[field], str) and obj[field].strip():
                return obj[field].strip()

        # Check parent objects for names
        current_parts = path.split('.')
        for i in range(len(current_parts), 0, -1):
            parent_path = '.'.join(current_parts[:i])
            # Navigate up the structure
            parent = obj
            for part in current_parts[:i]:
                if isinstance(parent, dict) and part in parent:
                    parent = parent[part]
                elif isinstance(parent, list) and part.isdigit() and int(part) < len(parent):
                    parent = parent[int(part)]
                else:
                    break

            if isinstance(parent, dict):
                for field in name_fields:
                    if field in parent and isinstance(parent[field], str) and parent[field].strip():
                        return parent[field].strip()

        return 'Unknown'

    # Start extraction
    extract_from_object(data)

    return urls

def create_ultimate_enrichment(linkedin_data: List[Dict[str, Any]]) -> Dict[str, Any]:
    """Create the ultimate LinkedIn enrichment structure."""

    # Categorize profiles
    personal_profiles = []
    company_profiles = []
    unknown_profiles = []

    for item in linkedin_data:
        profile = {
            'name': item.get('name', 'Unknown'),
            'linkedin_url': item.get('linkedin_url'),
            'path': item.get('path'),
            'field': item.get('field')
        }

        if '/company/' in item['linkedin_url']:
            company_profiles.append(profile)
        elif item['name'] != 'Unknown':
            personal_profiles.append(profile)
        else:
            unknown_profiles.append(profile)

    enrichment = {
        'extraction_timestamp': datetime.now().isoformat() + 'Z',
        'extraction_method': 'ultimate_deep_extraction_v3',
        'extraction_stats': {
            'total_profiles': len(linkedin_data),
            'personal_profiles': len(personal_profiles),
            'company_profiles': len(company_profiles),
            'unknown_profiles': len(unknown_profiles),
            'high_confidence': len([p for p in linkedin_data if p['name'] != 'Unknown']),
            'medium_confidence': len([p for p in linkedin_data if p['name'] == 'Unknown'])
        },
        'profiles_by_category': {
            'personal': personal_profiles,
            'company': company_profiles,
            'unknown': unknown_profiles
        },
        'all_raw_data': linkedin_data
    }

    return enrichment

def main():
    """Main function."""
    # Path to Eye Filmmuseum file
    eye_file = "/Users/kempersc/apps/glam/data/custodian/NL-NH-AMS-U-EFM-eye_filmmuseum.yaml"

    print("=" * 80)
    print("ULTIMATE LINKEDIN EXTRACTION FOR EYE FILMMUSEUM")
    print("=" * 80)

    print(f"\nLoading data from: {eye_file}")
    with open(eye_file, 'r', encoding='utf-8') as f:
        eye_data = yaml.safe_load(f)

    print("\nPerforming ultimate deep extraction of ALL LinkedIn URLs...")
    linkedin_data = ultimate_extract_linkedin_urls(eye_data)

    print(f"\n✓ Found {len(linkedin_data)} LinkedIn profiles!")

    # Show breakdown by category
    personal = sum(1 for item in linkedin_data if '/company/' not in item['linkedin_url'] and item['name'] != 'Unknown')
    company = sum(1 for item in linkedin_data if '/company/' in item['linkedin_url'])
    unknown = sum(1 for item in linkedin_data if item['name'] == 'Unknown')

    print(f"  - Personal profiles: {personal}")
    print(f"  - Company profiles: {company}")
    print(f"  - Unknown names: {unknown}")

    # Show first 15 results
    print("\nFirst 15 profiles found:")
    for i, item in enumerate(linkedin_data[:15]):
        print(f"  {i+1:2d}. {item.get('name', 'Unknown')}")
        print(f"      URL: {item.get('linkedin_url', 'N/A')}")
        print(f"      Path: {item.get('path', 'N/A')}")
        print(f"      Field: {item.get('field', 'N/A')}")
        print()

    if len(linkedin_data) > 15:
        print(f"  ... and {len(linkedin_data) - 15} more")

    # Create enrichment
    print("\nCreating ultimate enrichment structure...")
    enrichment = create_ultimate_enrichment(linkedin_data)

    # Add to existing data
    if 'linkedin_enrichment' not in eye_data:
        eye_data['linkedin_enrichment'] = {}

    # Merge with existing data
    eye_data['linkedin_enrichment']['ultimate_extraction'] = enrichment
    eye_data['linkedin_enrichment']['extraction_notes'] = [
        f"Ultimate LinkedIn extraction completed on {enrichment['extraction_timestamp']}",
        f"Total profiles found: {enrichment['extraction_stats']['total_profiles']}",
        f"Personal profiles: {enrichment['extraction_stats']['personal_profiles']}",
        f"Company profiles: {enrichment['extraction_stats']['company_profiles']}",
        "Deep extraction scans ALL YAML fields including conservators, volunteers, interns",
        "Ready for API enrichment with Unipile when credentials are available"
    ]

    # Update provenance
    if 'provenance' not in eye_data:
        eye_data['provenance'] = {}
    if 'notes' not in eye_data['provenance']:
        eye_data['provenance']['notes'] = []

    eye_data['provenance']['notes'].append(
        f"Ultimate LinkedIn deep extraction on {enrichment['extraction_timestamp']}"
    )

    # Save enriched data
    output_file = eye_file.replace('.yaml', '_linkedin_ultimate.yaml')
    print(f"\nSaving enriched data to: {output_file}")

    with open(output_file, 'w', encoding='utf-8') as f:
        yaml.dump(eye_data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)

    # Save profiles-only files
    profiles_file = output_file.replace('.yaml', '_all_profiles.json')
    with open(profiles_file, 'w', encoding='utf-8') as f:
        json.dump({
            'extraction_timestamp': enrichment['extraction_timestamp'],
            'total_profiles': len(linkedin_data),
            'profiles': linkedin_data
        }, f, indent=2)

    # Create comprehensive CSV
    csv_file = output_file.replace('.yaml', '_profiles_ultimate.csv')
    with open(csv_file, 'w', encoding='utf-8') as f:
        f.write("Name,LinkedIn URL,Type,Path,Field,Confidence\n")
        for item in linkedin_data:
            profile_type = 'company' if '/company/' in item['linkedin_url'] else 'personal'
            confidence = 'high' if item.get('name', 'Unknown') != 'Unknown' else 'medium'
            f.write(f"{item.get('name', 'Unknown')},{item.get('linkedin_url', 'N/A')},{profile_type},{item.get('path', 'N/A')},{item.get('field', 'N/A')},{confidence}\n")

    # Create detailed report
    report = {
        'extraction_timestamp': enrichment['extraction_timestamp'],
        'method': 'ultimate_deep_extraction_v3',
        'stats': enrichment['extraction_stats'],
        'files_created': {
            'main_yaml': output_file,
            'profiles_json': profiles_file,
            'profiles_csv': csv_file
        },
        'sample_profiles': linkedin_data[:20]  # First 20 as sample
    }

    report_file = output_file.replace('.yaml', '_ultimate_report.json')
    with open(report_file, 'w', encoding='utf-8') as f:
        json.dump(report, f, indent=2)

    print("\n" + "=" * 80)
    print("ULTIMATE EXTRACTION COMPLETE!")
    print("=" * 80)
    print(f"Total LinkedIn profiles: {len(linkedin_data)}")
    print(f"  - Personal: {enrichment['extraction_stats']['personal_profiles']}")
    print(f"  - Company: {enrichment['extraction_stats']['company_profiles']}")
    print(f"  - Unknown: {enrichment['extraction_stats']['unknown_profiles']}")
    print(f"\nFiles created:")
    print(f"  1. Main YAML: {output_file}")
    print(f"  2. Profiles JSON: {profiles_file}")
    print(f"  3. Profiles CSV: {csv_file}")
    print(f"  4. Report JSON: {report_file}")

    print("\n" + "=" * 80)
    print("READY FOR API ENRICHMENT")
    print("=" * 80)
    print("""
To enrich these profiles with detailed data using Unipile API:

1. Set up Unipile account:
   - Sign up: https://dashboard.unipile.com/signup
   - Connect your LinkedIn account via Hosted Auth
   - Get API key from dashboard

2. Set environment variables:
   export UNIPILE_API_KEY=your_api_key_here
   export UNIPILE_DSN=api1.unipile.com:13111

3. Run enrichment script:
   python scripts/enrich_linkedin_ultimate.py

This will fetch comprehensive profile data including:
- Full name and professional headline
- Location and industry
- Summary and about section
- Connection count and follower count
- Work experience history
- Education background
- Skills and languages
- Profile image URL

The enriched data will be seamlessly integrated into the Eye Filmmuseum YAML.
""")

if __name__ == "__main__":
    main()