glam/scripts/linkedin_comprehensive_extraction.py

#!/usr/bin/env python3
"""
Comprehensive LinkedIn enrichment for Eye Filmmuseum.
This script extracts all LinkedIn data and creates a structured enrichment section.
"""

import os
import sys
import json
import yaml
import re
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Any, Optional
from collections import defaultdict

def extract_all_linkedin_data(data: Dict[str, Any]) -> Dict[str, Any]:
    """Extract all LinkedIn-related data from Eye Filmmuseum YAML."""

    linkedin_data = {
        'management': [],
        'board_of_trustees': [],
        'department_heads': [],
        'former_directors': [],
        'chief_curator': [],
        'collection_specialists': [],
        'curators': [],
        'archivists_and_film_specialists': [],
        'programmers': [],
        'pico_staff': [],
        'deceased_staff': [],
        'company_page': {},
        'foaf_knows': []
    }

    def extract_linkedin_info(obj, path=""):
        """Extract LinkedIn info from any object."""
        info = {}

        if isinstance(obj, dict):
            # Direct LinkedIn URL
            if 'linkedin_url' in obj:
                info['linkedin_url'] = obj['linkedin_url']
                info['name'] = obj.get('name', obj.get('person_observed', {}).get('name', 'Unknown'))
                info['path'] = path

            # Extract from foaf_knows entries
            if 'foaf_knows' in obj and isinstance(obj['foaf_knows'], list):
                for foaf in obj['foaf_knows']:
                    if isinstance(foaf, dict) and 'linkedin_url' in foaf:
                        info['linkedin_url'] = foaf['linkedin_url']
                        info['name'] = foaf.get('name', foaf.get('type', 'Unknown'))
                        info['path'] = f"{path}.foaf_knows"
                        info['role'] = foaf.get('type')
                        info['organization'] = foaf.get('organization')
                        info['relationship'] = foaf.get('relationship')
                        info['wikidata_id'] = foaf.get('wikidata_id')
                        break

            # Check nested objects
            for key, value in obj.items():
                if key not in ['linkedin_url', 'foaf_knows']:
                    nested = extract_linkedin_info(value, f"{path}.{key}" if path else key)
                    if nested:
                        return nested

        elif isinstance(obj, list):
            for i, item in enumerate(obj):
                nested = extract_linkedin_info(item, f"{path}[{i}]" if path else f"[{i}]")
                if nested:
                    return nested

        return info

    # Extract from each section
    sections = [
        'management',
        'board_of_trustees',
        'department_heads',
        'former_directors',
        'chief_curator',
        'collection_specialists',
        'curators',
        'archivists_and_film_specialists',
        'programmers',
        'pico_staff',
        'deceased_staff'
    ]

    for section in sections:
        section_data = data.get(section, [])
        if isinstance(section_data, dict):
            # Handle single entry sections
            for key, value in section_data.items():
                info = extract_linkedin_info(value, f"{section}.{key}")
                if info:
                    info['section_key'] = key
                    linkedin_data[section].append(info)
        elif isinstance(section_data, list):
            # Handle list sections
            for i, item in enumerate(section_data):
                info = extract_linkedin_info(item, f"{section}[{i}]")
                if info:
                    info['section_index'] = i
                    linkedin_data[section].append(info)

    # Extract company page LinkedIn info
    if 'linkedin_enrichment' in data:
        company_data = data['linkedin_enrichment']
        if 'company_linkedin_url' in company_data:
            linkedin_data['company_page'] = {
                'linkedin_url': company_data['company_linkedin_url'],
                'employee_count': company_data.get('company_stats', {}).get('employee_count_linkedin'),
                'source': 'linkedin_enrichment.company_linkedin_url'
            }

    # Extract all foaf_knows with LinkedIn URLs
    def extract_foaf_with_linkedin(obj, path=""):
        results = []
        if isinstance(obj, dict):
            if 'foaf_knows' in obj and isinstance(obj['foaf_knows'], list):
                for foaf in obj['foaf_knows']:
                    if isinstance(foaf, dict) and 'linkedin_url' in foaf:
                        result = {
                            'name': foaf.get('name'),
                            'type': foaf.get('type'),
                            'organization': foaf.get('organization'),
                            'relationship': foaf.get('relationship'),
                            'linkedin_url': foaf['linkedin_url'],
                            'wikidata_id': foaf.get('wikidata_id'),
                            'path': f"{path}.foaf_knows"
                        }
                        results.append(result)

            # Check nested
            for key, value in obj.items():
                nested = extract_foaf_with_linkedin(value, f"{path}.{key}" if path else key)
                results.extend(nested)

        elif isinstance(obj, list):
            for i, item in enumerate(obj):
                nested = extract_foaf_with_linkedin(item, f"{path}[{i}]" if path else f"[{i}]")
                results.extend(nested)

        return results

    linkedin_data['foaf_knows'] = extract_foaf_with_linkedin(data)

    return linkedin_data

def create_linkedin_identifier(url: str) -> Optional[str]:
    """Create LinkedIn identifier from URL."""
    if not url:
        return None

    # Extract identifier from URL
    patterns = [
        r'linkedin\.com/in/([^/?]+)',
        r'linkedin\.com/pub/([^/?]+)',
    ]

    for pattern in patterns:
        match = re.search(pattern, url)
        if match:
            return match.group(1).rstrip('/').split('?')[0]

    return None

def create_comprehensive_linkedin_enrichment(linkedin_data: Dict[str, Any]) -> Dict[str, Any]:
    """Create comprehensive LinkedIn enrichment structure."""

    enrichment = {
        'extraction_timestamp': datetime.now().isoformat() + 'Z',
        'extraction_method': 'comprehensive_yaml_extraction',
        'total_sections_with_linkedin': 0,
        'total_linkedin_urls': 0,
        'unique_linkedin_urls': set(),
        'sections': {}
    }

    # Process each section
    for section_name, items in linkedin_data.items():
        if section_name == 'company_page':
            if items:
                enrichment['sections'][section_name] = items
                enrichment['total_linkedin_urls'] += 1
                enrichment['unique_linkedin_urls'].add(items['linkedin_url'])
                enrichment['total_sections_with_linkedin'] += 1
        elif section_name == 'foaf_knows':
            # Group foaf_knows by type
            foaf_by_type = defaultdict(list)
            for item in items:
                foaf_by_type[item['type']].append(item)

            enrichment['sections'][section_name] = {
                'total_entries': len(items),
                'by_type': dict(foaf_by_type),
                'sample_entries': items[:10]  # First 10 as sample
            }
            enrichment['total_linkedin_urls'] += len(items)
            for item in items:
                enrichment['unique_linkedin_urls'].add(item['linkedin_url'])
            if items:
                enrichment['total_sections_with_linkedin'] += 1
        elif items:
            # Regular sections
            processed_items = []
            for item in items:
                processed = {
                    'name': item.get('name'),
                    'linkedin_url': item.get('linkedin_url'),
                    'linkedin_identifier': create_linkedin_identifier(item.get('linkedin_url')),
                    'path': item.get('path'),
                    'role': item.get('role'),
                    'section_key': item.get('section_key'),
                    'section_index': item.get('section_index')
                }
                processed_items.append(processed)
                enrichment['unique_linkedin_urls'].add(item.get('linkedin_url'))

            enrichment['sections'][section_name] = {
                'total_entries': len(processed_items),
                'entries': processed_items
            }
            enrichment['total_linkedin_urls'] += len(items)
            if items:
                enrichment['total_sections_with_linkedin'] += 1

    # Convert set to count
    enrichment['unique_linkedin_urls_count'] = len(enrichment['unique_linkedin_urls'])
    enrichment['unique_linkedin_urls'] = list(enrichment['unique_linkedin_urls'])

    return enrichment

def main():
    """Main function."""
    # Path to Eye Filmmuseum file
    eye_file = "/Users/kempersc/apps/glam/data/custodian/NL-NH-AMS-U-EFM-eye_filmmuseum.yaml"

    print("=" * 70)
    print("COMPREHENSIVE LINKEDIN ENRICHMENT FOR EYE FILMMUSEUM")
    print("=" * 70)

    print(f"\nLoading Eye Filmmuseum data from: {eye_file}")
    with open(eye_file, 'r', encoding='utf-8') as f:
        eye_data = yaml.safe_load(f)

    print("\nExtracting all LinkedIn data...")
    linkedin_data = extract_all_linkedin_data(eye_data)

    # Create comprehensive enrichment
    print("\nCreating comprehensive LinkedIn enrichment...")
    enrichment = create_comprehensive_linkedin_enrichment(linkedin_data)

    # Print summary
    print("\n" + "=" * 50)
    print("LINKEDIN DATA SUMMARY")
    print("=" * 50)
    print(f"Total sections with LinkedIn data: {enrichment['total_sections_with_linkedin']}")
    print(f"Total LinkedIn URLs found: {enrichment['total_linkedin_urls']}")
    print(f"Unique LinkedIn URLs: {enrichment['unique_linkedin_urls_count']}")

    print("\nBreakdown by section:")
    for section, data in enrichment['sections'].items():
        if section == 'foaf_knows':
            print(f"\n{section.upper()}:")
            print(f"  Total entries: {data['total_entries']}")
            print("  By type:")
            for type_name, items in data['by_type'].items():
                print(f"    - {type_name}: {len(items)}")
            if data['sample_entries']:
                print("  Sample entries:")
                for item in data['sample_entries'][:3]:
                    print(f"    - {item['name']} ({item.get('type', 'Unknown')})")
        elif isinstance(data, dict) and 'total_entries' in data:
            print(f"\n{section.upper()}: {data['total_entries']} entries")
            if data['entries']:
                for item in data['entries'][:3]:
                    print(f"  - {item['name']}")
        elif isinstance(data, dict):
            print(f"\n{section.upper()}: 1 entry")
            if 'linkedin_url' in data:
                print(f"  - Company page: {data['linkedin_url']}")

    # Add to existing data
    if 'linkedin_enrichment' not in eye_data:
        eye_data['linkedin_enrichment'] = {}

    # Merge with existing LinkedIn enrichment
    existing = eye_data['linkedin_enrichment']
    existing.update({
        'comprehensive_extraction': enrichment,
        'extraction_notes': [
            f"Comprehensive LinkedIn extraction completed on {enrichment['extraction_timestamp']}",
            f"Found {enrichment['total_linkedin_urls']} LinkedIn URLs across {enrichment['total_sections_with_linkedin']} sections",
            f"Unique LinkedIn profiles: {enrichment['unique_linkedin_urls_count']}",
            "Data ready for API enrichment with Unipile when credentials are available",
            "Extraction includes: management, board, staff, curators, foaf_knows network"
        ]
    })

    # Update provenance
    if 'provenance' not in eye_data:
        eye_data['provenance'] = {}
    if 'notes' not in eye_data['provenance']:
        eye_data['provenance']['notes'] = []

    eye_data['provenance']['notes'].append(
        f"Comprehensive LinkedIn extraction on {enrichment['extraction_timestamp']}"
    )

    # Save enriched data
    output_file = eye_file.replace('.yaml', '_linkedin_comprehensive.yaml')
    print(f"\nSaving enriched data to: {output_file}")

    with open(output_file, 'w', encoding='utf-8') as f:
        yaml.dump(eye_data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)

    # Save separate LinkedIn profiles file for easy access
    profiles_file = output_file.replace('.yaml', '_profiles_only.json')
    profiles = {
        'extraction_timestamp': enrichment['extraction_timestamp'],
        'total_profiles': enrichment['unique_linkedin_urls_count'],
        'profiles': []
    }

    # Collect all unique profiles
    all_profiles = []
    for section_data in linkedin_data.values():
        if isinstance(section_data, list):
            all_profiles.extend(section_data)
        elif isinstance(section_data, dict) and 'linkedin_url' in section_data:
            all_profiles.append(section_data)

    # Deduplicate by LinkedIn URL
    seen_urls = set()
    unique_profiles = []
    for profile in all_profiles:
        url = profile.get('linkedin_url')
        if url and url not in seen_urls:
            seen_urls.add(url)
            unique_profiles.append({
                'name': profile.get('name'),
                'linkedin_url': url,
                'linkedin_identifier': create_linkedin_identifier(url),
                'section': profile.get('path', '').split('.')[0] if profile.get('path') else 'unknown'
            })

    profiles['profiles'] = unique_profiles

    with open(profiles_file, 'w', encoding='utf-8') as f:
        json.dump(profiles, f, indent=2)

    print(f"\nLinkedIn profiles saved to: {profiles_file}")

    # Create CSV for easy viewing
    csv_file = output_file.replace('.yaml', '_profiles.csv')
    with open(csv_file, 'w', encoding='utf-8') as f:
        f.write("Name,LinkedIn URL,Identifier,Section,Path\n")
        for profile in unique_profiles:
            f.write(f"{profile['name']},{profile['linkedin_url']},{profile['linkedin_identifier']},{profile['section']},{profile.get('path', '')}\n")

    print(f"CSV saved to: {csv_file}")

    print("\n" + "=" * 70)
    print("ENRICHMENT COMPLETE!")
    print("=" * 70)
    print(f"Total unique LinkedIn profiles: {len(unique_profiles)}")
    print(f"Main enriched file: {output_file}")
    print(f"Profiles-only JSON: {profiles_file}")
    print(f"Profiles CSV: {csv_file}")

    # Instructions for next steps
    print("\n" + "=" * 70)
    print("NEXT STEPS FOR API ENRICHMENT")
    print("=" * 70)
    print("""
To enrich these profiles with Unipile API:

1. Set up Unipile account:
   - Sign up: https://dashboard.unipile.com/signup
   - Connect LinkedIn account via Hosted Auth
   - Get API key from dashboard

2. Set environment variables:
   export UNIPILE_API_KEY=your_api_key_here
   export UNIPILE_DSN=api1.unipile.com:13111

3. Run enrichment script:
   python scripts/enrich_linkedin_profiles_unipile.py

This will fetch detailed profile information for each LinkedIn URL
including: name, headline, location, industry, summary, connection count.
""")

if __name__ == "__main__":
    main()