glam/scripts/linkedin_final_extraction.py

#!/usr/bin/env python3
"""
FINAL LinkedIn extraction for Eye Filmmuseum.
This script performs deep extraction of ALL LinkedIn URLs from the complex YAML structure.
"""

import os
import sys
import json
import yaml
import re
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Any, Optional, Set

def deep_extract_linkedin_urls(data: Any, path: str = "") -> List[Dict[str, Any]]:
    """Deep extraction of LinkedIn URLs from nested data structure."""
    urls = []

    def extract_from_value(value, context_path: str):
        """Extract LinkedIn URL from a single value."""
        if isinstance(value, str) and 'linkedin.com' in value:
            # Clean up the URL
            url = value.strip()
            if url.startswith('http'):
                return url
            elif url.startswith('//'):
                return f"https:{url}"
            else:
                return f"https://{url}"
        return None

    def extract_from_object(obj, context_path: str = ""):
        """Extract LinkedIn URLs from an object."""
        results = []

        if isinstance(obj, dict):
            for key, value in obj.items():
                current_path = f"{context_path}.{key}" if context_path else key

                # Direct LinkedIn URL fields
                if 'linkedin' in key.lower() or key == 'url':
                    url = extract_from_value(value, current_path)
                    if url:
                        # Try to find associated name
                        name = find_associated_name(obj, key)
                        results.append({
                            'name': name,
                            'linkedin_url': url,
                            'path': current_path,
                            'field': key,
                            'context': obj
                        })

                # Check for LinkedIn URLs in string values
                if isinstance(value, str):
                    url = extract_from_value(value, current_path)
                    if url:
                        name = find_associated_name(obj, key)
                        results.append({
                            'name': name,
                            'linkedin_url': url,
                            'path': current_path,
                            'field': key,
                            'context': obj
                        })

                # Recurse into nested structures
                nested_results = extract_from_object(value, current_path)
                results.extend(nested_results)

        elif isinstance(obj, list):
            for i, item in enumerate(obj):
                current_path = f"{context_path}[{i}]" if context_path else f"[{i}]"
                nested_results = extract_from_object(item, current_path)
                results.extend(nested_results)

        elif isinstance(obj, str):
            url = extract_from_value(obj, context_path)
            if url:
                results.append({
                    'name': 'Unknown',
                    'linkedin_url': url,
                    'path': context_path,
                    'field': 'string_value',
                    'context': obj
                })

        return results

    def find_associated_name(obj, field_key):
        """Try to find an associated name for a LinkedIn URL."""
        # Check various name fields
        name_fields = [
            'name', 'full_name', 'staff_name', 'person_name',
            'title', 'label', 'organization', 'company'
        ]

        for field in name_fields:
            if field in obj and isinstance(obj[field], str):
                return obj[field]

        # Check in parent context if available
        if isinstance(obj, dict):
            # Check for person_observed structure
            if 'person_observed' in obj:
                person = obj['person_observed']
                if isinstance(person, dict) and 'name' in person:
                    return person['name']

        return 'Unknown'

    # Start deep extraction
    all_results = extract_from_object(data)

    # Deduplicate by LinkedIn URL
    seen_urls = set()
    unique_results = []

    for result in all_results:
        url = result['linkedin_url']
        if url and url not in seen_urls:
            seen_urls.add(url)
            unique_results.append(result)

    return unique_results

def extract_linkedin_identifier(url: str) -> Optional[str]:
    """Extract LinkedIn identifier from URL."""
    if not url:
        return None

    # Handle various LinkedIn URL formats
    patterns = [
        r'linkedin\.com/in/([^/?]+)',
        r'linkedin\.com/pub/([^/?]+)',
        r'linkedin\.com/company/([^/?]+)',
        r'linkedin\.com/school/([^/?]+)'
    ]

    for pattern in patterns:
        match = re.search(pattern, url)
        if match:
            return match.group(1).rstrip('/').split('?')[0]

    return None

def create_final_linkedin_enrichment(linkedin_data: List[Dict[str, Any]]) -> Dict[str, Any]:
    """Create the final LinkedIn enrichment structure."""

    # Group by type (personal vs company)
    personal_profiles = []
    company_profiles = []

    for item in linkedin_data:
        profile = {
            'name': item['name'],
            'linkedin_url': item['linkedin_url'],
            'linkedin_identifier': extract_linkedin_identifier(item['linkedin_url']),
            'extraction_path': item['path'],
            'field_found': item['field']
        }

        # Classify as personal or company profile
        if '/company/' in item['linkedin_url']:
            company_profiles.append(profile)
        else:
            personal_profiles.append(profile)

    enrichment = {
        'extraction_timestamp': datetime.now().isoformat() + 'Z',
        'extraction_method': 'deep_yaml_extraction_v2',
        'extraction_stats': {
            'total_profiles_found': len(linkedin_data),
            'personal_profiles': len(personal_profiles),
            'company_profiles': len(company_profiles),
            'unique_identifiers': len(set(p['linkedin_identifier'] for p in linkedin_data if p['linkedin_identifier']))
        },
        'personal_profiles': personal_profiles,
        'company_profiles': company_profiles,
        'all_profiles': linkedin_data
    }

    return enrichment

def main():
    """Main function."""
    # Path to Eye Filmmuseum file
    eye_file = "/Users/kempersc/apps/glam/data/custodian/NL-NH-AMS-U-EFM-eye_filmmuseum.yaml"

    print("=" * 80)
    print("FINAL LINKEDIN EXTRACTION FOR EYE FILMMUSEUM")
    print("=" * 80)

    print(f"\nLoading data from: {eye_file}")
    with open(eye_file, 'r', encoding='utf-8') as f:
        eye_data = yaml.safe_load(f)

    print("\nPerforming deep extraction of LinkedIn URLs...")
    linkedin_data = deep_extract_linkedin_urls(eye_data)

    print(f"\n✓ Found {len(linkedin_data)} LinkedIn profiles!")

    # Show first few results
    print("\nFirst 10 profiles found:")
    for i, item in enumerate(linkedin_data[:10]):
        print(f"  {i+1:2d}. {item['name']}")
        print(f"      URL: {item['linkedin_url']}")
        print(f"      Path: {item['path']}")
        print(f"      Field: {item['field']}")
        print()

    if len(linkedin_data) > 10:
        print(f"  ... and {len(linkedin_data) - 10} more profiles")

    # Create enrichment
    print("\nCreating enrichment structure...")
    enrichment = create_final_linkedin_enrichment(linkedin_data)

    # Add to existing data
    if 'linkedin_enrichment' not in eye_data:
        eye_data['linkedin_enrichment'] = {}

    # Merge with existing data
    eye_data['linkedin_enrichment']['final_extraction'] = enrichment
    eye_data['linkedin_enrichment']['extraction_notes'] = [
        f"Final deep LinkedIn extraction completed on {enrichment['extraction_timestamp']}",
        f"Total profiles found: {enrichment['extraction_stats']['total_profiles_found']}",
        f"Personal profiles: {enrichment['extraction_stats']['personal_profiles']}",
        f"Company profiles: {enrichment['extraction_stats']['company_profiles']}",
        "Deep extraction scans all YAML fields and nested structures",
        "Ready for API enrichment with Unipile when credentials available"
    ]

    # Update provenance
    if 'provenance' not in eye_data:
        eye_data['provenance'] = {}
    if 'notes' not in eye_data['provenance']:
        eye_data['provenance']['notes'] = []

    eye_data['provenance']['notes'].append(
        f"Final LinkedIn deep extraction on {enrichment['extraction_timestamp']}"
    )

    # Save enriched data
    output_file = eye_file.replace('.yaml', '_linkedin_final.yaml')
    print(f"\nSaving enriched data to: {output_file}")

    with open(output_file, 'w', encoding='utf-8') as f:
        yaml.dump(eye_data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)

    # Save profiles-only files
    profiles_file = output_file.replace('.yaml', '_all_profiles.json')
    with open(profiles_file, 'w', encoding='utf-8') as f:
        json.dump({
            'extraction_timestamp': enrichment['extraction_timestamp'],
            'total_profiles': len(linkedin_data),
            'profiles': linkedin_data
        }, f, indent=2)

    # Create CSV for easy viewing
    csv_file = output_file.replace('.yaml', '_profiles.csv')
    with open(csv_file, 'w', encoding='utf-8') as f:
        f.write("Name,LinkedIn URL,Identifier,Type,Path,Field\n")
        for item in linkedin_data:
            profile_type = 'company' if '/company/' in item['linkedin_url'] else 'personal'
            identifier = extract_linkedin_identifier(item['linkedin_url'])
            f.write(f"{item['name']},{item['linkedin_url']},{identifier},{profile_type},{item['path']},{item['field']}\n")

    # Create summary report
    report = {
        'extraction_timestamp': enrichment['extraction_timestamp'],
        'method': 'deep_yaml_extraction_v2',
        'stats': enrichment['extraction_stats'],
        'files_created': {
            'main_yaml': output_file,
            'profiles_json': profiles_file,
            'profiles_csv': csv_file
        }
    }

    report_file = output_file.replace('.yaml', '_extraction_report.json')
    with open(report_file, 'w', encoding='utf-8') as f:
        json.dump(report, f, indent=2)

    print("\n" + "=" * 80)
    print("EXTRACTION COMPLETE!")
    print("=" * 80)
    print(f"Total LinkedIn profiles: {len(linkedin_data)}")
    print(f"  - Personal profiles: {enrichment['extraction_stats']['personal_profiles']}")
    print(f"  - Company profiles: {enrichment['extraction_stats']['company_profiles']}")
    print(f"\nFiles created:")
    print(f"  1. Main YAML: {output_file}")
    print(f"  2. Profiles JSON: {profiles_file}")
    print(f"  3. Profiles CSV: {csv_file}")
    print(f"  4. Report JSON: {report_file}")

    print("\n" + "=" * 80)
    print("READY FOR API ENRICHMENT")
    print("=" * 80)
    print("""
To enrich these profiles with detailed data using Unipile API:

1. Set up Unipile account:
   - Sign up: https://dashboard.unipile.com/signup
   - Connect your LinkedIn account via Hosted Auth
   - Get API key from dashboard

2. Set environment variables:
   export UNIPILE_API_KEY=your_api_key_here
   export UNIPILE_DSN=api1.unipile.com:13111

3. Run the enrichment script:
   python scripts/enrich_linkedin_with_api.py

This will fetch:
- Profile names, headlines, locations
- Connection counts
- Industry and summary information
- Work experience and education
- Skills and languages (if available)

The enriched data will be added back to the Eye Filmmuseum YAML file.
""")

if __name__ == "__main__":
    main()