glam/scripts/extract_linkedin_profiles_v2.py

#!/usr/bin/env python3
"""
Extract and enrich LinkedIn profiles from Eye Filmmuseum data.
This script works with existing data to extract LinkedIn URLs and prepare enrichment data.
"""

import os
import sys
import json
import yaml
import re
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Any, Optional

# Add project root to path
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

def extract_linkedin_urls(data: Dict[str, Any]) -> List[Dict[str, Any]]:
    """Extract all LinkedIn URLs and associated info from Eye Filmmuseum data."""
    linkedin_data = []

    def extract_from_obj(obj, path=""):
        """Recursively extract LinkedIn URLs from object."""
        if isinstance(obj, dict):
            for key, value in obj.items():
                current_path = f"{path}.{key}" if path else key

                # Check for linkedin_url field
                if key == 'linkedin_url' and isinstance(value, str):
                    # Try to find associated name from parent context
                    name = find_name_from_context(obj, path)
                    linkedin_data.append({
                        'name': name,
                        'path': current_path,
                        'linkedin_url': value,
                        'context': obj
                    })

                # Recurse into nested objects
                extract_from_obj(value, current_path)
        elif isinstance(obj, list):
            for i, item in enumerate(obj):
                extract_from_obj(item, f"{path}[{i}]" if path else f"[{i}]")

    def find_name_from_context(obj, path):
        """Try to find a name associated with the LinkedIn URL."""
        # Check common name fields
        for name_field in ['name', 'staff_name', 'person_observed', 'role']:
            if name_field in obj and isinstance(obj[name_field], str):
                return obj[name_field]

        # If person_observed is nested
        if 'person_observed' in obj and isinstance(obj['person_observed'], dict):
            if 'name' in obj['person_observed']:
                return obj['person_observed']['name']

        return "Unknown"

    # Start extraction from root
    extract_from_obj(data)

    return linkedin_data

def extract_linkedin_identifier(url: str) -> Optional[str]:
    """Extract LinkedIn identifier from URL."""
    patterns = [
        r'linkedin\.com/in/([^/?]+)',
        r'linkedin\.com/pub/([^/?]+)',
    ]
    for pattern in patterns:
        match = re.search(pattern, url)
        if match:
            return match.group(1).rstrip('/').split('?')[0]
    return None

def create_linkedin_enrichment(linkedin_data: List[Dict[str, Any]]) -> Dict[str, Any]:
    """Create LinkedIn enrichment structure."""
    enrichment = {
        'extraction_timestamp': datetime.now().isoformat() + 'Z',
        'extraction_method': 'yaml_bulk_extraction',
        'total_profiles_found': len(linkedin_data),
        'profiles': []
    }

    for item in linkedin_data:
        profile = {
            'name': item['name'],
            'path_in_yaml': item['path'],
            'linkedin_url': item['linkedin_url'],
            'linkedin_identifier': extract_linkedin_identifier(item['linkedin_url']),
            'extracted_from': item['path']
        }
        enrichment['profiles'].append(profile)

    return enrichment

def main():
    """Extract LinkedIn profiles from Eye Filmmuseum data."""
    # Path to Eye Filmmuseum file
    eye_file = "/Users/kempersc/apps/glam/data/custodian/NL-NH-AMS-U-EFM-eye_filmmuseum.yaml"

    print("Loading Eye Filmmuseum data...")
    with open(eye_file, 'r', encoding='utf-8') as f:
        eye_data = yaml.safe_load(f)

    print("Extracting LinkedIn URLs...")
    linkedin_data = extract_linkedin_urls(eye_data)

    print(f"Found {len(linkedin_data)} LinkedIn profiles:")
    for item in linkedin_data[:20]:  # Show first 20
        print(f"  - {item['name']} ({item['path']}): {item['linkedin_url']}")

    if len(linkedin_data) > 20:
        print(f"  ... and {len(linkedin_data) - 20} more")

    # Create enrichment
    enrichment = create_linkedin_enrichment(linkedin_data)

    # Add to existing data
    if 'linkedin_enrichment' not in eye_data:
        eye_data['linkedin_enrichment'] = {}

    # Merge with existing LinkedIn enrichment
    existing = eye_data['linkedin_enrichment']
    existing.update({
        'bulk_url_extraction': enrichment,
        'extraction_notes': [
            f"Bulk LinkedIn URL extraction completed on {enrichment['extraction_timestamp']}",
            f"Found {enrichment['total_profiles_found']} total LinkedIn profiles across all sections",
            "Profiles can be enriched with Unipile API when credentials are available",
            "Note: These URLs were extracted from various sections including management, curators, collection_specialists, etc."
        ]
    })

    # Update provenance
    if 'provenance' not in eye_data:
        eye_data['provenance'] = {}
    if 'notes' not in eye_data['provenance']:
        eye_data['provenance']['notes'] = []

    eye_data['provenance']['notes'].append(
        f"LinkedIn bulk URL extraction on {enrichment['extraction_timestamp']}"
    )

    # Save enriched data
    output_file = eye_file.replace('.yaml', '_linkedin_enriched.yaml')
    print(f"\nSaving enriched data to: {output_file}")

    with open(output_file, 'w', encoding='utf-8') as f:
        yaml.dump(eye_data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)

    print("\nExtraction complete!")
    print(f"Total LinkedIn profiles extracted: {len(linkedin_data)}")

    # Create summary report
    report_file = output_file.replace('.yaml', '_report.json')
    report = {
        'extraction_timestamp': enrichment['extraction_timestamp'],
        'total_profiles': len(linkedin_data),
        'profiles_by_section': {}
    }

    # Count by section
    for item in linkedin_data:
        section = item['path'].split('.')[0]  # Get top-level section
        if section not in report['profiles_by_section']:
            report['profiles_by_section'][section] = 0
        report['profiles_by_section'][section] += 1

    with open(report_file, 'w', encoding='utf-8') as f:
        json.dump(report, f, indent=2)

    print(f"Report saved to: {report_file}")

    # Show section breakdown
    print("\nProfiles by section:")
    for section, count in sorted(report['profiles_by_section'].items()):
        print(f"  {section}: {count}")

    # Create a simple CSV for easy viewing
    csv_file = output_file.replace('.yaml', '_profiles.csv')
    with open(csv_file, 'w', encoding='utf-8') as f:
        f.write("Name,Path,LinkedIn URL,Identifier\n")
        for item in linkedin_data:
            identifier = extract_linkedin_identifier(item['linkedin_url'])
            f.write(f"{item['name']},{item['path']},{item['linkedin_url']},{identifier}\n")

    print(f"CSV saved to: {csv_file}")

if __name__ == "__main__":
    main()