glam/scripts/extract_zwolle_profiles.py

#!/usr/bin/env python3
"""
Simple script to extract LinkedIn profiles using existing working pattern.
"""
import json
import os
import subprocess
from datetime import datetime, timezone
from pathlib import Path

def extract_profile_simple(linkedin_url: str, name: str, output_file: str, source_file: str, staff_id: str) -> bool:
    """Extract LinkedIn profile using existing working pattern."""

    print(f"Extracting LinkedIn profile for: {name}")

    # Use the exact command pattern that worked before
    cmd = [
        'python', 'scripts/extract_linkedin_profile_exa.py',
        linkedin_url,
        output_file,
        '--source_file', source_file,
        '--staff_id', staff_id
    ]

    try:
        result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)

        if result.returncode == 0:
            print(f"✅ Successfully extracted profile for {name}")
            return True
        else:
            print(f"❌ Failed to extract profile for {name}: {result.stderr}")
            return False

    except Exception as e:
        print(f"❌ Exception extracting profile for {name}: {e}")
        return False

def main():
    """Main function to extract specific LinkedIn profiles."""

    # Define specific profiles to extract from Academiehuis Grote Kerk Zwolle
    profiles = [
        {
            'linkedin_url': 'https://www.linkedin.com/in/anja-van-hoorn-657b66223',
            'name': 'Anja van Hoorn',
            'output_file': '/Users/kempersc/apps/glam/data/custodian/person/entity/anja-van-hoorn-657b66223_20251210T160000Z.json',
            'source_file': '/Users/kempersc/apps/glam/data/custodian/person/affiliated/parsed/academiehuis-grote-kerk-zwolle_staff_20251210T155412Z.json',
            'staff_id': 'academiehuis-grote-kerk-zwolle_staff_0001_anja_van_hoorn'
        },
        {
            'linkedin_url': 'https://www.linkedin.com/in/inez-van-kleef',
            'name': 'Inez van Kleef',
            'output_file': '/Users/kempersc/apps/glam/data/custodian/person/entity/inez-van-kleef_20251210T160000Z.json',
            'source_file': '/Users/kempersc/apps/glam/data/custodian/person/affiliated/parsed/academiehuis-grote-kerk-zwolle_staff_20251210T155412Z.json',
            'staff_id': 'academiehuis-grote-kerk-zwolle_staff_0002_inez_van_kleef'
        },
        {
            'linkedin_url': 'https://www.linkedin.com/in/marga-edens-a284175',
            'name': 'Marga Edens',
            'output_file': '/Users/kempersc/apps/glam/data/custodian/person/entity/marga-edens-a284175_20251210T160000Z.json',
            'source_file': '/Users/kempersc/apps/glam/data/custodian/person/affiliated/parsed/academiehuis-grote-kerk-zwolle_staff_20251210T155412Z.json',
            'staff_id': 'academiehuis-grote-kerk-zwolle_staff_0003_marga_edens'
        }
    ]

    success_count = 0
    total_cost = 0.0

    for profile in profiles:
        if extract_profile_simple(**profile):
            success_count += 1
            total_cost += 0.001
        # Delay between requests
        import time
        time.sleep(3)

    print(f"\n📊 Extraction Summary:")
    print(f"✅ Successfully processed: {success_count}")
    print(f"💰 Total cost: ${total_cost:.3f}")
    print(f"📁 Files saved to: /Users/kempersc/apps/glam/data/custodian/person/entity")

if __name__ == "__main__":
    main()