glam/scripts/enrich_profiles_linkup.py

#!/usr/bin/env python3
"""
Enrich basic person profiles using Linkup API.

This script takes fallback_basic and privacy_restricted_fallback profiles
and enriches them with additional data from Linkup searches and page fetches.

Provenance is tracked according to AGENTS.md Rule 27.
"""

import json
import os
import sys
import time
import re
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Optional, Any
import subprocess

# Configuration
ENTITY_DIR = Path("/Users/kempersc/apps/glam/data/custodian/person/entity")
BATCH_SIZE = 10  # Process in batches
DELAY_BETWEEN_REQUESTS = 1.5  # seconds between API calls

def load_basic_profiles() -> List[Path]:
    """Load all profiles that need enrichment (fallback_basic or privacy_restricted_fallback)."""
    profiles_to_enrich = []

    for file_path in ENTITY_DIR.glob("*.json"):
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)

            extraction_method = data.get('extraction_metadata', {}).get('extraction_method', '')

            # Check if already enriched with Linkup
            timeline_enrichment = data.get('timeline_enrichment', {})
            if timeline_enrichment:
                continue  # Already enriched

            if extraction_method in ('fallback_basic', 'privacy_restricted_fallback'):
                profiles_to_enrich.append(file_path)
        except Exception as e:
            print(f"Error reading {file_path}: {e}")

    return profiles_to_enrich


def call_linkup_search(query: str) -> Optional[Dict]:
    """Call Linkup search API via MCP."""
    # Use subprocess to call the MCP tool
    # This is a placeholder - in production, use the actual MCP client
    try:
        import httpx

        # Linkup API endpoint
        response = httpx.post(
            "https://api.linkup.so/v1/search",
            headers={
                "Authorization": f"Bearer {os.environ.get('LINKUP_API_KEY', '')}",
                "Content-Type": "application/json"
            },
            json={
                "q": query,
                "depth": "standard",
                "outputType": "searchResults"
            },
            timeout=30.0
        )

        if response.status_code == 200:
            return response.json()
        else:
            print(f"Linkup search error: {response.status_code} - {response.text}")
            return None
    except Exception as e:
        print(f"Error calling Linkup search: {e}")
        return None


def call_linkup_fetch(url: str) -> Optional[Dict]:
    """Call Linkup fetch API via MCP."""
    try:
        import httpx

        response = httpx.post(
            "https://api.linkup.so/v1/fetch",
            headers={
                "Authorization": f"Bearer {os.environ.get('LINKUP_API_KEY', '')}",
                "Content-Type": "application/json"
            },
            json={
                "url": url,
                "renderJs": False
            },
            timeout=30.0
        )

        if response.status_code == 200:
            return response.json()
        else:
            print(f"Linkup fetch error: {response.status_code} - {response.text}")
            return None
    except Exception as e:
        print(f"Error calling Linkup fetch: {e}")
        return None


def extract_profile_data_from_markdown(markdown: str, linkedin_url: str) -> Dict:
    """Extract structured profile data from LinkedIn markdown."""
    data = {
        'name': None,
        'headline': None,
        'location': None,
        'connections': None,
        'current_company': None,
        'education': [],
        'experience': [],
        'volunteer_experience': [],
        'skills': [],
        'about': None
    }

    # Extract name from title
    name_match = re.search(r'^# ([^\n]+)', markdown)
    if name_match:
        data['name'] = name_match.group(1).strip()

    # Extract from header line
    header_match = re.search(r'Experience: ([^·]+)·\s*Education: ([^·]+)·\s*Location: ([^·]+)·\s*(\d+) connections', markdown)
    if header_match:
        data['current_company'] = header_match.group(1).strip()
        education_name = header_match.group(2).strip()
        if education_name:
            data['education'].append({'school': education_name})
        data['location'] = header_match.group(3).strip()
        data['connections'] = int(header_match.group(4))

    # Extract followers/connections
    followers_match = re.search(r'(\d+) followers (\d+) connections', markdown)
    if followers_match:
        data['followers'] = int(followers_match.group(1))
        data['connections'] = int(followers_match.group(2))

    # Extract volunteer experience sections
    volunteer_sections = re.findall(
        r'### ([^\n]+)\n\n#### \[([^\]]+)\][^\n]*\n\n([^\n]+)\n\n([^#]*?)(?=\n-   |$)',
        markdown,
        re.MULTILINE
    )
    for title, org, duration, description in volunteer_sections:
        if 'Volunteer' in title or 'volunteer' in description.lower():
            data['volunteer_experience'].append({
                'title': title.strip(),
                'organization': org.strip(),
                'duration': duration.strip(),
                'description': description.strip()[:500]  # Limit description length
            })

    # Extract experience from Experience & Education section
    exp_section = re.search(r'## Experience & Education\n(.*?)(?=## |$)', markdown, re.DOTALL)
    if exp_section:
        exp_items = re.findall(
            r'### ([^\n]+)\n\n#### \n\n([^\n]+)',
            exp_section.group(1)
        )
        for company, role in exp_items:
            if company.strip() and not company.startswith('\\*'):
                data['experience'].append({
                    'company': company.strip(),
                    'role': role.strip() if not role.startswith('\\*') else None
                })

    return data


def create_linkup_provenance(url: str, search_query: Optional[str] = None) -> Dict:
    """Create provenance block for Linkup enrichment."""
    timestamp = datetime.now(timezone.utc).isoformat()

    return {
        'retrieval_agent': 'linkup',
        'retrieval_timestamp': timestamp,
        'source_url': url,
        'search_query': search_query,
        'method': 'linkup_fetch' if not search_query else 'linkup_search'
    }


def enrich_profile(profile_path: Path) -> bool:
    """Enrich a single profile using Linkup."""
    try:
        with open(profile_path, 'r', encoding='utf-8') as f:
            data = json.load(f)

        # Get profile info for search
        profile_data = data.get('profile_data', {})
        source_info = data.get('source_staff_info', {})
        extraction_meta = data.get('extraction_metadata', {})

        name = (
            profile_data.get('name') or
            profile_data.get('full_name') or
            source_info.get('name') or
            'Unknown'
        )

        headline = (
            profile_data.get('headline') or
            source_info.get('headline') or
            ''
        )

        custodian = source_info.get('custodian', '')
        linkedin_url = extraction_meta.get('linkedin_url', '')

        # Build search query
        search_parts = [name]
        if custodian:
            search_parts.append(custodian)
        if headline and len(headline) < 50:
            search_parts.append(headline)
        search_parts.append('site:linkedin.com')

        search_query = ' '.join(search_parts)

        print(f"Enriching: {name} ({custodian})")

        # Try direct LinkedIn fetch first
        enriched_data = {}
        provenance_list = []

        if linkedin_url:
            print(f"  Fetching: {linkedin_url}")
            fetch_result = call_linkup_fetch(linkedin_url)

            if fetch_result and fetch_result.get('markdown'):
                markdown = fetch_result['markdown']
                extracted = extract_profile_data_from_markdown(markdown, linkedin_url)

                # Add provenance
                provenance_list.append(create_linkup_provenance(linkedin_url))

                # Update enriched data
                enriched_data['fetch_result'] = extracted
                enriched_data['raw_markdown_length'] = len(markdown)

        # Also do a search to potentially find additional info
        print(f"  Searching: {search_query[:80]}...")
        search_result = call_linkup_search(search_query)

        if search_result and search_result.get('results'):
            results = search_result['results']

            # Find the most relevant result
            relevant_results = []
            for r in results[:5]:
                url = r.get('url', '')
                content = r.get('content', '')
                title = r.get('name', '')

                # Check if it's a LinkedIn profile
                if 'linkedin.com/in/' in url:
                    relevant_results.append({
                        'url': url,
                        'title': title,
                        'content': content[:500],  # Limit content
                        'relevance': 'linkedin_profile'
                    })
                elif name.lower() in title.lower() or name.lower() in content.lower():
                    relevant_results.append({
                        'url': url,
                        'title': title,
                        'content': content[:300],
                        'relevance': 'name_match'
                    })

            if relevant_results:
                provenance_list.append(create_linkup_provenance(
                    url=relevant_results[0]['url'],
                    search_query=search_query
                ))
                enriched_data['search_results'] = relevant_results

        # Only update if we found something
        if enriched_data:
            # Add Linkup enrichment section
            data['timeline_enrichment'] = {
                'enrichment_date': datetime.now(timezone.utc).isoformat(),
                'provenance': provenance_list,
                'data': enriched_data
            }

            # Update profile_data if we got better info
            if enriched_data.get('fetch_result'):
                fetch_data = enriched_data['fetch_result']

                if fetch_data.get('location') and not profile_data.get('location'):
                    profile_data['location'] = fetch_data['location']

                if fetch_data.get('connections'):
                    profile_data['connections'] = fetch_data['connections']

                if fetch_data.get('education'):
                    profile_data['education'] = fetch_data['education']

                if fetch_data.get('experience'):
                    if not profile_data.get('career_history'):
                        profile_data['career_history'] = []
                    for exp in fetch_data['experience']:
                        if exp.get('company'):
                            profile_data['career_history'].append(exp)

                if fetch_data.get('volunteer_experience'):
                    profile_data['volunteer_experience'] = fetch_data['volunteer_experience']

                data['profile_data'] = profile_data

            # Update extraction metadata
            data['extraction_metadata']['linkup_enriched'] = True
            data['extraction_metadata']['timeline_enrichment_date'] = datetime.now(timezone.utc).isoformat()

            # Write back
            with open(profile_path, 'w', encoding='utf-8') as f:
                json.dump(data, f, indent=2, ensure_ascii=False)

            print(f"  ✓ Enriched with {len(provenance_list)} sources")
            return True
        else:
            print(f"  - No additional data found")
            return False

    except Exception as e:
        print(f"Error enriching {profile_path}: {e}")
        return False


def main():
    """Main enrichment workflow."""
    print("=" * 60)
    print("Linkup Profile Enrichment Script")
    print("=" * 60)

    # Check for API key
    if not os.environ.get('LINKUP_API_KEY'):
        print("Warning: LINKUP_API_KEY not set. Using MCP tools instead.")

    # Load profiles to enrich
    profiles = load_basic_profiles()
    print(f"\nFound {len(profiles)} profiles to enrich")

    if not profiles:
        print("No profiles need enrichment.")
        return

    # Process in batches
    enriched_count = 0
    failed_count = 0

    for i, profile_path in enumerate(profiles):
        print(f"\n[{i+1}/{len(profiles)}] ", end="")

        try:
            success = enrich_profile(profile_path)
            if success:
                enriched_count += 1
            else:
                failed_count += 1
        except Exception as e:
            print(f"Error: {e}")
            failed_count += 1

        # Rate limiting
        if i < len(profiles) - 1:
            time.sleep(DELAY_BETWEEN_REQUESTS)

        # Progress report every 100 profiles
        if (i + 1) % 100 == 0:
            print(f"\n--- Progress: {i+1}/{len(profiles)} processed, {enriched_count} enriched ---\n")

    # Final report
    print("\n" + "=" * 60)
    print("Enrichment Complete")
    print("=" * 60)
    print(f"Total profiles processed: {len(profiles)}")
    print(f"Successfully enriched: {enriched_count}")
    print(f"Failed/No data: {failed_count}")


if __name__ == '__main__':
    main()