#!/usr/bin/env python3 """ Enrich basic person profiles using Linkup API. This script takes fallback_basic and privacy_restricted_fallback profiles and enriches them with additional data from Linkup searches and page fetches. Provenance is tracked according to AGENTS.md Rule 27. """ import json import os import sys import time import re from datetime import datetime, timezone from pathlib import Path from typing import Dict, List, Optional, Any import subprocess # Configuration ENTITY_DIR = Path("/Users/kempersc/apps/glam/data/custodian/person/entity") BATCH_SIZE = 10 # Process in batches DELAY_BETWEEN_REQUESTS = 1.5 # seconds between API calls def load_basic_profiles() -> List[Path]: """Load all profiles that need enrichment (fallback_basic or privacy_restricted_fallback).""" profiles_to_enrich = [] for file_path in ENTITY_DIR.glob("*.json"): try: with open(file_path, 'r', encoding='utf-8') as f: data = json.load(f) extraction_method = data.get('extraction_metadata', {}).get('extraction_method', '') # Check if already enriched with Linkup timeline_enrichment = data.get('timeline_enrichment', {}) if timeline_enrichment: continue # Already enriched if extraction_method in ('fallback_basic', 'privacy_restricted_fallback'): profiles_to_enrich.append(file_path) except Exception as e: print(f"Error reading {file_path}: {e}") return profiles_to_enrich def call_linkup_search(query: str) -> Optional[Dict]: """Call Linkup search API via MCP.""" # Use subprocess to call the MCP tool # This is a placeholder - in production, use the actual MCP client try: import httpx # Linkup API endpoint response = httpx.post( "https://api.linkup.so/v1/search", headers={ "Authorization": f"Bearer {os.environ.get('LINKUP_API_KEY', '')}", "Content-Type": "application/json" }, json={ "q": query, "depth": "standard", "outputType": "searchResults" }, timeout=30.0 ) if response.status_code == 200: return response.json() else: print(f"Linkup search error: {response.status_code} - {response.text}") return None except Exception as e: print(f"Error calling Linkup search: {e}") return None def call_linkup_fetch(url: str) -> Optional[Dict]: """Call Linkup fetch API via MCP.""" try: import httpx response = httpx.post( "https://api.linkup.so/v1/fetch", headers={ "Authorization": f"Bearer {os.environ.get('LINKUP_API_KEY', '')}", "Content-Type": "application/json" }, json={ "url": url, "renderJs": False }, timeout=30.0 ) if response.status_code == 200: return response.json() else: print(f"Linkup fetch error: {response.status_code} - {response.text}") return None except Exception as e: print(f"Error calling Linkup fetch: {e}") return None def extract_profile_data_from_markdown(markdown: str, linkedin_url: str) -> Dict: """Extract structured profile data from LinkedIn markdown.""" data = { 'name': None, 'headline': None, 'location': None, 'connections': None, 'current_company': None, 'education': [], 'experience': [], 'volunteer_experience': [], 'skills': [], 'about': None } # Extract name from title name_match = re.search(r'^# ([^\n]+)', markdown) if name_match: data['name'] = name_match.group(1).strip() # Extract from header line header_match = re.search(r'Experience: ([^·]+)·\s*Education: ([^·]+)·\s*Location: ([^·]+)·\s*(\d+) connections', markdown) if header_match: data['current_company'] = header_match.group(1).strip() education_name = header_match.group(2).strip() if education_name: data['education'].append({'school': education_name}) data['location'] = header_match.group(3).strip() data['connections'] = int(header_match.group(4)) # Extract followers/connections followers_match = re.search(r'(\d+) followers (\d+) connections', markdown) if followers_match: data['followers'] = int(followers_match.group(1)) data['connections'] = int(followers_match.group(2)) # Extract volunteer experience sections volunteer_sections = re.findall( r'### ([^\n]+)\n\n#### \[([^\]]+)\][^\n]*\n\n([^\n]+)\n\n([^#]*?)(?=\n- |$)', markdown, re.MULTILINE ) for title, org, duration, description in volunteer_sections: if 'Volunteer' in title or 'volunteer' in description.lower(): data['volunteer_experience'].append({ 'title': title.strip(), 'organization': org.strip(), 'duration': duration.strip(), 'description': description.strip()[:500] # Limit description length }) # Extract experience from Experience & Education section exp_section = re.search(r'## Experience & Education\n(.*?)(?=## |$)', markdown, re.DOTALL) if exp_section: exp_items = re.findall( r'### ([^\n]+)\n\n#### \n\n([^\n]+)', exp_section.group(1) ) for company, role in exp_items: if company.strip() and not company.startswith('\\*'): data['experience'].append({ 'company': company.strip(), 'role': role.strip() if not role.startswith('\\*') else None }) return data def create_linkup_provenance(url: str, search_query: Optional[str] = None) -> Dict: """Create provenance block for Linkup enrichment.""" timestamp = datetime.now(timezone.utc).isoformat() return { 'retrieval_agent': 'linkup', 'retrieval_timestamp': timestamp, 'source_url': url, 'search_query': search_query, 'method': 'linkup_fetch' if not search_query else 'linkup_search' } def enrich_profile(profile_path: Path) -> bool: """Enrich a single profile using Linkup.""" try: with open(profile_path, 'r', encoding='utf-8') as f: data = json.load(f) # Get profile info for search profile_data = data.get('profile_data', {}) source_info = data.get('source_staff_info', {}) extraction_meta = data.get('extraction_metadata', {}) name = ( profile_data.get('name') or profile_data.get('full_name') or source_info.get('name') or 'Unknown' ) headline = ( profile_data.get('headline') or source_info.get('headline') or '' ) custodian = source_info.get('custodian', '') linkedin_url = extraction_meta.get('linkedin_url', '') # Build search query search_parts = [name] if custodian: search_parts.append(custodian) if headline and len(headline) < 50: search_parts.append(headline) search_parts.append('site:linkedin.com') search_query = ' '.join(search_parts) print(f"Enriching: {name} ({custodian})") # Try direct LinkedIn fetch first enriched_data = {} provenance_list = [] if linkedin_url: print(f" Fetching: {linkedin_url}") fetch_result = call_linkup_fetch(linkedin_url) if fetch_result and fetch_result.get('markdown'): markdown = fetch_result['markdown'] extracted = extract_profile_data_from_markdown(markdown, linkedin_url) # Add provenance provenance_list.append(create_linkup_provenance(linkedin_url)) # Update enriched data enriched_data['fetch_result'] = extracted enriched_data['raw_markdown_length'] = len(markdown) # Also do a search to potentially find additional info print(f" Searching: {search_query[:80]}...") search_result = call_linkup_search(search_query) if search_result and search_result.get('results'): results = search_result['results'] # Find the most relevant result relevant_results = [] for r in results[:5]: url = r.get('url', '') content = r.get('content', '') title = r.get('name', '') # Check if it's a LinkedIn profile if 'linkedin.com/in/' in url: relevant_results.append({ 'url': url, 'title': title, 'content': content[:500], # Limit content 'relevance': 'linkedin_profile' }) elif name.lower() in title.lower() or name.lower() in content.lower(): relevant_results.append({ 'url': url, 'title': title, 'content': content[:300], 'relevance': 'name_match' }) if relevant_results: provenance_list.append(create_linkup_provenance( url=relevant_results[0]['url'], search_query=search_query )) enriched_data['search_results'] = relevant_results # Only update if we found something if enriched_data: # Add Linkup enrichment section data['timeline_enrichment'] = { 'enrichment_date': datetime.now(timezone.utc).isoformat(), 'provenance': provenance_list, 'data': enriched_data } # Update profile_data if we got better info if enriched_data.get('fetch_result'): fetch_data = enriched_data['fetch_result'] if fetch_data.get('location') and not profile_data.get('location'): profile_data['location'] = fetch_data['location'] if fetch_data.get('connections'): profile_data['connections'] = fetch_data['connections'] if fetch_data.get('education'): profile_data['education'] = fetch_data['education'] if fetch_data.get('experience'): if not profile_data.get('career_history'): profile_data['career_history'] = [] for exp in fetch_data['experience']: if exp.get('company'): profile_data['career_history'].append(exp) if fetch_data.get('volunteer_experience'): profile_data['volunteer_experience'] = fetch_data['volunteer_experience'] data['profile_data'] = profile_data # Update extraction metadata data['extraction_metadata']['linkup_enriched'] = True data['extraction_metadata']['timeline_enrichment_date'] = datetime.now(timezone.utc).isoformat() # Write back with open(profile_path, 'w', encoding='utf-8') as f: json.dump(data, f, indent=2, ensure_ascii=False) print(f" ✓ Enriched with {len(provenance_list)} sources") return True else: print(f" - No additional data found") return False except Exception as e: print(f"Error enriching {profile_path}: {e}") return False def main(): """Main enrichment workflow.""" print("=" * 60) print("Linkup Profile Enrichment Script") print("=" * 60) # Check for API key if not os.environ.get('LINKUP_API_KEY'): print("Warning: LINKUP_API_KEY not set. Using MCP tools instead.") # Load profiles to enrich profiles = load_basic_profiles() print(f"\nFound {len(profiles)} profiles to enrich") if not profiles: print("No profiles need enrichment.") return # Process in batches enriched_count = 0 failed_count = 0 for i, profile_path in enumerate(profiles): print(f"\n[{i+1}/{len(profiles)}] ", end="") try: success = enrich_profile(profile_path) if success: enriched_count += 1 else: failed_count += 1 except Exception as e: print(f"Error: {e}") failed_count += 1 # Rate limiting if i < len(profiles) - 1: time.sleep(DELAY_BETWEEN_REQUESTS) # Progress report every 100 profiles if (i + 1) % 100 == 0: print(f"\n--- Progress: {i+1}/{len(profiles)} processed, {enriched_count} enriched ---\n") # Final report print("\n" + "=" * 60) print("Enrichment Complete") print("=" * 60) print(f"Total profiles processed: {len(profiles)}") print(f"Successfully enriched: {enriched_count}") print(f"Failed/No data: {failed_count}") if __name__ == '__main__': main()