#!/usr/bin/env python3 """ Enrich basic person profiles using Linkup API data. This script takes search results from Linkup and enriches fallback_basic profiles with clear provenance statements according to AGENTS.md Rule 27. Usage: python3 scripts/enrich_single_profile_linkup.py """ import json import sys import re from datetime import datetime, timezone from pathlib import Path from typing import Dict, List, Optional, Any def extract_contact_details(content: str) -> Dict[str, List[str]]: """Extract email addresses and phone numbers from content.""" contacts = { 'emails': [], 'phones': [] } # Email patterns email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}' emails = re.findall(email_pattern, content) # Filter out common false positives filtered_emails = [ e for e in emails if not any(x in e.lower() for x in ['example.com', 'linkedin.com', 'noreply', 'no-reply']) ] contacts['emails'] = list(set(filtered_emails)) # Phone patterns (international formats) phone_patterns = [ r'\+\d{1,3}[-.\s]?\(?\d{1,4}\)?[-.\s]?\d{1,4}[-.\s]?\d{1,9}', # +1 (234) 567-8900 r'\(\d{3}\)\s*\d{3}[-.\s]?\d{4}', # (234) 567-8900 r'\d{3}[-.\s]\d{3}[-.\s]\d{4}', # 234-567-8900 r'\+\d{10,14}', # +12345678900 ] for pattern in phone_patterns: phones = re.findall(pattern, content) contacts['phones'].extend(phones) contacts['phones'] = list(set(contacts['phones'])) return contacts def extract_profile_info_from_search_results(results: List[Dict], target_name: str, target_custodian: str) -> Dict: """Extract profile information from Linkup search results.""" extracted = { 'name': None, 'headline': None, 'location': None, 'education': [], 'experience': [], 'connections': None, 'linkedin_url': None, 'source_snippets': [], 'contacts': {'emails': [], 'phones': []} } target_name_lower = target_name.lower() if target_name else '' for result in results: url = result.get('url', '') content = result.get('content', '') title = result.get('name', '') # Check if this is a relevant LinkedIn profile if 'linkedin.com/in/' in url: # Extract slug from URL slug_match = re.search(r'linkedin\.com/in/([^/?]+)', url) if slug_match: slug = slug_match.group(1).lower() # Check if the profile matches our target name_in_title = target_name_lower in title.lower() name_in_slug = any(part in slug for part in target_name_lower.split()[:2] if len(part) > 2) if name_in_title or name_in_slug: extracted['linkedin_url'] = url # Extract name from title (format: "Name - Title | LinkedIn") name_match = re.match(r'^([^-|]+)', title) if name_match: extracted['name'] = name_match.group(1).strip() # Extract headline from title headline_match = re.search(r' - ([^|]+)', title) if headline_match: extracted['headline'] = headline_match.group(1).strip() # Extract info from content # Pattern: "Experience: X · Education: Y · Location: Z · N connections" exp_match = re.search(r'Experience:\s*([^·]+)', content) if exp_match: extracted['experience'].append({ 'company': exp_match.group(1).strip(), 'source': 'linkup_search' }) edu_match = re.search(r'Education:\s*([^·]+)', content) if edu_match: extracted['education'].append({ 'school': edu_match.group(1).strip(), 'source': 'linkup_search' }) loc_match = re.search(r'Location:\s*([^·]+)', content) if loc_match: extracted['location'] = loc_match.group(1).strip() conn_match = re.search(r'(\d+)\s*connections', content) if conn_match: extracted['connections'] = int(conn_match.group(1)) # Store relevant content snippet if content and len(content) > 50: extracted['source_snippets'].append({ 'url': url, 'content': content[:500] }) # Extract contact details from content contacts = extract_contact_details(content) extracted['contacts']['emails'].extend(contacts['emails']) extracted['contacts']['phones'].extend(contacts['phones']) # Also check non-LinkedIn results for contact info elif content: contacts = extract_contact_details(content) extracted['contacts']['emails'].extend(contacts['emails']) extracted['contacts']['phones'].extend(contacts['phones']) # Deduplicate contacts extracted['contacts']['emails'] = list(set(extracted['contacts']['emails'])) extracted['contacts']['phones'] = list(set(extracted['contacts']['phones'])) return extracted def create_linkup_provenance(url: str, search_query: str, timestamp: str) -> Dict: """Create provenance block per AGENTS.md Rule 27.""" return { 'source_url': url, 'retrieved_on': timestamp, 'retrieval_agent': 'linkup', 'search_query': search_query, 'method': 'linkup-search', 'xpath': None, # Linkup returns processed content, not raw HTML 'notes': 'Enriched via Linkup search API. Content extracted from search result snippets.' } def enrich_profile(profile_path: Path, search_results: List[Dict], search_query: str) -> bool: """Enrich a single profile with Linkup search results.""" # Load existing profile with open(profile_path, 'r', encoding='utf-8') as f: data = json.load(f) # Get target info profile_data = data.get('profile_data', {}) source_info = data.get('source_staff_info', {}) target_name = ( profile_data.get('name') or profile_data.get('full_name') or source_info.get('name', '') ) target_custodian = source_info.get('custodian', '') # Extract info from search results extracted = extract_profile_info_from_search_results( search_results, target_name, target_custodian ) # Skip if no useful data found if not any([extracted['headline'], extracted['education'], extracted['experience'], extracted['location']]): print(f"No useful data found for {target_name}") return False # Create timestamp timestamp = datetime.now(timezone.utc).isoformat() # Create provenance source_url = extracted.get('linkedin_url') or (extracted['source_snippets'][0]['url'] if extracted['source_snippets'] else 'https://api.linkup.so') provenance = create_linkup_provenance(source_url, search_query, timestamp) # Add timeline_enrichment section data['timeline_enrichment'] = { 'enrichment_date': timestamp, 'provenance': provenance, 'extracted_data': { 'headline': extracted['headline'], 'location': extracted['location'], 'connections': extracted['connections'], 'education': extracted['education'], 'experience': extracted['experience'], 'contacts': extracted['contacts'] }, 'source_snippets': extracted['source_snippets'][:3] # Keep top 3 snippets } # Update profile_data with new info if extracted['headline'] and not profile_data.get('headline'): profile_data['headline'] = extracted['headline'] if extracted['location'] and not profile_data.get('location'): profile_data['location'] = extracted['location'] if extracted['connections']: profile_data['connections'] = extracted['connections'] if extracted['education']: if not profile_data.get('education'): profile_data['education'] = [] for edu in extracted['education']: # Avoid duplicates existing_schools = [e.get('school', '').lower() for e in profile_data['education']] if edu['school'].lower() not in existing_schools: profile_data['education'].append(edu) if extracted['experience']: if not profile_data.get('career_history'): profile_data['career_history'] = [] for exp in extracted['experience']: # Avoid duplicates existing_companies = [e.get('company', '').lower() for e in profile_data.get('career_history', [])] if exp['company'].lower() not in existing_companies: profile_data['career_history'].append(exp) # Update corrected LinkedIn URL if found if extracted['linkedin_url']: data['extraction_metadata']['linkedin_url_verified'] = extracted['linkedin_url'] # Add contacts to profile_data if found if extracted['contacts']['emails'] or extracted['contacts']['phones']: if not profile_data.get('contacts'): profile_data['contacts'] = {} if extracted['contacts']['emails']: profile_data['contacts']['emails'] = extracted['contacts']['emails'] if extracted['contacts']['phones']: profile_data['contacts']['phones'] = extracted['contacts']['phones'] data['profile_data'] = profile_data # Update extraction metadata data['extraction_metadata']['linkup_enriched'] = True data['extraction_metadata']['timeline_enrichment_date'] = timestamp # Write back with open(profile_path, 'w', encoding='utf-8') as f: json.dump(data, f, indent=2, ensure_ascii=False) print(f"✓ Enriched {target_name}") print(f" - Headline: {extracted['headline']}") print(f" - Location: {extracted['location']}") print(f" - Education: {len(extracted['education'])} entries") print(f" - Experience: {len(extracted['experience'])} entries") print(f" - Emails: {len(extracted['contacts']['emails'])}") print(f" - Phones: {len(extracted['contacts']['phones'])}") return True def main(): if len(sys.argv) < 3: print("Usage: python3 enrich_single_profile_linkup.py ") sys.exit(1) profile_path = Path(sys.argv[1]) search_results_json = sys.argv[2] search_query = sys.argv[3] if len(sys.argv) > 3 else "LinkedIn profile search" if not profile_path.exists(): print(f"Error: Profile not found: {profile_path}") sys.exit(1) try: search_results_raw = json.loads(search_results_json) if isinstance(search_results_raw, dict) and 'results' in search_results_raw: search_results: List[Dict[str, Any]] = search_results_raw['results'] elif isinstance(search_results_raw, list): search_results = search_results_raw else: print(f"Error: Unexpected search results format") sys.exit(1) except json.JSONDecodeError as e: print(f"Error parsing search results: {e}") sys.exit(1) success = enrich_profile(profile_path, search_results, search_query) sys.exit(0 if success else 1) if __name__ == '__main__': main()