304 lines
12 KiB
Python
304 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Enrich basic person profiles using Linkup API data.
|
|
|
|
This script takes search results from Linkup and enriches fallback_basic profiles
|
|
with clear provenance statements according to AGENTS.md Rule 27.
|
|
|
|
Usage:
|
|
python3 scripts/enrich_single_profile_linkup.py <profile_path> <search_results_json>
|
|
"""
|
|
|
|
import json
|
|
import sys
|
|
import re
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional, Any
|
|
|
|
|
|
def extract_contact_details(content: str) -> Dict[str, List[str]]:
|
|
"""Extract email addresses and phone numbers from content."""
|
|
contacts = {
|
|
'emails': [],
|
|
'phones': []
|
|
}
|
|
|
|
# Email patterns
|
|
email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
|
|
emails = re.findall(email_pattern, content)
|
|
# Filter out common false positives
|
|
filtered_emails = [
|
|
e for e in emails
|
|
if not any(x in e.lower() for x in ['example.com', 'linkedin.com', 'noreply', 'no-reply'])
|
|
]
|
|
contacts['emails'] = list(set(filtered_emails))
|
|
|
|
# Phone patterns (international formats)
|
|
phone_patterns = [
|
|
r'\+\d{1,3}[-.\s]?\(?\d{1,4}\)?[-.\s]?\d{1,4}[-.\s]?\d{1,9}', # +1 (234) 567-8900
|
|
r'\(\d{3}\)\s*\d{3}[-.\s]?\d{4}', # (234) 567-8900
|
|
r'\d{3}[-.\s]\d{3}[-.\s]\d{4}', # 234-567-8900
|
|
r'\+\d{10,14}', # +12345678900
|
|
]
|
|
for pattern in phone_patterns:
|
|
phones = re.findall(pattern, content)
|
|
contacts['phones'].extend(phones)
|
|
contacts['phones'] = list(set(contacts['phones']))
|
|
|
|
return contacts
|
|
|
|
|
|
def extract_profile_info_from_search_results(results: List[Dict], target_name: str, target_custodian: str) -> Dict:
|
|
"""Extract profile information from Linkup search results."""
|
|
extracted = {
|
|
'name': None,
|
|
'headline': None,
|
|
'location': None,
|
|
'education': [],
|
|
'experience': [],
|
|
'connections': None,
|
|
'linkedin_url': None,
|
|
'source_snippets': [],
|
|
'contacts': {'emails': [], 'phones': []}
|
|
}
|
|
|
|
target_name_lower = target_name.lower() if target_name else ''
|
|
|
|
for result in results:
|
|
url = result.get('url', '')
|
|
content = result.get('content', '')
|
|
title = result.get('name', '')
|
|
|
|
# Check if this is a relevant LinkedIn profile
|
|
if 'linkedin.com/in/' in url:
|
|
# Extract slug from URL
|
|
slug_match = re.search(r'linkedin\.com/in/([^/?]+)', url)
|
|
if slug_match:
|
|
slug = slug_match.group(1).lower()
|
|
|
|
# Check if the profile matches our target
|
|
name_in_title = target_name_lower in title.lower()
|
|
name_in_slug = any(part in slug for part in target_name_lower.split()[:2] if len(part) > 2)
|
|
|
|
if name_in_title or name_in_slug:
|
|
extracted['linkedin_url'] = url
|
|
|
|
# Extract name from title (format: "Name - Title | LinkedIn")
|
|
name_match = re.match(r'^([^-|]+)', title)
|
|
if name_match:
|
|
extracted['name'] = name_match.group(1).strip()
|
|
|
|
# Extract headline from title
|
|
headline_match = re.search(r' - ([^|]+)', title)
|
|
if headline_match:
|
|
extracted['headline'] = headline_match.group(1).strip()
|
|
|
|
# Extract info from content
|
|
# Pattern: "Experience: X · Education: Y · Location: Z · N connections"
|
|
exp_match = re.search(r'Experience:\s*([^·]+)', content)
|
|
if exp_match:
|
|
extracted['experience'].append({
|
|
'company': exp_match.group(1).strip(),
|
|
'source': 'linkup_search'
|
|
})
|
|
|
|
edu_match = re.search(r'Education:\s*([^·]+)', content)
|
|
if edu_match:
|
|
extracted['education'].append({
|
|
'school': edu_match.group(1).strip(),
|
|
'source': 'linkup_search'
|
|
})
|
|
|
|
loc_match = re.search(r'Location:\s*([^·]+)', content)
|
|
if loc_match:
|
|
extracted['location'] = loc_match.group(1).strip()
|
|
|
|
conn_match = re.search(r'(\d+)\s*connections', content)
|
|
if conn_match:
|
|
extracted['connections'] = int(conn_match.group(1))
|
|
|
|
# Store relevant content snippet
|
|
if content and len(content) > 50:
|
|
extracted['source_snippets'].append({
|
|
'url': url,
|
|
'content': content[:500]
|
|
})
|
|
|
|
# Extract contact details from content
|
|
contacts = extract_contact_details(content)
|
|
extracted['contacts']['emails'].extend(contacts['emails'])
|
|
extracted['contacts']['phones'].extend(contacts['phones'])
|
|
|
|
# Also check non-LinkedIn results for contact info
|
|
elif content:
|
|
contacts = extract_contact_details(content)
|
|
extracted['contacts']['emails'].extend(contacts['emails'])
|
|
extracted['contacts']['phones'].extend(contacts['phones'])
|
|
|
|
# Deduplicate contacts
|
|
extracted['contacts']['emails'] = list(set(extracted['contacts']['emails']))
|
|
extracted['contacts']['phones'] = list(set(extracted['contacts']['phones']))
|
|
|
|
return extracted
|
|
|
|
|
|
def create_linkup_provenance(url: str, search_query: str, timestamp: str) -> Dict:
|
|
"""Create provenance block per AGENTS.md Rule 27."""
|
|
return {
|
|
'source_url': url,
|
|
'retrieved_on': timestamp,
|
|
'retrieval_agent': 'linkup',
|
|
'search_query': search_query,
|
|
'method': 'linkup-search',
|
|
'xpath': None, # Linkup returns processed content, not raw HTML
|
|
'notes': 'Enriched via Linkup search API. Content extracted from search result snippets.'
|
|
}
|
|
|
|
|
|
def enrich_profile(profile_path: Path, search_results: List[Dict], search_query: str) -> bool:
|
|
"""Enrich a single profile with Linkup search results."""
|
|
|
|
# Load existing profile
|
|
with open(profile_path, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
# Get target info
|
|
profile_data = data.get('profile_data', {})
|
|
source_info = data.get('source_staff_info', {})
|
|
|
|
target_name = (
|
|
profile_data.get('name') or
|
|
profile_data.get('full_name') or
|
|
source_info.get('name', '')
|
|
)
|
|
target_custodian = source_info.get('custodian', '')
|
|
|
|
# Extract info from search results
|
|
extracted = extract_profile_info_from_search_results(
|
|
search_results,
|
|
target_name,
|
|
target_custodian
|
|
)
|
|
|
|
# Skip if no useful data found
|
|
if not any([extracted['headline'], extracted['education'], extracted['experience'], extracted['location']]):
|
|
print(f"No useful data found for {target_name}")
|
|
return False
|
|
|
|
# Create timestamp
|
|
timestamp = datetime.now(timezone.utc).isoformat()
|
|
|
|
# Create provenance
|
|
source_url = extracted.get('linkedin_url') or (extracted['source_snippets'][0]['url'] if extracted['source_snippets'] else 'https://api.linkup.so')
|
|
provenance = create_linkup_provenance(source_url, search_query, timestamp)
|
|
|
|
# Add timeline_enrichment section
|
|
data['timeline_enrichment'] = {
|
|
'enrichment_date': timestamp,
|
|
'provenance': provenance,
|
|
'extracted_data': {
|
|
'headline': extracted['headline'],
|
|
'location': extracted['location'],
|
|
'connections': extracted['connections'],
|
|
'education': extracted['education'],
|
|
'experience': extracted['experience'],
|
|
'contacts': extracted['contacts']
|
|
},
|
|
'source_snippets': extracted['source_snippets'][:3] # Keep top 3 snippets
|
|
}
|
|
|
|
# Update profile_data with new info
|
|
if extracted['headline'] and not profile_data.get('headline'):
|
|
profile_data['headline'] = extracted['headline']
|
|
|
|
if extracted['location'] and not profile_data.get('location'):
|
|
profile_data['location'] = extracted['location']
|
|
|
|
if extracted['connections']:
|
|
profile_data['connections'] = extracted['connections']
|
|
|
|
if extracted['education']:
|
|
if not profile_data.get('education'):
|
|
profile_data['education'] = []
|
|
for edu in extracted['education']:
|
|
# Avoid duplicates
|
|
existing_schools = [e.get('school', '').lower() for e in profile_data['education']]
|
|
if edu['school'].lower() not in existing_schools:
|
|
profile_data['education'].append(edu)
|
|
|
|
if extracted['experience']:
|
|
if not profile_data.get('career_history'):
|
|
profile_data['career_history'] = []
|
|
for exp in extracted['experience']:
|
|
# Avoid duplicates
|
|
existing_companies = [e.get('company', '').lower() for e in profile_data.get('career_history', [])]
|
|
if exp['company'].lower() not in existing_companies:
|
|
profile_data['career_history'].append(exp)
|
|
|
|
# Update corrected LinkedIn URL if found
|
|
if extracted['linkedin_url']:
|
|
data['extraction_metadata']['linkedin_url_verified'] = extracted['linkedin_url']
|
|
|
|
# Add contacts to profile_data if found
|
|
if extracted['contacts']['emails'] or extracted['contacts']['phones']:
|
|
if not profile_data.get('contacts'):
|
|
profile_data['contacts'] = {}
|
|
if extracted['contacts']['emails']:
|
|
profile_data['contacts']['emails'] = extracted['contacts']['emails']
|
|
if extracted['contacts']['phones']:
|
|
profile_data['contacts']['phones'] = extracted['contacts']['phones']
|
|
|
|
data['profile_data'] = profile_data
|
|
|
|
# Update extraction metadata
|
|
data['extraction_metadata']['linkup_enriched'] = True
|
|
data['extraction_metadata']['timeline_enrichment_date'] = timestamp
|
|
|
|
# Write back
|
|
with open(profile_path, 'w', encoding='utf-8') as f:
|
|
json.dump(data, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"✓ Enriched {target_name}")
|
|
print(f" - Headline: {extracted['headline']}")
|
|
print(f" - Location: {extracted['location']}")
|
|
print(f" - Education: {len(extracted['education'])} entries")
|
|
print(f" - Experience: {len(extracted['experience'])} entries")
|
|
print(f" - Emails: {len(extracted['contacts']['emails'])}")
|
|
print(f" - Phones: {len(extracted['contacts']['phones'])}")
|
|
|
|
return True
|
|
|
|
|
|
def main():
|
|
if len(sys.argv) < 3:
|
|
print("Usage: python3 enrich_single_profile_linkup.py <profile_path> <search_results_json>")
|
|
sys.exit(1)
|
|
|
|
profile_path = Path(sys.argv[1])
|
|
search_results_json = sys.argv[2]
|
|
search_query = sys.argv[3] if len(sys.argv) > 3 else "LinkedIn profile search"
|
|
|
|
if not profile_path.exists():
|
|
print(f"Error: Profile not found: {profile_path}")
|
|
sys.exit(1)
|
|
|
|
try:
|
|
search_results_raw = json.loads(search_results_json)
|
|
if isinstance(search_results_raw, dict) and 'results' in search_results_raw:
|
|
search_results: List[Dict[str, Any]] = search_results_raw['results']
|
|
elif isinstance(search_results_raw, list):
|
|
search_results = search_results_raw
|
|
else:
|
|
print(f"Error: Unexpected search results format")
|
|
sys.exit(1)
|
|
except json.JSONDecodeError as e:
|
|
print(f"Error parsing search results: {e}")
|
|
sys.exit(1)
|
|
|
|
success = enrich_profile(profile_path, search_results, search_query)
|
|
sys.exit(0 if success else 1)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|