glam/scripts/enrich_single_profile_linkup.py
2026-01-02 02:11:04 +01:00

304 lines
12 KiB
Python

#!/usr/bin/env python3
"""
Enrich basic person profiles using Linkup API data.
This script takes search results from Linkup and enriches fallback_basic profiles
with clear provenance statements according to AGENTS.md Rule 27.
Usage:
python3 scripts/enrich_single_profile_linkup.py <profile_path> <search_results_json>
"""
import json
import sys
import re
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Optional, Any
def extract_contact_details(content: str) -> Dict[str, List[str]]:
"""Extract email addresses and phone numbers from content."""
contacts = {
'emails': [],
'phones': []
}
# Email patterns
email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
emails = re.findall(email_pattern, content)
# Filter out common false positives
filtered_emails = [
e for e in emails
if not any(x in e.lower() for x in ['example.com', 'linkedin.com', 'noreply', 'no-reply'])
]
contacts['emails'] = list(set(filtered_emails))
# Phone patterns (international formats)
phone_patterns = [
r'\+\d{1,3}[-.\s]?\(?\d{1,4}\)?[-.\s]?\d{1,4}[-.\s]?\d{1,9}', # +1 (234) 567-8900
r'\(\d{3}\)\s*\d{3}[-.\s]?\d{4}', # (234) 567-8900
r'\d{3}[-.\s]\d{3}[-.\s]\d{4}', # 234-567-8900
r'\+\d{10,14}', # +12345678900
]
for pattern in phone_patterns:
phones = re.findall(pattern, content)
contacts['phones'].extend(phones)
contacts['phones'] = list(set(contacts['phones']))
return contacts
def extract_profile_info_from_search_results(results: List[Dict], target_name: str, target_custodian: str) -> Dict:
"""Extract profile information from Linkup search results."""
extracted = {
'name': None,
'headline': None,
'location': None,
'education': [],
'experience': [],
'connections': None,
'linkedin_url': None,
'source_snippets': [],
'contacts': {'emails': [], 'phones': []}
}
target_name_lower = target_name.lower() if target_name else ''
for result in results:
url = result.get('url', '')
content = result.get('content', '')
title = result.get('name', '')
# Check if this is a relevant LinkedIn profile
if 'linkedin.com/in/' in url:
# Extract slug from URL
slug_match = re.search(r'linkedin\.com/in/([^/?]+)', url)
if slug_match:
slug = slug_match.group(1).lower()
# Check if the profile matches our target
name_in_title = target_name_lower in title.lower()
name_in_slug = any(part in slug for part in target_name_lower.split()[:2] if len(part) > 2)
if name_in_title or name_in_slug:
extracted['linkedin_url'] = url
# Extract name from title (format: "Name - Title | LinkedIn")
name_match = re.match(r'^([^-|]+)', title)
if name_match:
extracted['name'] = name_match.group(1).strip()
# Extract headline from title
headline_match = re.search(r' - ([^|]+)', title)
if headline_match:
extracted['headline'] = headline_match.group(1).strip()
# Extract info from content
# Pattern: "Experience: X · Education: Y · Location: Z · N connections"
exp_match = re.search(r'Experience:\s*([^·]+)', content)
if exp_match:
extracted['experience'].append({
'company': exp_match.group(1).strip(),
'source': 'linkup_search'
})
edu_match = re.search(r'Education:\s*([^·]+)', content)
if edu_match:
extracted['education'].append({
'school': edu_match.group(1).strip(),
'source': 'linkup_search'
})
loc_match = re.search(r'Location:\s*([^·]+)', content)
if loc_match:
extracted['location'] = loc_match.group(1).strip()
conn_match = re.search(r'(\d+)\s*connections', content)
if conn_match:
extracted['connections'] = int(conn_match.group(1))
# Store relevant content snippet
if content and len(content) > 50:
extracted['source_snippets'].append({
'url': url,
'content': content[:500]
})
# Extract contact details from content
contacts = extract_contact_details(content)
extracted['contacts']['emails'].extend(contacts['emails'])
extracted['contacts']['phones'].extend(contacts['phones'])
# Also check non-LinkedIn results for contact info
elif content:
contacts = extract_contact_details(content)
extracted['contacts']['emails'].extend(contacts['emails'])
extracted['contacts']['phones'].extend(contacts['phones'])
# Deduplicate contacts
extracted['contacts']['emails'] = list(set(extracted['contacts']['emails']))
extracted['contacts']['phones'] = list(set(extracted['contacts']['phones']))
return extracted
def create_linkup_provenance(url: str, search_query: str, timestamp: str) -> Dict:
"""Create provenance block per AGENTS.md Rule 27."""
return {
'source_url': url,
'retrieved_on': timestamp,
'retrieval_agent': 'linkup',
'search_query': search_query,
'method': 'linkup-search',
'xpath': None, # Linkup returns processed content, not raw HTML
'notes': 'Enriched via Linkup search API. Content extracted from search result snippets.'
}
def enrich_profile(profile_path: Path, search_results: List[Dict], search_query: str) -> bool:
"""Enrich a single profile with Linkup search results."""
# Load existing profile
with open(profile_path, 'r', encoding='utf-8') as f:
data = json.load(f)
# Get target info
profile_data = data.get('profile_data', {})
source_info = data.get('source_staff_info', {})
target_name = (
profile_data.get('name') or
profile_data.get('full_name') or
source_info.get('name', '')
)
target_custodian = source_info.get('custodian', '')
# Extract info from search results
extracted = extract_profile_info_from_search_results(
search_results,
target_name,
target_custodian
)
# Skip if no useful data found
if not any([extracted['headline'], extracted['education'], extracted['experience'], extracted['location']]):
print(f"No useful data found for {target_name}")
return False
# Create timestamp
timestamp = datetime.now(timezone.utc).isoformat()
# Create provenance
source_url = extracted.get('linkedin_url') or (extracted['source_snippets'][0]['url'] if extracted['source_snippets'] else 'https://api.linkup.so')
provenance = create_linkup_provenance(source_url, search_query, timestamp)
# Add timeline_enrichment section
data['timeline_enrichment'] = {
'enrichment_date': timestamp,
'provenance': provenance,
'extracted_data': {
'headline': extracted['headline'],
'location': extracted['location'],
'connections': extracted['connections'],
'education': extracted['education'],
'experience': extracted['experience'],
'contacts': extracted['contacts']
},
'source_snippets': extracted['source_snippets'][:3] # Keep top 3 snippets
}
# Update profile_data with new info
if extracted['headline'] and not profile_data.get('headline'):
profile_data['headline'] = extracted['headline']
if extracted['location'] and not profile_data.get('location'):
profile_data['location'] = extracted['location']
if extracted['connections']:
profile_data['connections'] = extracted['connections']
if extracted['education']:
if not profile_data.get('education'):
profile_data['education'] = []
for edu in extracted['education']:
# Avoid duplicates
existing_schools = [e.get('school', '').lower() for e in profile_data['education']]
if edu['school'].lower() not in existing_schools:
profile_data['education'].append(edu)
if extracted['experience']:
if not profile_data.get('career_history'):
profile_data['career_history'] = []
for exp in extracted['experience']:
# Avoid duplicates
existing_companies = [e.get('company', '').lower() for e in profile_data.get('career_history', [])]
if exp['company'].lower() not in existing_companies:
profile_data['career_history'].append(exp)
# Update corrected LinkedIn URL if found
if extracted['linkedin_url']:
data['extraction_metadata']['linkedin_url_verified'] = extracted['linkedin_url']
# Add contacts to profile_data if found
if extracted['contacts']['emails'] or extracted['contacts']['phones']:
if not profile_data.get('contacts'):
profile_data['contacts'] = {}
if extracted['contacts']['emails']:
profile_data['contacts']['emails'] = extracted['contacts']['emails']
if extracted['contacts']['phones']:
profile_data['contacts']['phones'] = extracted['contacts']['phones']
data['profile_data'] = profile_data
# Update extraction metadata
data['extraction_metadata']['linkup_enriched'] = True
data['extraction_metadata']['timeline_enrichment_date'] = timestamp
# Write back
with open(profile_path, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
print(f"✓ Enriched {target_name}")
print(f" - Headline: {extracted['headline']}")
print(f" - Location: {extracted['location']}")
print(f" - Education: {len(extracted['education'])} entries")
print(f" - Experience: {len(extracted['experience'])} entries")
print(f" - Emails: {len(extracted['contacts']['emails'])}")
print(f" - Phones: {len(extracted['contacts']['phones'])}")
return True
def main():
if len(sys.argv) < 3:
print("Usage: python3 enrich_single_profile_linkup.py <profile_path> <search_results_json>")
sys.exit(1)
profile_path = Path(sys.argv[1])
search_results_json = sys.argv[2]
search_query = sys.argv[3] if len(sys.argv) > 3 else "LinkedIn profile search"
if not profile_path.exists():
print(f"Error: Profile not found: {profile_path}")
sys.exit(1)
try:
search_results_raw = json.loads(search_results_json)
if isinstance(search_results_raw, dict) and 'results' in search_results_raw:
search_results: List[Dict[str, Any]] = search_results_raw['results']
elif isinstance(search_results_raw, list):
search_results = search_results_raw
else:
print(f"Error: Unexpected search results format")
sys.exit(1)
except json.JSONDecodeError as e:
print(f"Error parsing search results: {e}")
sys.exit(1)
success = enrich_profile(profile_path, search_results, search_query)
sys.exit(0 if success else 1)
if __name__ == '__main__':
main()