glam/scripts/enrich_profiles_linkup.py
2026-01-02 02:11:04 +01:00

392 lines
14 KiB
Python

#!/usr/bin/env python3
"""
Enrich basic person profiles using Linkup API.
This script takes fallback_basic and privacy_restricted_fallback profiles
and enriches them with additional data from Linkup searches and page fetches.
Provenance is tracked according to AGENTS.md Rule 27.
"""
import json
import os
import sys
import time
import re
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Optional, Any
import subprocess
# Configuration
ENTITY_DIR = Path("/Users/kempersc/apps/glam/data/custodian/person/entity")
BATCH_SIZE = 10 # Process in batches
DELAY_BETWEEN_REQUESTS = 1.5 # seconds between API calls
def load_basic_profiles() -> List[Path]:
"""Load all profiles that need enrichment (fallback_basic or privacy_restricted_fallback)."""
profiles_to_enrich = []
for file_path in ENTITY_DIR.glob("*.json"):
try:
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
extraction_method = data.get('extraction_metadata', {}).get('extraction_method', '')
# Check if already enriched with Linkup
timeline_enrichment = data.get('timeline_enrichment', {})
if timeline_enrichment:
continue # Already enriched
if extraction_method in ('fallback_basic', 'privacy_restricted_fallback'):
profiles_to_enrich.append(file_path)
except Exception as e:
print(f"Error reading {file_path}: {e}")
return profiles_to_enrich
def call_linkup_search(query: str) -> Optional[Dict]:
"""Call Linkup search API via MCP."""
# Use subprocess to call the MCP tool
# This is a placeholder - in production, use the actual MCP client
try:
import httpx
# Linkup API endpoint
response = httpx.post(
"https://api.linkup.so/v1/search",
headers={
"Authorization": f"Bearer {os.environ.get('LINKUP_API_KEY', '')}",
"Content-Type": "application/json"
},
json={
"q": query,
"depth": "standard",
"outputType": "searchResults"
},
timeout=30.0
)
if response.status_code == 200:
return response.json()
else:
print(f"Linkup search error: {response.status_code} - {response.text}")
return None
except Exception as e:
print(f"Error calling Linkup search: {e}")
return None
def call_linkup_fetch(url: str) -> Optional[Dict]:
"""Call Linkup fetch API via MCP."""
try:
import httpx
response = httpx.post(
"https://api.linkup.so/v1/fetch",
headers={
"Authorization": f"Bearer {os.environ.get('LINKUP_API_KEY', '')}",
"Content-Type": "application/json"
},
json={
"url": url,
"renderJs": False
},
timeout=30.0
)
if response.status_code == 200:
return response.json()
else:
print(f"Linkup fetch error: {response.status_code} - {response.text}")
return None
except Exception as e:
print(f"Error calling Linkup fetch: {e}")
return None
def extract_profile_data_from_markdown(markdown: str, linkedin_url: str) -> Dict:
"""Extract structured profile data from LinkedIn markdown."""
data = {
'name': None,
'headline': None,
'location': None,
'connections': None,
'current_company': None,
'education': [],
'experience': [],
'volunteer_experience': [],
'skills': [],
'about': None
}
# Extract name from title
name_match = re.search(r'^# ([^\n]+)', markdown)
if name_match:
data['name'] = name_match.group(1).strip()
# Extract from header line
header_match = re.search(r'Experience: ([^·]+)·\s*Education: ([^·]+)·\s*Location: ([^·]+)·\s*(\d+) connections', markdown)
if header_match:
data['current_company'] = header_match.group(1).strip()
education_name = header_match.group(2).strip()
if education_name:
data['education'].append({'school': education_name})
data['location'] = header_match.group(3).strip()
data['connections'] = int(header_match.group(4))
# Extract followers/connections
followers_match = re.search(r'(\d+) followers (\d+) connections', markdown)
if followers_match:
data['followers'] = int(followers_match.group(1))
data['connections'] = int(followers_match.group(2))
# Extract volunteer experience sections
volunteer_sections = re.findall(
r'### ([^\n]+)\n\n#### \[([^\]]+)\][^\n]*\n\n([^\n]+)\n\n([^#]*?)(?=\n- |$)',
markdown,
re.MULTILINE
)
for title, org, duration, description in volunteer_sections:
if 'Volunteer' in title or 'volunteer' in description.lower():
data['volunteer_experience'].append({
'title': title.strip(),
'organization': org.strip(),
'duration': duration.strip(),
'description': description.strip()[:500] # Limit description length
})
# Extract experience from Experience & Education section
exp_section = re.search(r'## Experience & Education\n(.*?)(?=## |$)', markdown, re.DOTALL)
if exp_section:
exp_items = re.findall(
r'### ([^\n]+)\n\n#### \n\n([^\n]+)',
exp_section.group(1)
)
for company, role in exp_items:
if company.strip() and not company.startswith('\\*'):
data['experience'].append({
'company': company.strip(),
'role': role.strip() if not role.startswith('\\*') else None
})
return data
def create_linkup_provenance(url: str, search_query: Optional[str] = None) -> Dict:
"""Create provenance block for Linkup enrichment."""
timestamp = datetime.now(timezone.utc).isoformat()
return {
'retrieval_agent': 'linkup',
'retrieval_timestamp': timestamp,
'source_url': url,
'search_query': search_query,
'method': 'linkup_fetch' if not search_query else 'linkup_search'
}
def enrich_profile(profile_path: Path) -> bool:
"""Enrich a single profile using Linkup."""
try:
with open(profile_path, 'r', encoding='utf-8') as f:
data = json.load(f)
# Get profile info for search
profile_data = data.get('profile_data', {})
source_info = data.get('source_staff_info', {})
extraction_meta = data.get('extraction_metadata', {})
name = (
profile_data.get('name') or
profile_data.get('full_name') or
source_info.get('name') or
'Unknown'
)
headline = (
profile_data.get('headline') or
source_info.get('headline') or
''
)
custodian = source_info.get('custodian', '')
linkedin_url = extraction_meta.get('linkedin_url', '')
# Build search query
search_parts = [name]
if custodian:
search_parts.append(custodian)
if headline and len(headline) < 50:
search_parts.append(headline)
search_parts.append('site:linkedin.com')
search_query = ' '.join(search_parts)
print(f"Enriching: {name} ({custodian})")
# Try direct LinkedIn fetch first
enriched_data = {}
provenance_list = []
if linkedin_url:
print(f" Fetching: {linkedin_url}")
fetch_result = call_linkup_fetch(linkedin_url)
if fetch_result and fetch_result.get('markdown'):
markdown = fetch_result['markdown']
extracted = extract_profile_data_from_markdown(markdown, linkedin_url)
# Add provenance
provenance_list.append(create_linkup_provenance(linkedin_url))
# Update enriched data
enriched_data['fetch_result'] = extracted
enriched_data['raw_markdown_length'] = len(markdown)
# Also do a search to potentially find additional info
print(f" Searching: {search_query[:80]}...")
search_result = call_linkup_search(search_query)
if search_result and search_result.get('results'):
results = search_result['results']
# Find the most relevant result
relevant_results = []
for r in results[:5]:
url = r.get('url', '')
content = r.get('content', '')
title = r.get('name', '')
# Check if it's a LinkedIn profile
if 'linkedin.com/in/' in url:
relevant_results.append({
'url': url,
'title': title,
'content': content[:500], # Limit content
'relevance': 'linkedin_profile'
})
elif name.lower() in title.lower() or name.lower() in content.lower():
relevant_results.append({
'url': url,
'title': title,
'content': content[:300],
'relevance': 'name_match'
})
if relevant_results:
provenance_list.append(create_linkup_provenance(
url=relevant_results[0]['url'],
search_query=search_query
))
enriched_data['search_results'] = relevant_results
# Only update if we found something
if enriched_data:
# Add Linkup enrichment section
data['timeline_enrichment'] = {
'enrichment_date': datetime.now(timezone.utc).isoformat(),
'provenance': provenance_list,
'data': enriched_data
}
# Update profile_data if we got better info
if enriched_data.get('fetch_result'):
fetch_data = enriched_data['fetch_result']
if fetch_data.get('location') and not profile_data.get('location'):
profile_data['location'] = fetch_data['location']
if fetch_data.get('connections'):
profile_data['connections'] = fetch_data['connections']
if fetch_data.get('education'):
profile_data['education'] = fetch_data['education']
if fetch_data.get('experience'):
if not profile_data.get('career_history'):
profile_data['career_history'] = []
for exp in fetch_data['experience']:
if exp.get('company'):
profile_data['career_history'].append(exp)
if fetch_data.get('volunteer_experience'):
profile_data['volunteer_experience'] = fetch_data['volunteer_experience']
data['profile_data'] = profile_data
# Update extraction metadata
data['extraction_metadata']['linkup_enriched'] = True
data['extraction_metadata']['timeline_enrichment_date'] = datetime.now(timezone.utc).isoformat()
# Write back
with open(profile_path, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
print(f" ✓ Enriched with {len(provenance_list)} sources")
return True
else:
print(f" - No additional data found")
return False
except Exception as e:
print(f"Error enriching {profile_path}: {e}")
return False
def main():
"""Main enrichment workflow."""
print("=" * 60)
print("Linkup Profile Enrichment Script")
print("=" * 60)
# Check for API key
if not os.environ.get('LINKUP_API_KEY'):
print("Warning: LINKUP_API_KEY not set. Using MCP tools instead.")
# Load profiles to enrich
profiles = load_basic_profiles()
print(f"\nFound {len(profiles)} profiles to enrich")
if not profiles:
print("No profiles need enrichment.")
return
# Process in batches
enriched_count = 0
failed_count = 0
for i, profile_path in enumerate(profiles):
print(f"\n[{i+1}/{len(profiles)}] ", end="")
try:
success = enrich_profile(profile_path)
if success:
enriched_count += 1
else:
failed_count += 1
except Exception as e:
print(f"Error: {e}")
failed_count += 1
# Rate limiting
if i < len(profiles) - 1:
time.sleep(DELAY_BETWEEN_REQUESTS)
# Progress report every 100 profiles
if (i + 1) % 100 == 0:
print(f"\n--- Progress: {i+1}/{len(profiles)} processed, {enriched_count} enriched ---\n")
# Final report
print("\n" + "=" * 60)
print("Enrichment Complete")
print("=" * 60)
print(f"Total profiles processed: {len(profiles)}")
print(f"Successfully enriched: {enriched_count}")
print(f"Failed/No data: {failed_count}")
if __name__ == '__main__':
main()