392 lines
14 KiB
Python
392 lines
14 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Enrich basic person profiles using Linkup API.
|
|
|
|
This script takes fallback_basic and privacy_restricted_fallback profiles
|
|
and enriches them with additional data from Linkup searches and page fetches.
|
|
|
|
Provenance is tracked according to AGENTS.md Rule 27.
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import sys
|
|
import time
|
|
import re
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional, Any
|
|
import subprocess
|
|
|
|
# Configuration
|
|
ENTITY_DIR = Path("/Users/kempersc/apps/glam/data/custodian/person/entity")
|
|
BATCH_SIZE = 10 # Process in batches
|
|
DELAY_BETWEEN_REQUESTS = 1.5 # seconds between API calls
|
|
|
|
def load_basic_profiles() -> List[Path]:
|
|
"""Load all profiles that need enrichment (fallback_basic or privacy_restricted_fallback)."""
|
|
profiles_to_enrich = []
|
|
|
|
for file_path in ENTITY_DIR.glob("*.json"):
|
|
try:
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
extraction_method = data.get('extraction_metadata', {}).get('extraction_method', '')
|
|
|
|
# Check if already enriched with Linkup
|
|
linkup_enrichment = data.get('linkup_enrichment', {})
|
|
if linkup_enrichment:
|
|
continue # Already enriched
|
|
|
|
if extraction_method in ('fallback_basic', 'privacy_restricted_fallback'):
|
|
profiles_to_enrich.append(file_path)
|
|
except Exception as e:
|
|
print(f"Error reading {file_path}: {e}")
|
|
|
|
return profiles_to_enrich
|
|
|
|
|
|
def call_linkup_search(query: str) -> Optional[Dict]:
|
|
"""Call Linkup search API via MCP."""
|
|
# Use subprocess to call the MCP tool
|
|
# This is a placeholder - in production, use the actual MCP client
|
|
try:
|
|
import httpx
|
|
|
|
# Linkup API endpoint
|
|
response = httpx.post(
|
|
"https://api.linkup.so/v1/search",
|
|
headers={
|
|
"Authorization": f"Bearer {os.environ.get('LINKUP_API_KEY', '')}",
|
|
"Content-Type": "application/json"
|
|
},
|
|
json={
|
|
"q": query,
|
|
"depth": "standard",
|
|
"outputType": "searchResults"
|
|
},
|
|
timeout=30.0
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
return response.json()
|
|
else:
|
|
print(f"Linkup search error: {response.status_code} - {response.text}")
|
|
return None
|
|
except Exception as e:
|
|
print(f"Error calling Linkup search: {e}")
|
|
return None
|
|
|
|
|
|
def call_linkup_fetch(url: str) -> Optional[Dict]:
|
|
"""Call Linkup fetch API via MCP."""
|
|
try:
|
|
import httpx
|
|
|
|
response = httpx.post(
|
|
"https://api.linkup.so/v1/fetch",
|
|
headers={
|
|
"Authorization": f"Bearer {os.environ.get('LINKUP_API_KEY', '')}",
|
|
"Content-Type": "application/json"
|
|
},
|
|
json={
|
|
"url": url,
|
|
"renderJs": False
|
|
},
|
|
timeout=30.0
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
return response.json()
|
|
else:
|
|
print(f"Linkup fetch error: {response.status_code} - {response.text}")
|
|
return None
|
|
except Exception as e:
|
|
print(f"Error calling Linkup fetch: {e}")
|
|
return None
|
|
|
|
|
|
def extract_profile_data_from_markdown(markdown: str, linkedin_url: str) -> Dict:
|
|
"""Extract structured profile data from LinkedIn markdown."""
|
|
data = {
|
|
'name': None,
|
|
'headline': None,
|
|
'location': None,
|
|
'connections': None,
|
|
'current_company': None,
|
|
'education': [],
|
|
'experience': [],
|
|
'volunteer_experience': [],
|
|
'skills': [],
|
|
'about': None
|
|
}
|
|
|
|
# Extract name from title
|
|
name_match = re.search(r'^# ([^\n]+)', markdown)
|
|
if name_match:
|
|
data['name'] = name_match.group(1).strip()
|
|
|
|
# Extract from header line
|
|
header_match = re.search(r'Experience: ([^·]+)·\s*Education: ([^·]+)·\s*Location: ([^·]+)·\s*(\d+) connections', markdown)
|
|
if header_match:
|
|
data['current_company'] = header_match.group(1).strip()
|
|
education_name = header_match.group(2).strip()
|
|
if education_name:
|
|
data['education'].append({'school': education_name})
|
|
data['location'] = header_match.group(3).strip()
|
|
data['connections'] = int(header_match.group(4))
|
|
|
|
# Extract followers/connections
|
|
followers_match = re.search(r'(\d+) followers (\d+) connections', markdown)
|
|
if followers_match:
|
|
data['followers'] = int(followers_match.group(1))
|
|
data['connections'] = int(followers_match.group(2))
|
|
|
|
# Extract volunteer experience sections
|
|
volunteer_sections = re.findall(
|
|
r'### ([^\n]+)\n\n#### \[([^\]]+)\][^\n]*\n\n([^\n]+)\n\n([^#]*?)(?=\n- |$)',
|
|
markdown,
|
|
re.MULTILINE
|
|
)
|
|
for title, org, duration, description in volunteer_sections:
|
|
if 'Volunteer' in title or 'volunteer' in description.lower():
|
|
data['volunteer_experience'].append({
|
|
'title': title.strip(),
|
|
'organization': org.strip(),
|
|
'duration': duration.strip(),
|
|
'description': description.strip()[:500] # Limit description length
|
|
})
|
|
|
|
# Extract experience from Experience & Education section
|
|
exp_section = re.search(r'## Experience & Education\n(.*?)(?=## |$)', markdown, re.DOTALL)
|
|
if exp_section:
|
|
exp_items = re.findall(
|
|
r'### ([^\n]+)\n\n#### \n\n([^\n]+)',
|
|
exp_section.group(1)
|
|
)
|
|
for company, role in exp_items:
|
|
if company.strip() and not company.startswith('\\*'):
|
|
data['experience'].append({
|
|
'company': company.strip(),
|
|
'role': role.strip() if not role.startswith('\\*') else None
|
|
})
|
|
|
|
return data
|
|
|
|
|
|
def create_linkup_provenance(url: str, search_query: Optional[str] = None) -> Dict:
|
|
"""Create provenance block for Linkup enrichment."""
|
|
timestamp = datetime.now(timezone.utc).isoformat()
|
|
|
|
return {
|
|
'retrieval_agent': 'linkup',
|
|
'retrieval_timestamp': timestamp,
|
|
'source_url': url,
|
|
'search_query': search_query,
|
|
'method': 'linkup_fetch' if not search_query else 'linkup_search'
|
|
}
|
|
|
|
|
|
def enrich_profile(profile_path: Path) -> bool:
|
|
"""Enrich a single profile using Linkup."""
|
|
try:
|
|
with open(profile_path, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
# Get profile info for search
|
|
profile_data = data.get('profile_data', {})
|
|
source_info = data.get('source_staff_info', {})
|
|
extraction_meta = data.get('extraction_metadata', {})
|
|
|
|
name = (
|
|
profile_data.get('name') or
|
|
profile_data.get('full_name') or
|
|
source_info.get('name') or
|
|
'Unknown'
|
|
)
|
|
|
|
headline = (
|
|
profile_data.get('headline') or
|
|
source_info.get('headline') or
|
|
''
|
|
)
|
|
|
|
custodian = source_info.get('custodian', '')
|
|
linkedin_url = extraction_meta.get('linkedin_url', '')
|
|
|
|
# Build search query
|
|
search_parts = [name]
|
|
if custodian:
|
|
search_parts.append(custodian)
|
|
if headline and len(headline) < 50:
|
|
search_parts.append(headline)
|
|
search_parts.append('site:linkedin.com')
|
|
|
|
search_query = ' '.join(search_parts)
|
|
|
|
print(f"Enriching: {name} ({custodian})")
|
|
|
|
# Try direct LinkedIn fetch first
|
|
enriched_data = {}
|
|
provenance_list = []
|
|
|
|
if linkedin_url:
|
|
print(f" Fetching: {linkedin_url}")
|
|
fetch_result = call_linkup_fetch(linkedin_url)
|
|
|
|
if fetch_result and fetch_result.get('markdown'):
|
|
markdown = fetch_result['markdown']
|
|
extracted = extract_profile_data_from_markdown(markdown, linkedin_url)
|
|
|
|
# Add provenance
|
|
provenance_list.append(create_linkup_provenance(linkedin_url))
|
|
|
|
# Update enriched data
|
|
enriched_data['fetch_result'] = extracted
|
|
enriched_data['raw_markdown_length'] = len(markdown)
|
|
|
|
# Also do a search to potentially find additional info
|
|
print(f" Searching: {search_query[:80]}...")
|
|
search_result = call_linkup_search(search_query)
|
|
|
|
if search_result and search_result.get('results'):
|
|
results = search_result['results']
|
|
|
|
# Find the most relevant result
|
|
relevant_results = []
|
|
for r in results[:5]:
|
|
url = r.get('url', '')
|
|
content = r.get('content', '')
|
|
title = r.get('name', '')
|
|
|
|
# Check if it's a LinkedIn profile
|
|
if 'linkedin.com/in/' in url:
|
|
relevant_results.append({
|
|
'url': url,
|
|
'title': title,
|
|
'content': content[:500], # Limit content
|
|
'relevance': 'linkedin_profile'
|
|
})
|
|
elif name.lower() in title.lower() or name.lower() in content.lower():
|
|
relevant_results.append({
|
|
'url': url,
|
|
'title': title,
|
|
'content': content[:300],
|
|
'relevance': 'name_match'
|
|
})
|
|
|
|
if relevant_results:
|
|
provenance_list.append(create_linkup_provenance(
|
|
url=relevant_results[0]['url'],
|
|
search_query=search_query
|
|
))
|
|
enriched_data['search_results'] = relevant_results
|
|
|
|
# Only update if we found something
|
|
if enriched_data:
|
|
# Add Linkup enrichment section
|
|
data['linkup_enrichment'] = {
|
|
'enrichment_date': datetime.now(timezone.utc).isoformat(),
|
|
'provenance': provenance_list,
|
|
'data': enriched_data
|
|
}
|
|
|
|
# Update profile_data if we got better info
|
|
if enriched_data.get('fetch_result'):
|
|
fetch_data = enriched_data['fetch_result']
|
|
|
|
if fetch_data.get('location') and not profile_data.get('location'):
|
|
profile_data['location'] = fetch_data['location']
|
|
|
|
if fetch_data.get('connections'):
|
|
profile_data['connections'] = fetch_data['connections']
|
|
|
|
if fetch_data.get('education'):
|
|
profile_data['education'] = fetch_data['education']
|
|
|
|
if fetch_data.get('experience'):
|
|
if not profile_data.get('career_history'):
|
|
profile_data['career_history'] = []
|
|
for exp in fetch_data['experience']:
|
|
if exp.get('company'):
|
|
profile_data['career_history'].append(exp)
|
|
|
|
if fetch_data.get('volunteer_experience'):
|
|
profile_data['volunteer_experience'] = fetch_data['volunteer_experience']
|
|
|
|
data['profile_data'] = profile_data
|
|
|
|
# Update extraction metadata
|
|
data['extraction_metadata']['linkup_enriched'] = True
|
|
data['extraction_metadata']['linkup_enrichment_date'] = datetime.now(timezone.utc).isoformat()
|
|
|
|
# Write back
|
|
with open(profile_path, 'w', encoding='utf-8') as f:
|
|
json.dump(data, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f" ✓ Enriched with {len(provenance_list)} sources")
|
|
return True
|
|
else:
|
|
print(f" - No additional data found")
|
|
return False
|
|
|
|
except Exception as e:
|
|
print(f"Error enriching {profile_path}: {e}")
|
|
return False
|
|
|
|
|
|
def main():
|
|
"""Main enrichment workflow."""
|
|
print("=" * 60)
|
|
print("Linkup Profile Enrichment Script")
|
|
print("=" * 60)
|
|
|
|
# Check for API key
|
|
if not os.environ.get('LINKUP_API_KEY'):
|
|
print("Warning: LINKUP_API_KEY not set. Using MCP tools instead.")
|
|
|
|
# Load profiles to enrich
|
|
profiles = load_basic_profiles()
|
|
print(f"\nFound {len(profiles)} profiles to enrich")
|
|
|
|
if not profiles:
|
|
print("No profiles need enrichment.")
|
|
return
|
|
|
|
# Process in batches
|
|
enriched_count = 0
|
|
failed_count = 0
|
|
|
|
for i, profile_path in enumerate(profiles):
|
|
print(f"\n[{i+1}/{len(profiles)}] ", end="")
|
|
|
|
try:
|
|
success = enrich_profile(profile_path)
|
|
if success:
|
|
enriched_count += 1
|
|
else:
|
|
failed_count += 1
|
|
except Exception as e:
|
|
print(f"Error: {e}")
|
|
failed_count += 1
|
|
|
|
# Rate limiting
|
|
if i < len(profiles) - 1:
|
|
time.sleep(DELAY_BETWEEN_REQUESTS)
|
|
|
|
# Progress report every 100 profiles
|
|
if (i + 1) % 100 == 0:
|
|
print(f"\n--- Progress: {i+1}/{len(profiles)} processed, {enriched_count} enriched ---\n")
|
|
|
|
# Final report
|
|
print("\n" + "=" * 60)
|
|
print("Enrichment Complete")
|
|
print("=" * 60)
|
|
print(f"Total profiles processed: {len(profiles)}")
|
|
print(f"Successfully enriched: {enriched_count}")
|
|
print(f"Failed/No data: {failed_count}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|