411 lines
13 KiB
Python
Executable file
411 lines
13 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Enrich person entity profiles that have empty experience data using Linkup API.
|
|
|
|
This script:
|
|
1. Identifies profiles with is_heritage_relevant=true but empty experience
|
|
2. Uses Linkup search to find career information
|
|
3. Stores raw responses in data/custodian/web/linkedin/{slug}/
|
|
4. Updates entity profiles with enrichment data
|
|
|
|
Per AGENTS.md Rule 27: Person entity files are the SINGLE SOURCE OF TRUTH.
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import sys
|
|
import time
|
|
import re
|
|
import httpx
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional, Any
|
|
|
|
# Configuration
|
|
ENTITY_DIR = Path("/Users/kempersc/apps/glam/data/custodian/person/entity")
|
|
WEB_DIR = Path("/Users/kempersc/apps/glam/data/custodian/web/linkedin")
|
|
BATCH_SIZE = 10
|
|
DELAY_BETWEEN_REQUESTS = 2.0 # seconds between API calls
|
|
|
|
# Linkup API configuration
|
|
LINKUP_API_BASE = "https://api.linkup.so/v1"
|
|
|
|
|
|
def get_linkup_api_key() -> Optional[str]:
|
|
"""Get Linkup API key from environment."""
|
|
return os.environ.get('LINKUP_API_KEY')
|
|
|
|
|
|
def search_linkup(query: str, depth: str = "deep") -> Optional[Dict]:
|
|
"""
|
|
Search using Linkup API.
|
|
|
|
Args:
|
|
query: Search query string
|
|
depth: Search depth - "standard" or "deep"
|
|
|
|
Returns:
|
|
Search results dict or None on error
|
|
"""
|
|
api_key = get_linkup_api_key()
|
|
if not api_key:
|
|
print(" ERROR: LINKUP_API_KEY not set")
|
|
return None
|
|
|
|
try:
|
|
with httpx.Client(timeout=60.0) as client:
|
|
response = client.post(
|
|
f"{LINKUP_API_BASE}/search",
|
|
headers={
|
|
"Authorization": f"Bearer {api_key}",
|
|
"Content-Type": "application/json"
|
|
},
|
|
json={
|
|
"q": query,
|
|
"depth": depth,
|
|
"outputType": "searchResults"
|
|
}
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
return response.json()
|
|
else:
|
|
print(f" Linkup search error: {response.status_code}")
|
|
return None
|
|
except Exception as e:
|
|
print(f" Error calling Linkup: {e}")
|
|
return None
|
|
|
|
|
|
def fetch_linkup(url: str, render_js: bool = False) -> Optional[Dict]:
|
|
"""
|
|
Fetch URL content using Linkup API.
|
|
|
|
Args:
|
|
url: URL to fetch
|
|
render_js: Whether to render JavaScript
|
|
|
|
Returns:
|
|
Fetch result dict or None on error
|
|
"""
|
|
api_key = get_linkup_api_key()
|
|
if not api_key:
|
|
print(" ERROR: LINKUP_API_KEY not set")
|
|
return None
|
|
|
|
try:
|
|
with httpx.Client(timeout=60.0) as client:
|
|
response = client.post(
|
|
f"{LINKUP_API_BASE}/fetch",
|
|
headers={
|
|
"Authorization": f"Bearer {api_key}",
|
|
"Content-Type": "application/json"
|
|
},
|
|
json={
|
|
"url": url,
|
|
"renderJs": render_js
|
|
}
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
return response.json()
|
|
else:
|
|
print(f" Linkup fetch error: {response.status_code}")
|
|
return None
|
|
except Exception as e:
|
|
print(f" Error calling Linkup fetch: {e}")
|
|
return None
|
|
|
|
|
|
def extract_linkedin_slug(url: str) -> str:
|
|
"""Extract LinkedIn slug from URL."""
|
|
if '/in/' in url:
|
|
return url.split('/in/')[-1].rstrip('/')
|
|
return url.rstrip('/')
|
|
|
|
|
|
def load_profiles_needing_enrichment() -> List[Dict]:
|
|
"""
|
|
Load profiles that:
|
|
1. Have is_heritage_relevant = true
|
|
2. Have empty experience array
|
|
3. Have valid LinkedIn URL
|
|
"""
|
|
profiles = []
|
|
|
|
for file_path in ENTITY_DIR.glob("*.json"):
|
|
try:
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
# Check heritage relevance
|
|
hr = data.get('heritage_relevance', {})
|
|
if not hr.get('is_heritage_relevant', False):
|
|
continue
|
|
|
|
# Check experience is empty
|
|
profile_data = data.get('profile_data', {})
|
|
experience = profile_data.get('experience', [])
|
|
if experience and len(experience) > 0:
|
|
continue # Already has experience data
|
|
|
|
# Check for LinkedIn URL
|
|
linkedin_url = (
|
|
data.get('extraction_metadata', {}).get('linkedin_url') or
|
|
profile_data.get('linkedin_url')
|
|
)
|
|
if not linkedin_url or 'linkedin.com/in/' not in linkedin_url:
|
|
continue
|
|
|
|
# Check if already enriched via Linkup
|
|
if data.get('timeline_enrichment', {}).get('enriched_on'):
|
|
continue
|
|
|
|
slug = extract_linkedin_slug(linkedin_url)
|
|
|
|
# Get source info for better search
|
|
source_info = data.get('source_staff_info', {})
|
|
|
|
profiles.append({
|
|
'file_path': str(file_path),
|
|
'name': profile_data.get('name') or source_info.get('name', 'Unknown'),
|
|
'headline': profile_data.get('headline') or source_info.get('headline', ''),
|
|
'linkedin_url': linkedin_url,
|
|
'slug': slug,
|
|
'custodian': source_info.get('custodian', ''),
|
|
'heritage_type': source_info.get('heritage_type', '')
|
|
})
|
|
except Exception as e:
|
|
print(f"Error loading {file_path}: {e}")
|
|
|
|
return profiles
|
|
|
|
|
|
def build_search_query(profile: Dict) -> str:
|
|
"""Build optimal search query for profile."""
|
|
name = profile['name']
|
|
headline = profile.get('headline', '')
|
|
custodian = profile.get('custodian', '')
|
|
|
|
# Extract organization from headline if not in custodian
|
|
org = custodian
|
|
if not org and headline:
|
|
# Common patterns: "Title at Organization" or "Title bij Organization"
|
|
for sep in [' at ', ' bij ', ' | ', ' - ']:
|
|
if sep in headline:
|
|
parts = headline.split(sep)
|
|
if len(parts) > 1:
|
|
org = parts[-1].split('|')[0].strip()
|
|
break
|
|
|
|
# Build query
|
|
query_parts = [f'"{name}"']
|
|
if org:
|
|
query_parts.append(org)
|
|
query_parts.append('linkedin.com/in career experience education')
|
|
|
|
return ' '.join(query_parts)
|
|
|
|
|
|
def parse_experience_from_results(results: List[Dict], name: str) -> List[Dict]:
|
|
"""Parse experience data from Linkup search results."""
|
|
experience = []
|
|
|
|
for result in results:
|
|
content = result.get('content', '') or result.get('snippet', '')
|
|
|
|
# Skip if doesn't mention the person
|
|
if name.lower() not in content.lower():
|
|
continue
|
|
|
|
# Look for job patterns
|
|
# Pattern: "Title at Company" or "Company · Title"
|
|
job_patterns = [
|
|
r'(?:as|is|was)\s+(?:a\s+)?([^·|]+?)\s+(?:at|for|bij)\s+([^·|]+)',
|
|
r'([A-Z][^·|]+?)\s+(?:at|@|bij)\s+([A-Z][^·|]+)',
|
|
r'([^·]+)\s+·\s+([A-Z][^·\n]+)',
|
|
]
|
|
|
|
for pattern in job_patterns:
|
|
matches = re.findall(pattern, content, re.IGNORECASE)
|
|
for match in matches:
|
|
if len(match) >= 2:
|
|
title = match[0].strip()[:100]
|
|
company = match[1].strip()[:100]
|
|
|
|
# Skip if too short or generic
|
|
if len(title) < 3 or len(company) < 3:
|
|
continue
|
|
|
|
experience.append({
|
|
'title': title,
|
|
'company': company,
|
|
'source': 'linkup_search',
|
|
'current': False
|
|
})
|
|
|
|
# Deduplicate
|
|
seen = set()
|
|
unique_exp = []
|
|
for exp in experience:
|
|
key = (exp['title'].lower(), exp['company'].lower())
|
|
if key not in seen:
|
|
seen.add(key)
|
|
unique_exp.append(exp)
|
|
|
|
return unique_exp[:10] # Limit to 10 positions
|
|
|
|
|
|
def save_raw_response(slug: str, response_type: str, data: Dict) -> Path:
|
|
"""Save raw Linkup response to web directory."""
|
|
dir_path = WEB_DIR / slug
|
|
dir_path.mkdir(parents=True, exist_ok=True)
|
|
|
|
timestamp = datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%SZ')
|
|
filename = f"linkup_{response_type}_{timestamp}.json"
|
|
file_path = dir_path / filename
|
|
|
|
with open(file_path, 'w', encoding='utf-8') as f:
|
|
json.dump({
|
|
'metadata': {
|
|
'retrieval_agent': 'linkup',
|
|
'retrieval_timestamp': datetime.now(timezone.utc).isoformat(),
|
|
'response_type': response_type
|
|
},
|
|
'data': data
|
|
}, f, indent=2, ensure_ascii=False)
|
|
|
|
return file_path
|
|
|
|
|
|
def enrich_profile(profile: Dict) -> bool:
|
|
"""
|
|
Enrich a single profile using Linkup.
|
|
|
|
Returns True if enrichment was successful.
|
|
"""
|
|
name = profile['name']
|
|
slug = profile['slug']
|
|
file_path = Path(profile['file_path'])
|
|
|
|
print(f" Enriching: {name} ({slug})")
|
|
|
|
# Build search query
|
|
query = build_search_query(profile)
|
|
print(f" Query: {query[:60]}...")
|
|
|
|
# Search using Linkup
|
|
search_result = search_linkup(query, depth="deep")
|
|
|
|
if not search_result:
|
|
print(f" No search results")
|
|
return False
|
|
|
|
# Save raw response
|
|
raw_path = save_raw_response(slug, "search", search_result)
|
|
print(f" Raw response saved: {raw_path.name}")
|
|
|
|
# Parse results
|
|
results = search_result.get('results', [])
|
|
if not results:
|
|
print(f" No results in response")
|
|
return False
|
|
|
|
# Extract experience data
|
|
experience = parse_experience_from_results(results, name)
|
|
|
|
# Also try to extract from LinkedIn-specific results
|
|
linkedin_results = [r for r in results if 'linkedin.com/in/' in r.get('url', '')]
|
|
|
|
# Load current entity file
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
entity_data = json.load(f)
|
|
|
|
# Update with enrichment data
|
|
timestamp = datetime.now(timezone.utc).isoformat()
|
|
|
|
entity_data['timeline_enrichment'] = {
|
|
'enriched_on': timestamp,
|
|
'retrieval_agent': 'linkup',
|
|
'search_query': query,
|
|
'results_count': len(results),
|
|
'linkedin_results_count': len(linkedin_results),
|
|
'raw_response_path': str(raw_path.relative_to(Path('/Users/kempersc/apps/glam'))),
|
|
'extracted_experience_count': len(experience)
|
|
}
|
|
|
|
# Update profile_data with experience if found
|
|
if experience:
|
|
entity_data['profile_data']['experience'] = experience
|
|
entity_data['extraction_metadata']['extraction_method'] = (
|
|
entity_data['extraction_metadata'].get('extraction_method', 'unknown') +
|
|
'_linkup_enriched'
|
|
)
|
|
entity_data['extraction_metadata']['notes'] = (
|
|
entity_data['extraction_metadata'].get('notes', '') +
|
|
f" Enriched via Linkup search on {timestamp[:10]} with {len(experience)} experience entries."
|
|
)
|
|
|
|
# Save updated entity
|
|
with open(file_path, 'w', encoding='utf-8') as f:
|
|
json.dump(entity_data, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f" ✓ Enriched with {len(experience)} experience entries")
|
|
return len(experience) > 0
|
|
|
|
|
|
def main():
|
|
"""Main enrichment workflow."""
|
|
print("=" * 60)
|
|
print("Linkup Enrichment for Profiles with Empty Experience")
|
|
print("=" * 60)
|
|
|
|
# Check API key
|
|
if not get_linkup_api_key():
|
|
print("\nERROR: LINKUP_API_KEY environment variable not set")
|
|
print("Please set it before running this script.")
|
|
sys.exit(1)
|
|
|
|
# Load profiles
|
|
profiles = load_profiles_needing_enrichment()
|
|
print(f"\nFound {len(profiles)} profiles needing enrichment")
|
|
|
|
if not profiles:
|
|
print("No profiles need enrichment.")
|
|
return
|
|
|
|
# Limit batch size
|
|
batch = profiles[:BATCH_SIZE]
|
|
print(f"Processing batch of {len(batch)} profiles\n")
|
|
|
|
# Process batch
|
|
enriched = 0
|
|
failed = 0
|
|
|
|
for i, profile in enumerate(batch, 1):
|
|
print(f"\n[{i}/{len(batch)}]")
|
|
try:
|
|
if enrich_profile(profile):
|
|
enriched += 1
|
|
else:
|
|
failed += 1
|
|
except Exception as e:
|
|
print(f" ERROR: {e}")
|
|
failed += 1
|
|
|
|
# Rate limiting
|
|
if i < len(batch):
|
|
time.sleep(DELAY_BETWEEN_REQUESTS)
|
|
|
|
# Summary
|
|
print("\n" + "=" * 60)
|
|
print("Enrichment Complete")
|
|
print("=" * 60)
|
|
print(f"Total processed: {len(batch)}")
|
|
print(f"Successfully enriched: {enriched}")
|
|
print(f"Failed/No data: {failed}")
|
|
print(f"Remaining profiles: {len(profiles) - len(batch)}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|