357 lines
12 KiB
Python
357 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Fetch LinkedIn profiles for confirmed entity resolution matches that are missing entity files.
|
|
|
|
This script:
|
|
1. Identifies confirmed matches (review_decision == 'match') without entity files
|
|
2. Uses Exa API to fetch LinkedIn profile data
|
|
3. Creates entity files with proper structure including WCMS identifiers
|
|
|
|
Usage:
|
|
python scripts/fetch_missing_confirmed_profiles.py [--dry-run] [--limit N]
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import re
|
|
import sys
|
|
import time
|
|
import argparse
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Optional, Dict, List
|
|
from urllib.parse import unquote
|
|
|
|
import httpx
|
|
from tqdm import tqdm
|
|
|
|
|
|
# Configuration
|
|
EXA_API_URL = "https://api.exa.ai/contents"
|
|
ENTITY_DIR = Path("data/custodian/person/entity")
|
|
CANDIDATES_FILE = Path("data/entity_resolution/entity_resolution_candidates.json")
|
|
WCMS_DIR = Path("data/person")
|
|
RATE_LIMIT_DELAY = 1.0 # seconds between requests
|
|
|
|
|
|
def get_exa_api_key() -> str:
|
|
"""Get Exa API key from environment."""
|
|
key = os.environ.get("EXA_API_KEY")
|
|
if not key:
|
|
raise ValueError("EXA_API_KEY environment variable not set")
|
|
return key
|
|
|
|
|
|
def normalize_slug(slug: str) -> str:
|
|
"""Normalize LinkedIn slug for filename use."""
|
|
# URL decode percent-encoded characters
|
|
slug = unquote(slug)
|
|
# Remove problematic characters
|
|
slug = slug.replace('✓', '').replace('✔', '')
|
|
# Replace special chars with ASCII equivalents
|
|
replacements = {
|
|
'é': 'e', 'è': 'e', 'ê': 'e', 'ë': 'e',
|
|
'á': 'a', 'à': 'a', 'â': 'a', 'ä': 'a',
|
|
'í': 'i', 'ì': 'i', 'î': 'i', 'ï': 'i',
|
|
'ó': 'o', 'ò': 'o', 'ô': 'o', 'ö': 'o',
|
|
'ú': 'u', 'ù': 'u', 'û': 'u', 'ü': 'u',
|
|
'ñ': 'n', 'ç': 'c',
|
|
}
|
|
for orig, repl in replacements.items():
|
|
slug = slug.replace(orig, repl)
|
|
return slug
|
|
|
|
|
|
def get_missing_profiles() -> List[Dict]:
|
|
"""Get confirmed matches that don't have entity files."""
|
|
with open(CANDIDATES_FILE, 'r') as f:
|
|
data = json.load(f)
|
|
|
|
confirmed = [c for c in data['candidates'] if c.get('review_decision') == 'match']
|
|
|
|
# Build slug -> candidate mapping
|
|
slug_to_candidate = {}
|
|
for c in confirmed:
|
|
slug = c.get('linkedin_slug')
|
|
if slug:
|
|
slug_to_candidate[slug] = c
|
|
|
|
# Find existing entity file slugs
|
|
existing_slugs = set()
|
|
for f in ENTITY_DIR.glob('*.json'):
|
|
name = f.stem
|
|
if '_202' in name:
|
|
slug = name.rsplit('_202', 1)[0]
|
|
existing_slugs.add(slug)
|
|
|
|
# Find missing profiles
|
|
missing = []
|
|
for slug, candidate in slug_to_candidate.items():
|
|
norm_slug = normalize_slug(slug)
|
|
if slug not in existing_slugs and norm_slug not in existing_slugs:
|
|
missing.append({
|
|
'slug': slug,
|
|
'normalized_slug': norm_slug,
|
|
'wcms_ppid': candidate.get('wcms_ppid'),
|
|
'wcms_name': candidate.get('wcms_name'),
|
|
'wcms_email': candidate.get('wcms_email'),
|
|
'linkedin_url': f'https://www.linkedin.com/in/{slug}',
|
|
})
|
|
|
|
return missing
|
|
|
|
|
|
def fetch_profile_exa(url: str, api_key: str, client: httpx.Client) -> Optional[Dict]:
|
|
"""Fetch LinkedIn profile using Exa contents API."""
|
|
try:
|
|
response = client.post(
|
|
EXA_API_URL,
|
|
headers={
|
|
"Authorization": f"Bearer {api_key}",
|
|
"Content-Type": "application/json",
|
|
},
|
|
json={
|
|
"ids": [url],
|
|
"text": True,
|
|
"livecrawl": "preferred"
|
|
},
|
|
timeout=60.0
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
data = response.json()
|
|
if data.get('results') and len(data['results']) > 0:
|
|
return data['results'][0]
|
|
else:
|
|
print(f"\nError fetching {url}: HTTP {response.status_code}")
|
|
if response.status_code == 429:
|
|
print("Rate limited - waiting 30 seconds...")
|
|
time.sleep(30)
|
|
return None
|
|
except Exception as e:
|
|
print(f"\nException fetching {url}: {e}")
|
|
return None
|
|
|
|
|
|
def parse_profile_text(text: str) -> Dict:
|
|
"""Parse profile text to extract structured data."""
|
|
profile = {
|
|
'name': None,
|
|
'headline': None,
|
|
'location': None,
|
|
'about': None,
|
|
'experience': [],
|
|
'education': [],
|
|
'skills': [],
|
|
'languages': [],
|
|
}
|
|
|
|
if not text:
|
|
return profile
|
|
|
|
lines = text.strip().split('\n')
|
|
|
|
# Try to extract name from first non-empty line
|
|
for line in lines[:5]:
|
|
line = line.strip()
|
|
if line and not line.startswith('http') and len(line) < 100:
|
|
# Likely the name
|
|
if not any(x in line.lower() for x in ['linkedin', 'sign in', 'join']):
|
|
profile['name'] = line
|
|
break
|
|
|
|
# Look for headline (usually follows name)
|
|
if profile['name']:
|
|
name_idx = None
|
|
for i, line in enumerate(lines):
|
|
if profile['name'] in line:
|
|
name_idx = i
|
|
break
|
|
if name_idx is not None and name_idx + 1 < len(lines):
|
|
headline = lines[name_idx + 1].strip()
|
|
if headline and len(headline) < 200:
|
|
profile['headline'] = headline
|
|
|
|
# Extract location if present
|
|
for line in lines:
|
|
if any(loc in line for loc in ['Netherlands', 'Amsterdam', 'Rotterdam', 'Utrecht', 'Den Haag', 'The Hague']):
|
|
if len(line) < 100:
|
|
profile['location'] = line.strip()
|
|
break
|
|
|
|
return profile
|
|
|
|
|
|
def get_wcms_identifiers(wcms_ppid: str) -> Optional[Dict]:
|
|
"""Load WCMS identifiers from WCMS person file."""
|
|
wcms_file = WCMS_DIR / f"{wcms_ppid}.json"
|
|
if not wcms_file.exists():
|
|
return None
|
|
|
|
try:
|
|
with open(wcms_file, 'r') as f:
|
|
data = json.load(f)
|
|
|
|
# Extract relevant WCMS identifiers
|
|
identifiers = {}
|
|
if data.get('user_id'):
|
|
identifiers['user_id'] = data['user_id']
|
|
if data.get('username'):
|
|
identifiers['username'] = data['username']
|
|
if data.get('username_url'):
|
|
identifiers['username_url'] = data['username_url']
|
|
if data.get('abs_id'):
|
|
identifiers['abs_id'] = data['abs_id']
|
|
if data.get('crm_id'):
|
|
identifiers['crm_id'] = data['crm_id']
|
|
|
|
# Also check nested structure
|
|
if not identifiers:
|
|
for key in ['user_id', 'username', 'username_url', 'abs_id', 'crm_id']:
|
|
if data.get('wcms_identifiers', {}).get(key):
|
|
identifiers[key] = data['wcms_identifiers'][key]
|
|
|
|
return identifiers if identifiers else None
|
|
except Exception as e:
|
|
print(f"Error loading WCMS file {wcms_ppid}: {e}")
|
|
return None
|
|
|
|
|
|
def create_entity_file(profile_info: Dict, exa_result: Optional[Dict], wcms_ids: Optional[Dict]) -> Dict:
|
|
"""Create entity file structure."""
|
|
slug = profile_info['normalized_slug']
|
|
now = datetime.now(timezone.utc)
|
|
|
|
# Parse profile data from Exa result
|
|
profile_data = {
|
|
'name': profile_info.get('wcms_name'),
|
|
'linkedin_url': profile_info['linkedin_url'],
|
|
'headline': None,
|
|
'location': None,
|
|
'connections': None,
|
|
'about': None,
|
|
'experience': [],
|
|
'education': [],
|
|
'skills': [],
|
|
'languages': [],
|
|
'profile_image_url': None,
|
|
}
|
|
|
|
if exa_result:
|
|
parsed = parse_profile_text(exa_result.get('text', ''))
|
|
if parsed.get('name'):
|
|
profile_data['name'] = parsed['name']
|
|
if parsed.get('headline'):
|
|
profile_data['headline'] = parsed['headline']
|
|
if parsed.get('location'):
|
|
profile_data['location'] = parsed['location']
|
|
|
|
entity = {
|
|
'person_id': slug,
|
|
'extraction_metadata': {
|
|
'extraction_agent': 'fetch_missing_confirmed_profiles.py',
|
|
'extraction_date': now.isoformat(),
|
|
'extraction_source': 'Exa API crawl + entity resolution',
|
|
'schema_version': '1.0.0',
|
|
'notes': f'Created for confirmed entity resolution match. WCMS: {profile_info.get("wcms_ppid")}'
|
|
},
|
|
'profile_data': profile_data,
|
|
'heritage_relevance': {
|
|
'is_heritage_relevant': True,
|
|
'heritage_types': ['O'], # Default to Official/unknown
|
|
'rationale': 'Confirmed match via entity resolution review'
|
|
},
|
|
'affiliations': [],
|
|
'web_claims': [
|
|
{
|
|
'claim_type': 'linkedin_url',
|
|
'claim_value': profile_info['linkedin_url'],
|
|
'source_url': 'entity_resolution_review',
|
|
'retrieved_on': now.isoformat(),
|
|
'statement_created_at': now.isoformat(),
|
|
'source_archived_at': now.isoformat(),
|
|
'retrieval_agent': 'fetch_missing_confirmed_profiles.py'
|
|
}
|
|
],
|
|
}
|
|
|
|
# Add WCMS identifiers if available
|
|
if wcms_ids:
|
|
entity['wcms_identifiers'] = wcms_ids
|
|
|
|
return entity
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='Fetch missing LinkedIn profiles for confirmed matches')
|
|
parser.add_argument('--dry-run', action='store_true', help='Show what would be fetched without fetching')
|
|
parser.add_argument('--limit', type=int, default=None, help='Limit number of profiles to fetch')
|
|
args = parser.parse_args()
|
|
|
|
# Get missing profiles
|
|
missing = get_missing_profiles()
|
|
print(f"Found {len(missing)} confirmed matches without entity files")
|
|
|
|
if args.limit:
|
|
missing = missing[:args.limit]
|
|
print(f"Limited to {len(missing)} profiles")
|
|
|
|
if args.dry_run:
|
|
print("\nDry run - profiles that would be fetched:")
|
|
for p in missing:
|
|
print(f" {p['slug']} -> {p['wcms_ppid']}")
|
|
return
|
|
|
|
# Ensure output directory exists
|
|
ENTITY_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Get API key
|
|
try:
|
|
api_key = get_exa_api_key()
|
|
except ValueError as e:
|
|
print(f"Error: {e}")
|
|
sys.exit(1)
|
|
|
|
# Fetch profiles
|
|
success_count = 0
|
|
error_count = 0
|
|
|
|
with httpx.Client() as client:
|
|
for profile in tqdm(missing, desc="Fetching profiles"):
|
|
slug = profile['normalized_slug']
|
|
url = profile['linkedin_url']
|
|
|
|
# Fetch from Exa
|
|
exa_result = fetch_profile_exa(url, api_key, client)
|
|
|
|
# Get WCMS identifiers
|
|
wcms_ids = None
|
|
if profile.get('wcms_ppid'):
|
|
wcms_ids = get_wcms_identifiers(profile['wcms_ppid'])
|
|
|
|
# Create entity file
|
|
entity = create_entity_file(profile, exa_result, wcms_ids)
|
|
|
|
# Save file
|
|
timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
|
|
filename = f"{slug}_{timestamp}.json"
|
|
filepath = ENTITY_DIR / filename
|
|
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
json.dump(entity, f, indent=2, ensure_ascii=False)
|
|
|
|
if exa_result:
|
|
success_count += 1
|
|
else:
|
|
error_count += 1
|
|
|
|
# Rate limiting
|
|
time.sleep(RATE_LIMIT_DELAY)
|
|
|
|
print(f"\nDone! Created {success_count + error_count} entity files")
|
|
print(f" With Exa data: {success_count}")
|
|
print(f" Without Exa data (WCMS only): {error_count}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|