glam/scripts/fetch_missing_confirmed_profiles.py
2026-01-16 12:50:50 +01:00

357 lines
12 KiB
Python

#!/usr/bin/env python3
"""
Fetch LinkedIn profiles for confirmed entity resolution matches that are missing entity files.
This script:
1. Identifies confirmed matches (review_decision == 'match') without entity files
2. Uses Exa API to fetch LinkedIn profile data
3. Creates entity files with proper structure including WCMS identifiers
Usage:
python scripts/fetch_missing_confirmed_profiles.py [--dry-run] [--limit N]
"""
import json
import os
import re
import sys
import time
import argparse
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional, Dict, List
from urllib.parse import unquote
import httpx
from tqdm import tqdm
# Configuration
EXA_API_URL = "https://api.exa.ai/contents"
ENTITY_DIR = Path("data/custodian/person/entity")
CANDIDATES_FILE = Path("data/entity_resolution/entity_resolution_candidates.json")
WCMS_DIR = Path("data/person")
RATE_LIMIT_DELAY = 1.0 # seconds between requests
def get_exa_api_key() -> str:
"""Get Exa API key from environment."""
key = os.environ.get("EXA_API_KEY")
if not key:
raise ValueError("EXA_API_KEY environment variable not set")
return key
def normalize_slug(slug: str) -> str:
"""Normalize LinkedIn slug for filename use."""
# URL decode percent-encoded characters
slug = unquote(slug)
# Remove problematic characters
slug = slug.replace('', '').replace('', '')
# Replace special chars with ASCII equivalents
replacements = {
'é': 'e', 'è': 'e', 'ê': 'e', 'ë': 'e',
'á': 'a', 'à': 'a', 'â': 'a', 'ä': 'a',
'í': 'i', 'ì': 'i', 'î': 'i', 'ï': 'i',
'ó': 'o', 'ò': 'o', 'ô': 'o', 'ö': 'o',
'ú': 'u', 'ù': 'u', 'û': 'u', 'ü': 'u',
'ñ': 'n', 'ç': 'c',
}
for orig, repl in replacements.items():
slug = slug.replace(orig, repl)
return slug
def get_missing_profiles() -> List[Dict]:
"""Get confirmed matches that don't have entity files."""
with open(CANDIDATES_FILE, 'r') as f:
data = json.load(f)
confirmed = [c for c in data['candidates'] if c.get('review_decision') == 'match']
# Build slug -> candidate mapping
slug_to_candidate = {}
for c in confirmed:
slug = c.get('linkedin_slug')
if slug:
slug_to_candidate[slug] = c
# Find existing entity file slugs
existing_slugs = set()
for f in ENTITY_DIR.glob('*.json'):
name = f.stem
if '_202' in name:
slug = name.rsplit('_202', 1)[0]
existing_slugs.add(slug)
# Find missing profiles
missing = []
for slug, candidate in slug_to_candidate.items():
norm_slug = normalize_slug(slug)
if slug not in existing_slugs and norm_slug not in existing_slugs:
missing.append({
'slug': slug,
'normalized_slug': norm_slug,
'wcms_ppid': candidate.get('wcms_ppid'),
'wcms_name': candidate.get('wcms_name'),
'wcms_email': candidate.get('wcms_email'),
'linkedin_url': f'https://www.linkedin.com/in/{slug}',
})
return missing
def fetch_profile_exa(url: str, api_key: str, client: httpx.Client) -> Optional[Dict]:
"""Fetch LinkedIn profile using Exa contents API."""
try:
response = client.post(
EXA_API_URL,
headers={
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
},
json={
"ids": [url],
"text": True,
"livecrawl": "preferred"
},
timeout=60.0
)
if response.status_code == 200:
data = response.json()
if data.get('results') and len(data['results']) > 0:
return data['results'][0]
else:
print(f"\nError fetching {url}: HTTP {response.status_code}")
if response.status_code == 429:
print("Rate limited - waiting 30 seconds...")
time.sleep(30)
return None
except Exception as e:
print(f"\nException fetching {url}: {e}")
return None
def parse_profile_text(text: str) -> Dict:
"""Parse profile text to extract structured data."""
profile = {
'name': None,
'headline': None,
'location': None,
'about': None,
'experience': [],
'education': [],
'skills': [],
'languages': [],
}
if not text:
return profile
lines = text.strip().split('\n')
# Try to extract name from first non-empty line
for line in lines[:5]:
line = line.strip()
if line and not line.startswith('http') and len(line) < 100:
# Likely the name
if not any(x in line.lower() for x in ['linkedin', 'sign in', 'join']):
profile['name'] = line
break
# Look for headline (usually follows name)
if profile['name']:
name_idx = None
for i, line in enumerate(lines):
if profile['name'] in line:
name_idx = i
break
if name_idx is not None and name_idx + 1 < len(lines):
headline = lines[name_idx + 1].strip()
if headline and len(headline) < 200:
profile['headline'] = headline
# Extract location if present
for line in lines:
if any(loc in line for loc in ['Netherlands', 'Amsterdam', 'Rotterdam', 'Utrecht', 'Den Haag', 'The Hague']):
if len(line) < 100:
profile['location'] = line.strip()
break
return profile
def get_wcms_identifiers(wcms_ppid: str) -> Optional[Dict]:
"""Load WCMS identifiers from WCMS person file."""
wcms_file = WCMS_DIR / f"{wcms_ppid}.json"
if not wcms_file.exists():
return None
try:
with open(wcms_file, 'r') as f:
data = json.load(f)
# Extract relevant WCMS identifiers
identifiers = {}
if data.get('user_id'):
identifiers['user_id'] = data['user_id']
if data.get('username'):
identifiers['username'] = data['username']
if data.get('username_url'):
identifiers['username_url'] = data['username_url']
if data.get('abs_id'):
identifiers['abs_id'] = data['abs_id']
if data.get('crm_id'):
identifiers['crm_id'] = data['crm_id']
# Also check nested structure
if not identifiers:
for key in ['user_id', 'username', 'username_url', 'abs_id', 'crm_id']:
if data.get('wcms_identifiers', {}).get(key):
identifiers[key] = data['wcms_identifiers'][key]
return identifiers if identifiers else None
except Exception as e:
print(f"Error loading WCMS file {wcms_ppid}: {e}")
return None
def create_entity_file(profile_info: Dict, exa_result: Optional[Dict], wcms_ids: Optional[Dict]) -> Dict:
"""Create entity file structure."""
slug = profile_info['normalized_slug']
now = datetime.now(timezone.utc)
# Parse profile data from Exa result
profile_data = {
'name': profile_info.get('wcms_name'),
'linkedin_url': profile_info['linkedin_url'],
'headline': None,
'location': None,
'connections': None,
'about': None,
'experience': [],
'education': [],
'skills': [],
'languages': [],
'profile_image_url': None,
}
if exa_result:
parsed = parse_profile_text(exa_result.get('text', ''))
if parsed.get('name'):
profile_data['name'] = parsed['name']
if parsed.get('headline'):
profile_data['headline'] = parsed['headline']
if parsed.get('location'):
profile_data['location'] = parsed['location']
entity = {
'person_id': slug,
'extraction_metadata': {
'extraction_agent': 'fetch_missing_confirmed_profiles.py',
'extraction_date': now.isoformat(),
'extraction_source': 'Exa API crawl + entity resolution',
'schema_version': '1.0.0',
'notes': f'Created for confirmed entity resolution match. WCMS: {profile_info.get("wcms_ppid")}'
},
'profile_data': profile_data,
'heritage_relevance': {
'is_heritage_relevant': True,
'heritage_types': ['O'], # Default to Official/unknown
'rationale': 'Confirmed match via entity resolution review'
},
'affiliations': [],
'web_claims': [
{
'claim_type': 'linkedin_url',
'claim_value': profile_info['linkedin_url'],
'source_url': 'entity_resolution_review',
'retrieved_on': now.isoformat(),
'statement_created_at': now.isoformat(),
'source_archived_at': now.isoformat(),
'retrieval_agent': 'fetch_missing_confirmed_profiles.py'
}
],
}
# Add WCMS identifiers if available
if wcms_ids:
entity['wcms_identifiers'] = wcms_ids
return entity
def main():
parser = argparse.ArgumentParser(description='Fetch missing LinkedIn profiles for confirmed matches')
parser.add_argument('--dry-run', action='store_true', help='Show what would be fetched without fetching')
parser.add_argument('--limit', type=int, default=None, help='Limit number of profiles to fetch')
args = parser.parse_args()
# Get missing profiles
missing = get_missing_profiles()
print(f"Found {len(missing)} confirmed matches without entity files")
if args.limit:
missing = missing[:args.limit]
print(f"Limited to {len(missing)} profiles")
if args.dry_run:
print("\nDry run - profiles that would be fetched:")
for p in missing:
print(f" {p['slug']} -> {p['wcms_ppid']}")
return
# Ensure output directory exists
ENTITY_DIR.mkdir(parents=True, exist_ok=True)
# Get API key
try:
api_key = get_exa_api_key()
except ValueError as e:
print(f"Error: {e}")
sys.exit(1)
# Fetch profiles
success_count = 0
error_count = 0
with httpx.Client() as client:
for profile in tqdm(missing, desc="Fetching profiles"):
slug = profile['normalized_slug']
url = profile['linkedin_url']
# Fetch from Exa
exa_result = fetch_profile_exa(url, api_key, client)
# Get WCMS identifiers
wcms_ids = None
if profile.get('wcms_ppid'):
wcms_ids = get_wcms_identifiers(profile['wcms_ppid'])
# Create entity file
entity = create_entity_file(profile, exa_result, wcms_ids)
# Save file
timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
filename = f"{slug}_{timestamp}.json"
filepath = ENTITY_DIR / filename
with open(filepath, 'w', encoding='utf-8') as f:
json.dump(entity, f, indent=2, ensure_ascii=False)
if exa_result:
success_count += 1
else:
error_count += 1
# Rate limiting
time.sleep(RATE_LIMIT_DELAY)
print(f"\nDone! Created {success_count + error_count} entity files")
print(f" With Exa data: {success_count}")
print(f" Without Exa data (WCMS only): {error_count}")
if __name__ == '__main__':
main()