#!/usr/bin/env python3 """ Fetch LinkedIn profiles for confirmed entity resolution matches that are missing entity files. This script: 1. Identifies confirmed matches (review_decision == 'match') without entity files 2. Uses Exa API to fetch LinkedIn profile data 3. Creates entity files with proper structure including WCMS identifiers Usage: python scripts/fetch_missing_confirmed_profiles.py [--dry-run] [--limit N] """ import json import os import re import sys import time import argparse from datetime import datetime, timezone from pathlib import Path from typing import Optional, Dict, List from urllib.parse import unquote import httpx from tqdm import tqdm # Configuration EXA_API_URL = "https://api.exa.ai/contents" ENTITY_DIR = Path("data/custodian/person/entity") CANDIDATES_FILE = Path("data/entity_resolution/entity_resolution_candidates.json") WCMS_DIR = Path("data/person") RATE_LIMIT_DELAY = 1.0 # seconds between requests def get_exa_api_key() -> str: """Get Exa API key from environment.""" key = os.environ.get("EXA_API_KEY") if not key: raise ValueError("EXA_API_KEY environment variable not set") return key def normalize_slug(slug: str) -> str: """Normalize LinkedIn slug for filename use.""" # URL decode percent-encoded characters slug = unquote(slug) # Remove problematic characters slug = slug.replace('✓', '').replace('✔', '') # Replace special chars with ASCII equivalents replacements = { 'é': 'e', 'è': 'e', 'ê': 'e', 'ë': 'e', 'á': 'a', 'à': 'a', 'â': 'a', 'ä': 'a', 'í': 'i', 'ì': 'i', 'î': 'i', 'ï': 'i', 'ó': 'o', 'ò': 'o', 'ô': 'o', 'ö': 'o', 'ú': 'u', 'ù': 'u', 'û': 'u', 'ü': 'u', 'ñ': 'n', 'ç': 'c', } for orig, repl in replacements.items(): slug = slug.replace(orig, repl) return slug def get_missing_profiles() -> List[Dict]: """Get confirmed matches that don't have entity files.""" with open(CANDIDATES_FILE, 'r') as f: data = json.load(f) confirmed = [c for c in data['candidates'] if c.get('review_decision') == 'match'] # Build slug -> candidate mapping slug_to_candidate = {} for c in confirmed: slug = c.get('linkedin_slug') if slug: slug_to_candidate[slug] = c # Find existing entity file slugs existing_slugs = set() for f in ENTITY_DIR.glob('*.json'): name = f.stem if '_202' in name: slug = name.rsplit('_202', 1)[0] existing_slugs.add(slug) # Find missing profiles missing = [] for slug, candidate in slug_to_candidate.items(): norm_slug = normalize_slug(slug) if slug not in existing_slugs and norm_slug not in existing_slugs: missing.append({ 'slug': slug, 'normalized_slug': norm_slug, 'wcms_ppid': candidate.get('wcms_ppid'), 'wcms_name': candidate.get('wcms_name'), 'wcms_email': candidate.get('wcms_email'), 'linkedin_url': f'https://www.linkedin.com/in/{slug}', }) return missing def fetch_profile_exa(url: str, api_key: str, client: httpx.Client) -> Optional[Dict]: """Fetch LinkedIn profile using Exa contents API.""" try: response = client.post( EXA_API_URL, headers={ "Authorization": f"Bearer {api_key}", "Content-Type": "application/json", }, json={ "ids": [url], "text": True, "livecrawl": "preferred" }, timeout=60.0 ) if response.status_code == 200: data = response.json() if data.get('results') and len(data['results']) > 0: return data['results'][0] else: print(f"\nError fetching {url}: HTTP {response.status_code}") if response.status_code == 429: print("Rate limited - waiting 30 seconds...") time.sleep(30) return None except Exception as e: print(f"\nException fetching {url}: {e}") return None def parse_profile_text(text: str) -> Dict: """Parse profile text to extract structured data.""" profile = { 'name': None, 'headline': None, 'location': None, 'about': None, 'experience': [], 'education': [], 'skills': [], 'languages': [], } if not text: return profile lines = text.strip().split('\n') # Try to extract name from first non-empty line for line in lines[:5]: line = line.strip() if line and not line.startswith('http') and len(line) < 100: # Likely the name if not any(x in line.lower() for x in ['linkedin', 'sign in', 'join']): profile['name'] = line break # Look for headline (usually follows name) if profile['name']: name_idx = None for i, line in enumerate(lines): if profile['name'] in line: name_idx = i break if name_idx is not None and name_idx + 1 < len(lines): headline = lines[name_idx + 1].strip() if headline and len(headline) < 200: profile['headline'] = headline # Extract location if present for line in lines: if any(loc in line for loc in ['Netherlands', 'Amsterdam', 'Rotterdam', 'Utrecht', 'Den Haag', 'The Hague']): if len(line) < 100: profile['location'] = line.strip() break return profile def get_wcms_identifiers(wcms_ppid: str) -> Optional[Dict]: """Load WCMS identifiers from WCMS person file.""" wcms_file = WCMS_DIR / f"{wcms_ppid}.json" if not wcms_file.exists(): return None try: with open(wcms_file, 'r') as f: data = json.load(f) # Extract relevant WCMS identifiers identifiers = {} if data.get('user_id'): identifiers['user_id'] = data['user_id'] if data.get('username'): identifiers['username'] = data['username'] if data.get('username_url'): identifiers['username_url'] = data['username_url'] if data.get('abs_id'): identifiers['abs_id'] = data['abs_id'] if data.get('crm_id'): identifiers['crm_id'] = data['crm_id'] # Also check nested structure if not identifiers: for key in ['user_id', 'username', 'username_url', 'abs_id', 'crm_id']: if data.get('wcms_identifiers', {}).get(key): identifiers[key] = data['wcms_identifiers'][key] return identifiers if identifiers else None except Exception as e: print(f"Error loading WCMS file {wcms_ppid}: {e}") return None def create_entity_file(profile_info: Dict, exa_result: Optional[Dict], wcms_ids: Optional[Dict]) -> Dict: """Create entity file structure.""" slug = profile_info['normalized_slug'] now = datetime.now(timezone.utc) # Parse profile data from Exa result profile_data = { 'name': profile_info.get('wcms_name'), 'linkedin_url': profile_info['linkedin_url'], 'headline': None, 'location': None, 'connections': None, 'about': None, 'experience': [], 'education': [], 'skills': [], 'languages': [], 'profile_image_url': None, } if exa_result: parsed = parse_profile_text(exa_result.get('text', '')) if parsed.get('name'): profile_data['name'] = parsed['name'] if parsed.get('headline'): profile_data['headline'] = parsed['headline'] if parsed.get('location'): profile_data['location'] = parsed['location'] entity = { 'person_id': slug, 'extraction_metadata': { 'extraction_agent': 'fetch_missing_confirmed_profiles.py', 'extraction_date': now.isoformat(), 'extraction_source': 'Exa API crawl + entity resolution', 'schema_version': '1.0.0', 'notes': f'Created for confirmed entity resolution match. WCMS: {profile_info.get("wcms_ppid")}' }, 'profile_data': profile_data, 'heritage_relevance': { 'is_heritage_relevant': True, 'heritage_types': ['O'], # Default to Official/unknown 'rationale': 'Confirmed match via entity resolution review' }, 'affiliations': [], 'web_claims': [ { 'claim_type': 'linkedin_url', 'claim_value': profile_info['linkedin_url'], 'source_url': 'entity_resolution_review', 'retrieved_on': now.isoformat(), 'statement_created_at': now.isoformat(), 'source_archived_at': now.isoformat(), 'retrieval_agent': 'fetch_missing_confirmed_profiles.py' } ], } # Add WCMS identifiers if available if wcms_ids: entity['wcms_identifiers'] = wcms_ids return entity def main(): parser = argparse.ArgumentParser(description='Fetch missing LinkedIn profiles for confirmed matches') parser.add_argument('--dry-run', action='store_true', help='Show what would be fetched without fetching') parser.add_argument('--limit', type=int, default=None, help='Limit number of profiles to fetch') args = parser.parse_args() # Get missing profiles missing = get_missing_profiles() print(f"Found {len(missing)} confirmed matches without entity files") if args.limit: missing = missing[:args.limit] print(f"Limited to {len(missing)} profiles") if args.dry_run: print("\nDry run - profiles that would be fetched:") for p in missing: print(f" {p['slug']} -> {p['wcms_ppid']}") return # Ensure output directory exists ENTITY_DIR.mkdir(parents=True, exist_ok=True) # Get API key try: api_key = get_exa_api_key() except ValueError as e: print(f"Error: {e}") sys.exit(1) # Fetch profiles success_count = 0 error_count = 0 with httpx.Client() as client: for profile in tqdm(missing, desc="Fetching profiles"): slug = profile['normalized_slug'] url = profile['linkedin_url'] # Fetch from Exa exa_result = fetch_profile_exa(url, api_key, client) # Get WCMS identifiers wcms_ids = None if profile.get('wcms_ppid'): wcms_ids = get_wcms_identifiers(profile['wcms_ppid']) # Create entity file entity = create_entity_file(profile, exa_result, wcms_ids) # Save file timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ") filename = f"{slug}_{timestamp}.json" filepath = ENTITY_DIR / filename with open(filepath, 'w', encoding='utf-8') as f: json.dump(entity, f, indent=2, ensure_ascii=False) if exa_result: success_count += 1 else: error_count += 1 # Rate limiting time.sleep(RATE_LIMIT_DELAY) print(f"\nDone! Created {success_count + error_count} entity files") print(f" With Exa data: {success_count}") print(f" Without Exa data (WCMS only): {error_count}") if __name__ == '__main__': main()