480 lines
16 KiB
Python
Executable file
480 lines
16 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Re-process LinkedIn profiles that have exa_raw_response but are missing pico_structured.
|
|
|
|
This script:
|
|
1. Scans entity directory for profiles with exa_raw_response but no pico_structured
|
|
2. Calls GLM-4.6 with the LinkedIn-specific PiCo prompt to structure the data
|
|
3. Updates the profile files in-place with pico_structured data
|
|
|
|
Usage:
|
|
python scripts/reprocess_pico_structured.py [--dry-run] [--limit N] [--verbose]
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import sys
|
|
import re
|
|
import argparse
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from typing import Dict, List, Optional, Tuple
|
|
|
|
# Load environment variables from .env file
|
|
from dotenv import load_dotenv
|
|
load_dotenv()
|
|
|
|
import httpx
|
|
import yaml
|
|
from tqdm import tqdm
|
|
|
|
|
|
# Path to PiCo integration module with GLM system prompt
|
|
PICO_MODULE_PATH = Path("/Users/kempersc/apps/glam/data/entity_annotation/modules/integrations/pico.yaml")
|
|
ENTITY_DIR = Path("/Users/kempersc/apps/glam/data/custodian/person/entity")
|
|
|
|
# Cache for system prompt
|
|
_CACHED_LINKEDIN_PROMPT: Optional[str] = None
|
|
|
|
|
|
def load_linkedin_system_prompt() -> Optional[str]:
|
|
"""Load the LinkedIn-specific system prompt from PiCo YAML module."""
|
|
try:
|
|
if not PICO_MODULE_PATH.exists():
|
|
print(f"Error: PiCo module not found at {PICO_MODULE_PATH}")
|
|
return None
|
|
|
|
with open(PICO_MODULE_PATH, 'r', encoding='utf-8') as f:
|
|
pico_module = yaml.safe_load(f)
|
|
|
|
linkedin_prompt = pico_module.get('glm_annotator_config', {}).get('linkedin_system_prompt')
|
|
if linkedin_prompt:
|
|
return linkedin_prompt.strip()
|
|
|
|
print("Warning: linkedin_system_prompt not found in PiCo module")
|
|
return None
|
|
except Exception as e:
|
|
print(f"Error loading LinkedIn prompt: {e}")
|
|
return None
|
|
|
|
|
|
def get_linkedin_system_prompt() -> Optional[str]:
|
|
"""Get cached LinkedIn-specific system prompt."""
|
|
global _CACHED_LINKEDIN_PROMPT
|
|
if _CACHED_LINKEDIN_PROMPT is None:
|
|
_CACHED_LINKEDIN_PROMPT = load_linkedin_system_prompt()
|
|
return _CACHED_LINKEDIN_PROMPT
|
|
|
|
|
|
def _call_glm_api(user_message: str, system_prompt: str, zai_api_token: str, session: httpx.Client, timeout: float = 180.0, model: str = "glm-4.6") -> Tuple[Optional[str], Optional[str]]:
|
|
"""Make a single GLM API call.
|
|
|
|
Returns:
|
|
Tuple of (content_string, error_message)
|
|
"""
|
|
try:
|
|
glm_response = session.post(
|
|
"https://api.z.ai/api/coding/paas/v4/chat/completions",
|
|
headers={
|
|
"Authorization": f"Bearer {zai_api_token}",
|
|
"Content-Type": "application/json",
|
|
},
|
|
json={
|
|
"model": model,
|
|
"messages": [
|
|
{"role": "system", "content": system_prompt},
|
|
{"role": "user", "content": user_message}
|
|
],
|
|
"temperature": 0.1
|
|
},
|
|
timeout=timeout
|
|
)
|
|
|
|
if glm_response.status_code != 200:
|
|
return None, f"GLM API error: {glm_response.status_code} - {glm_response.text[:200]}"
|
|
|
|
result = glm_response.json()
|
|
glm_content = result.get('choices', [{}])[0].get('message', {}).get('content', '')
|
|
|
|
if not glm_content:
|
|
return None, f"Empty GLM response (model={model})"
|
|
|
|
return glm_content, None
|
|
|
|
except httpx.TimeoutException:
|
|
return None, f"GLM request timed out ({timeout}s)"
|
|
except httpx.RequestError as e:
|
|
return None, f"HTTP error: {e}"
|
|
except Exception as e:
|
|
return None, f"Exception: {e}"
|
|
|
|
|
|
def _parse_glm_json(glm_content: str) -> Tuple[Optional[Dict], Optional[str]]:
|
|
"""Parse JSON from GLM response content.
|
|
|
|
Returns:
|
|
Tuple of (parsed_dict, error_message)
|
|
"""
|
|
# First try: direct JSON parse
|
|
try:
|
|
structured = json.loads(glm_content)
|
|
return structured, None
|
|
except json.JSONDecodeError:
|
|
pass
|
|
|
|
# Second try: extract from markdown code block
|
|
json_match = re.search(r'```(?:json)?\s*([\s\S]*?)\s*```', glm_content)
|
|
if json_match:
|
|
try:
|
|
structured = json.loads(json_match.group(1))
|
|
return structured, None
|
|
except json.JSONDecodeError:
|
|
pass
|
|
|
|
# Third try: find JSON object in text
|
|
json_match = re.search(r'\{[\s\S]*\}', glm_content)
|
|
if json_match:
|
|
try:
|
|
structured = json.loads(json_match.group())
|
|
return structured, None
|
|
except json.JSONDecodeError:
|
|
pass
|
|
|
|
return None, f"Could not parse GLM response as JSON (len={len(glm_content)})"
|
|
|
|
|
|
def structure_with_pico_annotator(
|
|
exa_content: str,
|
|
url: str,
|
|
session: httpx.Client,
|
|
verbose: bool = False,
|
|
max_retries: int = 3,
|
|
base_delay: float = 5.0
|
|
) -> Tuple[Optional[Dict], Optional[str]]:
|
|
"""Use GLM with LinkedIn-specific PiCo prompt to structure profile data.
|
|
|
|
Implements exponential backoff retry logic to handle GLM API inconsistency:
|
|
- Retry on empty responses
|
|
- Retry on timeouts
|
|
- Exponential backoff: 5s, 10s, 20s delays between retries
|
|
- Falls back to glm-4.5 if glm-4.6 returns empty
|
|
|
|
Returns:
|
|
Tuple of (structured_data, error_message)
|
|
- (dict, None) on success
|
|
- (None, error_string) on failure
|
|
"""
|
|
import time
|
|
|
|
zai_api_token = os.environ.get('ZAI_API_TOKEN')
|
|
if not zai_api_token:
|
|
return None, "ZAI_API_TOKEN not set"
|
|
|
|
system_prompt = get_linkedin_system_prompt()
|
|
if not system_prompt:
|
|
return None, "Could not load LinkedIn system prompt"
|
|
|
|
user_message = f"""Source Type: modern_digital
|
|
Source URL: {url}
|
|
|
|
Content:
|
|
{exa_content[:12000]}"""
|
|
|
|
if verbose:
|
|
print(f" → Calling GLM with {len(exa_content)} chars of content...")
|
|
|
|
last_error = None
|
|
|
|
# Model fallback order: try glm-4.6 first, then glm-4.5
|
|
models = ["glm-4.6", "glm-4.5"]
|
|
|
|
for model in models:
|
|
for attempt in range(max_retries):
|
|
if attempt > 0:
|
|
delay = base_delay * (2 ** (attempt - 1)) # Exponential backoff: 5s, 10s, 20s
|
|
if verbose:
|
|
print(f" → Retry {attempt}/{max_retries-1} with {model} after {delay}s delay (last error: {last_error})")
|
|
time.sleep(delay)
|
|
elif model != models[0] and verbose:
|
|
print(f" → Falling back to {model}...")
|
|
|
|
# Increase timeout on retries
|
|
timeout = 180.0 + (attempt * 60) # 180s, 240s, 300s
|
|
|
|
glm_content, error = _call_glm_api(user_message, system_prompt, zai_api_token, session, timeout=timeout, model=model)
|
|
|
|
if error:
|
|
last_error = error
|
|
# If empty response, try next model instead of retrying same model
|
|
if "Empty GLM response" in error:
|
|
break # Exit retry loop, move to next model
|
|
# Retry on timeouts
|
|
if "timed out" in error:
|
|
continue
|
|
# Don't retry on non-recoverable errors
|
|
if "GLM API error: 4" in error: # 4xx errors
|
|
return None, error
|
|
continue
|
|
|
|
if verbose:
|
|
print(f" → {model} returned {len(glm_content or '')} chars")
|
|
|
|
if not glm_content:
|
|
last_error = "Empty content after successful API call"
|
|
break # Try next model
|
|
|
|
# Parse the response
|
|
structured, parse_error = _parse_glm_json(glm_content)
|
|
|
|
if parse_error:
|
|
last_error = parse_error
|
|
continue # Retry on parse failures (might be truncated response)
|
|
|
|
# Success!
|
|
return structured, None
|
|
|
|
# All retries exhausted for all models
|
|
return None, f"Failed after trying all models. Last error: {last_error}"
|
|
|
|
|
|
def find_profiles_needing_pico(entity_dir: Path) -> List[Path]:
|
|
"""Find all profile files that have exa_raw_response but no pico_structured."""
|
|
profiles_to_process = []
|
|
|
|
for file_path in entity_dir.glob("*.json"):
|
|
try:
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
# Check if it has exa_raw_response
|
|
if not data.get('exa_raw_response'):
|
|
continue
|
|
|
|
# Check if it already has pico_structured
|
|
if data.get('pico_structured'):
|
|
continue
|
|
|
|
# Check if exa_raw_response has content
|
|
results = data['exa_raw_response'].get('results', [])
|
|
if not results or not results[0].get('text'):
|
|
continue
|
|
|
|
profiles_to_process.append(file_path)
|
|
|
|
except Exception as e:
|
|
print(f"Error reading {file_path}: {e}")
|
|
|
|
return profiles_to_process
|
|
|
|
|
|
def process_profile(
|
|
file_path: Path,
|
|
session: httpx.Client,
|
|
dry_run: bool = False,
|
|
verbose: bool = False,
|
|
max_retries: int = 3
|
|
) -> Tuple[bool, str]:
|
|
"""Process a single profile file and add pico_structured.
|
|
|
|
Returns:
|
|
Tuple of (success, message)
|
|
"""
|
|
try:
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
# Extract content from Exa response
|
|
results = data.get('exa_raw_response', {}).get('results', [])
|
|
if not results:
|
|
return False, "No results in exa_raw_response"
|
|
|
|
exa_content = results[0].get('text', '')
|
|
if not exa_content or len(exa_content) < 50:
|
|
return False, f"Insufficient content (len={len(exa_content)})"
|
|
|
|
# Get URL
|
|
url = data.get('extraction_metadata', {}).get('linkedin_url', '')
|
|
if not url:
|
|
url = results[0].get('url', '')
|
|
|
|
if dry_run:
|
|
return True, f"Would process {len(exa_content)} chars from {url}"
|
|
|
|
# Call GLM to structure with retry logic
|
|
structured, error = structure_with_pico_annotator(
|
|
exa_content, url, session,
|
|
verbose=verbose,
|
|
max_retries=max_retries
|
|
)
|
|
|
|
if error:
|
|
return False, error
|
|
|
|
if not structured:
|
|
return False, "Empty structured result"
|
|
|
|
# Update the file
|
|
data['pico_structured'] = structured
|
|
|
|
# Update extraction method
|
|
if 'extraction_metadata' in data:
|
|
old_method = data['extraction_metadata'].get('extraction_method', '')
|
|
if 'pico' not in old_method.lower():
|
|
data['extraction_metadata']['extraction_method'] = old_method + '+pico_glm_reprocessed'
|
|
data['extraction_metadata']['pico_reprocess_date'] = datetime.now(timezone.utc).isoformat()
|
|
|
|
# Write back
|
|
with open(file_path, 'w', encoding='utf-8') as f:
|
|
json.dump(data, f, indent=2, ensure_ascii=False)
|
|
|
|
# Calculate heritage experience count
|
|
persons = structured.get('persons', [])
|
|
heritage_exp_count = 0
|
|
if persons:
|
|
heritage_exp_count = len(persons[0].get('heritage_relevant_experience', []))
|
|
|
|
return True, f"Added pico_structured with {heritage_exp_count} heritage experiences"
|
|
|
|
except Exception as e:
|
|
return False, f"Exception: {e}"
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Re-process profiles that have exa_raw_response but no pico_structured"
|
|
)
|
|
parser.add_argument(
|
|
"--dry-run",
|
|
action="store_true",
|
|
help="Show what would be processed without making changes"
|
|
)
|
|
parser.add_argument(
|
|
"--limit",
|
|
type=int,
|
|
default=None,
|
|
help="Maximum number of profiles to process"
|
|
)
|
|
parser.add_argument(
|
|
"--verbose",
|
|
"-v",
|
|
action="store_true",
|
|
help="Show detailed progress"
|
|
)
|
|
parser.add_argument(
|
|
"--file",
|
|
type=str,
|
|
default=None,
|
|
help="Process a specific file (full path or just the filename)"
|
|
)
|
|
parser.add_argument(
|
|
"--max-retries",
|
|
type=int,
|
|
default=3,
|
|
help="Maximum number of retries per profile on GLM failures (default: 3)"
|
|
)
|
|
parser.add_argument(
|
|
"--delay",
|
|
type=float,
|
|
default=3.0,
|
|
help="Delay between processing profiles in seconds (default: 3.0)"
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
# Check for required API key
|
|
if not args.dry_run and not os.environ.get('ZAI_API_TOKEN'):
|
|
print("Error: ZAI_API_TOKEN environment variable not set")
|
|
sys.exit(1)
|
|
|
|
# Check for LinkedIn prompt
|
|
prompt = get_linkedin_system_prompt()
|
|
if not prompt:
|
|
print("Error: Could not load LinkedIn system prompt from pico.yaml")
|
|
sys.exit(1)
|
|
|
|
print(f"✓ Loaded LinkedIn system prompt ({len(prompt)} chars)")
|
|
print(f"✓ Using max {args.max_retries} retries per profile with exponential backoff")
|
|
|
|
# Find or specify files to process
|
|
if args.file:
|
|
# Process specific file
|
|
file_path = Path(args.file)
|
|
if not file_path.is_absolute():
|
|
file_path = ENTITY_DIR / args.file
|
|
if not file_path.exists():
|
|
print(f"Error: File not found: {file_path}")
|
|
sys.exit(1)
|
|
profiles_to_process = [file_path]
|
|
else:
|
|
# Find all profiles needing processing
|
|
print(f"\nScanning {ENTITY_DIR} for profiles needing PiCo annotation...")
|
|
profiles_to_process = find_profiles_needing_pico(ENTITY_DIR)
|
|
|
|
print(f"Found {len(profiles_to_process)} profiles to process")
|
|
|
|
if not profiles_to_process:
|
|
print("No profiles need processing!")
|
|
return
|
|
|
|
# Apply limit
|
|
if args.limit:
|
|
profiles_to_process = profiles_to_process[:args.limit]
|
|
print(f"Processing first {len(profiles_to_process)} profiles (--limit {args.limit})")
|
|
|
|
if args.dry_run:
|
|
print("\n[DRY RUN - no changes will be made]\n")
|
|
|
|
# Process profiles
|
|
success_count = 0
|
|
failed_count = 0
|
|
results = []
|
|
|
|
with httpx.Client(timeout=300.0) as session:
|
|
for i, file_path in enumerate(tqdm(profiles_to_process, desc="Processing profiles")):
|
|
success, message = process_profile(
|
|
file_path,
|
|
session,
|
|
dry_run=args.dry_run,
|
|
verbose=args.verbose,
|
|
max_retries=args.max_retries
|
|
)
|
|
|
|
filename = file_path.name
|
|
if success:
|
|
success_count += 1
|
|
results.append(f"✓ {filename}: {message}")
|
|
if args.verbose:
|
|
print(f" ✓ {filename}: {message}")
|
|
else:
|
|
failed_count += 1
|
|
results.append(f"✗ {filename}: {message}")
|
|
if args.verbose:
|
|
print(f" ✗ {filename}: {message}")
|
|
|
|
# Rate limiting between profiles (not on dry run or last profile)
|
|
if not args.dry_run and i < len(profiles_to_process) - 1:
|
|
import time
|
|
time.sleep(args.delay)
|
|
|
|
# Print summary
|
|
print("\n" + "="*60)
|
|
print("RESULTS")
|
|
print("="*60)
|
|
print(f"Successful: {success_count}")
|
|
print(f"Failed: {failed_count}")
|
|
|
|
# Show failed profiles
|
|
if failed_count > 0:
|
|
print(f"\nFailed profiles:")
|
|
for result in results:
|
|
if result.startswith("✗"):
|
|
print(f" {result}")
|
|
|
|
# Show sample of successful profiles
|
|
if success_count > 0 and not args.dry_run:
|
|
print(f"\nSuccessfully processed profiles (showing first 5):")
|
|
for result in [r for r in results if r.startswith("✓")][:5]:
|
|
print(f" {result}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|