glam/scripts/reprocess_pico_structured.py
2025-12-14 17:09:55 +01:00

480 lines
16 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Re-process LinkedIn profiles that have exa_raw_response but are missing pico_structured.
This script:
1. Scans entity directory for profiles with exa_raw_response but no pico_structured
2. Calls GLM-4.6 with the LinkedIn-specific PiCo prompt to structure the data
3. Updates the profile files in-place with pico_structured data
Usage:
python scripts/reprocess_pico_structured.py [--dry-run] [--limit N] [--verbose]
"""
import json
import os
import sys
import re
import argparse
from pathlib import Path
from datetime import datetime, timezone
from typing import Dict, List, Optional, Tuple
# Load environment variables from .env file
from dotenv import load_dotenv
load_dotenv()
import httpx
import yaml
from tqdm import tqdm
# Path to PiCo integration module with GLM system prompt
PICO_MODULE_PATH = Path("/Users/kempersc/apps/glam/data/entity_annotation/modules/integrations/pico.yaml")
ENTITY_DIR = Path("/Users/kempersc/apps/glam/data/custodian/person/entity")
# Cache for system prompt
_CACHED_LINKEDIN_PROMPT: Optional[str] = None
def load_linkedin_system_prompt() -> Optional[str]:
"""Load the LinkedIn-specific system prompt from PiCo YAML module."""
try:
if not PICO_MODULE_PATH.exists():
print(f"Error: PiCo module not found at {PICO_MODULE_PATH}")
return None
with open(PICO_MODULE_PATH, 'r', encoding='utf-8') as f:
pico_module = yaml.safe_load(f)
linkedin_prompt = pico_module.get('glm_annotator_config', {}).get('linkedin_system_prompt')
if linkedin_prompt:
return linkedin_prompt.strip()
print("Warning: linkedin_system_prompt not found in PiCo module")
return None
except Exception as e:
print(f"Error loading LinkedIn prompt: {e}")
return None
def get_linkedin_system_prompt() -> Optional[str]:
"""Get cached LinkedIn-specific system prompt."""
global _CACHED_LINKEDIN_PROMPT
if _CACHED_LINKEDIN_PROMPT is None:
_CACHED_LINKEDIN_PROMPT = load_linkedin_system_prompt()
return _CACHED_LINKEDIN_PROMPT
def _call_glm_api(user_message: str, system_prompt: str, zai_api_token: str, session: httpx.Client, timeout: float = 180.0, model: str = "glm-4.6") -> Tuple[Optional[str], Optional[str]]:
"""Make a single GLM API call.
Returns:
Tuple of (content_string, error_message)
"""
try:
glm_response = session.post(
"https://api.z.ai/api/coding/paas/v4/chat/completions",
headers={
"Authorization": f"Bearer {zai_api_token}",
"Content-Type": "application/json",
},
json={
"model": model,
"messages": [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_message}
],
"temperature": 0.1
},
timeout=timeout
)
if glm_response.status_code != 200:
return None, f"GLM API error: {glm_response.status_code} - {glm_response.text[:200]}"
result = glm_response.json()
glm_content = result.get('choices', [{}])[0].get('message', {}).get('content', '')
if not glm_content:
return None, f"Empty GLM response (model={model})"
return glm_content, None
except httpx.TimeoutException:
return None, f"GLM request timed out ({timeout}s)"
except httpx.RequestError as e:
return None, f"HTTP error: {e}"
except Exception as e:
return None, f"Exception: {e}"
def _parse_glm_json(glm_content: str) -> Tuple[Optional[Dict], Optional[str]]:
"""Parse JSON from GLM response content.
Returns:
Tuple of (parsed_dict, error_message)
"""
# First try: direct JSON parse
try:
structured = json.loads(glm_content)
return structured, None
except json.JSONDecodeError:
pass
# Second try: extract from markdown code block
json_match = re.search(r'```(?:json)?\s*([\s\S]*?)\s*```', glm_content)
if json_match:
try:
structured = json.loads(json_match.group(1))
return structured, None
except json.JSONDecodeError:
pass
# Third try: find JSON object in text
json_match = re.search(r'\{[\s\S]*\}', glm_content)
if json_match:
try:
structured = json.loads(json_match.group())
return structured, None
except json.JSONDecodeError:
pass
return None, f"Could not parse GLM response as JSON (len={len(glm_content)})"
def structure_with_pico_annotator(
exa_content: str,
url: str,
session: httpx.Client,
verbose: bool = False,
max_retries: int = 3,
base_delay: float = 5.0
) -> Tuple[Optional[Dict], Optional[str]]:
"""Use GLM with LinkedIn-specific PiCo prompt to structure profile data.
Implements exponential backoff retry logic to handle GLM API inconsistency:
- Retry on empty responses
- Retry on timeouts
- Exponential backoff: 5s, 10s, 20s delays between retries
- Falls back to glm-4.5 if glm-4.6 returns empty
Returns:
Tuple of (structured_data, error_message)
- (dict, None) on success
- (None, error_string) on failure
"""
import time
zai_api_token = os.environ.get('ZAI_API_TOKEN')
if not zai_api_token:
return None, "ZAI_API_TOKEN not set"
system_prompt = get_linkedin_system_prompt()
if not system_prompt:
return None, "Could not load LinkedIn system prompt"
user_message = f"""Source Type: modern_digital
Source URL: {url}
Content:
{exa_content[:12000]}"""
if verbose:
print(f" → Calling GLM with {len(exa_content)} chars of content...")
last_error = None
# Model fallback order: try glm-4.6 first, then glm-4.5
models = ["glm-4.6", "glm-4.5"]
for model in models:
for attempt in range(max_retries):
if attempt > 0:
delay = base_delay * (2 ** (attempt - 1)) # Exponential backoff: 5s, 10s, 20s
if verbose:
print(f" → Retry {attempt}/{max_retries-1} with {model} after {delay}s delay (last error: {last_error})")
time.sleep(delay)
elif model != models[0] and verbose:
print(f" → Falling back to {model}...")
# Increase timeout on retries
timeout = 180.0 + (attempt * 60) # 180s, 240s, 300s
glm_content, error = _call_glm_api(user_message, system_prompt, zai_api_token, session, timeout=timeout, model=model)
if error:
last_error = error
# If empty response, try next model instead of retrying same model
if "Empty GLM response" in error:
break # Exit retry loop, move to next model
# Retry on timeouts
if "timed out" in error:
continue
# Don't retry on non-recoverable errors
if "GLM API error: 4" in error: # 4xx errors
return None, error
continue
if verbose:
print(f"{model} returned {len(glm_content or '')} chars")
if not glm_content:
last_error = "Empty content after successful API call"
break # Try next model
# Parse the response
structured, parse_error = _parse_glm_json(glm_content)
if parse_error:
last_error = parse_error
continue # Retry on parse failures (might be truncated response)
# Success!
return structured, None
# All retries exhausted for all models
return None, f"Failed after trying all models. Last error: {last_error}"
def find_profiles_needing_pico(entity_dir: Path) -> List[Path]:
"""Find all profile files that have exa_raw_response but no pico_structured."""
profiles_to_process = []
for file_path in entity_dir.glob("*.json"):
try:
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
# Check if it has exa_raw_response
if not data.get('exa_raw_response'):
continue
# Check if it already has pico_structured
if data.get('pico_structured'):
continue
# Check if exa_raw_response has content
results = data['exa_raw_response'].get('results', [])
if not results or not results[0].get('text'):
continue
profiles_to_process.append(file_path)
except Exception as e:
print(f"Error reading {file_path}: {e}")
return profiles_to_process
def process_profile(
file_path: Path,
session: httpx.Client,
dry_run: bool = False,
verbose: bool = False,
max_retries: int = 3
) -> Tuple[bool, str]:
"""Process a single profile file and add pico_structured.
Returns:
Tuple of (success, message)
"""
try:
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
# Extract content from Exa response
results = data.get('exa_raw_response', {}).get('results', [])
if not results:
return False, "No results in exa_raw_response"
exa_content = results[0].get('text', '')
if not exa_content or len(exa_content) < 50:
return False, f"Insufficient content (len={len(exa_content)})"
# Get URL
url = data.get('extraction_metadata', {}).get('linkedin_url', '')
if not url:
url = results[0].get('url', '')
if dry_run:
return True, f"Would process {len(exa_content)} chars from {url}"
# Call GLM to structure with retry logic
structured, error = structure_with_pico_annotator(
exa_content, url, session,
verbose=verbose,
max_retries=max_retries
)
if error:
return False, error
if not structured:
return False, "Empty structured result"
# Update the file
data['pico_structured'] = structured
# Update extraction method
if 'extraction_metadata' in data:
old_method = data['extraction_metadata'].get('extraction_method', '')
if 'pico' not in old_method.lower():
data['extraction_metadata']['extraction_method'] = old_method + '+pico_glm_reprocessed'
data['extraction_metadata']['pico_reprocess_date'] = datetime.now(timezone.utc).isoformat()
# Write back
with open(file_path, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
# Calculate heritage experience count
persons = structured.get('persons', [])
heritage_exp_count = 0
if persons:
heritage_exp_count = len(persons[0].get('heritage_relevant_experience', []))
return True, f"Added pico_structured with {heritage_exp_count} heritage experiences"
except Exception as e:
return False, f"Exception: {e}"
def main():
parser = argparse.ArgumentParser(
description="Re-process profiles that have exa_raw_response but no pico_structured"
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Show what would be processed without making changes"
)
parser.add_argument(
"--limit",
type=int,
default=None,
help="Maximum number of profiles to process"
)
parser.add_argument(
"--verbose",
"-v",
action="store_true",
help="Show detailed progress"
)
parser.add_argument(
"--file",
type=str,
default=None,
help="Process a specific file (full path or just the filename)"
)
parser.add_argument(
"--max-retries",
type=int,
default=3,
help="Maximum number of retries per profile on GLM failures (default: 3)"
)
parser.add_argument(
"--delay",
type=float,
default=3.0,
help="Delay between processing profiles in seconds (default: 3.0)"
)
args = parser.parse_args()
# Check for required API key
if not args.dry_run and not os.environ.get('ZAI_API_TOKEN'):
print("Error: ZAI_API_TOKEN environment variable not set")
sys.exit(1)
# Check for LinkedIn prompt
prompt = get_linkedin_system_prompt()
if not prompt:
print("Error: Could not load LinkedIn system prompt from pico.yaml")
sys.exit(1)
print(f"✓ Loaded LinkedIn system prompt ({len(prompt)} chars)")
print(f"✓ Using max {args.max_retries} retries per profile with exponential backoff")
# Find or specify files to process
if args.file:
# Process specific file
file_path = Path(args.file)
if not file_path.is_absolute():
file_path = ENTITY_DIR / args.file
if not file_path.exists():
print(f"Error: File not found: {file_path}")
sys.exit(1)
profiles_to_process = [file_path]
else:
# Find all profiles needing processing
print(f"\nScanning {ENTITY_DIR} for profiles needing PiCo annotation...")
profiles_to_process = find_profiles_needing_pico(ENTITY_DIR)
print(f"Found {len(profiles_to_process)} profiles to process")
if not profiles_to_process:
print("No profiles need processing!")
return
# Apply limit
if args.limit:
profiles_to_process = profiles_to_process[:args.limit]
print(f"Processing first {len(profiles_to_process)} profiles (--limit {args.limit})")
if args.dry_run:
print("\n[DRY RUN - no changes will be made]\n")
# Process profiles
success_count = 0
failed_count = 0
results = []
with httpx.Client(timeout=300.0) as session:
for i, file_path in enumerate(tqdm(profiles_to_process, desc="Processing profiles")):
success, message = process_profile(
file_path,
session,
dry_run=args.dry_run,
verbose=args.verbose,
max_retries=args.max_retries
)
filename = file_path.name
if success:
success_count += 1
results.append(f"{filename}: {message}")
if args.verbose:
print(f"{filename}: {message}")
else:
failed_count += 1
results.append(f"{filename}: {message}")
if args.verbose:
print(f"{filename}: {message}")
# Rate limiting between profiles (not on dry run or last profile)
if not args.dry_run and i < len(profiles_to_process) - 1:
import time
time.sleep(args.delay)
# Print summary
print("\n" + "="*60)
print("RESULTS")
print("="*60)
print(f"Successful: {success_count}")
print(f"Failed: {failed_count}")
# Show failed profiles
if failed_count > 0:
print(f"\nFailed profiles:")
for result in results:
if result.startswith(""):
print(f" {result}")
# Show sample of successful profiles
if success_count > 0 and not args.dry_run:
print(f"\nSuccessfully processed profiles (showing first 5):")
for result in [r for r in results if r.startswith("")][:5]:
print(f" {result}")
if __name__ == "__main__":
main()