#!/usr/bin/env python3 """ Convert raw Exa LinkedIn profile dumps to properly structured JSON format. Rule 20 (AGENTS.md): Person entity profiles MUST be stored in proper structured format, not as raw content dumps. Proper format includes: - extraction_metadata: source, method, timestamps, costs - profile_data: parsed name, headline, location, experience[], education[], skills[] """ import json import re import sys from datetime import datetime, timezone from pathlib import Path def parse_experience_section(content: str) -> list: """Parse experience section from raw LinkedIn content.""" experiences = [] # Split by experience entries (look for ### headers or role patterns) exp_pattern = r'### ([^\n]+) at \[([^\]]+)\](?:\([^\)]+\))?\n([^#]+?)(?=###|\Z|## )' matches = re.findall(exp_pattern, content, re.DOTALL) for title, company, details in matches: exp = { "title": title.strip(), "company": company.strip(), } # Extract duration duration_match = re.search(r'(\w+ \d{4}) - (Present|\w+ \d{4})(?: • (.+?))?(?:\n|$)', details) if duration_match: exp["duration"] = f"{duration_match.group(1)} - {duration_match.group(2)}" if duration_match.group(3): exp["duration"] += f" • {duration_match.group(3)}" # Extract location location_match = re.search(r'\n([^•\n]+, [^•\n]+)\n', details) if location_match: exp["location"] = location_match.group(1).strip() # Extract company details company_match = re.search(r'Company: (.+?)(?:\n|$)', details) if company_match: exp["company_details"] = company_match.group(1).strip() experiences.append(exp) return experiences def parse_education_section(content: str) -> list: """Parse education section from raw LinkedIn content.""" education = [] # Look for education entries edu_pattern = r'### ([^\n]+) at ([^\n]+)\n([^#]+?)(?=###|\Z|## )' # Find the education section edu_section_match = re.search(r'## Education\n(.+?)(?=## |\Z)', content, re.DOTALL) if edu_section_match: edu_content = edu_section_match.group(1) matches = re.findall(edu_pattern, edu_content, re.DOTALL) for degree, school, details in matches: edu = { "degree": degree.strip().rstrip('.'), "school": school.strip(), } # Extract years years_match = re.search(r'(\d{4}) - (\d{4})', details) if years_match: edu["years"] = f"{years_match.group(1)} - {years_match.group(2)}" education.append(edu) return education def parse_raw_profile(raw_json: dict, source_file: str = None) -> dict: """Convert raw Exa profile dump to proper structured format.""" raw_content = raw_json.get('profile_data', {}).get('raw_content', '') summary = raw_json.get('profile_data', {}).get('summary', '') title = raw_json.get('profile_data', {}).get('title', '') url = raw_json.get('profile_data', {}).get('url', '') or raw_json.get('linkedin_profile_url', '') # Extract name from title or content name = '' if title: name_match = re.match(r'^([^|]+)', title) if name_match: name = name_match.group(1).strip() if not name and raw_content: name_match = re.match(r'^# (.+?)$', raw_content, re.MULTILINE) if name_match: name = name_match.group(1).strip() # Extract headline headline = '' if title and '|' in title: headline = title.split('|', 1)[1].strip() elif raw_content: lines = raw_content.split('\n') if len(lines) > 1: headline = lines[1].strip() # Extract location location = '' location_match = re.search(r'\n([^,\n]+, [^,\n]+, [^\n(]+)\n', raw_content) if location_match: location = location_match.group(1).strip() # Extract connections connections = '' conn_match = re.search(r'(\d+ connections[^•\n]*(?:• \d+ followers)?)', raw_content) if conn_match: connections = conn_match.group(1).strip() # Extract about section about = '' about_match = re.search(r'## About\n(.+?)(?=\n## |\nTotal Experience:|\Z)', raw_content, re.DOTALL) if about_match: about = about_match.group(1).strip() # Parse experience and education experience = parse_experience_section(raw_content) education = parse_education_section(raw_content) # Build proper structure exa_meta = raw_json.get('exa_search_metadata', {}) structured = { "extraction_metadata": { "source_file": source_file or "unknown", "staff_id": None, "extraction_date": exa_meta.get('timestamp', datetime.now(timezone.utc).isoformat()), "extraction_method": "exa_contents", "extraction_agent": "claude-opus-4.5", "linkedin_url": url, "cost_usd": exa_meta.get('cost', 0), "request_id": exa_meta.get('request_id', 'unknown') }, "profile_data": { "name": name, "linkedin_url": url, "headline": headline, "location": location, "connections": connections, "about": about, "summary": summary, "experience": experience, "education": education } } return structured def format_profile_file(filepath: Path) -> bool: """Format a single profile file in place.""" try: with open(filepath, 'r', encoding='utf-8') as f: data = json.load(f) # Check if already in proper format if 'extraction_metadata' in data and 'extraction_agent' in data.get('extraction_metadata', {}): return False # Already formatted # Check if it's a raw format if 'exa_search_metadata' in data and 'profile_data' in data: raw_content = data.get('profile_data', {}).get('raw_content', '') if raw_content: # Has raw content, needs formatting formatted = parse_raw_profile(data, str(filepath)) with open(filepath, 'w', encoding='utf-8') as f: json.dump(formatted, f, indent=2, ensure_ascii=False) return True return False except Exception as e: print(f"Error processing {filepath}: {e}", file=sys.stderr) return False def main(): """Format all raw profile files in the entity directory.""" entity_dir = Path('/Users/kempersc/apps/glam/data/custodian/person/entity') formatted_count = 0 skipped_count = 0 error_count = 0 for filepath in entity_dir.glob('*.json'): try: result = format_profile_file(filepath) if result: formatted_count += 1 print(f"✓ Formatted: {filepath.name}") else: skipped_count += 1 except Exception as e: error_count += 1 print(f"✗ Error: {filepath.name}: {e}") print(f"\nSummary:") print(f" Formatted: {formatted_count}") print(f" Skipped (already formatted): {skipped_count}") print(f" Errors: {error_count}") if __name__ == '__main__': main()