glam/scripts/format_linkedin_profile.py

#!/usr/bin/env python3
"""
Convert raw Exa LinkedIn profile dumps to properly structured JSON format.

Rule 20 (AGENTS.md): Person entity profiles MUST be stored in proper structured format,
not as raw content dumps.

Proper format includes:
- extraction_metadata: source, method, timestamps, costs
- profile_data: parsed name, headline, location, experience[], education[], skills[]
"""

import json
import re
import sys
from datetime import datetime, timezone
from pathlib import Path


def parse_experience_section(content: str) -> list:
    """Parse experience section from raw LinkedIn content."""
    experiences = []

    # Split by experience entries (look for ### headers or role patterns)
    exp_pattern = r'### ([^\n]+) at \[([^\]]+)\](?:\([^\)]+\))?\n([^#]+?)(?=###|\Z|## )'
    matches = re.findall(exp_pattern, content, re.DOTALL)

    for title, company, details in matches:
        exp = {
            "title": title.strip(),
            "company": company.strip(),
        }

        # Extract duration
        duration_match = re.search(r'(\w+ \d{4}) - (Present|\w+ \d{4})(?: • (.+?))?(?:\n|$)', details)
        if duration_match:
            exp["duration"] = f"{duration_match.group(1)} - {duration_match.group(2)}"
            if duration_match.group(3):
                exp["duration"] += f" • {duration_match.group(3)}"

        # Extract location
        location_match = re.search(r'\n([^•\n]+, [^•\n]+)\n', details)
        if location_match:
            exp["location"] = location_match.group(1).strip()

        # Extract company details
        company_match = re.search(r'Company: (.+?)(?:\n|$)', details)
        if company_match:
            exp["company_details"] = company_match.group(1).strip()

        experiences.append(exp)

    return experiences


def parse_education_section(content: str) -> list:
    """Parse education section from raw LinkedIn content."""
    education = []

    # Look for education entries
    edu_pattern = r'### ([^\n]+) at ([^\n]+)\n([^#]+?)(?=###|\Z|## )'

    # Find the education section
    edu_section_match = re.search(r'## Education\n(.+?)(?=## |\Z)', content, re.DOTALL)
    if edu_section_match:
        edu_content = edu_section_match.group(1)
        matches = re.findall(edu_pattern, edu_content, re.DOTALL)

        for degree, school, details in matches:
            edu = {
                "degree": degree.strip().rstrip('.'),
                "school": school.strip(),
            }

            # Extract years
            years_match = re.search(r'(\d{4}) - (\d{4})', details)
            if years_match:
                edu["years"] = f"{years_match.group(1)} - {years_match.group(2)}"

            education.append(edu)

    return education


def parse_raw_profile(raw_json: dict, source_file: str = None) -> dict:
    """Convert raw Exa profile dump to proper structured format."""

    raw_content = raw_json.get('profile_data', {}).get('raw_content', '')
    summary = raw_json.get('profile_data', {}).get('summary', '')
    title = raw_json.get('profile_data', {}).get('title', '')
    url = raw_json.get('profile_data', {}).get('url', '') or raw_json.get('linkedin_profile_url', '')

    # Extract name from title or content
    name = ''
    if title:
        name_match = re.match(r'^([^|]+)', title)
        if name_match:
            name = name_match.group(1).strip()
    if not name and raw_content:
        name_match = re.match(r'^# (.+?)$', raw_content, re.MULTILINE)
        if name_match:
            name = name_match.group(1).strip()

    # Extract headline
    headline = ''
    if title and '|' in title:
        headline = title.split('|', 1)[1].strip()
    elif raw_content:
        lines = raw_content.split('\n')
        if len(lines) > 1:
            headline = lines[1].strip()

    # Extract location
    location = ''
    location_match = re.search(r'\n([^,\n]+, [^,\n]+, [^\n(]+)\n', raw_content)
    if location_match:
        location = location_match.group(1).strip()

    # Extract connections
    connections = ''
    conn_match = re.search(r'(\d+ connections[^•\n]*(?:• \d+ followers)?)', raw_content)
    if conn_match:
        connections = conn_match.group(1).strip()

    # Extract about section
    about = ''
    about_match = re.search(r'## About\n(.+?)(?=\n## |\nTotal Experience:|\Z)', raw_content, re.DOTALL)
    if about_match:
        about = about_match.group(1).strip()

    # Parse experience and education
    experience = parse_experience_section(raw_content)
    education = parse_education_section(raw_content)

    # Build proper structure
    exa_meta = raw_json.get('exa_search_metadata', {})

    structured = {
        "extraction_metadata": {
            "source_file": source_file or "unknown",
            "staff_id": None,
            "extraction_date": exa_meta.get('timestamp', datetime.now(timezone.utc).isoformat()),
            "extraction_method": "exa_contents",
            "extraction_agent": "claude-opus-4.5",
            "linkedin_url": url,
            "cost_usd": exa_meta.get('cost', 0),
            "request_id": exa_meta.get('request_id', 'unknown')
        },
        "profile_data": {
            "name": name,
            "linkedin_url": url,
            "headline": headline,
            "location": location,
            "connections": connections,
            "about": about,
            "summary": summary,
            "experience": experience,
            "education": education
        }
    }

    return structured


def format_profile_file(filepath: Path) -> bool:
    """Format a single profile file in place."""
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            data = json.load(f)

        # Check if already in proper format
        if 'extraction_metadata' in data and 'extraction_agent' in data.get('extraction_metadata', {}):
            return False  # Already formatted

        # Check if it's a raw format
        if 'exa_search_metadata' in data and 'profile_data' in data:
            raw_content = data.get('profile_data', {}).get('raw_content', '')
            if raw_content:  # Has raw content, needs formatting
                formatted = parse_raw_profile(data, str(filepath))

                with open(filepath, 'w', encoding='utf-8') as f:
                    json.dump(formatted, f, indent=2, ensure_ascii=False)

                return True

        return False

    except Exception as e:
        print(f"Error processing {filepath}: {e}", file=sys.stderr)
        return False


def main():
    """Format all raw profile files in the entity directory."""
    entity_dir = Path('/Users/kempersc/apps/glam/data/custodian/person/entity')

    formatted_count = 0
    skipped_count = 0
    error_count = 0

    for filepath in entity_dir.glob('*.json'):
        try:
            result = format_profile_file(filepath)
            if result:
                formatted_count += 1
                print(f"✓ Formatted: {filepath.name}")
            else:
                skipped_count += 1
        except Exception as e:
            error_count += 1
            print(f"✗ Error: {filepath.name}: {e}")

    print(f"\nSummary:")
    print(f"  Formatted: {formatted_count}")
    print(f"  Skipped (already formatted): {skipped_count}")
    print(f"  Errors: {error_count}")


if __name__ == '__main__':
    main()