glam/scripts/format_linkedin_profile.py
2025-12-11 22:32:09 +01:00

220 lines
7.4 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Convert raw Exa LinkedIn profile dumps to properly structured JSON format.
Rule 20 (AGENTS.md): Person entity profiles MUST be stored in proper structured format,
not as raw content dumps.
Proper format includes:
- extraction_metadata: source, method, timestamps, costs
- profile_data: parsed name, headline, location, experience[], education[], skills[]
"""
import json
import re
import sys
from datetime import datetime, timezone
from pathlib import Path
def parse_experience_section(content: str) -> list:
"""Parse experience section from raw LinkedIn content."""
experiences = []
# Split by experience entries (look for ### headers or role patterns)
exp_pattern = r'### ([^\n]+) at \[([^\]]+)\](?:\([^\)]+\))?\n([^#]+?)(?=###|\Z|## )'
matches = re.findall(exp_pattern, content, re.DOTALL)
for title, company, details in matches:
exp = {
"title": title.strip(),
"company": company.strip(),
}
# Extract duration
duration_match = re.search(r'(\w+ \d{4}) - (Present|\w+ \d{4})(?: • (.+?))?(?:\n|$)', details)
if duration_match:
exp["duration"] = f"{duration_match.group(1)} - {duration_match.group(2)}"
if duration_match.group(3):
exp["duration"] += f"{duration_match.group(3)}"
# Extract location
location_match = re.search(r'\n([^•\n]+, [^•\n]+)\n', details)
if location_match:
exp["location"] = location_match.group(1).strip()
# Extract company details
company_match = re.search(r'Company: (.+?)(?:\n|$)', details)
if company_match:
exp["company_details"] = company_match.group(1).strip()
experiences.append(exp)
return experiences
def parse_education_section(content: str) -> list:
"""Parse education section from raw LinkedIn content."""
education = []
# Look for education entries
edu_pattern = r'### ([^\n]+) at ([^\n]+)\n([^#]+?)(?=###|\Z|## )'
# Find the education section
edu_section_match = re.search(r'## Education\n(.+?)(?=## |\Z)', content, re.DOTALL)
if edu_section_match:
edu_content = edu_section_match.group(1)
matches = re.findall(edu_pattern, edu_content, re.DOTALL)
for degree, school, details in matches:
edu = {
"degree": degree.strip().rstrip('.'),
"school": school.strip(),
}
# Extract years
years_match = re.search(r'(\d{4}) - (\d{4})', details)
if years_match:
edu["years"] = f"{years_match.group(1)} - {years_match.group(2)}"
education.append(edu)
return education
def parse_raw_profile(raw_json: dict, source_file: str = None) -> dict:
"""Convert raw Exa profile dump to proper structured format."""
raw_content = raw_json.get('profile_data', {}).get('raw_content', '')
summary = raw_json.get('profile_data', {}).get('summary', '')
title = raw_json.get('profile_data', {}).get('title', '')
url = raw_json.get('profile_data', {}).get('url', '') or raw_json.get('linkedin_profile_url', '')
# Extract name from title or content
name = ''
if title:
name_match = re.match(r'^([^|]+)', title)
if name_match:
name = name_match.group(1).strip()
if not name and raw_content:
name_match = re.match(r'^# (.+?)$', raw_content, re.MULTILINE)
if name_match:
name = name_match.group(1).strip()
# Extract headline
headline = ''
if title and '|' in title:
headline = title.split('|', 1)[1].strip()
elif raw_content:
lines = raw_content.split('\n')
if len(lines) > 1:
headline = lines[1].strip()
# Extract location
location = ''
location_match = re.search(r'\n([^,\n]+, [^,\n]+, [^\n(]+)\n', raw_content)
if location_match:
location = location_match.group(1).strip()
# Extract connections
connections = ''
conn_match = re.search(r'(\d+ connections[^•\n]*(?:• \d+ followers)?)', raw_content)
if conn_match:
connections = conn_match.group(1).strip()
# Extract about section
about = ''
about_match = re.search(r'## About\n(.+?)(?=\n## |\nTotal Experience:|\Z)', raw_content, re.DOTALL)
if about_match:
about = about_match.group(1).strip()
# Parse experience and education
experience = parse_experience_section(raw_content)
education = parse_education_section(raw_content)
# Build proper structure
exa_meta = raw_json.get('exa_search_metadata', {})
structured = {
"extraction_metadata": {
"source_file": source_file or "unknown",
"staff_id": None,
"extraction_date": exa_meta.get('timestamp', datetime.now(timezone.utc).isoformat()),
"extraction_method": "exa_contents",
"extraction_agent": "claude-opus-4.5",
"linkedin_url": url,
"cost_usd": exa_meta.get('cost', 0),
"request_id": exa_meta.get('request_id', 'unknown')
},
"profile_data": {
"name": name,
"linkedin_url": url,
"headline": headline,
"location": location,
"connections": connections,
"about": about,
"summary": summary,
"experience": experience,
"education": education
}
}
return structured
def format_profile_file(filepath: Path) -> bool:
"""Format a single profile file in place."""
try:
with open(filepath, 'r', encoding='utf-8') as f:
data = json.load(f)
# Check if already in proper format
if 'extraction_metadata' in data and 'extraction_agent' in data.get('extraction_metadata', {}):
return False # Already formatted
# Check if it's a raw format
if 'exa_search_metadata' in data and 'profile_data' in data:
raw_content = data.get('profile_data', {}).get('raw_content', '')
if raw_content: # Has raw content, needs formatting
formatted = parse_raw_profile(data, str(filepath))
with open(filepath, 'w', encoding='utf-8') as f:
json.dump(formatted, f, indent=2, ensure_ascii=False)
return True
return False
except Exception as e:
print(f"Error processing {filepath}: {e}", file=sys.stderr)
return False
def main():
"""Format all raw profile files in the entity directory."""
entity_dir = Path('/Users/kempersc/apps/glam/data/custodian/person/entity')
formatted_count = 0
skipped_count = 0
error_count = 0
for filepath in entity_dir.glob('*.json'):
try:
result = format_profile_file(filepath)
if result:
formatted_count += 1
print(f"✓ Formatted: {filepath.name}")
else:
skipped_count += 1
except Exception as e:
error_count += 1
print(f"✗ Error: {filepath.name}: {e}")
print(f"\nSummary:")
print(f" Formatted: {formatted_count}")
print(f" Skipped (already formatted): {skipped_count}")
print(f" Errors: {error_count}")
if __name__ == '__main__':
main()