220 lines
7.4 KiB
Python
Executable file
220 lines
7.4 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Convert raw Exa LinkedIn profile dumps to properly structured JSON format.
|
|
|
|
Rule 20 (AGENTS.md): Person entity profiles MUST be stored in proper structured format,
|
|
not as raw content dumps.
|
|
|
|
Proper format includes:
|
|
- extraction_metadata: source, method, timestamps, costs
|
|
- profile_data: parsed name, headline, location, experience[], education[], skills[]
|
|
"""
|
|
|
|
import json
|
|
import re
|
|
import sys
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
|
|
def parse_experience_section(content: str) -> list:
|
|
"""Parse experience section from raw LinkedIn content."""
|
|
experiences = []
|
|
|
|
# Split by experience entries (look for ### headers or role patterns)
|
|
exp_pattern = r'### ([^\n]+) at \[([^\]]+)\](?:\([^\)]+\))?\n([^#]+?)(?=###|\Z|## )'
|
|
matches = re.findall(exp_pattern, content, re.DOTALL)
|
|
|
|
for title, company, details in matches:
|
|
exp = {
|
|
"title": title.strip(),
|
|
"company": company.strip(),
|
|
}
|
|
|
|
# Extract duration
|
|
duration_match = re.search(r'(\w+ \d{4}) - (Present|\w+ \d{4})(?: • (.+?))?(?:\n|$)', details)
|
|
if duration_match:
|
|
exp["duration"] = f"{duration_match.group(1)} - {duration_match.group(2)}"
|
|
if duration_match.group(3):
|
|
exp["duration"] += f" • {duration_match.group(3)}"
|
|
|
|
# Extract location
|
|
location_match = re.search(r'\n([^•\n]+, [^•\n]+)\n', details)
|
|
if location_match:
|
|
exp["location"] = location_match.group(1).strip()
|
|
|
|
# Extract company details
|
|
company_match = re.search(r'Company: (.+?)(?:\n|$)', details)
|
|
if company_match:
|
|
exp["company_details"] = company_match.group(1).strip()
|
|
|
|
experiences.append(exp)
|
|
|
|
return experiences
|
|
|
|
|
|
def parse_education_section(content: str) -> list:
|
|
"""Parse education section from raw LinkedIn content."""
|
|
education = []
|
|
|
|
# Look for education entries
|
|
edu_pattern = r'### ([^\n]+) at ([^\n]+)\n([^#]+?)(?=###|\Z|## )'
|
|
|
|
# Find the education section
|
|
edu_section_match = re.search(r'## Education\n(.+?)(?=## |\Z)', content, re.DOTALL)
|
|
if edu_section_match:
|
|
edu_content = edu_section_match.group(1)
|
|
matches = re.findall(edu_pattern, edu_content, re.DOTALL)
|
|
|
|
for degree, school, details in matches:
|
|
edu = {
|
|
"degree": degree.strip().rstrip('.'),
|
|
"school": school.strip(),
|
|
}
|
|
|
|
# Extract years
|
|
years_match = re.search(r'(\d{4}) - (\d{4})', details)
|
|
if years_match:
|
|
edu["years"] = f"{years_match.group(1)} - {years_match.group(2)}"
|
|
|
|
education.append(edu)
|
|
|
|
return education
|
|
|
|
|
|
def parse_raw_profile(raw_json: dict, source_file: str = None) -> dict:
|
|
"""Convert raw Exa profile dump to proper structured format."""
|
|
|
|
raw_content = raw_json.get('profile_data', {}).get('raw_content', '')
|
|
summary = raw_json.get('profile_data', {}).get('summary', '')
|
|
title = raw_json.get('profile_data', {}).get('title', '')
|
|
url = raw_json.get('profile_data', {}).get('url', '') or raw_json.get('linkedin_profile_url', '')
|
|
|
|
# Extract name from title or content
|
|
name = ''
|
|
if title:
|
|
name_match = re.match(r'^([^|]+)', title)
|
|
if name_match:
|
|
name = name_match.group(1).strip()
|
|
if not name and raw_content:
|
|
name_match = re.match(r'^# (.+?)$', raw_content, re.MULTILINE)
|
|
if name_match:
|
|
name = name_match.group(1).strip()
|
|
|
|
# Extract headline
|
|
headline = ''
|
|
if title and '|' in title:
|
|
headline = title.split('|', 1)[1].strip()
|
|
elif raw_content:
|
|
lines = raw_content.split('\n')
|
|
if len(lines) > 1:
|
|
headline = lines[1].strip()
|
|
|
|
# Extract location
|
|
location = ''
|
|
location_match = re.search(r'\n([^,\n]+, [^,\n]+, [^\n(]+)\n', raw_content)
|
|
if location_match:
|
|
location = location_match.group(1).strip()
|
|
|
|
# Extract connections
|
|
connections = ''
|
|
conn_match = re.search(r'(\d+ connections[^•\n]*(?:• \d+ followers)?)', raw_content)
|
|
if conn_match:
|
|
connections = conn_match.group(1).strip()
|
|
|
|
# Extract about section
|
|
about = ''
|
|
about_match = re.search(r'## About\n(.+?)(?=\n## |\nTotal Experience:|\Z)', raw_content, re.DOTALL)
|
|
if about_match:
|
|
about = about_match.group(1).strip()
|
|
|
|
# Parse experience and education
|
|
experience = parse_experience_section(raw_content)
|
|
education = parse_education_section(raw_content)
|
|
|
|
# Build proper structure
|
|
exa_meta = raw_json.get('exa_search_metadata', {})
|
|
|
|
structured = {
|
|
"extraction_metadata": {
|
|
"source_file": source_file or "unknown",
|
|
"staff_id": None,
|
|
"extraction_date": exa_meta.get('timestamp', datetime.now(timezone.utc).isoformat()),
|
|
"extraction_method": "exa_contents",
|
|
"extraction_agent": "claude-opus-4.5",
|
|
"linkedin_url": url,
|
|
"cost_usd": exa_meta.get('cost', 0),
|
|
"request_id": exa_meta.get('request_id', 'unknown')
|
|
},
|
|
"profile_data": {
|
|
"name": name,
|
|
"linkedin_url": url,
|
|
"headline": headline,
|
|
"location": location,
|
|
"connections": connections,
|
|
"about": about,
|
|
"summary": summary,
|
|
"experience": experience,
|
|
"education": education
|
|
}
|
|
}
|
|
|
|
return structured
|
|
|
|
|
|
def format_profile_file(filepath: Path) -> bool:
|
|
"""Format a single profile file in place."""
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
# Check if already in proper format
|
|
if 'extraction_metadata' in data and 'extraction_agent' in data.get('extraction_metadata', {}):
|
|
return False # Already formatted
|
|
|
|
# Check if it's a raw format
|
|
if 'exa_search_metadata' in data and 'profile_data' in data:
|
|
raw_content = data.get('profile_data', {}).get('raw_content', '')
|
|
if raw_content: # Has raw content, needs formatting
|
|
formatted = parse_raw_profile(data, str(filepath))
|
|
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
json.dump(formatted, f, indent=2, ensure_ascii=False)
|
|
|
|
return True
|
|
|
|
return False
|
|
|
|
except Exception as e:
|
|
print(f"Error processing {filepath}: {e}", file=sys.stderr)
|
|
return False
|
|
|
|
|
|
def main():
|
|
"""Format all raw profile files in the entity directory."""
|
|
entity_dir = Path('/Users/kempersc/apps/glam/data/custodian/person/entity')
|
|
|
|
formatted_count = 0
|
|
skipped_count = 0
|
|
error_count = 0
|
|
|
|
for filepath in entity_dir.glob('*.json'):
|
|
try:
|
|
result = format_profile_file(filepath)
|
|
if result:
|
|
formatted_count += 1
|
|
print(f"✓ Formatted: {filepath.name}")
|
|
else:
|
|
skipped_count += 1
|
|
except Exception as e:
|
|
error_count += 1
|
|
print(f"✗ Error: {filepath.name}: {e}")
|
|
|
|
print(f"\nSummary:")
|
|
print(f" Formatted: {formatted_count}")
|
|
print(f" Skipped (already formatted): {skipped_count}")
|
|
print(f" Errors: {error_count}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|