117 lines
3.5 KiB
Python
117 lines
3.5 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Generate SQL INSERT statements for person entity files.
|
|
Outputs SQL to stdout - pipe to psql.
|
|
|
|
Usage:
|
|
python generate_person_sql.py /path/to/entity/dir | sudo -u postgres psql -d glam
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import sys
|
|
import re
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
|
|
|
|
def extract_linkedin_slug(filename: str) -> str:
|
|
"""Extract LinkedIn slug from filename."""
|
|
base = filename.replace('.json', '')
|
|
parts = re.split(r'_\d{8}T\d{6}Z', base)
|
|
return parts[0] if parts else base
|
|
|
|
|
|
def escape_sql_string(s: str | None) -> str:
|
|
"""Escape single quotes for SQL."""
|
|
if s is None:
|
|
return 'NULL'
|
|
return "'" + s.replace("'", "''") + "'"
|
|
|
|
|
|
def parse_extraction_date(data: dict) -> str:
|
|
"""Extract the extraction timestamp from the JSON data."""
|
|
ts = None
|
|
|
|
if 'exa_search_metadata' in data:
|
|
ts = data['exa_search_metadata'].get('timestamp')
|
|
elif 'extraction_metadata' in data:
|
|
ts = data['extraction_metadata'].get('extraction_date')
|
|
elif 'provenance' in data:
|
|
ts = data['provenance'].get('extraction_date')
|
|
|
|
if ts:
|
|
return escape_sql_string(ts)
|
|
return 'NULL'
|
|
|
|
|
|
def main():
|
|
if len(sys.argv) < 2:
|
|
print("Usage: python generate_person_sql.py /path/to/entity/dir", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
entity_dir = Path(sys.argv[1])
|
|
|
|
if not entity_dir.exists():
|
|
print(f"Error: Directory not found: {entity_dir}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
json_files = list(entity_dir.glob("*.json"))
|
|
print(f"-- Processing {len(json_files)} JSON files", file=sys.stderr)
|
|
|
|
# Begin transaction
|
|
print("BEGIN;")
|
|
|
|
success = 0
|
|
errors = 0
|
|
|
|
for filepath in json_files:
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
linkedin_slug = extract_linkedin_slug(filepath.name)
|
|
profile_data = data.get('profile_data', {})
|
|
|
|
name = profile_data.get('full_name') or profile_data.get('name', '')
|
|
headline = profile_data.get('headline', '')
|
|
location = profile_data.get('location', '')
|
|
extraction_date = parse_extraction_date(data)
|
|
|
|
# JSON escape for profile_data
|
|
json_str = json.dumps(data).replace("'", "''")
|
|
|
|
print(f"""
|
|
INSERT INTO person_entity (linkedin_slug, name, headline, location, profile_data, extraction_date, updated_date)
|
|
VALUES (
|
|
{escape_sql_string(linkedin_slug)},
|
|
{escape_sql_string(name[:500] if name else None)},
|
|
{escape_sql_string(headline[:1000] if headline else None)},
|
|
{escape_sql_string(location[:500] if location else None)},
|
|
'{json_str}'::jsonb,
|
|
{extraction_date}::timestamptz,
|
|
NOW()
|
|
)
|
|
ON CONFLICT (linkedin_slug) DO UPDATE SET
|
|
name = EXCLUDED.name,
|
|
headline = EXCLUDED.headline,
|
|
location = EXCLUDED.location,
|
|
profile_data = EXCLUDED.profile_data,
|
|
extraction_date = EXCLUDED.extraction_date,
|
|
updated_date = NOW();
|
|
""")
|
|
success += 1
|
|
|
|
except json.JSONDecodeError as e:
|
|
print(f"-- Error parsing {filepath.name}: {e}", file=sys.stderr)
|
|
errors += 1
|
|
except Exception as e:
|
|
print(f"-- Error processing {filepath.name}: {e}", file=sys.stderr)
|
|
errors += 1
|
|
|
|
print("COMMIT;")
|
|
print(f"-- Processed: {success} success, {errors} errors", file=sys.stderr)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|