glam/scripts/generate_person_sql.py

#!/usr/bin/env python3
"""
Generate SQL INSERT statements for person entity files.
Outputs SQL to stdout - pipe to psql.

Usage:
    python generate_person_sql.py /path/to/entity/dir | sudo -u postgres psql -d glam
"""

import json
import os
import sys
import re
from datetime import datetime
from pathlib import Path


def extract_linkedin_slug(filename: str) -> str:
    """Extract LinkedIn slug from filename."""
    base = filename.replace('.json', '')
    parts = re.split(r'_\d{8}T\d{6}Z', base)
    return parts[0] if parts else base


def escape_sql_string(s: str | None) -> str:
    """Escape single quotes for SQL."""
    if s is None:
        return 'NULL'
    return "'" + s.replace("'", "''") + "'"


def parse_extraction_date(data: dict) -> str:
    """Extract the extraction timestamp from the JSON data."""
    ts = None

    if 'exa_search_metadata' in data:
        ts = data['exa_search_metadata'].get('timestamp')
    elif 'extraction_metadata' in data:
        ts = data['extraction_metadata'].get('extraction_date')
    elif 'provenance' in data:
        ts = data['provenance'].get('extraction_date')

    if ts:
        return escape_sql_string(ts)
    return 'NULL'


def main():
    if len(sys.argv) < 2:
        print("Usage: python generate_person_sql.py /path/to/entity/dir", file=sys.stderr)
        sys.exit(1)

    entity_dir = Path(sys.argv[1])

    if not entity_dir.exists():
        print(f"Error: Directory not found: {entity_dir}", file=sys.stderr)
        sys.exit(1)

    json_files = list(entity_dir.glob("*.json"))
    print(f"-- Processing {len(json_files)} JSON files", file=sys.stderr)

    # Begin transaction
    print("BEGIN;")

    success = 0
    errors = 0

    for filepath in json_files:
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                data = json.load(f)

            linkedin_slug = extract_linkedin_slug(filepath.name)
            profile_data = data.get('profile_data', {})

            name = profile_data.get('full_name') or profile_data.get('name', '')
            headline = profile_data.get('headline', '')
            location = profile_data.get('location', '')
            extraction_date = parse_extraction_date(data)

            # JSON escape for profile_data
            json_str = json.dumps(data).replace("'", "''")

            print(f"""
INSERT INTO person_entity (linkedin_slug, name, headline, location, profile_data, extraction_date, updated_date)
VALUES (
    {escape_sql_string(linkedin_slug)},
    {escape_sql_string(name[:500] if name else None)},
    {escape_sql_string(headline[:1000] if headline else None)},
    {escape_sql_string(location[:500] if location else None)},
    '{json_str}'::jsonb,
    {extraction_date}::timestamptz,
    NOW()
)
ON CONFLICT (linkedin_slug) DO UPDATE SET
    name = EXCLUDED.name,
    headline = EXCLUDED.headline,
    location = EXCLUDED.location,
    profile_data = EXCLUDED.profile_data,
    extraction_date = EXCLUDED.extraction_date,
    updated_date = NOW();
""")
            success += 1

        except json.JSONDecodeError as e:
            print(f"-- Error parsing {filepath.name}: {e}", file=sys.stderr)
            errors += 1
        except Exception as e:
            print(f"-- Error processing {filepath.name}: {e}", file=sys.stderr)
            errors += 1

    print("COMMIT;")
    print(f"-- Processed: {success} success, {errors} errors", file=sys.stderr)


if __name__ == '__main__':
    main()