glam/scripts/generate_person_sql.py
2025-12-12 12:51:10 +01:00

117 lines
3.5 KiB
Python

#!/usr/bin/env python3
"""
Generate SQL INSERT statements for person entity files.
Outputs SQL to stdout - pipe to psql.
Usage:
python generate_person_sql.py /path/to/entity/dir | sudo -u postgres psql -d glam
"""
import json
import os
import sys
import re
from datetime import datetime
from pathlib import Path
def extract_linkedin_slug(filename: str) -> str:
"""Extract LinkedIn slug from filename."""
base = filename.replace('.json', '')
parts = re.split(r'_\d{8}T\d{6}Z', base)
return parts[0] if parts else base
def escape_sql_string(s: str | None) -> str:
"""Escape single quotes for SQL."""
if s is None:
return 'NULL'
return "'" + s.replace("'", "''") + "'"
def parse_extraction_date(data: dict) -> str:
"""Extract the extraction timestamp from the JSON data."""
ts = None
if 'exa_search_metadata' in data:
ts = data['exa_search_metadata'].get('timestamp')
elif 'extraction_metadata' in data:
ts = data['extraction_metadata'].get('extraction_date')
elif 'provenance' in data:
ts = data['provenance'].get('extraction_date')
if ts:
return escape_sql_string(ts)
return 'NULL'
def main():
if len(sys.argv) < 2:
print("Usage: python generate_person_sql.py /path/to/entity/dir", file=sys.stderr)
sys.exit(1)
entity_dir = Path(sys.argv[1])
if not entity_dir.exists():
print(f"Error: Directory not found: {entity_dir}", file=sys.stderr)
sys.exit(1)
json_files = list(entity_dir.glob("*.json"))
print(f"-- Processing {len(json_files)} JSON files", file=sys.stderr)
# Begin transaction
print("BEGIN;")
success = 0
errors = 0
for filepath in json_files:
try:
with open(filepath, 'r', encoding='utf-8') as f:
data = json.load(f)
linkedin_slug = extract_linkedin_slug(filepath.name)
profile_data = data.get('profile_data', {})
name = profile_data.get('full_name') or profile_data.get('name', '')
headline = profile_data.get('headline', '')
location = profile_data.get('location', '')
extraction_date = parse_extraction_date(data)
# JSON escape for profile_data
json_str = json.dumps(data).replace("'", "''")
print(f"""
INSERT INTO person_entity (linkedin_slug, name, headline, location, profile_data, extraction_date, updated_date)
VALUES (
{escape_sql_string(linkedin_slug)},
{escape_sql_string(name[:500] if name else None)},
{escape_sql_string(headline[:1000] if headline else None)},
{escape_sql_string(location[:500] if location else None)},
'{json_str}'::jsonb,
{extraction_date}::timestamptz,
NOW()
)
ON CONFLICT (linkedin_slug) DO UPDATE SET
name = EXCLUDED.name,
headline = EXCLUDED.headline,
location = EXCLUDED.location,
profile_data = EXCLUDED.profile_data,
extraction_date = EXCLUDED.extraction_date,
updated_date = NOW();
""")
success += 1
except json.JSONDecodeError as e:
print(f"-- Error parsing {filepath.name}: {e}", file=sys.stderr)
errors += 1
except Exception as e:
print(f"-- Error processing {filepath.name}: {e}", file=sys.stderr)
errors += 1
print("COMMIT;")
print(f"-- Processed: {success} success, {errors} errors", file=sys.stderr)
if __name__ == '__main__':
main()