#!/usr/bin/env python3 """ Generate SQL INSERT statements for person entity files. Outputs SQL to stdout - pipe to psql. Usage: python generate_person_sql.py /path/to/entity/dir | sudo -u postgres psql -d glam """ import json import os import sys import re from datetime import datetime from pathlib import Path def extract_linkedin_slug(filename: str) -> str: """Extract LinkedIn slug from filename.""" base = filename.replace('.json', '') parts = re.split(r'_\d{8}T\d{6}Z', base) return parts[0] if parts else base def escape_sql_string(s: str | None) -> str: """Escape single quotes for SQL.""" if s is None: return 'NULL' return "'" + s.replace("'", "''") + "'" def parse_extraction_date(data: dict) -> str: """Extract the extraction timestamp from the JSON data.""" ts = None if 'exa_search_metadata' in data: ts = data['exa_search_metadata'].get('timestamp') elif 'extraction_metadata' in data: ts = data['extraction_metadata'].get('extraction_date') elif 'provenance' in data: ts = data['provenance'].get('extraction_date') if ts: return escape_sql_string(ts) return 'NULL' def main(): if len(sys.argv) < 2: print("Usage: python generate_person_sql.py /path/to/entity/dir", file=sys.stderr) sys.exit(1) entity_dir = Path(sys.argv[1]) if not entity_dir.exists(): print(f"Error: Directory not found: {entity_dir}", file=sys.stderr) sys.exit(1) json_files = list(entity_dir.glob("*.json")) print(f"-- Processing {len(json_files)} JSON files", file=sys.stderr) # Begin transaction print("BEGIN;") success = 0 errors = 0 for filepath in json_files: try: with open(filepath, 'r', encoding='utf-8') as f: data = json.load(f) linkedin_slug = extract_linkedin_slug(filepath.name) profile_data = data.get('profile_data', {}) name = profile_data.get('full_name') or profile_data.get('name', '') headline = profile_data.get('headline', '') location = profile_data.get('location', '') extraction_date = parse_extraction_date(data) # JSON escape for profile_data json_str = json.dumps(data).replace("'", "''") print(f""" INSERT INTO person_entity (linkedin_slug, name, headline, location, profile_data, extraction_date, updated_date) VALUES ( {escape_sql_string(linkedin_slug)}, {escape_sql_string(name[:500] if name else None)}, {escape_sql_string(headline[:1000] if headline else None)}, {escape_sql_string(location[:500] if location else None)}, '{json_str}'::jsonb, {extraction_date}::timestamptz, NOW() ) ON CONFLICT (linkedin_slug) DO UPDATE SET name = EXCLUDED.name, headline = EXCLUDED.headline, location = EXCLUDED.location, profile_data = EXCLUDED.profile_data, extraction_date = EXCLUDED.extraction_date, updated_date = NOW(); """) success += 1 except json.JSONDecodeError as e: print(f"-- Error parsing {filepath.name}: {e}", file=sys.stderr) errors += 1 except Exception as e: print(f"-- Error processing {filepath.name}: {e}", file=sys.stderr) errors += 1 print("COMMIT;") print(f"-- Processed: {success} success, {errors} errors", file=sys.stderr) if __name__ == '__main__': main()