#!/usr/bin/env python3 """ PostgreSQL Person Sync Module - Sync person/staff JSON files to PostgreSQL. This module syncs all person data (staff lists and entity profiles) to PostgreSQL for querying heritage institution personnel. Usage: python -m scripts.sync.postgres_person_sync [--dry-run] [--limit N] Data Sources: - data/custodian/person/entity/*.json - Individual person profile files """ import argparse import json import os import sys from datetime import datetime, timezone from pathlib import Path from typing import Any, Optional # Add project root to path PROJECT_ROOT = Path(__file__).parent.parent.parent sys.path.insert(0, str(PROJECT_ROOT)) # Use try/except for imports that may not resolve in all environments try: from scripts.sync import BaseSyncer, SyncResult, SyncStatus except ImportError: from . import BaseSyncer, SyncResult, SyncStatus # Configuration DATABASE_CONFIG = { 'host': os.getenv("POSTGRES_HOST", "localhost"), 'port': int(os.getenv("POSTGRES_PORT", "5432")), 'database': os.getenv("POSTGRES_DB", "glam_heritage"), 'user': os.getenv("POSTGRES_USER", "glam_api"), 'password': os.getenv("POSTGRES_PASSWORD", "glam_api_password"), } BATCH_SIZE = 100 # Person data directory PERSON_DATA_DIR = PROJECT_ROOT / "data" / "custodian" / "person" / "entity" # SQL statements CREATE_PERSONS_TABLE = """ CREATE TABLE IF NOT EXISTS persons ( id SERIAL PRIMARY KEY, staff_id VARCHAR(255) UNIQUE NOT NULL, name VARCHAR(500) NOT NULL, headline TEXT, location VARCHAR(500), country_code VARCHAR(10), custodian_slug VARCHAR(255), custodian_name VARCHAR(500), linkedin_url VARCHAR(1000), profile_image_url TEXT, heritage_relevant BOOLEAN DEFAULT TRUE, heritage_types JSONB DEFAULT '[]'::jsonb, experience JSONB DEFAULT '[]'::jsonb, education JSONB DEFAULT '[]'::jsonb, skills JSONB DEFAULT '[]'::jsonb, languages JSONB DEFAULT '[]'::jsonb, about TEXT, connections VARCHAR(255), extraction_date TIMESTAMP WITH TIME ZONE, extraction_method VARCHAR(100), source_file VARCHAR(1000), created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(), updated_at TIMESTAMP WITH TIME ZONE DEFAULT NOW() ); -- Indexes for common queries CREATE INDEX IF NOT EXISTS idx_persons_custodian_slug ON persons(custodian_slug); CREATE INDEX IF NOT EXISTS idx_persons_heritage_relevant ON persons(heritage_relevant); CREATE INDEX IF NOT EXISTS idx_persons_name ON persons(name); CREATE INDEX IF NOT EXISTS idx_persons_country_code ON persons(country_code); CREATE INDEX IF NOT EXISTS idx_persons_heritage_types ON persons USING GIN (heritage_types); -- Full text search index CREATE INDEX IF NOT EXISTS idx_persons_search ON persons USING GIN ( to_tsvector('english', COALESCE(name, '') || ' ' || COALESCE(headline, '') || ' ' || COALESCE(custodian_name, '')) ); """ UPSERT_PERSON = """ INSERT INTO persons ( staff_id, name, headline, location, country_code, custodian_slug, custodian_name, linkedin_url, profile_image_url, heritage_relevant, heritage_types, experience, education, skills, languages, about, connections, extraction_date, extraction_method, source_file ) VALUES ( %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s ) ON CONFLICT (staff_id) DO UPDATE SET name = EXCLUDED.name, headline = EXCLUDED.headline, location = EXCLUDED.location, country_code = EXCLUDED.country_code, custodian_slug = EXCLUDED.custodian_slug, custodian_name = EXCLUDED.custodian_name, linkedin_url = EXCLUDED.linkedin_url, profile_image_url = EXCLUDED.profile_image_url, heritage_relevant = EXCLUDED.heritage_relevant, heritage_types = EXCLUDED.heritage_types, experience = EXCLUDED.experience, education = EXCLUDED.education, skills = EXCLUDED.skills, languages = EXCLUDED.languages, about = EXCLUDED.about, connections = EXCLUDED.connections, extraction_date = EXCLUDED.extraction_date, extraction_method = EXCLUDED.extraction_method, source_file = EXCLUDED.source_file, updated_at = NOW() """ def extract_country_code(location: str) -> Optional[str]: """Extract country code from location string like 'Amsterdam, North Holland, Netherlands (NL)'.""" if not location: return None # Try to find country code in parentheses import re match = re.search(r'\(([A-Z]{2})\)', location) if match: return match.group(1) # Common country mappings country_map = { 'netherlands': 'NL', 'united states': 'US', 'united kingdom': 'GB', 'germany': 'DE', 'france': 'FR', 'belgium': 'BE', 'spain': 'ES', 'italy': 'IT', } lower_loc = location.lower() for country, code in country_map.items(): if country in lower_loc: return code return None def parse_person_file(file_path: Path) -> Optional[dict]: """Parse a person JSON file and extract data for database.""" try: with open(file_path, 'r', encoding='utf-8') as f: data = json.load(f) except (json.JSONDecodeError, IOError) as e: return None extraction_meta = data.get('extraction_metadata', {}) source_staff = data.get('source_staff_info', {}) profile_data = data.get('profile_data', {}) heritage_rel = data.get('heritage_relevance', {}) # Get staff_id (required) staff_id = extraction_meta.get('staff_id') if not staff_id: # Try to generate from filename staff_id = file_path.stem.split('_')[0] # Get name (required) name = profile_data.get('name') or source_staff.get('name') if not name: return None # Extract location and country code location = profile_data.get('location', '') country_code = extract_country_code(location) # Parse extraction date extraction_date = None date_str = extraction_meta.get('extraction_date') if date_str: try: extraction_date = datetime.fromisoformat(date_str.replace('Z', '+00:00')) except ValueError: extraction_date = None # Build custodian slug from custodian name custodian_name = source_staff.get('custodian', '') custodian_slug = custodian_name.lower().replace(' ', '-') if custodian_name else None return { 'staff_id': staff_id, 'name': name, 'headline': profile_data.get('headline') or source_staff.get('headline'), 'location': location, 'country_code': country_code, 'custodian_slug': custodian_slug, 'custodian_name': custodian_name, 'linkedin_url': profile_data.get('linkedin_url') or extraction_meta.get('linkedin_url'), 'profile_image_url': profile_data.get('profile_image_url'), 'heritage_relevant': heritage_rel.get('is_heritage_relevant', True), 'heritage_types': json.dumps(heritage_rel.get('heritage_types', [])), 'experience': json.dumps(profile_data.get('experience', [])), 'education': json.dumps(profile_data.get('education', [])), 'skills': json.dumps(profile_data.get('skills', [])), 'languages': json.dumps(profile_data.get('languages', [])), 'about': profile_data.get('about'), 'connections': profile_data.get('connections'), 'extraction_date': extraction_date, 'extraction_method': extraction_meta.get('extraction_method'), 'source_file': str(file_path.relative_to(PROJECT_ROOT)), } class PostgresPersonSyncer(BaseSyncer): """Sync person/staff JSON files to PostgreSQL database.""" database_name = "postgres_persons" def __init__( self, db_config: dict | None = None, person_dir: Path = PERSON_DATA_DIR, batch_size: int = BATCH_SIZE, **kwargs ): super().__init__(**kwargs) self.db_config = db_config or DATABASE_CONFIG self.person_dir = person_dir self.batch_size = batch_size self._conn = None def _get_connection(self): """Get database connection.""" if self._conn is None: import psycopg2 self._conn = psycopg2.connect( host=self.db_config['host'], port=self.db_config['port'], database=self.db_config['database'], user=self.db_config['user'], password=self.db_config['password'], ) return self._conn def check_connection(self) -> bool: """Check if PostgreSQL is available.""" try: import psycopg2 conn = psycopg2.connect( host=self.db_config['host'], port=self.db_config['port'], database=self.db_config['database'], user=self.db_config['user'], password=self.db_config['password'], connect_timeout=5, ) conn.close() return True except ImportError: self.logger.error("psycopg2 not installed. Run: pip install psycopg2-binary") return False except Exception as e: self.logger.error(f"Connection check failed: {e}") return False def get_status(self) -> dict: """Get PostgreSQL persons table status.""" try: import psycopg2 conn = psycopg2.connect( host=self.db_config['host'], port=self.db_config['port'], database=self.db_config['database'], user=self.db_config['user'], password=self.db_config['password'], connect_timeout=5, ) cursor = conn.cursor() # Check if table exists and get count try: cursor.execute("SELECT COUNT(*) FROM persons") count = cursor.fetchone()[0] # Get heritage types distribution cursor.execute(""" SELECT heritage_types, COUNT(*) FROM persons GROUP BY heritage_types ORDER BY COUNT(*) DESC LIMIT 10 """) type_dist = cursor.fetchall() status = { "status": "healthy", "persons_count": count, "heritage_types": dict(type_dist) if type_dist else {}, } except psycopg2.errors.UndefinedTable: status = { "status": "healthy", "persons_count": 0, "note": "Table does not exist yet", } cursor.close() conn.close() return status except Exception as e: return {"status": "error", "error": str(e)} def _ensure_table(self, conn): """Ensure the persons table exists.""" cursor = conn.cursor() cursor.execute(CREATE_PERSONS_TABLE) conn.commit() cursor.close() def _list_person_files(self) -> list[Path]: """List all person JSON files.""" if not self.person_dir.exists(): self.logger.error(f"Person directory not found: {self.person_dir}") return [] return sorted(self.person_dir.glob("*.json")) def sync(self, limit: Optional[int] = None, dry_run: bool = False) -> SyncResult: """Sync all person JSON files to PostgreSQL.""" result = SyncResult( database="postgres_persons", status=SyncStatus.IN_PROGRESS, start_time=datetime.now(timezone.utc), ) # Check connection if not dry_run and not self.check_connection(): result.status = SyncStatus.FAILED result.error_message = "Cannot connect to PostgreSQL" result.end_time = datetime.now(timezone.utc) return result # Find all person files person_files = self._list_person_files() if limit: person_files = person_files[:limit] self.progress.total_files = len(person_files) self.progress.current_database = "postgres_persons" result.records_processed = len(person_files) if not person_files: self.logger.warning("No person files found to sync") result.status = SyncStatus.SUCCESS result.end_time = datetime.now(timezone.utc) return result self.logger.info(f"Found {len(person_files)} person files to sync") if dry_run: self.logger.info(f"[DRY RUN] Would sync {len(person_files)} person files to PostgreSQL") result.status = SyncStatus.SUCCESS result.records_succeeded = len(person_files) result.end_time = datetime.now(timezone.utc) result.details["dry_run"] = True return result try: import psycopg2 conn = self._get_connection() # Ensure table exists self._ensure_table(conn) cursor = conn.cursor() succeeded = 0 failed = 0 for i, file_path in enumerate(person_files): self.progress.processed_files = i + 1 self.progress.current_file = file_path.name self._report_progress() try: person_data = parse_person_file(file_path) if not person_data: self.logger.warning(f"Could not parse: {file_path.name}") failed += 1 continue # Insert/update person cursor.execute(UPSERT_PERSON, ( person_data['staff_id'], person_data['name'], person_data['headline'], person_data['location'], person_data['country_code'], person_data['custodian_slug'], person_data['custodian_name'], person_data['linkedin_url'], person_data['profile_image_url'], person_data['heritage_relevant'], person_data['heritage_types'], person_data['experience'], person_data['education'], person_data['skills'], person_data['languages'], person_data['about'], person_data['connections'], person_data['extraction_date'], person_data['extraction_method'], person_data['source_file'], )) succeeded += 1 # Commit in batches if (i + 1) % self.batch_size == 0: conn.commit() self.logger.info(f"Committed batch: {i + 1}/{len(person_files)}") except Exception as e: self.logger.error(f"Error processing {file_path.name}: {e}") self.progress.errors.append(f"{file_path.name}: {e}") failed += 1 # Final commit conn.commit() cursor.close() result.records_succeeded = succeeded result.records_failed = failed result.status = SyncStatus.SUCCESS if failed == 0 else SyncStatus.PARTIAL self.logger.info(f"Sync complete: {succeeded} succeeded, {failed} failed") except Exception as e: self.logger.error(f"Sync failed: {e}") result.status = SyncStatus.FAILED result.error_message = str(e) finally: if self._conn: self._conn.close() self._conn = None result.end_time = datetime.now(timezone.utc) return result def main(): parser = argparse.ArgumentParser(description="Sync person JSON files to PostgreSQL") parser.add_argument("--dry-run", action="store_true", help="Parse files but don't upload") parser.add_argument("--limit", type=int, help="Limit number of files to process") args = parser.parse_args() import logging logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", ) syncer = PostgresPersonSyncer() print("=" * 60) print("PostgreSQL Person Sync") print("=" * 60) if not args.dry_run: print("Checking connection...") status = syncer.get_status() print(f" Status: {status.get('status', 'unknown')}") if status.get('persons_count'): print(f" Current persons: {status['persons_count']}") result = syncer.sync(limit=args.limit, dry_run=args.dry_run) print("\n" + "=" * 60) print(f"Sync Result: {result.status.value.upper()}") print(f" Processed: {result.records_processed}") print(f" Succeeded: {result.records_succeeded}") print(f" Failed: {result.records_failed}") print(f" Duration: {result.duration_seconds:.2f}s") if result.error_message: print(f" Error: {result.error_message}") print("=" * 60) if __name__ == "__main__": main()