#!/usr/bin/env python3 """ Import person data from entity and staff files into PostgreSQL persons table. This script reads: 1. Entity files from data/custodian/person/entity/ (primary - rich profile data) 2. Staff files from data/custodian/person/affiliated/parsed/ (secondary - custodian associations) And imports them into the glam_geo.persons table. Usage: python scripts/import_persons_to_postgres.py [--dry-run] [--remote] """ import json import os import sys import argparse import re from pathlib import Path from typing import Dict, List, Any, Optional import psycopg2 from psycopg2.extras import execute_values # Configuration ENTITY_DIR = Path("data/custodian/person/entity") PARSED_DIR = Path("data/custodian/person/affiliated/parsed") # For remote execution REMOTE_ENTITY_DIR = Path("/mnt/data/custodian/person/entity") REMOTE_PARSED_DIR = Path("/mnt/data/custodian/person/affiliated/parsed") DB_CONFIG = { "host": os.getenv("GEO_POSTGRES_HOST", "localhost"), "port": int(os.getenv("GEO_POSTGRES_PORT", "5432")), "database": os.getenv("GEO_POSTGRES_DB", "glam_geo"), "user": os.getenv("GEO_POSTGRES_USER", "glam_api"), "password": os.getenv("GEO_POSTGRES_PASSWORD", "glam_secret_2025"), } # Cached custodian types from database _custodian_type_cache: Dict[str, str] = {} _custodian_cache_loaded: bool = False def load_custodian_types_from_db() -> Dict[str, str]: """ Load custodian types from the database custodians table. This provides AUTHORITATIVE heritage types based on verified institutional data, not heuristics based on name patterns or headline keywords. Returns a dict mapping lowercase custodian names to their type code (M, A, L, etc.) """ global _custodian_type_cache, _custodian_cache_loaded if _custodian_cache_loaded: return _custodian_type_cache try: conn = psycopg2.connect(**DB_CONFIG) cur = conn.cursor() # Get all custodians with verified types cur.execute(""" SELECT DISTINCT LOWER(name), type FROM custodians WHERE type IS NOT NULL AND name IS NOT NULL """) for row in cur.fetchall(): name_lower, type_code = row if name_lower and type_code: _custodian_type_cache[name_lower] = type_code.strip() cur.close() conn.close() print(f" Loaded {len(_custodian_type_cache)} custodian types from database") _custodian_cache_loaded = True except Exception as e: print(f" Warning: Could not load custodian types from database: {e}") print(f" Heritage types will be NULL for persons without verified custodian data") _custodian_cache_loaded = True # Don't retry return _custodian_type_cache def get_custodian_type_from_db(custodian_name: Optional[str]) -> Optional[str]: """ Look up the authoritative heritage type for a custodian from the database. This ensures that people at "Van Gogh Museum" are tagged as 'M' (Museum) based on verified institutional data, not heuristics. Returns None if the custodian is not found in the database - we do NOT guess or infer types from name patterns. No data is better than wrong data. """ if not custodian_name: return None cache = load_custodian_types_from_db() # Try exact match (lowercase) name_lower = custodian_name.lower().strip() if name_lower in cache: return cache[name_lower] # Try partial match for common variations # e.g., "Van Gogh Museum" should match "Van Gogh Museum Amsterdam" for db_name, type_code in cache.items(): if name_lower in db_name or db_name in name_lower: return type_code # No match found - return None (don't guess!) return None def extract_country_from_location(location: Optional[str]) -> Optional[str]: """Extract ISO country code from location string.""" if not location: return None # Handle dict locations if isinstance(location, dict): location = f"{location.get('city', '')} {location.get('region', '')} {location.get('country', '')}" if not isinstance(location, str): return None location_lower = location.lower() # Country mappings country_patterns = { "NL": ["netherlands", "nederland", "amsterdam", "rotterdam", "den haag", "utrecht", "noord-holland", "zuid-holland", "gelderland", "brabant", "limburg", "overijssel", "friesland", "groningen", "drenthe", "zeeland", "flevoland"], "BE": ["belgium", "belgië", "belgique", "brussels", "bruxelles", "flanders", "wallonia", "antwerp", "ghent", "bruges"], "ID": ["indonesia", "jakarta", "bandung", "surabaya", "aceh", "java", "sumatra"], "US": ["united states", "usa", "california", "new york", "washington", "texas"], "GB": ["united kingdom", "uk", "england", "london", "scotland", "wales"], "DE": ["germany", "deutschland", "berlin", "munich", "frankfurt"], "FR": ["france", "paris", "lyon", "marseille"], "IL": ["israel", "tel aviv", "jerusalem"], "MA": ["morocco", "rabat", "casablanca"], "PH": ["philippines", "manila"], "IN": ["india", " india", "mumbai", "delhi", "bangalore"], "AU": ["australia", "sydney", "melbourne"], "CA": ["canada", "toronto", "vancouver"], "JP": ["japan", "tokyo", "osaka"], "SG": ["singapore"], "HK": ["hong kong"], "TW": ["taiwan", "taipei"], "KR": ["korea", "seoul"], "CN": ["china", "beijing", "shanghai"], "BR": ["brazil", "são paulo", "rio"], "MX": ["mexico", "mexico city"], "ZA": ["south africa", "cape town", "johannesburg"], "PS": ["palestine", "gaza", "west bank"], } for code, patterns in country_patterns.items(): for pattern in patterns: if pattern in location_lower: return code # Check for (XX) country code pattern match = re.search(r'\(([A-Z]{2})\)', location) if match: return match.group(1) return None def load_entity_files(entity_dir: Path) -> Dict[str, Dict[str, Any]]: """Load all entity JSON files, keyed by LinkedIn slug.""" entities = {} for json_file in entity_dir.glob("*.json"): # Skip non-entity files if json_file.name.startswith(("extraction_log", "batch_", ".")): continue try: with open(json_file, "r", encoding="utf-8") as f: data = json.load(f) # Extract LinkedIn slug from filename or data linkedin_url = data.get("extraction_metadata", {}).get("linkedin_url", "") if not linkedin_url: linkedin_url = data.get("profile_data", {}).get("linkedin_url", "") if linkedin_url: # Extract slug from URL slug = linkedin_url.rstrip("/").split("/")[-1] if slug and slug != "in": entities[slug] = { "file": json_file.name, "data": data } except (json.JSONDecodeError, Exception) as e: print(f" Warning: Error loading {json_file.name}: {e}") continue return entities def load_staff_files(parsed_dir: Path) -> List[Dict[str, Any]]: """Load all parsed staff JSON files.""" all_staff = [] for json_file in parsed_dir.glob("*_staff_*.json"): # Skip batch result files and analysis files if any(skip in json_file.name for skip in ["batch_results", "cross_custodian", "missing_entity"]): continue try: with open(json_file, "r", encoding="utf-8") as f: data = json.load(f) custodian_meta = data.get("custodian_metadata", {}) custodian_name = custodian_meta.get("custodian_name", "") custodian_slug = custodian_meta.get("custodian_slug", "") location = custodian_meta.get("location", {}) city = location.get("city", "") region = location.get("region", "") for staff in data.get("staff", []): # Skip organization entries (e.g., company itself listed as staff) if staff.get("name") == custodian_name: continue # Skip anonymous names if staff.get("name_type") == "anonymous": continue # Get LinkedIn slug linkedin_url = staff.get("linkedin_profile_url", "") linkedin_slug = staff.get("linkedin_slug", "") if not linkedin_slug and linkedin_url: linkedin_slug = linkedin_url.rstrip("/").split("/")[-1] person = { "staff_id": staff.get("staff_id"), "name": staff.get("name"), "headline": staff.get("headline", ""), "location": f"{city}, {region}" if city and region else (city or region or None), "custodian_slug": custodian_slug, "custodian_name": custodian_name, "linkedin_url": linkedin_url, "linkedin_slug": linkedin_slug, "heritage_relevant": staff.get("heritage_relevant", False), "heritage_type": staff.get("heritage_type"), } if person["staff_id"]: all_staff.append(person) except (json.JSONDecodeError, Exception) as e: print(f" Warning: Error loading {json_file.name}: {e}") continue return all_staff def merge_data(entities: Dict[str, Dict], staff_list: List[Dict]) -> List[Dict]: """Merge entity data with staff data.""" persons = {} # First, index staff by LinkedIn slug staff_by_slug = {} for staff in staff_list: slug = staff.get("linkedin_slug") if slug: if slug not in staff_by_slug: staff_by_slug[slug] = [] staff_by_slug[slug].append(staff) # Process entities first (they have richer data) for slug, entity_info in entities.items(): data = entity_info["data"] meta = data.get("extraction_metadata", {}) profile = data.get("profile_data", {}) # Get name name = profile.get("name") or profile.get("full_name") if not name: continue # Get location location = profile.get("location") # Handle dict locations if isinstance(location, dict): location = f"{location.get('city', '')} {location.get('region', '')} {location.get('country', '')}".strip() if not location: # Try to build from city/region city = profile.get("city", "") region = profile.get("region", "") if isinstance(city, dict): city = city.get("name", "") if isinstance(region, dict): region = region.get("name", "") location = f"{city}, {region}" if city and region else (city or region or None) # Get country code country_code = extract_country_from_location(location) # Get headline headline = profile.get("headline") or profile.get("current_position") if isinstance(headline, dict): headline = headline.get("title", "") # Get staff associations from staff_by_slug staff_entries = staff_by_slug.get(slug, []) custodian_slug = None custodian_name = None heritage_relevant = False heritage_types = [] staff_id = meta.get("staff_id") if staff_entries: # Use first staff entry for custodian info first_staff = staff_entries[0] custodian_slug = first_staff.get("custodian_slug") custodian_name = first_staff.get("custodian_name") staff_id = staff_id or first_staff.get("staff_id") # Aggregate heritage info from all entries for se in staff_entries: if se.get("heritage_relevant"): heritage_relevant = True ht = se.get("heritage_type") if ht and ht not in heritage_types: heritage_types.append(ht) # Look up authoritative heritage_type from custodians database table # This ensures Van Gogh Museum staff are tagged as 'M' not 'E' or 'A' # We do NOT use heuristics - only verified institutional data custodian_type = get_custodian_type_from_db(custodian_name) if custodian_type: heritage_relevant = True # Replace first heritage_type with custodian-inferred type, or insert at front if heritage_types and heritage_types[0] != custodian_type: # Only keep the custodian type - headline-based types were often wrong heritage_types = [custodian_type] elif not heritage_types: heritage_types = [custodian_type] # Generate staff_id if missing if not staff_id: name_slug = re.sub(r'[^a-z0-9]+', '_', name.lower()).strip('_') staff_id = f"entity_{name_slug}_{slug}" # Get profile image profile_image_url = None if "exa_raw_response" in data: results = data["exa_raw_response"].get("results", []) if results: profile_image_url = results[0].get("image") if not profile_image_url: profile_image_url = profile.get("profile_image_url") # Ensure location is a string before truncating location_str = str(location)[:200] if location and isinstance(location, str) else (str(location)[:200] if location else None) # Extract rich profile data from entity files about = profile.get("about") if about and isinstance(about, str): about = about[:5000] # Truncate very long about sections experience = profile.get("experience") if experience and isinstance(experience, list): # Ensure it's JSON-serializable experience = json.dumps(experience) else: experience = None education = profile.get("education") if education and isinstance(education, list): education = json.dumps(education) else: education = None skills = profile.get("skills") if skills and isinstance(skills, list): # Ensure all items are strings skills = [str(s) for s in skills if s] else: skills = None languages = profile.get("languages") if languages and isinstance(languages, list): # Languages can be strings or dicts - convert dicts to JSON strings processed_languages = [] for l in languages: if l: if isinstance(l, dict): processed_languages.append(json.dumps(l)) else: processed_languages.append(str(l)) languages = processed_languages if processed_languages else None else: languages = None connections = profile.get("connections") if connections and isinstance(connections, str): connections = connections[:200] elif connections: connections = str(connections)[:200] else: connections = None # Extraction metadata extraction_date = meta.get("extraction_date") extraction_method = meta.get("extraction_method") source_file = entity_info.get("file") or meta.get("source_file") persons[staff_id] = { "staff_id": staff_id, "name": name, "headline": headline[:500] if headline and isinstance(headline, str) else None, "location": location_str, "country_code": country_code, "custodian_slug": custodian_slug, "custodian_name": custodian_name, "linkedin_url": meta.get("linkedin_url") or profile.get("linkedin_url"), "profile_image_url": profile_image_url, "heritage_relevant": heritage_relevant, "heritage_types": heritage_types if heritage_types else None, # New rich profile fields "about": about, "experience": experience, "education": education, "skills": skills, "languages": languages, "connections": connections, "extraction_date": extraction_date, "extraction_method": extraction_method, "source_file": source_file, } # Then add staff entries that don't have entity files for staff in staff_list: slug = staff.get("linkedin_slug") staff_id = staff.get("staff_id") # Skip if we already have this person from entity if slug and slug in entities: continue # Skip if we already have this staff_id if staff_id in persons: continue if not staff_id: continue # Get country code country_code = extract_country_from_location(staff.get("location")) heritage_types = [] if staff.get("heritage_type"): heritage_types.append(staff["heritage_type"]) persons[staff_id] = { "staff_id": staff_id, "name": staff.get("name"), "headline": staff.get("headline", "")[:500] if staff.get("headline") else None, "location": staff.get("location"), "country_code": country_code, "custodian_slug": staff.get("custodian_slug"), "custodian_name": staff.get("custodian_name"), "linkedin_url": staff.get("linkedin_url"), "profile_image_url": None, "heritage_relevant": staff.get("heritage_relevant", False), "heritage_types": heritage_types if heritage_types else None, # Staff-only entries don't have rich profile data "about": None, "experience": None, "education": None, "skills": None, "languages": None, "connections": None, "extraction_date": None, "extraction_method": None, "source_file": None, } return list(persons.values()) def import_to_postgres(persons: List[Dict], dry_run: bool = False) -> int: """Import person data into PostgreSQL.""" if dry_run: print(f"\n[DRY RUN] Would import {len(persons)} persons") # Show sample for person in persons[:10]: img = "📸" if person.get("profile_image_url") else " " hr = "🏛️" if person.get("heritage_relevant") else " " print(f" {img}{hr} {person['name'][:40]:<40} | {(person.get('custodian_name') or 'No custodian')[:35]}") return len(persons) # Connect to database conn = psycopg2.connect(**DB_CONFIG) cur = conn.cursor() try: # Clear existing data cur.execute("TRUNCATE TABLE persons") # Prepare data for batch insert columns = [ "staff_id", "name", "headline", "location", "country_code", "custodian_slug", "custodian_name", "linkedin_url", "profile_image_url", "heritage_relevant", "heritage_types", # New rich profile columns "about", "experience", "education", "skills", "languages", "connections", "extraction_date", "extraction_method", "source_file" ] values = [ ( p["staff_id"], p["name"], p["headline"], p["location"], p["country_code"], p["custodian_slug"], p["custodian_name"], p["linkedin_url"], p["profile_image_url"], p["heritage_relevant"], p["heritage_types"], # New rich profile values p.get("about"), p.get("experience"), p.get("education"), p.get("skills"), p.get("languages"), p.get("connections"), p.get("extraction_date"), p.get("extraction_method"), p.get("source_file"), ) for p in persons ] # Batch insert insert_query = f""" INSERT INTO persons ({', '.join(columns)}) VALUES %s ON CONFLICT (staff_id) DO UPDATE SET name = EXCLUDED.name, headline = EXCLUDED.headline, location = EXCLUDED.location, country_code = EXCLUDED.country_code, custodian_slug = EXCLUDED.custodian_slug, custodian_name = EXCLUDED.custodian_name, linkedin_url = EXCLUDED.linkedin_url, profile_image_url = EXCLUDED.profile_image_url, heritage_relevant = EXCLUDED.heritage_relevant, heritage_types = EXCLUDED.heritage_types, about = EXCLUDED.about, experience = EXCLUDED.experience, education = EXCLUDED.education, skills = EXCLUDED.skills, languages = EXCLUDED.languages, connections = EXCLUDED.connections, extraction_date = EXCLUDED.extraction_date, extraction_method = EXCLUDED.extraction_method, source_file = EXCLUDED.source_file, updated_at = CURRENT_TIMESTAMP """ execute_values(cur, insert_query, values, page_size=1000) conn.commit() # Get final counts cur.execute("SELECT COUNT(*) FROM persons") total = cur.fetchone()[0] cur.execute("SELECT COUNT(*) FROM persons WHERE heritage_relevant = true") heritage_relevant = cur.fetchone()[0] cur.execute("SELECT COUNT(*) FROM persons WHERE profile_image_url IS NOT NULL") with_image = cur.fetchone()[0] cur.execute("SELECT COUNT(*) FROM persons WHERE about IS NOT NULL") with_about = cur.fetchone()[0] cur.execute("SELECT COUNT(*) FROM persons WHERE experience IS NOT NULL") with_experience = cur.fetchone()[0] cur.execute("SELECT COUNT(*) FROM persons WHERE education IS NOT NULL") with_education = cur.fetchone()[0] cur.execute("SELECT COUNT(*) FROM persons WHERE skills IS NOT NULL") with_skills = cur.fetchone()[0] print(f"\n✓ Imported {total} persons") print(f" - Heritage-relevant: {heritage_relevant}") print(f" - With profile image: {with_image}") print(f" - With about section: {with_about}") print(f" - With experience: {with_experience}") print(f" - With education: {with_education}") print(f" - With skills: {with_skills}") return total except Exception as e: conn.rollback() print(f"Error importing data: {e}") raise finally: cur.close() conn.close() def main(): parser = argparse.ArgumentParser(description="Import persons to PostgreSQL") parser.add_argument("--dry-run", action="store_true", help="Don't actually import") parser.add_argument("--remote", action="store_true", help="Use remote paths (for server execution)") args = parser.parse_args() # Set paths entity_dir = REMOTE_ENTITY_DIR if args.remote else ENTITY_DIR parsed_dir = REMOTE_PARSED_DIR if args.remote else PARSED_DIR print(f"Loading entity files from {entity_dir}...") entities = load_entity_files(entity_dir) print(f" Loaded {len(entities)} entity profiles") print(f"\nLoading staff files from {parsed_dir}...") staff = load_staff_files(parsed_dir) print(f" Loaded {len(staff)} staff entries") print("\nMerging data...") persons = merge_data(entities, staff) print(f" Total unique persons: {len(persons)}") # Stats heritage_relevant = sum(1 for p in persons if p["heritage_relevant"]) with_linkedin = sum(1 for p in persons if p["linkedin_url"]) with_image = sum(1 for p in persons if p["profile_image_url"]) with_custodian = sum(1 for p in persons if p["custodian_name"]) by_country = {} for p in persons: cc = p["country_code"] or "Unknown" by_country[cc] = by_country.get(cc, 0) + 1 print(f"\nStats:") print(f" Heritage-relevant: {heritage_relevant}") print(f" With LinkedIn URL: {with_linkedin}") print(f" With profile image: {with_image}") print(f" With custodian link: {with_custodian}") print(f" By country (top 10):") for cc, count in sorted(by_country.items(), key=lambda x: -x[1])[:10]: print(f" {cc}: {count}") import_to_postgres(persons, dry_run=args.dry_run) if __name__ == "__main__": main()