#!/usr/bin/env python3 """ Load NDE institution data into PostgreSQL This script: 1. Creates the institutions table with proper schema 2. Loads data from nde_institutions.json 3. Creates useful indexes for querying Usage: python load_nde_data.py [--json-file PATH] [--drop-existing] """ import argparse import asyncio import json import os from pathlib import Path from typing import Any, Dict, List import asyncpg # Default configuration DEFAULT_JSON_FILE = "/var/www/glam-frontend/data/nde_institutions.json" POSTGRES_HOST = os.getenv("POSTGRES_HOST", "localhost") POSTGRES_PORT = int(os.getenv("POSTGRES_PORT", "5432")) POSTGRES_DB = os.getenv("POSTGRES_DB", "glam") POSTGRES_USER = os.getenv("POSTGRES_USER", "glam_api") POSTGRES_PASSWORD = os.getenv("POSTGRES_PASSWORD", "glam_secret_2025") CREATE_TABLE_SQL = """ CREATE TABLE IF NOT EXISTS institutions ( id SERIAL PRIMARY KEY, -- Core identity name TEXT NOT NULL, verified_name TEXT, name_source TEXT, -- Location lat DOUBLE PRECISION, lon DOUBLE PRECISION, city TEXT, province TEXT, address TEXT, -- Classification type CHAR(1), type_name TEXT, color VARCHAR(10), -- External IDs wikidata_id TEXT, google_place_id TEXT, -- GHCID (Global Heritage Custodian ID) ghcid TEXT, ghcid_uuid UUID, ghcid_numeric NUMERIC(20), -- Metadata website TEXT, description TEXT, phone TEXT, -- Ratings (from Google Maps) rating REAL, total_ratings INTEGER, business_status TEXT, -- Founding information founding_year INTEGER, founding_decade INTEGER, -- Wikidata types (as JSON array) wikidata_types JSONB, -- Reviews (as JSON array) reviews JSONB, -- Full identifiers (as JSON array) identifiers JSONB, -- Genealogie werkbalk data (as JSON object) genealogiewerkbalk JSONB, -- Street view URL street_view_url TEXT, -- Timestamps created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(), updated_at TIMESTAMP WITH TIME ZONE DEFAULT NOW() ); -- Create indexes for common queries CREATE INDEX IF NOT EXISTS idx_institutions_type ON institutions(type); CREATE INDEX IF NOT EXISTS idx_institutions_province ON institutions(province); CREATE INDEX IF NOT EXISTS idx_institutions_city ON institutions(city); CREATE INDEX IF NOT EXISTS idx_institutions_ghcid ON institutions(ghcid); CREATE INDEX IF NOT EXISTS idx_institutions_wikidata_id ON institutions(wikidata_id); CREATE INDEX IF NOT EXISTS idx_institutions_rating ON institutions(rating); -- Create index for full-text search on name CREATE INDEX IF NOT EXISTS idx_institutions_name_gin ON institutions USING GIN (to_tsvector('simple', name)); """ # Separate spatial SQL - only run if PostGIS is available SPATIAL_INDEX_SQL = """ -- Create a spatial index (requires PostGIS) CREATE INDEX IF NOT EXISTS idx_institutions_location ON institutions USING GIST ( ST_SetSRID(ST_MakePoint(lon, lat), 4326) ) WHERE lat IS NOT NULL AND lon IS NOT NULL; """ INSERT_SQL = """ INSERT INTO institutions ( name, verified_name, name_source, lat, lon, city, province, address, type, type_name, color, wikidata_id, google_place_id, ghcid, ghcid_uuid, ghcid_numeric, website, description, phone, rating, total_ratings, business_status, founding_year, founding_decade, wikidata_types, reviews, identifiers, genealogiewerkbalk, street_view_url ) VALUES ( $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29 ) """ def extract_values(inst: Dict[str, Any]) -> tuple: """Extract values from institution dict for INSERT""" ghcid_data = inst.get("ghcid", {}) return ( inst.get("name"), inst.get("verified_name"), inst.get("name_source"), inst.get("lat"), inst.get("lon"), inst.get("city"), inst.get("province"), inst.get("address"), inst.get("type"), inst.get("type_name"), inst.get("color"), inst.get("wikidata_id"), inst.get("google_place_id"), ghcid_data.get("current") if ghcid_data and ghcid_data.get("current") else None, ghcid_data.get("uuid") if ghcid_data and ghcid_data.get("uuid") else None, # Handle empty strings ghcid_data.get("numeric") if ghcid_data and ghcid_data.get("numeric") else None, inst.get("website"), inst.get("description"), inst.get("phone"), inst.get("rating"), inst.get("total_ratings"), inst.get("business_status"), inst.get("founding_year"), inst.get("founding_decade"), json.dumps(inst.get("wikidata_types", [])), json.dumps(inst.get("reviews", [])), json.dumps(inst.get("identifiers", [])), json.dumps(inst.get("genealogiewerkbalk", {})) if inst.get("genealogiewerkbalk") else None, inst.get("street_view_url"), ) async def load_data(json_file: str, drop_existing: bool = False): """Load NDE data into PostgreSQL""" print(f"Connecting to PostgreSQL at {POSTGRES_HOST}:{POSTGRES_PORT}/{POSTGRES_DB}...") conn = await asyncpg.connect( host=POSTGRES_HOST, port=POSTGRES_PORT, database=POSTGRES_DB, user=POSTGRES_USER, password=POSTGRES_PASSWORD, ) try: if drop_existing: print("Dropping existing institutions table...") await conn.execute("DROP TABLE IF EXISTS institutions CASCADE") print("Creating institutions table...") # Split CREATE TABLE and CREATE INDEX statements # (GIS index may fail if PostGIS not installed, that's OK) statements = CREATE_TABLE_SQL.split(";") for stmt in statements: stmt = stmt.strip() if stmt: try: await conn.execute(stmt) except Exception as e: if "ST_SetSRID" in stmt or "ST_MakePoint" in stmt: print(f" Note: Spatial index skipped (PostGIS not installed): {e}") else: raise print(f"Loading data from {json_file}...") with open(json_file, 'r', encoding='utf-8') as f: institutions = json.load(f) print(f"Found {len(institutions)} institutions") # Insert in batches batch_size = 100 inserted = 0 for i in range(0, len(institutions), batch_size): batch = institutions[i:i + batch_size] values = [extract_values(inst) for inst in batch] await conn.executemany(INSERT_SQL, values) inserted += len(batch) if inserted % 500 == 0 or inserted == len(institutions): print(f" Inserted {inserted}/{len(institutions)} institutions...") # Get final count count = await conn.fetchval("SELECT COUNT(*) FROM institutions") print(f"\nSuccess! Loaded {count} institutions into PostgreSQL.") # Show sample print("\nSample data:") sample = await conn.fetch(""" SELECT name, city, province, type_name, rating FROM institutions WHERE rating IS NOT NULL ORDER BY rating DESC LIMIT 5 """) for row in sample: print(f" - {row['name']} ({row['city']}, {row['province']}) - {row['type_name']}, Rating: {row['rating']}") finally: await conn.close() def main(): parser = argparse.ArgumentParser(description="Load NDE institution data into PostgreSQL") parser.add_argument( "--json-file", default=DEFAULT_JSON_FILE, help=f"Path to nde_institutions.json (default: {DEFAULT_JSON_FILE})" ) parser.add_argument( "--drop-existing", action="store_true", help="Drop existing table before loading" ) args = parser.parse_args() asyncio.run(load_data(args.json_file, args.drop_existing)) if __name__ == "__main__": main()