274 lines
8.1 KiB
Python
274 lines
8.1 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Load NDE institution data into PostgreSQL
|
|
|
|
This script:
|
|
1. Creates the institutions table with proper schema
|
|
2. Loads data from nde_institutions.json
|
|
3. Creates useful indexes for querying
|
|
|
|
Usage:
|
|
python load_nde_data.py [--json-file PATH] [--drop-existing]
|
|
"""
|
|
|
|
import argparse
|
|
import asyncio
|
|
import json
|
|
import os
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List
|
|
|
|
import asyncpg
|
|
|
|
|
|
# Default configuration
|
|
DEFAULT_JSON_FILE = "/var/www/glam-frontend/data/nde_institutions.json"
|
|
POSTGRES_HOST = os.getenv("POSTGRES_HOST", "localhost")
|
|
POSTGRES_PORT = int(os.getenv("POSTGRES_PORT", "5432"))
|
|
POSTGRES_DB = os.getenv("POSTGRES_DB", "glam")
|
|
POSTGRES_USER = os.getenv("POSTGRES_USER", "glam_api")
|
|
POSTGRES_PASSWORD = os.getenv("POSTGRES_PASSWORD", "glam_secret_2025")
|
|
|
|
|
|
CREATE_TABLE_SQL = """
|
|
CREATE TABLE IF NOT EXISTS institutions (
|
|
id SERIAL PRIMARY KEY,
|
|
|
|
-- Core identity
|
|
name TEXT NOT NULL,
|
|
verified_name TEXT,
|
|
name_source TEXT,
|
|
|
|
-- Location
|
|
lat DOUBLE PRECISION,
|
|
lon DOUBLE PRECISION,
|
|
city TEXT,
|
|
province TEXT,
|
|
address TEXT,
|
|
|
|
-- Classification
|
|
type CHAR(1),
|
|
type_name TEXT,
|
|
color VARCHAR(10),
|
|
|
|
-- External IDs
|
|
wikidata_id TEXT,
|
|
google_place_id TEXT,
|
|
|
|
-- GHCID (Global Heritage Custodian ID)
|
|
ghcid TEXT,
|
|
ghcid_uuid UUID,
|
|
ghcid_numeric NUMERIC(20),
|
|
|
|
-- Metadata
|
|
website TEXT,
|
|
description TEXT,
|
|
phone TEXT,
|
|
|
|
-- Ratings (from Google Maps)
|
|
rating REAL,
|
|
total_ratings INTEGER,
|
|
business_status TEXT,
|
|
|
|
-- Founding information
|
|
founding_year INTEGER,
|
|
founding_decade INTEGER,
|
|
|
|
-- Wikidata types (as JSON array)
|
|
wikidata_types JSONB,
|
|
|
|
-- Reviews (as JSON array)
|
|
reviews JSONB,
|
|
|
|
-- Full identifiers (as JSON array)
|
|
identifiers JSONB,
|
|
|
|
-- Genealogie werkbalk data (as JSON object)
|
|
genealogiewerkbalk JSONB,
|
|
|
|
-- Street view URL
|
|
street_view_url TEXT,
|
|
|
|
-- Timestamps
|
|
created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
|
|
updated_at TIMESTAMP WITH TIME ZONE DEFAULT NOW()
|
|
);
|
|
|
|
-- Create indexes for common queries
|
|
CREATE INDEX IF NOT EXISTS idx_institutions_type ON institutions(type);
|
|
CREATE INDEX IF NOT EXISTS idx_institutions_province ON institutions(province);
|
|
CREATE INDEX IF NOT EXISTS idx_institutions_city ON institutions(city);
|
|
CREATE INDEX IF NOT EXISTS idx_institutions_ghcid ON institutions(ghcid);
|
|
CREATE INDEX IF NOT EXISTS idx_institutions_wikidata_id ON institutions(wikidata_id);
|
|
CREATE INDEX IF NOT EXISTS idx_institutions_rating ON institutions(rating);
|
|
|
|
-- Create index for full-text search on name
|
|
CREATE INDEX IF NOT EXISTS idx_institutions_name_gin ON institutions USING GIN (to_tsvector('simple', name));
|
|
"""
|
|
|
|
# Separate spatial SQL - only run if PostGIS is available
|
|
SPATIAL_INDEX_SQL = """
|
|
-- Create a spatial index (requires PostGIS)
|
|
CREATE INDEX IF NOT EXISTS idx_institutions_location ON institutions USING GIST (
|
|
ST_SetSRID(ST_MakePoint(lon, lat), 4326)
|
|
) WHERE lat IS NOT NULL AND lon IS NOT NULL;
|
|
"""
|
|
|
|
|
|
INSERT_SQL = """
|
|
INSERT INTO institutions (
|
|
name, verified_name, name_source,
|
|
lat, lon, city, province, address,
|
|
type, type_name, color,
|
|
wikidata_id, google_place_id,
|
|
ghcid, ghcid_uuid, ghcid_numeric,
|
|
website, description, phone,
|
|
rating, total_ratings, business_status,
|
|
founding_year, founding_decade,
|
|
wikidata_types, reviews, identifiers, genealogiewerkbalk,
|
|
street_view_url
|
|
) VALUES (
|
|
$1, $2, $3,
|
|
$4, $5, $6, $7, $8,
|
|
$9, $10, $11,
|
|
$12, $13,
|
|
$14, $15, $16,
|
|
$17, $18, $19,
|
|
$20, $21, $22,
|
|
$23, $24,
|
|
$25, $26, $27, $28,
|
|
$29
|
|
)
|
|
"""
|
|
|
|
|
|
def extract_values(inst: Dict[str, Any]) -> tuple:
|
|
"""Extract values from institution dict for INSERT"""
|
|
ghcid_data = inst.get("ghcid", {})
|
|
|
|
return (
|
|
inst.get("name"),
|
|
inst.get("verified_name"),
|
|
inst.get("name_source"),
|
|
inst.get("lat"),
|
|
inst.get("lon"),
|
|
inst.get("city"),
|
|
inst.get("province"),
|
|
inst.get("address"),
|
|
inst.get("type"),
|
|
inst.get("type_name"),
|
|
inst.get("color"),
|
|
inst.get("wikidata_id"),
|
|
inst.get("google_place_id"),
|
|
ghcid_data.get("current") if ghcid_data and ghcid_data.get("current") else None,
|
|
ghcid_data.get("uuid") if ghcid_data and ghcid_data.get("uuid") else None, # Handle empty strings
|
|
ghcid_data.get("numeric") if ghcid_data and ghcid_data.get("numeric") else None,
|
|
inst.get("website"),
|
|
inst.get("description"),
|
|
inst.get("phone"),
|
|
inst.get("rating"),
|
|
inst.get("total_ratings"),
|
|
inst.get("business_status"),
|
|
inst.get("founding_year"),
|
|
inst.get("founding_decade"),
|
|
json.dumps(inst.get("wikidata_types", [])),
|
|
json.dumps(inst.get("reviews", [])),
|
|
json.dumps(inst.get("identifiers", [])),
|
|
json.dumps(inst.get("genealogiewerkbalk", {})) if inst.get("genealogiewerkbalk") else None,
|
|
inst.get("street_view_url"),
|
|
)
|
|
|
|
|
|
async def load_data(json_file: str, drop_existing: bool = False):
|
|
"""Load NDE data into PostgreSQL"""
|
|
|
|
print(f"Connecting to PostgreSQL at {POSTGRES_HOST}:{POSTGRES_PORT}/{POSTGRES_DB}...")
|
|
|
|
conn = await asyncpg.connect(
|
|
host=POSTGRES_HOST,
|
|
port=POSTGRES_PORT,
|
|
database=POSTGRES_DB,
|
|
user=POSTGRES_USER,
|
|
password=POSTGRES_PASSWORD,
|
|
)
|
|
|
|
try:
|
|
if drop_existing:
|
|
print("Dropping existing institutions table...")
|
|
await conn.execute("DROP TABLE IF EXISTS institutions CASCADE")
|
|
|
|
print("Creating institutions table...")
|
|
# Split CREATE TABLE and CREATE INDEX statements
|
|
# (GIS index may fail if PostGIS not installed, that's OK)
|
|
statements = CREATE_TABLE_SQL.split(";")
|
|
for stmt in statements:
|
|
stmt = stmt.strip()
|
|
if stmt:
|
|
try:
|
|
await conn.execute(stmt)
|
|
except Exception as e:
|
|
if "ST_SetSRID" in stmt or "ST_MakePoint" in stmt:
|
|
print(f" Note: Spatial index skipped (PostGIS not installed): {e}")
|
|
else:
|
|
raise
|
|
|
|
print(f"Loading data from {json_file}...")
|
|
with open(json_file, 'r', encoding='utf-8') as f:
|
|
institutions = json.load(f)
|
|
|
|
print(f"Found {len(institutions)} institutions")
|
|
|
|
# Insert in batches
|
|
batch_size = 100
|
|
inserted = 0
|
|
|
|
for i in range(0, len(institutions), batch_size):
|
|
batch = institutions[i:i + batch_size]
|
|
values = [extract_values(inst) for inst in batch]
|
|
|
|
await conn.executemany(INSERT_SQL, values)
|
|
inserted += len(batch)
|
|
|
|
if inserted % 500 == 0 or inserted == len(institutions):
|
|
print(f" Inserted {inserted}/{len(institutions)} institutions...")
|
|
|
|
# Get final count
|
|
count = await conn.fetchval("SELECT COUNT(*) FROM institutions")
|
|
print(f"\nSuccess! Loaded {count} institutions into PostgreSQL.")
|
|
|
|
# Show sample
|
|
print("\nSample data:")
|
|
sample = await conn.fetch("""
|
|
SELECT name, city, province, type_name, rating
|
|
FROM institutions
|
|
WHERE rating IS NOT NULL
|
|
ORDER BY rating DESC
|
|
LIMIT 5
|
|
""")
|
|
for row in sample:
|
|
print(f" - {row['name']} ({row['city']}, {row['province']}) - {row['type_name']}, Rating: {row['rating']}")
|
|
|
|
finally:
|
|
await conn.close()
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Load NDE institution data into PostgreSQL")
|
|
parser.add_argument(
|
|
"--json-file",
|
|
default=DEFAULT_JSON_FILE,
|
|
help=f"Path to nde_institutions.json (default: {DEFAULT_JSON_FILE})"
|
|
)
|
|
parser.add_argument(
|
|
"--drop-existing",
|
|
action="store_true",
|
|
help="Drop existing table before loading"
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
asyncio.run(load_data(args.json_file, args.drop_existing))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|