glam/backend/postgres/load_nde_data.py
2025-12-06 19:50:04 +01:00

274 lines
8.1 KiB
Python

#!/usr/bin/env python3
"""
Load NDE institution data into PostgreSQL
This script:
1. Creates the institutions table with proper schema
2. Loads data from nde_institutions.json
3. Creates useful indexes for querying
Usage:
python load_nde_data.py [--json-file PATH] [--drop-existing]
"""
import argparse
import asyncio
import json
import os
from pathlib import Path
from typing import Any, Dict, List
import asyncpg
# Default configuration
DEFAULT_JSON_FILE = "/var/www/glam-frontend/data/nde_institutions.json"
POSTGRES_HOST = os.getenv("POSTGRES_HOST", "localhost")
POSTGRES_PORT = int(os.getenv("POSTGRES_PORT", "5432"))
POSTGRES_DB = os.getenv("POSTGRES_DB", "glam")
POSTGRES_USER = os.getenv("POSTGRES_USER", "glam_api")
POSTGRES_PASSWORD = os.getenv("POSTGRES_PASSWORD", "glam_secret_2025")
CREATE_TABLE_SQL = """
CREATE TABLE IF NOT EXISTS institutions (
id SERIAL PRIMARY KEY,
-- Core identity
name TEXT NOT NULL,
verified_name TEXT,
name_source TEXT,
-- Location
lat DOUBLE PRECISION,
lon DOUBLE PRECISION,
city TEXT,
province TEXT,
address TEXT,
-- Classification
type CHAR(1),
type_name TEXT,
color VARCHAR(10),
-- External IDs
wikidata_id TEXT,
google_place_id TEXT,
-- GHCID (Global Heritage Custodian ID)
ghcid TEXT,
ghcid_uuid UUID,
ghcid_numeric NUMERIC(20),
-- Metadata
website TEXT,
description TEXT,
phone TEXT,
-- Ratings (from Google Maps)
rating REAL,
total_ratings INTEGER,
business_status TEXT,
-- Founding information
founding_year INTEGER,
founding_decade INTEGER,
-- Wikidata types (as JSON array)
wikidata_types JSONB,
-- Reviews (as JSON array)
reviews JSONB,
-- Full identifiers (as JSON array)
identifiers JSONB,
-- Genealogie werkbalk data (as JSON object)
genealogiewerkbalk JSONB,
-- Street view URL
street_view_url TEXT,
-- Timestamps
created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
updated_at TIMESTAMP WITH TIME ZONE DEFAULT NOW()
);
-- Create indexes for common queries
CREATE INDEX IF NOT EXISTS idx_institutions_type ON institutions(type);
CREATE INDEX IF NOT EXISTS idx_institutions_province ON institutions(province);
CREATE INDEX IF NOT EXISTS idx_institutions_city ON institutions(city);
CREATE INDEX IF NOT EXISTS idx_institutions_ghcid ON institutions(ghcid);
CREATE INDEX IF NOT EXISTS idx_institutions_wikidata_id ON institutions(wikidata_id);
CREATE INDEX IF NOT EXISTS idx_institutions_rating ON institutions(rating);
-- Create index for full-text search on name
CREATE INDEX IF NOT EXISTS idx_institutions_name_gin ON institutions USING GIN (to_tsvector('simple', name));
"""
# Separate spatial SQL - only run if PostGIS is available
SPATIAL_INDEX_SQL = """
-- Create a spatial index (requires PostGIS)
CREATE INDEX IF NOT EXISTS idx_institutions_location ON institutions USING GIST (
ST_SetSRID(ST_MakePoint(lon, lat), 4326)
) WHERE lat IS NOT NULL AND lon IS NOT NULL;
"""
INSERT_SQL = """
INSERT INTO institutions (
name, verified_name, name_source,
lat, lon, city, province, address,
type, type_name, color,
wikidata_id, google_place_id,
ghcid, ghcid_uuid, ghcid_numeric,
website, description, phone,
rating, total_ratings, business_status,
founding_year, founding_decade,
wikidata_types, reviews, identifiers, genealogiewerkbalk,
street_view_url
) VALUES (
$1, $2, $3,
$4, $5, $6, $7, $8,
$9, $10, $11,
$12, $13,
$14, $15, $16,
$17, $18, $19,
$20, $21, $22,
$23, $24,
$25, $26, $27, $28,
$29
)
"""
def extract_values(inst: Dict[str, Any]) -> tuple:
"""Extract values from institution dict for INSERT"""
ghcid_data = inst.get("ghcid", {})
return (
inst.get("name"),
inst.get("verified_name"),
inst.get("name_source"),
inst.get("lat"),
inst.get("lon"),
inst.get("city"),
inst.get("province"),
inst.get("address"),
inst.get("type"),
inst.get("type_name"),
inst.get("color"),
inst.get("wikidata_id"),
inst.get("google_place_id"),
ghcid_data.get("current") if ghcid_data and ghcid_data.get("current") else None,
ghcid_data.get("uuid") if ghcid_data and ghcid_data.get("uuid") else None, # Handle empty strings
ghcid_data.get("numeric") if ghcid_data and ghcid_data.get("numeric") else None,
inst.get("website"),
inst.get("description"),
inst.get("phone"),
inst.get("rating"),
inst.get("total_ratings"),
inst.get("business_status"),
inst.get("founding_year"),
inst.get("founding_decade"),
json.dumps(inst.get("wikidata_types", [])),
json.dumps(inst.get("reviews", [])),
json.dumps(inst.get("identifiers", [])),
json.dumps(inst.get("genealogiewerkbalk", {})) if inst.get("genealogiewerkbalk") else None,
inst.get("street_view_url"),
)
async def load_data(json_file: str, drop_existing: bool = False):
"""Load NDE data into PostgreSQL"""
print(f"Connecting to PostgreSQL at {POSTGRES_HOST}:{POSTGRES_PORT}/{POSTGRES_DB}...")
conn = await asyncpg.connect(
host=POSTGRES_HOST,
port=POSTGRES_PORT,
database=POSTGRES_DB,
user=POSTGRES_USER,
password=POSTGRES_PASSWORD,
)
try:
if drop_existing:
print("Dropping existing institutions table...")
await conn.execute("DROP TABLE IF EXISTS institutions CASCADE")
print("Creating institutions table...")
# Split CREATE TABLE and CREATE INDEX statements
# (GIS index may fail if PostGIS not installed, that's OK)
statements = CREATE_TABLE_SQL.split(";")
for stmt in statements:
stmt = stmt.strip()
if stmt:
try:
await conn.execute(stmt)
except Exception as e:
if "ST_SetSRID" in stmt or "ST_MakePoint" in stmt:
print(f" Note: Spatial index skipped (PostGIS not installed): {e}")
else:
raise
print(f"Loading data from {json_file}...")
with open(json_file, 'r', encoding='utf-8') as f:
institutions = json.load(f)
print(f"Found {len(institutions)} institutions")
# Insert in batches
batch_size = 100
inserted = 0
for i in range(0, len(institutions), batch_size):
batch = institutions[i:i + batch_size]
values = [extract_values(inst) for inst in batch]
await conn.executemany(INSERT_SQL, values)
inserted += len(batch)
if inserted % 500 == 0 or inserted == len(institutions):
print(f" Inserted {inserted}/{len(institutions)} institutions...")
# Get final count
count = await conn.fetchval("SELECT COUNT(*) FROM institutions")
print(f"\nSuccess! Loaded {count} institutions into PostgreSQL.")
# Show sample
print("\nSample data:")
sample = await conn.fetch("""
SELECT name, city, province, type_name, rating
FROM institutions
WHERE rating IS NOT NULL
ORDER BY rating DESC
LIMIT 5
""")
for row in sample:
print(f" - {row['name']} ({row['city']}, {row['province']}) - {row['type_name']}, Rating: {row['rating']}")
finally:
await conn.close()
def main():
parser = argparse.ArgumentParser(description="Load NDE institution data into PostgreSQL")
parser.add_argument(
"--json-file",
default=DEFAULT_JSON_FILE,
help=f"Path to nde_institutions.json (default: {DEFAULT_JSON_FILE})"
)
parser.add_argument(
"--drop-existing",
action="store_true",
help="Drop existing table before loading"
)
args = parser.parse_args()
asyncio.run(load_data(args.json_file, args.drop_existing))
if __name__ == "__main__":
main()