glam/scripts/setup_ducklake_views.py
2025-12-07 00:26:01 +01:00

336 lines
13 KiB
Python

#!/usr/bin/env python3
"""
Setup DuckLake views for common query patterns.
This script creates SQL views on the DuckLake server to simplify
querying the custodians_raw table with JSON extraction.
Usage:
python scripts/setup_ducklake_views.py [--drop-first]
"""
import argparse
import requests
import json
import sys
DUCKLAKE_URL = "http://localhost:8765"
# View definitions with SQL CREATE VIEW statements
VIEWS = {
# Flattened view with key fields extracted from JSON
"v_custodians": """
CREATE OR REPLACE VIEW heritage.v_custodians AS
SELECT
ghcid_current as ghcid,
custodian_name as name,
org_name as original_name,
org_type as institution_type,
city,
country,
postal_code,
latitude,
longitude,
google_rating as rating,
google_total_ratings as review_count,
enrichment_status,
wikidata_id,
CASE WHEN wikidata_enrichment_json IS NOT NULL AND LENGTH(wikidata_enrichment_json) > 10
THEN json_extract_string(wikidata_enrichment_json, '$.wikidata_entity_id') ELSE NULL END as wikidata_qid,
CASE WHEN wikidata_enrichment_json IS NOT NULL AND LENGTH(wikidata_enrichment_json) > 10
THEN json_extract_string(wikidata_enrichment_json, '$.wikidata_label_en') ELSE NULL END as name_en,
CASE WHEN wikidata_enrichment_json IS NOT NULL AND LENGTH(wikidata_enrichment_json) > 10
THEN json_extract_string(wikidata_enrichment_json, '$.wikidata_label_nl') ELSE NULL END as name_nl,
CASE WHEN wikidata_enrichment_json IS NOT NULL AND LENGTH(wikidata_enrichment_json) > 10
THEN json_extract_string(wikidata_enrichment_json, '$.wikidata_description_nl') ELSE NULL END as description_nl,
CASE WHEN wikidata_enrichment_json IS NOT NULL AND LENGTH(wikidata_enrichment_json) > 10
THEN json_extract_string(wikidata_enrichment_json, '$.wikidata_official_website') ELSE NULL END as website,
CASE WHEN wikidata_enrichment_json IS NOT NULL AND LENGTH(wikidata_enrichment_json) > 10
THEN json_extract_string(wikidata_enrichment_json, '$.wikidata_identifiers.viaf') ELSE NULL END as viaf_id,
CASE WHEN wikidata_enrichment_json IS NOT NULL AND LENGTH(wikidata_enrichment_json) > 10
THEN json_extract_string(wikidata_enrichment_json, '$.wikidata_identifiers.gnd') ELSE NULL END as gnd_id,
CASE WHEN ghcid_json IS NOT NULL AND LENGTH(ghcid_json) > 10
THEN json_extract_string(ghcid_json, '$.record_id') ELSE NULL END as record_id,
CASE WHEN ghcid_json IS NOT NULL AND LENGTH(ghcid_json) > 10
THEN json_extract_string(ghcid_json, '$.ghcid_uuid') ELSE NULL END as ghcid_uuid,
processing_timestamp,
file_name
FROM heritage.custodians_raw
""",
# Statistics by country
"v_stats_by_country": """
CREATE OR REPLACE VIEW heritage.v_stats_by_country AS
SELECT
COALESCE(NULLIF(country, ''), 'Unknown') as country,
COUNT(*) as total_institutions,
COUNT(google_rating) as with_ratings,
ROUND(AVG(google_rating), 2) as avg_rating,
COUNT(DISTINCT city) as cities,
SUM(CASE WHEN enrichment_status = 'success' THEN 1 ELSE 0 END) as enriched_count
FROM heritage.custodians_raw
GROUP BY COALESCE(NULLIF(country, ''), 'Unknown')
ORDER BY total_institutions DESC
""",
# Statistics by institution type
"v_stats_by_type": """
CREATE OR REPLACE VIEW heritage.v_stats_by_type AS
SELECT
CASE org_type
WHEN 'M' THEN 'Museum'
WHEN 'A' THEN 'Archive'
WHEN 'L' THEN 'Library'
WHEN 'G' THEN 'Gallery'
WHEN 'S' THEN 'Society'
WHEN 'I' THEN 'Intangible Heritage'
WHEN 'R' THEN 'Research Center'
WHEN 'O' THEN 'Official Institution'
WHEN 'H' THEN 'Holy Site'
WHEN 'E' THEN 'Educational'
WHEN 'N' THEN 'NGO'
WHEN 'M,N' THEN 'Museum/NGO'
ELSE COALESCE(NULLIF(org_type, ''), 'Unknown')
END as institution_type,
org_type as type_code,
COUNT(*) as count,
ROUND(AVG(google_rating), 2) as avg_rating,
COUNT(google_rating) as with_ratings
FROM heritage.custodians_raw
GROUP BY org_type
ORDER BY count DESC
""",
# Statistics by enrichment status
"v_stats_by_enrichment": """
CREATE OR REPLACE VIEW heritage.v_stats_by_enrichment AS
SELECT
enrichment_status,
COUNT(*) as count,
ROUND(100.0 * COUNT(*) / SUM(COUNT(*)) OVER (), 1) as percentage
FROM heritage.custodians_raw
GROUP BY enrichment_status
ORDER BY count DESC
""",
# Top rated institutions
"v_top_rated": """
CREATE OR REPLACE VIEW heritage.v_top_rated AS
SELECT
ghcid_current as ghcid,
custodian_name as name,
city,
country,
google_rating as rating,
google_total_ratings as review_count,
CASE
WHEN wikidata_enrichment_json IS NOT NULL
AND wikidata_enrichment_json != ''
AND LENGTH(wikidata_enrichment_json) > 10
THEN json_extract_string(wikidata_enrichment_json, '$.wikidata_official_website')
ELSE NULL
END as website
FROM heritage.custodians_raw
WHERE google_rating IS NOT NULL
ORDER BY google_rating DESC, google_total_ratings DESC
""",
# Institutions with Wikidata
"v_with_wikidata": """
CREATE OR REPLACE VIEW heritage.v_with_wikidata AS
SELECT
ghcid_current as ghcid,
custodian_name as name,
json_extract_string(wikidata_enrichment_json, '$.wikidata_entity_id') as wikidata_qid,
json_extract_string(wikidata_enrichment_json, '$.wikidata_label_en') as name_en,
json_extract_string(wikidata_enrichment_json, '$.wikidata_description_nl') as description,
json_extract_string(wikidata_enrichment_json, '$.wikidata_official_website') as website,
json_extract_string(wikidata_enrichment_json, '$.wikidata_identifiers.viaf') as viaf,
json_extract_string(wikidata_enrichment_json, '$.wikidata_identifiers.gnd') as gnd,
city,
country
FROM heritage.custodians_raw
WHERE wikidata_enrichment_json IS NOT NULL
AND wikidata_enrichment_json != ''
AND LENGTH(wikidata_enrichment_json) > 10
AND json_extract_string(wikidata_enrichment_json, '$.wikidata_entity_id') IS NOT NULL
""",
# Data quality metrics
"v_data_quality": """
CREATE OR REPLACE VIEW heritage.v_data_quality AS
SELECT
'Total Records' as metric,
COUNT(*)::VARCHAR as value,
'' as details
FROM heritage.custodians_raw
UNION ALL
SELECT
'With Coordinates',
COUNT(*)::VARCHAR,
ROUND(100.0 * COUNT(*) / (SELECT COUNT(*) FROM heritage.custodians_raw), 1)::VARCHAR || '%'
FROM heritage.custodians_raw WHERE latitude IS NOT NULL
UNION ALL
SELECT
'With Google Rating',
COUNT(*)::VARCHAR,
ROUND(100.0 * COUNT(*) / (SELECT COUNT(*) FROM heritage.custodians_raw), 1)::VARCHAR || '%'
FROM heritage.custodians_raw WHERE google_rating IS NOT NULL
UNION ALL
SELECT
'With Wikidata',
COUNT(*)::VARCHAR,
ROUND(100.0 * COUNT(*) / (SELECT COUNT(*) FROM heritage.custodians_raw), 1)::VARCHAR || '%'
FROM heritage.custodians_raw
WHERE wikidata_enrichment_json IS NOT NULL
AND wikidata_enrichment_json != ''
AND LENGTH(wikidata_enrichment_json) > 10
UNION ALL
SELECT
'With Website',
COUNT(*)::VARCHAR,
ROUND(100.0 * COUNT(*) / (SELECT COUNT(*) FROM heritage.custodians_raw), 1)::VARCHAR || '%'
FROM heritage.custodians_raw
WHERE wikidata_enrichment_json IS NOT NULL
AND wikidata_enrichment_json != ''
AND LENGTH(wikidata_enrichment_json) > 10
AND json_extract_string(wikidata_enrichment_json, '$.wikidata_official_website') IS NOT NULL
UNION ALL
SELECT
'Fully Enriched',
COUNT(*)::VARCHAR,
ROUND(100.0 * COUNT(*) / (SELECT COUNT(*) FROM heritage.custodians_raw), 1)::VARCHAR || '%'
FROM heritage.custodians_raw WHERE enrichment_status = 'success'
UNION ALL
SELECT
'Average Rating',
ROUND(AVG(google_rating), 2)::VARCHAR,
'Out of 5.0'
FROM heritage.custodians_raw WHERE google_rating IS NOT NULL
""",
# Cities with most institutions
"v_top_cities": """
CREATE OR REPLACE VIEW heritage.v_top_cities AS
SELECT
COALESCE(NULLIF(city, ''), 'Unknown') as city,
country,
COUNT(*) as institution_count,
COUNT(google_rating) as with_ratings,
ROUND(AVG(google_rating), 2) as avg_rating
FROM heritage.custodians_raw
WHERE city IS NOT NULL AND city != ''
GROUP BY city, country
ORDER BY institution_count DESC
""",
# Full-text search helper (materialized as table for FTS)
"v_search_index": """
CREATE OR REPLACE VIEW heritage.v_search_index AS
SELECT
ghcid_current as ghcid,
LOWER(COALESCE(custodian_name, '') || ' ' ||
COALESCE(org_name, '') || ' ' ||
COALESCE(CASE WHEN wikidata_enrichment_json IS NOT NULL AND LENGTH(wikidata_enrichment_json) > 10
THEN json_extract_string(wikidata_enrichment_json, '$.wikidata_label_en') ELSE NULL END, '') || ' ' ||
COALESCE(CASE WHEN wikidata_enrichment_json IS NOT NULL AND LENGTH(wikidata_enrichment_json) > 10
THEN json_extract_string(wikidata_enrichment_json, '$.wikidata_label_nl') ELSE NULL END, '') || ' ' ||
COALESCE(city, '')) as search_text,
custodian_name as name,
city,
country,
google_rating as rating
FROM heritage.custodians_raw
"""
}
def execute_query(query: str) -> dict:
"""Execute a SQL query against DuckLake."""
response = requests.post(
f"{DUCKLAKE_URL}/query",
json={"query": query},
timeout=60.0
)
return response.json()
def create_views(drop_first: bool = False):
"""Create all views in DuckLake."""
print(f"Connecting to DuckLake at {DUCKLAKE_URL}...")
# Check connection
try:
status = requests.get(f"{DUCKLAKE_URL}/", timeout=5.0).json()
print(f"Connected: {status.get('status')} (DuckDB {status.get('duckdb_version')})")
except Exception as e:
print(f"Error connecting: {e}")
sys.exit(1)
# Create each view
for view_name, view_sql in VIEWS.items():
print(f"\nCreating view: {view_name}")
if drop_first:
drop_sql = f"DROP VIEW IF EXISTS heritage.{view_name}"
try:
execute_query(drop_sql)
print(f" Dropped existing view")
except:
pass
try:
result = execute_query(view_sql)
if "error" in result or "detail" in result:
print(f" ERROR: {result}")
else:
print(f" Created successfully")
except Exception as e:
print(f" ERROR: {e}")
# Verify views were created
print("\n" + "=" * 50)
print("Verifying views...")
result = execute_query("""
SELECT table_name
FROM information_schema.tables
WHERE table_schema = 'heritage' AND table_type = 'VIEW'
ORDER BY table_name
""")
if "rows" in result:
print(f"Created {len(result['rows'])} views:")
for row in result["rows"]:
print(f" - {row[0]}")
# Test each view with a sample query
print("\n" + "=" * 50)
print("Testing views...")
for view_name in VIEWS.keys():
try:
result = execute_query(f"SELECT COUNT(*) FROM heritage.{view_name}")
if "rows" in result:
count = result["rows"][0][0]
print(f" {view_name}: {count} rows")
else:
print(f" {view_name}: ERROR - {result}")
except Exception as e:
print(f" {view_name}: ERROR - {e}")
def main():
global DUCKLAKE_URL
parser = argparse.ArgumentParser(description="Setup DuckLake views")
parser.add_argument("--drop-first", action="store_true", help="Drop existing views before creating")
parser.add_argument("--url", default=DUCKLAKE_URL, help="DuckLake API URL")
args = parser.parse_args()
DUCKLAKE_URL = args.url
create_views(drop_first=args.drop_first)
print("\nDone!")
if __name__ == "__main__":
main()