#!/usr/bin/env python3 """ Setup DuckLake views for common query patterns. This script creates SQL views on the DuckLake server to simplify querying the custodians_raw table with JSON extraction. Usage: python scripts/setup_ducklake_views.py [--drop-first] """ import argparse import requests import json import sys DUCKLAKE_URL = "http://localhost:8765" # View definitions with SQL CREATE VIEW statements VIEWS = { # Flattened view with key fields extracted from JSON "v_custodians": """ CREATE OR REPLACE VIEW heritage.v_custodians AS SELECT ghcid_current as ghcid, custodian_name as name, org_name as original_name, org_type as institution_type, city, country, postal_code, latitude, longitude, google_rating as rating, google_total_ratings as review_count, enrichment_status, wikidata_id, CASE WHEN wikidata_enrichment_json IS NOT NULL AND LENGTH(wikidata_enrichment_json) > 10 THEN json_extract_string(wikidata_enrichment_json, '$.wikidata_entity_id') ELSE NULL END as wikidata_qid, CASE WHEN wikidata_enrichment_json IS NOT NULL AND LENGTH(wikidata_enrichment_json) > 10 THEN json_extract_string(wikidata_enrichment_json, '$.wikidata_label_en') ELSE NULL END as name_en, CASE WHEN wikidata_enrichment_json IS NOT NULL AND LENGTH(wikidata_enrichment_json) > 10 THEN json_extract_string(wikidata_enrichment_json, '$.wikidata_label_nl') ELSE NULL END as name_nl, CASE WHEN wikidata_enrichment_json IS NOT NULL AND LENGTH(wikidata_enrichment_json) > 10 THEN json_extract_string(wikidata_enrichment_json, '$.wikidata_description_nl') ELSE NULL END as description_nl, CASE WHEN wikidata_enrichment_json IS NOT NULL AND LENGTH(wikidata_enrichment_json) > 10 THEN json_extract_string(wikidata_enrichment_json, '$.wikidata_official_website') ELSE NULL END as website, CASE WHEN wikidata_enrichment_json IS NOT NULL AND LENGTH(wikidata_enrichment_json) > 10 THEN json_extract_string(wikidata_enrichment_json, '$.wikidata_identifiers.viaf') ELSE NULL END as viaf_id, CASE WHEN wikidata_enrichment_json IS NOT NULL AND LENGTH(wikidata_enrichment_json) > 10 THEN json_extract_string(wikidata_enrichment_json, '$.wikidata_identifiers.gnd') ELSE NULL END as gnd_id, CASE WHEN ghcid_json IS NOT NULL AND LENGTH(ghcid_json) > 10 THEN json_extract_string(ghcid_json, '$.record_id') ELSE NULL END as record_id, CASE WHEN ghcid_json IS NOT NULL AND LENGTH(ghcid_json) > 10 THEN json_extract_string(ghcid_json, '$.ghcid_uuid') ELSE NULL END as ghcid_uuid, processing_timestamp, file_name FROM heritage.custodians_raw """, # Statistics by country "v_stats_by_country": """ CREATE OR REPLACE VIEW heritage.v_stats_by_country AS SELECT COALESCE(NULLIF(country, ''), 'Unknown') as country, COUNT(*) as total_institutions, COUNT(google_rating) as with_ratings, ROUND(AVG(google_rating), 2) as avg_rating, COUNT(DISTINCT city) as cities, SUM(CASE WHEN enrichment_status = 'success' THEN 1 ELSE 0 END) as enriched_count FROM heritage.custodians_raw GROUP BY COALESCE(NULLIF(country, ''), 'Unknown') ORDER BY total_institutions DESC """, # Statistics by institution type "v_stats_by_type": """ CREATE OR REPLACE VIEW heritage.v_stats_by_type AS SELECT CASE org_type WHEN 'M' THEN 'Museum' WHEN 'A' THEN 'Archive' WHEN 'L' THEN 'Library' WHEN 'G' THEN 'Gallery' WHEN 'S' THEN 'Society' WHEN 'I' THEN 'Intangible Heritage' WHEN 'R' THEN 'Research Center' WHEN 'O' THEN 'Official Institution' WHEN 'H' THEN 'Holy Site' WHEN 'E' THEN 'Educational' WHEN 'N' THEN 'NGO' WHEN 'M,N' THEN 'Museum/NGO' ELSE COALESCE(NULLIF(org_type, ''), 'Unknown') END as institution_type, org_type as type_code, COUNT(*) as count, ROUND(AVG(google_rating), 2) as avg_rating, COUNT(google_rating) as with_ratings FROM heritage.custodians_raw GROUP BY org_type ORDER BY count DESC """, # Statistics by enrichment status "v_stats_by_enrichment": """ CREATE OR REPLACE VIEW heritage.v_stats_by_enrichment AS SELECT enrichment_status, COUNT(*) as count, ROUND(100.0 * COUNT(*) / SUM(COUNT(*)) OVER (), 1) as percentage FROM heritage.custodians_raw GROUP BY enrichment_status ORDER BY count DESC """, # Top rated institutions "v_top_rated": """ CREATE OR REPLACE VIEW heritage.v_top_rated AS SELECT ghcid_current as ghcid, custodian_name as name, city, country, google_rating as rating, google_total_ratings as review_count, CASE WHEN wikidata_enrichment_json IS NOT NULL AND wikidata_enrichment_json != '' AND LENGTH(wikidata_enrichment_json) > 10 THEN json_extract_string(wikidata_enrichment_json, '$.wikidata_official_website') ELSE NULL END as website FROM heritage.custodians_raw WHERE google_rating IS NOT NULL ORDER BY google_rating DESC, google_total_ratings DESC """, # Institutions with Wikidata "v_with_wikidata": """ CREATE OR REPLACE VIEW heritage.v_with_wikidata AS SELECT ghcid_current as ghcid, custodian_name as name, json_extract_string(wikidata_enrichment_json, '$.wikidata_entity_id') as wikidata_qid, json_extract_string(wikidata_enrichment_json, '$.wikidata_label_en') as name_en, json_extract_string(wikidata_enrichment_json, '$.wikidata_description_nl') as description, json_extract_string(wikidata_enrichment_json, '$.wikidata_official_website') as website, json_extract_string(wikidata_enrichment_json, '$.wikidata_identifiers.viaf') as viaf, json_extract_string(wikidata_enrichment_json, '$.wikidata_identifiers.gnd') as gnd, city, country FROM heritage.custodians_raw WHERE wikidata_enrichment_json IS NOT NULL AND wikidata_enrichment_json != '' AND LENGTH(wikidata_enrichment_json) > 10 AND json_extract_string(wikidata_enrichment_json, '$.wikidata_entity_id') IS NOT NULL """, # Data quality metrics "v_data_quality": """ CREATE OR REPLACE VIEW heritage.v_data_quality AS SELECT 'Total Records' as metric, COUNT(*)::VARCHAR as value, '' as details FROM heritage.custodians_raw UNION ALL SELECT 'With Coordinates', COUNT(*)::VARCHAR, ROUND(100.0 * COUNT(*) / (SELECT COUNT(*) FROM heritage.custodians_raw), 1)::VARCHAR || '%' FROM heritage.custodians_raw WHERE latitude IS NOT NULL UNION ALL SELECT 'With Google Rating', COUNT(*)::VARCHAR, ROUND(100.0 * COUNT(*) / (SELECT COUNT(*) FROM heritage.custodians_raw), 1)::VARCHAR || '%' FROM heritage.custodians_raw WHERE google_rating IS NOT NULL UNION ALL SELECT 'With Wikidata', COUNT(*)::VARCHAR, ROUND(100.0 * COUNT(*) / (SELECT COUNT(*) FROM heritage.custodians_raw), 1)::VARCHAR || '%' FROM heritage.custodians_raw WHERE wikidata_enrichment_json IS NOT NULL AND wikidata_enrichment_json != '' AND LENGTH(wikidata_enrichment_json) > 10 UNION ALL SELECT 'With Website', COUNT(*)::VARCHAR, ROUND(100.0 * COUNT(*) / (SELECT COUNT(*) FROM heritage.custodians_raw), 1)::VARCHAR || '%' FROM heritage.custodians_raw WHERE wikidata_enrichment_json IS NOT NULL AND wikidata_enrichment_json != '' AND LENGTH(wikidata_enrichment_json) > 10 AND json_extract_string(wikidata_enrichment_json, '$.wikidata_official_website') IS NOT NULL UNION ALL SELECT 'Fully Enriched', COUNT(*)::VARCHAR, ROUND(100.0 * COUNT(*) / (SELECT COUNT(*) FROM heritage.custodians_raw), 1)::VARCHAR || '%' FROM heritage.custodians_raw WHERE enrichment_status = 'success' UNION ALL SELECT 'Average Rating', ROUND(AVG(google_rating), 2)::VARCHAR, 'Out of 5.0' FROM heritage.custodians_raw WHERE google_rating IS NOT NULL """, # Cities with most institutions "v_top_cities": """ CREATE OR REPLACE VIEW heritage.v_top_cities AS SELECT COALESCE(NULLIF(city, ''), 'Unknown') as city, country, COUNT(*) as institution_count, COUNT(google_rating) as with_ratings, ROUND(AVG(google_rating), 2) as avg_rating FROM heritage.custodians_raw WHERE city IS NOT NULL AND city != '' GROUP BY city, country ORDER BY institution_count DESC """, # Full-text search helper (materialized as table for FTS) "v_search_index": """ CREATE OR REPLACE VIEW heritage.v_search_index AS SELECT ghcid_current as ghcid, LOWER(COALESCE(custodian_name, '') || ' ' || COALESCE(org_name, '') || ' ' || COALESCE(CASE WHEN wikidata_enrichment_json IS NOT NULL AND LENGTH(wikidata_enrichment_json) > 10 THEN json_extract_string(wikidata_enrichment_json, '$.wikidata_label_en') ELSE NULL END, '') || ' ' || COALESCE(CASE WHEN wikidata_enrichment_json IS NOT NULL AND LENGTH(wikidata_enrichment_json) > 10 THEN json_extract_string(wikidata_enrichment_json, '$.wikidata_label_nl') ELSE NULL END, '') || ' ' || COALESCE(city, '')) as search_text, custodian_name as name, city, country, google_rating as rating FROM heritage.custodians_raw """ } def execute_query(query: str) -> dict: """Execute a SQL query against DuckLake.""" response = requests.post( f"{DUCKLAKE_URL}/query", json={"query": query}, timeout=60.0 ) return response.json() def create_views(drop_first: bool = False): """Create all views in DuckLake.""" print(f"Connecting to DuckLake at {DUCKLAKE_URL}...") # Check connection try: status = requests.get(f"{DUCKLAKE_URL}/", timeout=5.0).json() print(f"Connected: {status.get('status')} (DuckDB {status.get('duckdb_version')})") except Exception as e: print(f"Error connecting: {e}") sys.exit(1) # Create each view for view_name, view_sql in VIEWS.items(): print(f"\nCreating view: {view_name}") if drop_first: drop_sql = f"DROP VIEW IF EXISTS heritage.{view_name}" try: execute_query(drop_sql) print(f" Dropped existing view") except: pass try: result = execute_query(view_sql) if "error" in result or "detail" in result: print(f" ERROR: {result}") else: print(f" Created successfully") except Exception as e: print(f" ERROR: {e}") # Verify views were created print("\n" + "=" * 50) print("Verifying views...") result = execute_query(""" SELECT table_name FROM information_schema.tables WHERE table_schema = 'heritage' AND table_type = 'VIEW' ORDER BY table_name """) if "rows" in result: print(f"Created {len(result['rows'])} views:") for row in result["rows"]: print(f" - {row[0]}") # Test each view with a sample query print("\n" + "=" * 50) print("Testing views...") for view_name in VIEWS.keys(): try: result = execute_query(f"SELECT COUNT(*) FROM heritage.{view_name}") if "rows" in result: count = result["rows"][0][0] print(f" {view_name}: {count} rows") else: print(f" {view_name}: ERROR - {result}") except Exception as e: print(f" {view_name}: ERROR - {e}") def main(): global DUCKLAKE_URL parser = argparse.ArgumentParser(description="Setup DuckLake views") parser.add_argument("--drop-first", action="store_true", help="Drop existing views before creating") parser.add_argument("--url", default=DUCKLAKE_URL, help="DuckLake API URL") args = parser.parse_args() DUCKLAKE_URL = args.url create_views(drop_first=args.drop_first) print("\nDone!") if __name__ == "__main__": main()