336 lines
13 KiB
Python
336 lines
13 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Setup DuckLake views for common query patterns.
|
|
|
|
This script creates SQL views on the DuckLake server to simplify
|
|
querying the custodians_raw table with JSON extraction.
|
|
|
|
Usage:
|
|
python scripts/setup_ducklake_views.py [--drop-first]
|
|
"""
|
|
|
|
import argparse
|
|
import requests
|
|
import json
|
|
import sys
|
|
|
|
DUCKLAKE_URL = "http://localhost:8765"
|
|
|
|
# View definitions with SQL CREATE VIEW statements
|
|
VIEWS = {
|
|
# Flattened view with key fields extracted from JSON
|
|
"v_custodians": """
|
|
CREATE OR REPLACE VIEW heritage.v_custodians AS
|
|
SELECT
|
|
ghcid_current as ghcid,
|
|
custodian_name as name,
|
|
org_name as original_name,
|
|
org_type as institution_type,
|
|
city,
|
|
country,
|
|
postal_code,
|
|
latitude,
|
|
longitude,
|
|
google_rating as rating,
|
|
google_total_ratings as review_count,
|
|
enrichment_status,
|
|
wikidata_id,
|
|
CASE WHEN wikidata_enrichment_json IS NOT NULL AND LENGTH(wikidata_enrichment_json) > 10
|
|
THEN json_extract_string(wikidata_enrichment_json, '$.wikidata_entity_id') ELSE NULL END as wikidata_qid,
|
|
CASE WHEN wikidata_enrichment_json IS NOT NULL AND LENGTH(wikidata_enrichment_json) > 10
|
|
THEN json_extract_string(wikidata_enrichment_json, '$.wikidata_label_en') ELSE NULL END as name_en,
|
|
CASE WHEN wikidata_enrichment_json IS NOT NULL AND LENGTH(wikidata_enrichment_json) > 10
|
|
THEN json_extract_string(wikidata_enrichment_json, '$.wikidata_label_nl') ELSE NULL END as name_nl,
|
|
CASE WHEN wikidata_enrichment_json IS NOT NULL AND LENGTH(wikidata_enrichment_json) > 10
|
|
THEN json_extract_string(wikidata_enrichment_json, '$.wikidata_description_nl') ELSE NULL END as description_nl,
|
|
CASE WHEN wikidata_enrichment_json IS NOT NULL AND LENGTH(wikidata_enrichment_json) > 10
|
|
THEN json_extract_string(wikidata_enrichment_json, '$.wikidata_official_website') ELSE NULL END as website,
|
|
CASE WHEN wikidata_enrichment_json IS NOT NULL AND LENGTH(wikidata_enrichment_json) > 10
|
|
THEN json_extract_string(wikidata_enrichment_json, '$.wikidata_identifiers.viaf') ELSE NULL END as viaf_id,
|
|
CASE WHEN wikidata_enrichment_json IS NOT NULL AND LENGTH(wikidata_enrichment_json) > 10
|
|
THEN json_extract_string(wikidata_enrichment_json, '$.wikidata_identifiers.gnd') ELSE NULL END as gnd_id,
|
|
CASE WHEN ghcid_json IS NOT NULL AND LENGTH(ghcid_json) > 10
|
|
THEN json_extract_string(ghcid_json, '$.record_id') ELSE NULL END as record_id,
|
|
CASE WHEN ghcid_json IS NOT NULL AND LENGTH(ghcid_json) > 10
|
|
THEN json_extract_string(ghcid_json, '$.ghcid_uuid') ELSE NULL END as ghcid_uuid,
|
|
processing_timestamp,
|
|
file_name
|
|
FROM heritage.custodians_raw
|
|
""",
|
|
|
|
# Statistics by country
|
|
"v_stats_by_country": """
|
|
CREATE OR REPLACE VIEW heritage.v_stats_by_country AS
|
|
SELECT
|
|
COALESCE(NULLIF(country, ''), 'Unknown') as country,
|
|
COUNT(*) as total_institutions,
|
|
COUNT(google_rating) as with_ratings,
|
|
ROUND(AVG(google_rating), 2) as avg_rating,
|
|
COUNT(DISTINCT city) as cities,
|
|
SUM(CASE WHEN enrichment_status = 'success' THEN 1 ELSE 0 END) as enriched_count
|
|
FROM heritage.custodians_raw
|
|
GROUP BY COALESCE(NULLIF(country, ''), 'Unknown')
|
|
ORDER BY total_institutions DESC
|
|
""",
|
|
|
|
# Statistics by institution type
|
|
"v_stats_by_type": """
|
|
CREATE OR REPLACE VIEW heritage.v_stats_by_type AS
|
|
SELECT
|
|
CASE org_type
|
|
WHEN 'M' THEN 'Museum'
|
|
WHEN 'A' THEN 'Archive'
|
|
WHEN 'L' THEN 'Library'
|
|
WHEN 'G' THEN 'Gallery'
|
|
WHEN 'S' THEN 'Society'
|
|
WHEN 'I' THEN 'Intangible Heritage'
|
|
WHEN 'R' THEN 'Research Center'
|
|
WHEN 'O' THEN 'Official Institution'
|
|
WHEN 'H' THEN 'Holy Site'
|
|
WHEN 'E' THEN 'Educational'
|
|
WHEN 'N' THEN 'NGO'
|
|
WHEN 'M,N' THEN 'Museum/NGO'
|
|
ELSE COALESCE(NULLIF(org_type, ''), 'Unknown')
|
|
END as institution_type,
|
|
org_type as type_code,
|
|
COUNT(*) as count,
|
|
ROUND(AVG(google_rating), 2) as avg_rating,
|
|
COUNT(google_rating) as with_ratings
|
|
FROM heritage.custodians_raw
|
|
GROUP BY org_type
|
|
ORDER BY count DESC
|
|
""",
|
|
|
|
# Statistics by enrichment status
|
|
"v_stats_by_enrichment": """
|
|
CREATE OR REPLACE VIEW heritage.v_stats_by_enrichment AS
|
|
SELECT
|
|
enrichment_status,
|
|
COUNT(*) as count,
|
|
ROUND(100.0 * COUNT(*) / SUM(COUNT(*)) OVER (), 1) as percentage
|
|
FROM heritage.custodians_raw
|
|
GROUP BY enrichment_status
|
|
ORDER BY count DESC
|
|
""",
|
|
|
|
# Top rated institutions
|
|
"v_top_rated": """
|
|
CREATE OR REPLACE VIEW heritage.v_top_rated AS
|
|
SELECT
|
|
ghcid_current as ghcid,
|
|
custodian_name as name,
|
|
city,
|
|
country,
|
|
google_rating as rating,
|
|
google_total_ratings as review_count,
|
|
CASE
|
|
WHEN wikidata_enrichment_json IS NOT NULL
|
|
AND wikidata_enrichment_json != ''
|
|
AND LENGTH(wikidata_enrichment_json) > 10
|
|
THEN json_extract_string(wikidata_enrichment_json, '$.wikidata_official_website')
|
|
ELSE NULL
|
|
END as website
|
|
FROM heritage.custodians_raw
|
|
WHERE google_rating IS NOT NULL
|
|
ORDER BY google_rating DESC, google_total_ratings DESC
|
|
""",
|
|
|
|
# Institutions with Wikidata
|
|
"v_with_wikidata": """
|
|
CREATE OR REPLACE VIEW heritage.v_with_wikidata AS
|
|
SELECT
|
|
ghcid_current as ghcid,
|
|
custodian_name as name,
|
|
json_extract_string(wikidata_enrichment_json, '$.wikidata_entity_id') as wikidata_qid,
|
|
json_extract_string(wikidata_enrichment_json, '$.wikidata_label_en') as name_en,
|
|
json_extract_string(wikidata_enrichment_json, '$.wikidata_description_nl') as description,
|
|
json_extract_string(wikidata_enrichment_json, '$.wikidata_official_website') as website,
|
|
json_extract_string(wikidata_enrichment_json, '$.wikidata_identifiers.viaf') as viaf,
|
|
json_extract_string(wikidata_enrichment_json, '$.wikidata_identifiers.gnd') as gnd,
|
|
city,
|
|
country
|
|
FROM heritage.custodians_raw
|
|
WHERE wikidata_enrichment_json IS NOT NULL
|
|
AND wikidata_enrichment_json != ''
|
|
AND LENGTH(wikidata_enrichment_json) > 10
|
|
AND json_extract_string(wikidata_enrichment_json, '$.wikidata_entity_id') IS NOT NULL
|
|
""",
|
|
|
|
# Data quality metrics
|
|
"v_data_quality": """
|
|
CREATE OR REPLACE VIEW heritage.v_data_quality AS
|
|
SELECT
|
|
'Total Records' as metric,
|
|
COUNT(*)::VARCHAR as value,
|
|
'' as details
|
|
FROM heritage.custodians_raw
|
|
UNION ALL
|
|
SELECT
|
|
'With Coordinates',
|
|
COUNT(*)::VARCHAR,
|
|
ROUND(100.0 * COUNT(*) / (SELECT COUNT(*) FROM heritage.custodians_raw), 1)::VARCHAR || '%'
|
|
FROM heritage.custodians_raw WHERE latitude IS NOT NULL
|
|
UNION ALL
|
|
SELECT
|
|
'With Google Rating',
|
|
COUNT(*)::VARCHAR,
|
|
ROUND(100.0 * COUNT(*) / (SELECT COUNT(*) FROM heritage.custodians_raw), 1)::VARCHAR || '%'
|
|
FROM heritage.custodians_raw WHERE google_rating IS NOT NULL
|
|
UNION ALL
|
|
SELECT
|
|
'With Wikidata',
|
|
COUNT(*)::VARCHAR,
|
|
ROUND(100.0 * COUNT(*) / (SELECT COUNT(*) FROM heritage.custodians_raw), 1)::VARCHAR || '%'
|
|
FROM heritage.custodians_raw
|
|
WHERE wikidata_enrichment_json IS NOT NULL
|
|
AND wikidata_enrichment_json != ''
|
|
AND LENGTH(wikidata_enrichment_json) > 10
|
|
UNION ALL
|
|
SELECT
|
|
'With Website',
|
|
COUNT(*)::VARCHAR,
|
|
ROUND(100.0 * COUNT(*) / (SELECT COUNT(*) FROM heritage.custodians_raw), 1)::VARCHAR || '%'
|
|
FROM heritage.custodians_raw
|
|
WHERE wikidata_enrichment_json IS NOT NULL
|
|
AND wikidata_enrichment_json != ''
|
|
AND LENGTH(wikidata_enrichment_json) > 10
|
|
AND json_extract_string(wikidata_enrichment_json, '$.wikidata_official_website') IS NOT NULL
|
|
UNION ALL
|
|
SELECT
|
|
'Fully Enriched',
|
|
COUNT(*)::VARCHAR,
|
|
ROUND(100.0 * COUNT(*) / (SELECT COUNT(*) FROM heritage.custodians_raw), 1)::VARCHAR || '%'
|
|
FROM heritage.custodians_raw WHERE enrichment_status = 'success'
|
|
UNION ALL
|
|
SELECT
|
|
'Average Rating',
|
|
ROUND(AVG(google_rating), 2)::VARCHAR,
|
|
'Out of 5.0'
|
|
FROM heritage.custodians_raw WHERE google_rating IS NOT NULL
|
|
""",
|
|
|
|
# Cities with most institutions
|
|
"v_top_cities": """
|
|
CREATE OR REPLACE VIEW heritage.v_top_cities AS
|
|
SELECT
|
|
COALESCE(NULLIF(city, ''), 'Unknown') as city,
|
|
country,
|
|
COUNT(*) as institution_count,
|
|
COUNT(google_rating) as with_ratings,
|
|
ROUND(AVG(google_rating), 2) as avg_rating
|
|
FROM heritage.custodians_raw
|
|
WHERE city IS NOT NULL AND city != ''
|
|
GROUP BY city, country
|
|
ORDER BY institution_count DESC
|
|
""",
|
|
|
|
# Full-text search helper (materialized as table for FTS)
|
|
"v_search_index": """
|
|
CREATE OR REPLACE VIEW heritage.v_search_index AS
|
|
SELECT
|
|
ghcid_current as ghcid,
|
|
LOWER(COALESCE(custodian_name, '') || ' ' ||
|
|
COALESCE(org_name, '') || ' ' ||
|
|
COALESCE(CASE WHEN wikidata_enrichment_json IS NOT NULL AND LENGTH(wikidata_enrichment_json) > 10
|
|
THEN json_extract_string(wikidata_enrichment_json, '$.wikidata_label_en') ELSE NULL END, '') || ' ' ||
|
|
COALESCE(CASE WHEN wikidata_enrichment_json IS NOT NULL AND LENGTH(wikidata_enrichment_json) > 10
|
|
THEN json_extract_string(wikidata_enrichment_json, '$.wikidata_label_nl') ELSE NULL END, '') || ' ' ||
|
|
COALESCE(city, '')) as search_text,
|
|
custodian_name as name,
|
|
city,
|
|
country,
|
|
google_rating as rating
|
|
FROM heritage.custodians_raw
|
|
"""
|
|
}
|
|
|
|
|
|
def execute_query(query: str) -> dict:
|
|
"""Execute a SQL query against DuckLake."""
|
|
response = requests.post(
|
|
f"{DUCKLAKE_URL}/query",
|
|
json={"query": query},
|
|
timeout=60.0
|
|
)
|
|
return response.json()
|
|
|
|
|
|
def create_views(drop_first: bool = False):
|
|
"""Create all views in DuckLake."""
|
|
print(f"Connecting to DuckLake at {DUCKLAKE_URL}...")
|
|
|
|
# Check connection
|
|
try:
|
|
status = requests.get(f"{DUCKLAKE_URL}/", timeout=5.0).json()
|
|
print(f"Connected: {status.get('status')} (DuckDB {status.get('duckdb_version')})")
|
|
except Exception as e:
|
|
print(f"Error connecting: {e}")
|
|
sys.exit(1)
|
|
|
|
# Create each view
|
|
for view_name, view_sql in VIEWS.items():
|
|
print(f"\nCreating view: {view_name}")
|
|
|
|
if drop_first:
|
|
drop_sql = f"DROP VIEW IF EXISTS heritage.{view_name}"
|
|
try:
|
|
execute_query(drop_sql)
|
|
print(f" Dropped existing view")
|
|
except:
|
|
pass
|
|
|
|
try:
|
|
result = execute_query(view_sql)
|
|
if "error" in result or "detail" in result:
|
|
print(f" ERROR: {result}")
|
|
else:
|
|
print(f" Created successfully")
|
|
except Exception as e:
|
|
print(f" ERROR: {e}")
|
|
|
|
# Verify views were created
|
|
print("\n" + "=" * 50)
|
|
print("Verifying views...")
|
|
result = execute_query("""
|
|
SELECT table_name
|
|
FROM information_schema.tables
|
|
WHERE table_schema = 'heritage' AND table_type = 'VIEW'
|
|
ORDER BY table_name
|
|
""")
|
|
|
|
if "rows" in result:
|
|
print(f"Created {len(result['rows'])} views:")
|
|
for row in result["rows"]:
|
|
print(f" - {row[0]}")
|
|
|
|
# Test each view with a sample query
|
|
print("\n" + "=" * 50)
|
|
print("Testing views...")
|
|
for view_name in VIEWS.keys():
|
|
try:
|
|
result = execute_query(f"SELECT COUNT(*) FROM heritage.{view_name}")
|
|
if "rows" in result:
|
|
count = result["rows"][0][0]
|
|
print(f" {view_name}: {count} rows")
|
|
else:
|
|
print(f" {view_name}: ERROR - {result}")
|
|
except Exception as e:
|
|
print(f" {view_name}: ERROR - {e}")
|
|
|
|
|
|
def main():
|
|
global DUCKLAKE_URL
|
|
|
|
parser = argparse.ArgumentParser(description="Setup DuckLake views")
|
|
parser.add_argument("--drop-first", action="store_true", help="Drop existing views before creating")
|
|
parser.add_argument("--url", default=DUCKLAKE_URL, help="DuckLake API URL")
|
|
args = parser.parse_args()
|
|
|
|
DUCKLAKE_URL = args.url
|
|
|
|
create_views(drop_first=args.drop_first)
|
|
print("\nDone!")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|