859 lines
25 KiB
TypeScript
859 lines
25 KiB
TypeScript
/**
|
|
* React Hook for generating statistics from DuckLake
|
|
* Converts raw DuckLake data into the StatsData format expected by NDEStatsPage
|
|
*/
|
|
|
|
import { useState, useEffect, useCallback } from 'react';
|
|
import { useDuckLake } from './useDuckLake';
|
|
|
|
// ============================================================================
|
|
// Type Definitions (matching NDEStatsPage.tsx)
|
|
// ============================================================================
|
|
|
|
export interface TypeData {
|
|
code: string;
|
|
name: string;
|
|
count: number;
|
|
percentage: number;
|
|
color: string;
|
|
}
|
|
|
|
export interface CityData {
|
|
city: string;
|
|
count: number;
|
|
[key: string]: string | number; // Index signature for HorizontalBarChart compatibility
|
|
}
|
|
|
|
export interface WikidataType {
|
|
type: string;
|
|
count: number;
|
|
}
|
|
|
|
export interface EnrichmentStatus {
|
|
status: string;
|
|
count: number;
|
|
percentage: number;
|
|
color: string;
|
|
}
|
|
|
|
export interface IdentifierCoverage {
|
|
identifier: string;
|
|
count: number;
|
|
percentage: number;
|
|
}
|
|
|
|
export interface GoogleMapsCoverage {
|
|
feature: string;
|
|
count: number;
|
|
percentage: number;
|
|
}
|
|
|
|
export interface EnrichmentSource {
|
|
source: string;
|
|
count: number;
|
|
percentage: number;
|
|
color: string;
|
|
}
|
|
|
|
export interface TimelineData {
|
|
decade: number;
|
|
count: number;
|
|
}
|
|
|
|
export interface ProvinceTypeBreakdown {
|
|
code: string;
|
|
name: string;
|
|
count: number;
|
|
color: string;
|
|
}
|
|
|
|
export interface ProvinceData {
|
|
province: string;
|
|
count: number;
|
|
color: string;
|
|
types: Record<string, ProvinceTypeBreakdown>;
|
|
}
|
|
|
|
export interface RatingHistogramData {
|
|
rating: number;
|
|
count: number;
|
|
}
|
|
|
|
export interface BubbleChartData {
|
|
type: string;
|
|
name: string;
|
|
avg_rating: number;
|
|
avg_reviews: number;
|
|
total_reviews: number;
|
|
count: number;
|
|
color: string;
|
|
}
|
|
|
|
export interface SunburstNode {
|
|
name: string;
|
|
color?: string;
|
|
value?: number;
|
|
code?: string;
|
|
children?: SunburstNode[];
|
|
}
|
|
|
|
export interface RatingScatterPoint {
|
|
rating: number;
|
|
reviews: number;
|
|
type: string;
|
|
name: string;
|
|
province?: string;
|
|
city?: string;
|
|
ghcid_uuid?: string;
|
|
ghcid_current?: string;
|
|
}
|
|
|
|
export interface CertaintyEntry {
|
|
name: string;
|
|
ghcid_uuid: string;
|
|
ghcid_current: string;
|
|
type: string;
|
|
city: string;
|
|
file: string;
|
|
confidence?: number;
|
|
isil_confidence?: number;
|
|
isil_code?: string;
|
|
reason?: string;
|
|
google_maps_name?: string;
|
|
}
|
|
|
|
export interface CertaintySummaryItem {
|
|
category: string;
|
|
high?: number;
|
|
medium?: number;
|
|
low?: number;
|
|
none?: number;
|
|
valid?: number;
|
|
invalid?: number;
|
|
}
|
|
|
|
export interface CertaintyDetails {
|
|
google_maps_invalid: CertaintyEntry[];
|
|
low_name_confidence: CertaintyEntry[];
|
|
medium_name_confidence: CertaintyEntry[];
|
|
low_na_isil_confidence?: CertaintyEntry[];
|
|
has_kb_isil?: CertaintyEntry[];
|
|
}
|
|
|
|
export interface CertaintyColors {
|
|
high: string;
|
|
authoritative?: string;
|
|
valid: string;
|
|
medium: string;
|
|
low: string;
|
|
invalid: string;
|
|
none: string;
|
|
}
|
|
|
|
export interface EnrichmentCertainty {
|
|
summary: CertaintySummaryItem[];
|
|
details: CertaintyDetails;
|
|
colors: CertaintyColors;
|
|
}
|
|
|
|
export interface StatsData {
|
|
generated_at: string;
|
|
total_entries: number;
|
|
summary: {
|
|
total_institutions: number;
|
|
enriched?: number;
|
|
not_enriched?: number;
|
|
with_coordinates: number;
|
|
with_wikidata: number;
|
|
with_google_maps?: number;
|
|
google_maps_not_found?: number;
|
|
unique_cities: number;
|
|
unique_provinces?: number;
|
|
institution_types: number;
|
|
with_nan_isil?: number;
|
|
with_museum_register?: number;
|
|
with_ghcid?: number;
|
|
with_web_claims?: number;
|
|
with_social_media?: number;
|
|
with_verified_name?: number;
|
|
};
|
|
charts: {
|
|
institution_types: TypeData[];
|
|
top_cities: CityData[];
|
|
wikidata_types: WikidataType[];
|
|
enrichment_status: EnrichmentStatus[];
|
|
enrichment_sources?: EnrichmentSource[];
|
|
identifier_coverage: IdentifierCoverage[];
|
|
google_maps_coverage?: GoogleMapsCoverage[];
|
|
founding_timeline: TimelineData[];
|
|
provinces?: ProvinceData[];
|
|
rating_histogram?: RatingHistogramData[];
|
|
bubble_chart?: BubbleChartData[];
|
|
sunburst?: SunburstNode;
|
|
rating_scatter?: RatingScatterPoint[];
|
|
enrichment_certainty?: EnrichmentCertainty;
|
|
};
|
|
}
|
|
|
|
// ============================================================================
|
|
// Constants
|
|
// ============================================================================
|
|
|
|
// Institution type code to name and color mapping
|
|
const INSTITUTION_TYPE_MAP: Record<string, { name: string; color: string }> = {
|
|
'G': { name: 'Gallery', color: '#00bcd4' },
|
|
'L': { name: 'Library', color: '#2ecc71' },
|
|
'A': { name: 'Archive', color: '#3498db' },
|
|
'M': { name: 'Museum', color: '#e74c3c' },
|
|
'O': { name: 'Official', color: '#f39c12' },
|
|
'R': { name: 'Research', color: '#1abc9c' },
|
|
'C': { name: 'Corporation', color: '#795548' },
|
|
'U': { name: 'Unknown', color: '#9e9e9e' },
|
|
'B': { name: 'Botanical', color: '#4caf50' },
|
|
'E': { name: 'Education', color: '#ff9800' },
|
|
'S': { name: 'Society', color: '#9b59b6' },
|
|
'F': { name: 'Features', color: '#95a5a6' },
|
|
'I': { name: 'Intangible', color: '#673ab7' },
|
|
'X': { name: 'Mixed', color: '#607d8b' },
|
|
'P': { name: 'Personal', color: '#8bc34a' },
|
|
'H': { name: 'Holy sites', color: '#607d8b' },
|
|
'D': { name: 'Digital', color: '#34495e' },
|
|
'N': { name: 'NGO', color: '#e91e63' },
|
|
'T': { name: 'Taste/smell', color: '#ff5722' },
|
|
};
|
|
|
|
// Province colors for Netherlands
|
|
const PROVINCE_COLORS: Record<string, string> = {
|
|
'Drenthe': '#1f77b4',
|
|
'Flevoland': '#ff7f0e',
|
|
'Friesland': '#2ca02c',
|
|
'Gelderland': '#d62728',
|
|
'Groningen': '#9467bd',
|
|
'Limburg': '#8c564b',
|
|
'Noord-Brabant': '#e377c2',
|
|
'Noord-Holland': '#7f7f7f',
|
|
'Overijssel': '#bcbd22',
|
|
'Utrecht': '#17becf',
|
|
'Zeeland': '#aec7e8',
|
|
'Zuid-Holland': '#bcbd22',
|
|
};
|
|
|
|
// ============================================================================
|
|
// SQL Queries
|
|
// ============================================================================
|
|
|
|
const SQL_QUERIES = {
|
|
// Summary statistics
|
|
// Note: Uses public.custodians table with correct column names
|
|
summary: `
|
|
SELECT
|
|
COUNT(*) as total_institutions,
|
|
COUNT(CASE WHEN wikidata_id IS NOT NULL OR rating IS NOT NULL OR web_claims IS NOT NULL THEN 1 END) as enriched,
|
|
COUNT(CASE WHEN wikidata_id IS NULL AND rating IS NULL AND web_claims IS NULL THEN 1 END) as not_enriched,
|
|
COUNT(CASE WHEN lat IS NOT NULL AND lon IS NOT NULL THEN 1 END) as with_coordinates,
|
|
COUNT(CASE WHEN wikidata_id IS NOT NULL AND wikidata_id != '' THEN 1 END) as with_wikidata,
|
|
COUNT(CASE WHEN rating IS NOT NULL OR total_ratings IS NOT NULL THEN 1 END) as with_google_maps,
|
|
COUNT(DISTINCT city) as unique_cities,
|
|
COUNT(DISTINCT
|
|
CASE WHEN ghcid IS NOT NULL
|
|
THEN SUBSTRING(ghcid, 4, 2) END
|
|
) as unique_provinces,
|
|
COUNT(DISTINCT type) as institution_types,
|
|
COUNT(CASE WHEN ghcid IS NOT NULL THEN 1 END) as with_ghcid,
|
|
COUNT(CASE WHEN web_claims IS NOT NULL THEN 1 END) as with_web_claims,
|
|
COUNT(CASE WHEN verified_name IS NOT NULL THEN 1 END) as with_verified_name,
|
|
COUNT(CASE WHEN confidence_score IS NOT NULL AND confidence_score < 0.5 THEN 1 END) as low_name_confidence,
|
|
COUNT(CASE WHEN confidence_score IS NOT NULL AND confidence_score >= 0.5 AND confidence_score < 0.8 THEN 1 END) as medium_name_confidence,
|
|
COUNT(CASE WHEN confidence_score IS NOT NULL AND confidence_score >= 0.8 THEN 1 END) as high_name_confidence
|
|
FROM custodians
|
|
`,
|
|
|
|
// Institution types breakdown
|
|
institution_types: `
|
|
SELECT
|
|
type as code,
|
|
COUNT(*) as count
|
|
FROM custodians
|
|
WHERE type IS NOT NULL
|
|
GROUP BY type
|
|
ORDER BY count DESC
|
|
`,
|
|
|
|
// Top cities
|
|
top_cities: `
|
|
SELECT
|
|
city,
|
|
COUNT(*) as count
|
|
FROM custodians
|
|
WHERE city IS NOT NULL AND city != ''
|
|
GROUP BY city
|
|
ORDER BY count DESC
|
|
LIMIT 20
|
|
`,
|
|
|
|
// Wikidata types (based on institution type)
|
|
wikidata_types: `
|
|
SELECT
|
|
CASE
|
|
WHEN type = 'M' THEN 'museum'
|
|
WHEN type = 'A' THEN 'archive'
|
|
WHEN type = 'L' THEN 'library'
|
|
WHEN type = 'S' THEN 'society'
|
|
WHEN type = 'G' THEN 'gallery'
|
|
ELSE 'other'
|
|
END as type,
|
|
COUNT(*) as count
|
|
FROM custodians
|
|
WHERE wikidata_id IS NOT NULL AND wikidata_id != ''
|
|
GROUP BY type
|
|
ORDER BY count DESC
|
|
`,
|
|
|
|
// Enrichment status - based on presence of enrichment data
|
|
enrichment_status: `
|
|
SELECT
|
|
CASE
|
|
WHEN wikidata_id IS NOT NULL OR rating IS NOT NULL OR web_claims IS NOT NULL
|
|
THEN 'Enriched'
|
|
ELSE 'Not Enriched'
|
|
END as status,
|
|
COUNT(*) as count
|
|
FROM custodians
|
|
GROUP BY status
|
|
ORDER BY count DESC
|
|
`,
|
|
|
|
// Enrichment sources - count records with actual enrichment data
|
|
enrichment_sources: `
|
|
SELECT
|
|
'Google Maps' as source,
|
|
COUNT(CASE WHEN rating IS NOT NULL OR total_ratings IS NOT NULL THEN 1 END) as count
|
|
FROM custodians
|
|
UNION ALL
|
|
SELECT
|
|
'Wikidata' as source,
|
|
COUNT(CASE WHEN wikidata_id IS NOT NULL AND wikidata_id != '' THEN 1 END) as count
|
|
FROM custodians
|
|
UNION ALL
|
|
SELECT
|
|
'Web Claims' as source,
|
|
COUNT(CASE WHEN web_claims IS NOT NULL THEN 1 END) as count
|
|
FROM custodians
|
|
UNION ALL
|
|
SELECT
|
|
'Coordinates' as source,
|
|
COUNT(CASE WHEN lat IS NOT NULL AND lon IS NOT NULL THEN 1 END) as count
|
|
FROM custodians
|
|
UNION ALL
|
|
SELECT
|
|
'Website Archived' as source,
|
|
COUNT(CASE WHEN web_archives IS NOT NULL THEN 1 END) as count
|
|
FROM custodians
|
|
`,
|
|
|
|
// Identifier coverage
|
|
identifier_coverage: `
|
|
SELECT
|
|
'Coordinates' as identifier,
|
|
COUNT(CASE WHEN lat IS NOT NULL AND lon IS NOT NULL THEN 1 END) as count
|
|
FROM custodians
|
|
UNION ALL
|
|
SELECT
|
|
'Wikidata ID' as identifier,
|
|
COUNT(CASE WHEN wikidata_id IS NOT NULL AND wikidata_id != '' THEN 1 END) as count
|
|
FROM custodians
|
|
UNION ALL
|
|
SELECT
|
|
'GHCID' as identifier,
|
|
COUNT(CASE WHEN ghcid IS NOT NULL THEN 1 END) as count
|
|
FROM custodians
|
|
UNION ALL
|
|
SELECT
|
|
'Address' as identifier,
|
|
COUNT(CASE WHEN formatted_address IS NOT NULL AND formatted_address != '' THEN 1 END) as count
|
|
FROM custodians
|
|
`,
|
|
|
|
// Google Maps coverage (features available)
|
|
google_maps_coverage: `
|
|
SELECT
|
|
'Location' as feature,
|
|
COUNT(CASE WHEN lat IS NOT NULL THEN 1 END) as count
|
|
FROM custodians
|
|
UNION ALL
|
|
SELECT
|
|
'Rating' as feature,
|
|
COUNT(CASE WHEN rating IS NOT NULL THEN 1 END) as count
|
|
FROM custodians
|
|
UNION ALL
|
|
SELECT
|
|
'Reviews' as feature,
|
|
COUNT(CASE WHEN total_ratings IS NOT NULL AND total_ratings > 0 THEN 1 END) as count
|
|
FROM custodians
|
|
`,
|
|
|
|
// Provinces with type breakdown
|
|
provinces: `
|
|
SELECT
|
|
CASE SUBSTRING(ghcid, 4, 2)
|
|
WHEN 'DR' THEN 'Drenthe'
|
|
WHEN 'FL' THEN 'Flevoland'
|
|
WHEN 'FR' THEN 'Friesland'
|
|
WHEN 'GE' THEN 'Gelderland'
|
|
WHEN 'GR' THEN 'Groningen'
|
|
WHEN 'LI' THEN 'Limburg'
|
|
WHEN 'NB' THEN 'Noord-Brabant'
|
|
WHEN 'NH' THEN 'Noord-Holland'
|
|
WHEN 'OV' THEN 'Overijssel'
|
|
WHEN 'UT' THEN 'Utrecht'
|
|
WHEN 'ZE' THEN 'Zeeland'
|
|
WHEN 'ZH' THEN 'Zuid-Holland'
|
|
ELSE 'Unknown'
|
|
END as province,
|
|
type as org_type,
|
|
COUNT(*) as count
|
|
FROM custodians
|
|
WHERE ghcid IS NOT NULL
|
|
GROUP BY province, type
|
|
ORDER BY province, count DESC
|
|
`,
|
|
|
|
// Rating histogram
|
|
rating_histogram: `
|
|
SELECT
|
|
ROUND(rating * 2) / 2 as rating,
|
|
COUNT(*) as count
|
|
FROM custodians
|
|
WHERE rating IS NOT NULL
|
|
GROUP BY ROUND(rating * 2) / 2
|
|
ORDER BY rating
|
|
`,
|
|
|
|
// Bubble chart data (type aggregates)
|
|
bubble_chart: `
|
|
SELECT
|
|
type,
|
|
AVG(rating) as avg_rating,
|
|
AVG(total_ratings) as avg_reviews,
|
|
SUM(total_ratings) as total_reviews,
|
|
COUNT(*) as count
|
|
FROM custodians
|
|
WHERE rating IS NOT NULL AND type IS NOT NULL
|
|
GROUP BY type
|
|
ORDER BY count DESC
|
|
`,
|
|
|
|
// Rating scatter data (individual points)
|
|
rating_scatter: `
|
|
SELECT
|
|
rating,
|
|
total_ratings as reviews,
|
|
type,
|
|
name,
|
|
city,
|
|
CASE SUBSTRING(ghcid, 4, 2)
|
|
WHEN 'DR' THEN 'Drenthe'
|
|
WHEN 'FL' THEN 'Flevoland'
|
|
WHEN 'FR' THEN 'Friesland'
|
|
WHEN 'GE' THEN 'Gelderland'
|
|
WHEN 'GR' THEN 'Groningen'
|
|
WHEN 'LI' THEN 'Limburg'
|
|
WHEN 'NB' THEN 'Noord-Brabant'
|
|
WHEN 'NH' THEN 'Noord-Holland'
|
|
WHEN 'OV' THEN 'Overijssel'
|
|
WHEN 'UT' THEN 'Utrecht'
|
|
WHEN 'ZE' THEN 'Zeeland'
|
|
WHEN 'ZH' THEN 'Zuid-Holland'
|
|
ELSE 'Unknown'
|
|
END as province,
|
|
ghcid_uuid,
|
|
ghcid as ghcid_current
|
|
FROM custodians
|
|
WHERE rating IS NOT NULL AND total_ratings IS NOT NULL
|
|
ORDER BY total_ratings DESC
|
|
LIMIT 500
|
|
`,
|
|
|
|
// Certainty data - low confidence names
|
|
low_name_confidence: `
|
|
SELECT
|
|
name,
|
|
ghcid_uuid,
|
|
ghcid as ghcid_current,
|
|
type,
|
|
city,
|
|
source_file as file,
|
|
confidence_score as confidence
|
|
FROM custodians
|
|
WHERE confidence_score IS NOT NULL AND confidence_score < 0.5
|
|
ORDER BY confidence_score ASC
|
|
LIMIT 100
|
|
`,
|
|
|
|
// Certainty data - medium confidence names
|
|
medium_name_confidence: `
|
|
SELECT
|
|
name,
|
|
ghcid_uuid,
|
|
ghcid as ghcid_current,
|
|
type,
|
|
city,
|
|
source_file as file,
|
|
confidence_score as confidence
|
|
FROM custodians
|
|
WHERE confidence_score IS NOT NULL
|
|
AND confidence_score >= 0.5
|
|
AND confidence_score < 0.8
|
|
ORDER BY confidence_score ASC
|
|
LIMIT 100
|
|
`,
|
|
};
|
|
|
|
// ============================================================================
|
|
// Helper Functions
|
|
// ============================================================================
|
|
|
|
function rowsToObjects<T>(columns: string[], rows: unknown[][]): T[] {
|
|
return rows.map(row => {
|
|
const obj: Record<string, unknown> = {};
|
|
columns.forEach((col, idx) => {
|
|
obj[col] = row[idx];
|
|
});
|
|
return obj as T;
|
|
});
|
|
}
|
|
|
|
// ============================================================================
|
|
// Main Hook
|
|
// ============================================================================
|
|
|
|
export interface UseDuckLakeStatsReturn {
|
|
data: StatsData | null;
|
|
isLoading: boolean;
|
|
error: Error | null;
|
|
refresh: () => Promise<void>;
|
|
isConnected: boolean;
|
|
}
|
|
|
|
export function useDuckLakeStats(): UseDuckLakeStatsReturn {
|
|
const { executeQuery, status, isLoading: baseLoading } = useDuckLake();
|
|
const [data, setData] = useState<StatsData | null>(null);
|
|
const [isLoading, setIsLoading] = useState(true);
|
|
const [error, setError] = useState<Error | null>(null);
|
|
|
|
const refresh = useCallback(async () => {
|
|
// Wait for base hook to finish checking connection
|
|
// Don't set error until we know connection status for sure
|
|
if (baseLoading) {
|
|
// Still connecting, keep loading state true
|
|
return;
|
|
}
|
|
|
|
if (!status.isConnected) {
|
|
// Only set error after connection check is complete
|
|
setError(new Error('DuckLake not connected'));
|
|
setIsLoading(false);
|
|
return;
|
|
}
|
|
|
|
setIsLoading(true);
|
|
setError(null);
|
|
|
|
try {
|
|
// Execute all queries in parallel
|
|
const [
|
|
summaryResult,
|
|
typesResult,
|
|
citiesResult,
|
|
wikidataResult,
|
|
enrichmentStatusResult,
|
|
enrichmentSourcesResult,
|
|
identifierResult,
|
|
googleMapsResult,
|
|
provincesResult,
|
|
ratingHistResult,
|
|
bubbleResult,
|
|
scatterResult,
|
|
lowConfResult,
|
|
medConfResult,
|
|
] = await Promise.all([
|
|
executeQuery(SQL_QUERIES.summary),
|
|
executeQuery(SQL_QUERIES.institution_types),
|
|
executeQuery(SQL_QUERIES.top_cities),
|
|
executeQuery(SQL_QUERIES.wikidata_types),
|
|
executeQuery(SQL_QUERIES.enrichment_status),
|
|
executeQuery(SQL_QUERIES.enrichment_sources),
|
|
executeQuery(SQL_QUERIES.identifier_coverage),
|
|
executeQuery(SQL_QUERIES.google_maps_coverage),
|
|
executeQuery(SQL_QUERIES.provinces),
|
|
executeQuery(SQL_QUERIES.rating_histogram),
|
|
executeQuery(SQL_QUERIES.bubble_chart),
|
|
executeQuery(SQL_QUERIES.rating_scatter),
|
|
executeQuery(SQL_QUERIES.low_name_confidence),
|
|
executeQuery(SQL_QUERIES.medium_name_confidence),
|
|
]);
|
|
|
|
// Parse summary
|
|
const summaryRow = summaryResult.rows[0] as number[];
|
|
const totalInstitutions = Number(summaryRow[0]) || 0;
|
|
|
|
// Parse institution types
|
|
const typesRaw = rowsToObjects<{ code: string; count: number }>(
|
|
typesResult.columns,
|
|
typesResult.rows
|
|
);
|
|
const institution_types: TypeData[] = typesRaw.map(t => {
|
|
const typeInfo = INSTITUTION_TYPE_MAP[t.code] || { name: t.code, color: '#999' };
|
|
return {
|
|
code: t.code,
|
|
name: typeInfo.name,
|
|
count: Number(t.count),
|
|
percentage: Math.round((Number(t.count) / totalInstitutions) * 1000) / 10,
|
|
color: typeInfo.color,
|
|
};
|
|
});
|
|
|
|
// Parse cities
|
|
const top_cities = rowsToObjects<CityData>(citiesResult.columns, citiesResult.rows)
|
|
.map(c => ({ city: c.city, count: Number(c.count) }));
|
|
|
|
// Parse wikidata types
|
|
const wikidata_types = rowsToObjects<WikidataType>(wikidataResult.columns, wikidataResult.rows)
|
|
.map(w => ({ type: w.type, count: Number(w.count) }));
|
|
|
|
// Parse enrichment status
|
|
const enrichmentRaw = rowsToObjects<{ status: string; count: number }>(
|
|
enrichmentStatusResult.columns,
|
|
enrichmentStatusResult.rows
|
|
);
|
|
const enrichment_status: EnrichmentStatus[] = enrichmentRaw.map(e => ({
|
|
status: e.status,
|
|
count: Number(e.count),
|
|
percentage: Math.round((Number(e.count) / totalInstitutions) * 1000) / 10,
|
|
color: e.status === 'Enriched' ? '#2ecc71' : '#e74c3c',
|
|
}));
|
|
|
|
// Parse enrichment sources
|
|
const sourcesRaw = rowsToObjects<{ source: string; count: number }>(
|
|
enrichmentSourcesResult.columns,
|
|
enrichmentSourcesResult.rows
|
|
);
|
|
const sourceColors: Record<string, string> = {
|
|
'Google Maps': '#e74c3c',
|
|
'Wikidata': '#3498db',
|
|
'Web Claims': '#9b59b6',
|
|
'Coordinates': '#2ecc71',
|
|
'Website Archived': '#f39c12',
|
|
};
|
|
const enrichment_sources: EnrichmentSource[] = sourcesRaw.map(s => ({
|
|
source: s.source,
|
|
count: Number(s.count),
|
|
percentage: Math.round((Number(s.count) / totalInstitutions) * 1000) / 10,
|
|
color: sourceColors[s.source] || '#999',
|
|
}));
|
|
|
|
// Parse identifier coverage
|
|
const identifierRaw = rowsToObjects<{ identifier: string; count: number }>(
|
|
identifierResult.columns,
|
|
identifierResult.rows
|
|
);
|
|
const identifier_coverage: IdentifierCoverage[] = identifierRaw.map(i => ({
|
|
identifier: i.identifier,
|
|
count: Number(i.count),
|
|
percentage: Math.round((Number(i.count) / totalInstitutions) * 1000) / 10,
|
|
}));
|
|
|
|
// Parse Google Maps coverage
|
|
const gmapsRaw = rowsToObjects<{ feature: string; count: number }>(
|
|
googleMapsResult.columns,
|
|
googleMapsResult.rows
|
|
);
|
|
const google_maps_coverage: GoogleMapsCoverage[] = gmapsRaw.map(g => ({
|
|
feature: g.feature,
|
|
count: Number(g.count),
|
|
percentage: Math.round((Number(g.count) / totalInstitutions) * 1000) / 10,
|
|
}));
|
|
|
|
// Parse provinces with type breakdown
|
|
const provincesRaw = rowsToObjects<{ province: string; org_type: string; count: number }>(
|
|
provincesResult.columns,
|
|
provincesResult.rows
|
|
);
|
|
const provinceMap = new Map<string, ProvinceData>();
|
|
for (const p of provincesRaw) {
|
|
if (!provinceMap.has(p.province)) {
|
|
provinceMap.set(p.province, {
|
|
province: p.province,
|
|
count: 0,
|
|
color: PROVINCE_COLORS[p.province] || '#999',
|
|
types: {},
|
|
});
|
|
}
|
|
const prov = provinceMap.get(p.province)!;
|
|
prov.count += Number(p.count);
|
|
const typeInfo = INSTITUTION_TYPE_MAP[p.org_type] || { name: p.org_type, color: '#999' };
|
|
prov.types[p.org_type] = {
|
|
code: p.org_type,
|
|
name: typeInfo.name,
|
|
count: Number(p.count),
|
|
color: typeInfo.color,
|
|
};
|
|
}
|
|
const provinces = Array.from(provinceMap.values()).sort((a, b) => b.count - a.count);
|
|
|
|
// Parse rating histogram
|
|
const rating_histogram = rowsToObjects<RatingHistogramData>(
|
|
ratingHistResult.columns,
|
|
ratingHistResult.rows
|
|
).map(r => ({ rating: Number(r.rating), count: Number(r.count) }));
|
|
|
|
// Parse bubble chart
|
|
const bubbleRaw = rowsToObjects<{
|
|
type: string;
|
|
avg_rating: number;
|
|
avg_reviews: number;
|
|
total_reviews: number;
|
|
count: number;
|
|
}>(bubbleResult.columns, bubbleResult.rows);
|
|
const bubble_chart: BubbleChartData[] = bubbleRaw.map(b => {
|
|
const typeInfo = INSTITUTION_TYPE_MAP[b.type] || { name: b.type, color: '#999' };
|
|
return {
|
|
type: b.type,
|
|
name: typeInfo.name,
|
|
avg_rating: Math.round(Number(b.avg_rating) * 100) / 100,
|
|
avg_reviews: Math.round(Number(b.avg_reviews) * 10) / 10,
|
|
total_reviews: Number(b.total_reviews),
|
|
count: Number(b.count),
|
|
color: typeInfo.color,
|
|
};
|
|
});
|
|
|
|
// Parse rating scatter
|
|
const rating_scatter = rowsToObjects<RatingScatterPoint>(
|
|
scatterResult.columns,
|
|
scatterResult.rows
|
|
).map(r => ({
|
|
rating: Number(r.rating),
|
|
reviews: Number(r.reviews),
|
|
type: String(r.type),
|
|
name: String(r.name),
|
|
province: r.province ? String(r.province) : undefined,
|
|
city: r.city ? String(r.city) : undefined,
|
|
ghcid_uuid: r.ghcid_uuid ? String(r.ghcid_uuid) : undefined,
|
|
ghcid_current: r.ghcid_current ? String(r.ghcid_current) : undefined,
|
|
}));
|
|
|
|
// Parse certainty data
|
|
const low_name_confidence = rowsToObjects<CertaintyEntry>(
|
|
lowConfResult.columns,
|
|
lowConfResult.rows
|
|
);
|
|
const medium_name_confidence = rowsToObjects<CertaintyEntry>(
|
|
medConfResult.columns,
|
|
medConfResult.rows
|
|
);
|
|
|
|
// Build sunburst from provinces and types
|
|
const sunburst: SunburstNode = {
|
|
name: 'Netherlands',
|
|
children: provinces.map(p => ({
|
|
name: p.province,
|
|
color: p.color,
|
|
children: Object.values(p.types).map(t => ({
|
|
name: t.name,
|
|
code: t.code,
|
|
value: t.count,
|
|
color: t.color,
|
|
})),
|
|
})),
|
|
};
|
|
|
|
// Build enrichment certainty
|
|
const enrichment_certainty: EnrichmentCertainty = {
|
|
summary: [
|
|
{
|
|
category: 'Name Confidence',
|
|
high: Number(summaryRow[14]) || 0, // high_name_confidence
|
|
medium: Number(summaryRow[13]) || 0, // medium_name_confidence
|
|
low: Number(summaryRow[12]) || 0, // low_name_confidence
|
|
},
|
|
],
|
|
details: {
|
|
google_maps_invalid: [],
|
|
low_name_confidence,
|
|
medium_name_confidence,
|
|
},
|
|
colors: {
|
|
high: '#2ecc71',
|
|
valid: '#27ae60',
|
|
medium: '#f39c12',
|
|
low: '#e74c3c',
|
|
invalid: '#c0392b',
|
|
none: '#95a5a6',
|
|
},
|
|
};
|
|
|
|
// Build final stats data
|
|
const statsData: StatsData = {
|
|
generated_at: new Date().toISOString(),
|
|
total_entries: totalInstitutions,
|
|
summary: {
|
|
total_institutions: totalInstitutions,
|
|
enriched: Number(summaryRow[1]) || 0,
|
|
not_enriched: Number(summaryRow[2]) || 0,
|
|
with_coordinates: Number(summaryRow[3]) || 0,
|
|
with_wikidata: Number(summaryRow[4]) || 0,
|
|
with_google_maps: Number(summaryRow[5]) || 0,
|
|
unique_cities: Number(summaryRow[6]) || 0,
|
|
unique_provinces: Number(summaryRow[7]) || 0,
|
|
institution_types: Number(summaryRow[8]) || 0,
|
|
with_ghcid: Number(summaryRow[9]) || 0,
|
|
with_web_claims: Number(summaryRow[10]) || 0,
|
|
with_verified_name: Number(summaryRow[11]) || 0,
|
|
},
|
|
charts: {
|
|
institution_types,
|
|
top_cities,
|
|
wikidata_types,
|
|
enrichment_status,
|
|
enrichment_sources,
|
|
identifier_coverage,
|
|
google_maps_coverage,
|
|
founding_timeline: [], // TODO: Add founding_timeline query if data available
|
|
provinces,
|
|
rating_histogram,
|
|
bubble_chart,
|
|
sunburst,
|
|
rating_scatter,
|
|
enrichment_certainty,
|
|
},
|
|
};
|
|
|
|
setData(statsData);
|
|
} catch (err) {
|
|
console.error('Failed to load stats from DuckLake:', err);
|
|
setError(err instanceof Error ? err : new Error('Failed to load statistics'));
|
|
} finally {
|
|
setIsLoading(false);
|
|
}
|
|
}, [executeQuery, status.isConnected, baseLoading]);
|
|
|
|
// Refresh when connection status changes (and base hook is done loading)
|
|
useEffect(() => {
|
|
if (!baseLoading && status.isConnected) {
|
|
refresh();
|
|
} else if (!baseLoading && !status.isConnected) {
|
|
// Connection check complete, not connected - set error
|
|
setError(new Error('DuckLake not connected'));
|
|
setIsLoading(false);
|
|
}
|
|
}, [status.isConnected, baseLoading, refresh]);
|
|
|
|
return {
|
|
data,
|
|
isLoading,
|
|
error,
|
|
refresh,
|
|
isConnected: status.isConnected,
|
|
};
|
|
}
|