glam/frontend/src/utils/contextNodeMatching.ts
2025-12-21 00:01:54 +01:00

402 lines
12 KiB
TypeScript

/**
* Context Node Matching Utilities
*
* Provides functions for matching RAG context nodes (from Qdrant, SPARQL, TypeDB)
* against knowledge graph nodes (from Oxigraph).
*
* The core challenge: Context nodes have IDs like "qdrant:123" while graph nodes
* have URIs like "http://example.org/entity/123". Matching is done via:
* - Normalized name matching
* - GHCID matching
* - ISIL code matching
* - Wikidata ID matching
*/
import type { GraphNode, GraphData } from '../components/database/KnowledgeGraphProjector';
/**
* Source database colors for multi-source visualization
*/
export const SOURCE_COLORS = {
qdrant: '#6366f1', // Indigo for vector search
sparql: '#10b981', // Emerald for SPARQL/Oxigraph
typedb: '#f59e0b', // Amber for TypeDB
} as const;
export type SourceType = keyof typeof SOURCE_COLORS;
/**
* Context info returned for matched nodes
*/
export interface ContextNodeInfo {
source?: string;
sources?: string[];
sourceColor?: string;
score?: number;
}
/**
* Options for building context node IDs
*/
export interface ContextNodeMatchingOptions {
/** Enable debug logging */
debug?: boolean;
}
/**
* Normalizes a name for matching by lowercasing and trimming
*/
export function normalizeName(name: unknown): string {
if (name === null || name === undefined) return '';
return String(name).toLowerCase().trim();
}
/**
* Pattern to extract GHCID from heritage custodian URIs
* Matches: https://w3id.org/heritage/ghcid/{GHCID}
*/
const GHCID_URI_PATTERN = /^https?:\/\/w3id\.org\/heritage\/ghcid\/([A-Z]{2}-[A-Z0-9-]+)$/i;
/**
* Extracts GHCID from a node URI if it matches the heritage custodian pattern.
* Returns null if the URI doesn't contain a GHCID.
*/
export function extractGhcidFromUri(uri: string): string | null {
const match = uri.match(GHCID_URI_PATTERN);
return match ? match[1].toUpperCase() : null;
}
/**
* Extracts potential matching keys from a node's attributes.
* Returns a set of normalized keys in format "type:value"
*
* Handles both:
* - Qdrant nodes: Have attributes like { ghcid, name, city }
* - Oxigraph nodes: Have URIs like https://w3id.org/heritage/ghcid/{GHCID} and label
*/
export function extractMatchingKeys(node: GraphNode): Set<string> {
const keys = new Set<string>();
const attrs = node.attributes || {};
// Add direct ID
keys.add(node.id);
// Extract GHCID from URI (for Oxigraph nodes)
// URIs like: https://w3id.org/heritage/ghcid/NL-FR-WOL-M-K
const ghcidFromUri = extractGhcidFromUri(node.id);
if (ghcidFromUri) {
keys.add(`ghcid:${ghcidFromUri}`);
}
// Match by name (normalized) - check multiple attribute names
const name = normalizeName(attrs.name);
if (name) {
keys.add(`name:${name}`);
}
const custodianName = normalizeName(attrs.custodian_name);
if (custodianName) {
keys.add(`name:${custodianName}`);
}
// Also try the label (primary match for Oxigraph nodes)
const label = normalizeName(node.label);
if (label) {
keys.add(`name:${label}`);
}
// Match by GHCID from attributes (for Qdrant nodes)
if (attrs.ghcid) {
keys.add(`ghcid:${String(attrs.ghcid).toUpperCase()}`);
}
// Match by ISIL
if (attrs.isil_code) {
keys.add(`isil:${String(attrs.isil_code)}`);
}
// Match by Wikidata ID
if (attrs.wikidata_id) {
keys.add(`wikidata:${String(attrs.wikidata_id)}`);
}
return keys;
}
/**
* Builds a set of context node IDs for efficient matching.
* The set contains normalized keys that can be used to check if
* a graph node matches any context result.
*
* @param contextNodes - Array of nodes from RAG context (Qdrant, SPARQL, TypeDB)
* @param options - Optional configuration
* @returns Set of normalized matching keys
*/
export function buildContextNodeIds(
contextNodes: GraphNode[],
options: ContextNodeMatchingOptions = {}
): Set<string> {
const ids = new Set<string>();
if (!contextNodes || contextNodes.length === 0) {
return ids;
}
contextNodes.forEach(node => {
const nodeKeys = extractMatchingKeys(node);
nodeKeys.forEach(key => ids.add(key));
});
// Debug logging
if (options.debug && contextNodes.length > 0) {
const sampleNode = contextNodes[0];
console.log('[Context Matching] Context nodes sample:', {
totalContextNodes: contextNodes.length,
sampleId: sampleNode.id,
sampleLabel: sampleNode.label,
sampleAttrs: sampleNode.attributes,
matchingKeysGenerated: Array.from(ids).slice(0, 10),
});
}
return ids;
}
/**
* Checks if a graph node matches any context result.
*
* Handles both:
* - Qdrant nodes: Have attributes like { ghcid, name, city }
* - Oxigraph nodes: Have URIs like https://w3id.org/heritage/ghcid/{GHCID} and label
*
* @param node - A graph node from the full knowledge graph
* @param contextNodeIds - Set of matching keys from buildContextNodeIds
* @returns true if the node matches any context result
*/
export function isNodeInContext(
node: GraphNode,
contextNodeIds: Set<string>
): boolean {
if (contextNodeIds.size === 0) return false;
// Direct ID match
if (contextNodeIds.has(node.id)) return true;
const attrs = node.attributes || {};
// Match by GHCID extracted from URI (for Oxigraph nodes)
// URIs like: https://w3id.org/heritage/ghcid/NL-FR-WOL-M-K
const ghcidFromUri = extractGhcidFromUri(node.id);
if (ghcidFromUri && contextNodeIds.has(`ghcid:${ghcidFromUri}`)) return true;
// Match by normalized name (try multiple attributes)
const name = normalizeName(attrs.name || attrs.custodian_name || node.label);
if (name && contextNodeIds.has(`name:${name}`)) return true;
// Match by GHCID from attributes (for Qdrant nodes)
if (attrs.ghcid && contextNodeIds.has(`ghcid:${String(attrs.ghcid).toUpperCase()}`)) return true;
// Match by ISIL
if (attrs.isil_code && contextNodeIds.has(`isil:${String(attrs.isil_code)}`)) return true;
// Match by Wikidata ID
if (attrs.wikidata_id && contextNodeIds.has(`wikidata:${String(attrs.wikidata_id)}`)) return true;
return false;
}
/**
* Finds the matching context node for a graph node and returns its info.
*
* Handles both:
* - Qdrant nodes: Have attributes like { ghcid, name, city }
* - Oxigraph nodes: Have URIs like https://w3id.org/heritage/ghcid/{GHCID} and label
*
* @param node - A graph node from the full knowledge graph
* @param contextNodes - Array of nodes from RAG context
* @returns Context info (source, score) or null if no match
*/
export function getContextNodeInfo(
node: GraphNode,
contextNodes: GraphNode[]
): ContextNodeInfo | null {
if (!contextNodes || contextNodes.length === 0) return null;
const attrs = node.attributes || {};
const nodeName = normalizeName(attrs.name || attrs.custodian_name || node.label);
// Extract GHCID from URI (for Oxigraph nodes)
const ghcidFromUri = extractGhcidFromUri(node.id);
const nodeGhcid = ghcidFromUri || (attrs.ghcid ? String(attrs.ghcid).toUpperCase() : null);
// Find matching context node
const contextNode = contextNodes.find(cn => {
// Direct ID match
if (cn.id === node.id) return true;
const cnAttrs = cn.attributes || {};
const cnName = normalizeName(cnAttrs.name || cnAttrs.custodian_name || cn.label);
// Name match
if (nodeName && cnName && nodeName === cnName) return true;
// GHCID match (handle both URI-extracted and attribute-based)
const cnGhcid = cnAttrs.ghcid ? String(cnAttrs.ghcid).toUpperCase() : null;
if (nodeGhcid && cnGhcid && nodeGhcid === cnGhcid) return true;
// ISIL match
if (attrs.isil_code && cnAttrs.isil_code && attrs.isil_code === cnAttrs.isil_code) return true;
// Wikidata match
if (attrs.wikidata_id && cnAttrs.wikidata_id && attrs.wikidata_id === cnAttrs.wikidata_id) return true;
return false;
});
if (!contextNode) return null;
return {
source: contextNode.attributes?.source as string | undefined,
sources: contextNode.attributes?.sources as string[] | undefined,
sourceColor: contextNode.attributes?.sourceColor as string | undefined,
score: contextNode.attributes?.score as number | undefined,
};
}
/**
* Enhanced node with context highlighting attributes
*/
export interface EnhancedGraphNode extends GraphNode {
attributes: GraphNode['attributes'] & {
isContextResult?: boolean;
isSourceVisible?: boolean;
dimmed?: boolean;
source?: string;
sources?: string[];
sourceColor?: string;
score?: number;
};
}
/**
* Merges context highlighting into full graph data.
* Nodes matching context results are marked as highlighted;
* non-matching nodes are marked as dimmed.
*
* @param graphData - Full graph data from Oxigraph
* @param contextData - Context graph data from RAG results
* @param visibleSources - Set of source types currently visible (for filtering)
* @param options - Optional configuration
* @returns Enhanced graph data with highlighting attributes
*/
export function mergeContextHighlighting(
graphData: GraphData,
contextData: GraphData,
visibleSources: Set<string>,
options: ContextNodeMatchingOptions = {}
): GraphData {
if (!graphData) {
return { nodes: [], edges: [] };
}
const contextNodeIds = buildContextNodeIds(contextData?.nodes || [], options);
let matchedCount = 0;
const enhancedNodes: EnhancedGraphNode[] = graphData.nodes.map(node => {
const inContext = isNodeInContext(node, contextNodeIds);
if (inContext) matchedCount++;
if (!inContext) {
// Not in context - return with dimmed styling
return {
...node,
attributes: {
...node.attributes,
isContextResult: false,
dimmed: true,
},
};
}
// Node is in context - add highlighting and source info
const contextInfo = getContextNodeInfo(node, contextData?.nodes || []);
// Check if this source is currently visible
const source = contextInfo?.source;
const sources = contextInfo?.sources;
let isSourceVisible = true;
if (sources && sources.length > 0) {
isSourceVisible = sources.some(s => visibleSources.has(s));
} else if (source) {
isSourceVisible = visibleSources.has(source);
}
return {
...node,
attributes: {
...node.attributes,
isContextResult: true,
isSourceVisible,
dimmed: !isSourceVisible,
source: contextInfo?.source,
sources: contextInfo?.sources,
sourceColor: contextInfo?.sourceColor,
score: contextInfo?.score,
},
};
});
// Debug logging
if (options.debug && graphData.nodes.length > 0) {
const sampleGraphNode = graphData.nodes[0];
const sampleContextNode = contextData?.nodes?.[0];
console.log('[Context Matching] Graph nodes vs context matching:', {
totalGraphNodes: graphData.nodes.length,
contextNodeIdsSize: contextNodeIds.size,
matchedNodes: matchedCount,
sampleGraphNode: {
id: sampleGraphNode.id,
label: sampleGraphNode.label,
attrs: sampleGraphNode.attributes,
extractedKeys: Array.from(extractMatchingKeys(sampleGraphNode)),
},
sampleContextNode: sampleContextNode ? {
id: sampleContextNode.id,
label: sampleContextNode.label,
attrs: sampleContextNode.attributes,
extractedKeys: Array.from(extractMatchingKeys(sampleContextNode)),
} : null,
contextKeysSample: Array.from(contextNodeIds).slice(0, 15),
});
}
return {
nodes: enhancedNodes,
edges: graphData.edges,
};
}
/**
* Counts how many graph nodes match context results
* Useful for debugging and UI display
*/
export function countContextMatches(
graphNodes: GraphNode[],
contextNodes: GraphNode[]
): { matched: number; total: number; percentage: number } {
const contextNodeIds = buildContextNodeIds(contextNodes);
let matched = 0;
graphNodes.forEach(node => {
if (isNodeInContext(node, contextNodeIds)) {
matched++;
}
});
const total = graphNodes.length;
const percentage = total > 0 ? Math.round((matched / total) * 100) : 0;
return { matched, total, percentage };
}