diff --git a/apps/archief-assistent/src/components/ChatMapPanel.tsx b/apps/archief-assistent/src/components/ChatMapPanel.tsx index 7cd493f665..cb6d88508e 100644 --- a/apps/archief-assistent/src/components/ChatMapPanel.tsx +++ b/apps/archief-assistent/src/components/ChatMapPanel.tsx @@ -12,9 +12,9 @@ * - Automatic bounds fitting */ -import React, { useRef, useEffect, useState, useMemo, useCallback } from 'react' +import React, { useRef, useEffect, useState, useMemo } from 'react' import maplibregl from 'maplibre-gl' -import type { StyleSpecification, MapLayerMouseEvent, GeoJSONSource } from 'maplibre-gl' +import type { StyleSpecification, MapLayerMouseEvent } from 'maplibre-gl' import 'maplibre-gl/dist/maplibre-gl.css' import { Box, @@ -268,7 +268,7 @@ export const ChatMapPanel: React.FC = ({ style: getMapStyle(isDarkMode), center: bounds ? bounds.getCenter().toArray() as [number, number] : [5.2913, 52.1326], zoom: 7, - attributionControl: true, + attributionControl: { compact: true }, }) mapRef.current = map diff --git a/apps/archief-assistent/src/lib/semantic-cache.ts b/apps/archief-assistent/src/lib/semantic-cache.ts index b5b6103b9a..279a7d1eca 100644 --- a/apps/archief-assistent/src/lib/semantic-cache.ts +++ b/apps/archief-assistent/src/lib/semantic-cache.ts @@ -28,6 +28,22 @@ // Types // ============================================================================ +/** + * Institution type codes from GLAMORCUBESFIXPHDNT taxonomy + */ +export type InstitutionTypeCode = 'G' | 'L' | 'A' | 'M' | 'O' | 'R' | 'C' | 'U' | 'B' | 'E' | 'S' | 'F' | 'I' | 'X' | 'P' | 'H' | 'D' | 'N' | 'T'; + +/** + * Entities extracted from a query for structured cache key generation. + * Used to prevent geographic false positives (e.g., "Amsterdam" vs "Noord-Holland"). + */ +export interface ExtractedEntities { + institutionType?: InstitutionTypeCode | null; + location?: string | null; + locationType?: 'city' | 'province' | null; + intent?: 'count' | 'list' | 'info' | null; +} + export interface CachedQuery { id: string; query: string; @@ -40,6 +56,10 @@ export interface CachedQuery { language: 'nl' | 'en'; llmProvider: string; source?: 'local' | 'shared'; + /** Extracted entities for structured cache matching (prevents geographic false positives) */ + entities?: ExtractedEntities; + /** Structured cache key derived from entities (e.g., "count:M:amsterdam") */ + structuredKey?: string; } export interface CachedResponse { @@ -83,7 +103,8 @@ export interface CacheLookupResult { found: boolean; entry?: CachedQuery; similarity: number; - method: 'semantic' | 'fuzzy' | 'exact' | 'none'; + /** 'structured' = entity-aware match (location/type/intent), prevents geographic false positives */ + method: 'semantic' | 'fuzzy' | 'exact' | 'structured' | 'none'; lookupTimeMs: number; tier?: 'local' | 'shared'; } @@ -97,11 +118,14 @@ const DB_VERSION = 1; const STORE_NAME = 'cached_queries'; const DEFAULT_CONFIG: CacheConfig = { - similarityThreshold: 0.85, // Lowered from 0.92 for better cache hit rate + // CRITICAL: Geographic queries like "musea in Amsterdam" vs "musea in Noord-Holland" + // have ~93% embedding similarity. A 0.85 threshold causes false cache hits. + // Must be ≥0.97 to avoid geographic false positives (matching BACKEND_DSPy threshold). + similarityThreshold: 0.97, // Raised from 0.85 to prevent geographic false positives ttlMs: 24 * 60 * 60 * 1000, // 24 hours maxEntries: 200, enableFuzzyMatch: true, - fuzzyThreshold: 0.85, + fuzzyThreshold: 0.90, // Raised from 0.85 for stricter text matching enableSharedCache: true, sharedCacheUrl: '/api/cache', // Qdrant-backed cache on archief.support embedApiUrl: '/api/embed', @@ -194,6 +218,185 @@ function generateCacheId(): string { return `cache_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`; } +// ============================================================================ +// Entity Extraction (Fast, <5ms, no LLM) +// ============================================================================ +// Prevents geographic false positives by extracting structured entities from queries. +// "musea in Amsterdam" and "musea in Noord-Holland" have ~93% embedding similarity +// but completely different answers. Entity extraction ensures they get different cache keys. + +/** Institution type patterns (Dutch + English) */ +const INSTITUTION_PATTERNS: Record = { + G: /\b(galler(y|ies|ij|ijen)|kunstgaller[ij])/i, + L: /\b(librar(y|ies)|bibliothe[ek]en?|bieb)/i, + A: /\b(archie[fv]en?|archives?|archief)/i, + M: /\b(muse(um|a|ums?)|musea)/i, + O: /\b(overheid|government|offici[eë]le?)/i, + R: /\b(onderzoek|research|kenniscentr[ua]m?)/i, + C: /\b(bedrijf|corporat|company)/i, + U: /\b(onbekend|unknown)/i, + B: /\b(botanisch|zoo|dierentuin|aquarium)/i, + E: /\b(universiteit|school|onderwijs|education|university|hogeschool)/i, + S: /\b(vereniging|genootschap|society|stichting)/i, + F: /\b(monument|standbeeld|landmark|feature)/i, + I: /\b(immateri[eë]el|intangible|erfgoed)/i, + X: /\b(gemengd|mixed|combinatie)/i, + P: /\b(priv[eé]|particulier|personal|collection)/i, + H: /\b(kerk|church|moskee|synagoge|temple|holy|religieus)/i, + D: /\b(digitaal|digital|online|platform)/i, + N: /\b(ngo|non-profit|goede doelen)/i, + T: /\b(culinair|parfum|smaak|smell|taste)/i, +}; + +/** Dutch provinces (ISO 3166-2:NL codes for backend compatibility) */ +const DUTCH_PROVINCES: Array<{ name: string; variants: string[]; code: string }> = [ + { name: 'noord-holland', variants: ['noord holland', 'nh', 'north holland'], code: 'NH' }, + { name: 'zuid-holland', variants: ['zuid holland', 'zh', 'south holland'], code: 'ZH' }, + { name: 'utrecht', variants: ['ut'], code: 'UT' }, + { name: 'gelderland', variants: ['gld', 'guelders'], code: 'GE' }, + { name: 'noord-brabant', variants: ['noord brabant', 'nb', 'brabant'], code: 'NB' }, + { name: 'limburg', variants: ['lb'], code: 'LI' }, + { name: 'overijssel', variants: ['ov'], code: 'OV' }, + { name: 'friesland', variants: ['fryslân', 'frisia', 'fr'], code: 'FR' }, + { name: 'groningen', variants: ['gr'], code: 'GR' }, + { name: 'drenthe', variants: ['dr'], code: 'DR' }, + { name: 'zeeland', variants: ['zl', 'zealand'], code: 'ZE' }, + { name: 'flevoland', variants: ['fl'], code: 'FL' }, +]; + +/** Major Dutch cities (top 50+ by population and heritage significance) */ +const DUTCH_CITIES: string[] = [ + 'amsterdam', 'rotterdam', 'den haag', "'s-gravenhage", 'the hague', + 'utrecht', 'eindhoven', 'groningen', 'tilburg', 'almere', + 'breda', 'nijmegen', 'arnhem', 'haarlem', 'enschede', + 'amersfoort', 'apeldoorn', 'zaanstad', 'haarlemmermeer', "'s-hertogenbosch", + 'den bosch', 'hertogenbosch', 'zwolle', 'zoetermeer', 'leiden', + 'maastricht', 'dordrecht', 'ede', 'alphen aan den rijn', 'alkmaar', + 'emmen', 'delft', 'deventer', 'venlo', 'sittard', + 'leeuwarden', 'hilversum', 'heerlen', 'amstelveen', 'oss', + 'schiedam', 'spijkenisse', 'helmond', 'purmerend', 'roosendaal', + 'vlaardingen', 'gouda', 'hoorn', 'middelburg', 'lelystad', + // Heritage-significant smaller cities + 'naarden', 'muiden', 'enkhuizen', 'edam', 'volendam', + 'zaanse schans', 'kinderdijk', 'giethoorn', 'valkenburg', +]; + +/** + * Extract entities from a query using fast regex and dictionary matching. + * No LLM calls - executes in <5ms for instant structured cache key generation. + * + * @param query - The user's query text + * @returns Extracted entities (institution type, location, intent) + */ +export function extractEntitiesFast(query: string): ExtractedEntities { + const normalized = query.toLowerCase().trim(); + const entities: ExtractedEntities = {}; + + // 1. Institution type detection (most specific first: M before U) + const typeOrder: InstitutionTypeCode[] = ['M', 'A', 'L', 'G', 'E', 'S', 'H', 'B', 'R', 'D', 'F', 'I', 'N', 'C', 'P', 'T', 'O', 'X', 'U']; + for (const typeCode of typeOrder) { + if (INSTITUTION_PATTERNS[typeCode].test(normalized)) { + entities.institutionType = typeCode; + break; + } + } + + // 2. Province detection (check provinces FIRST - more specific geographic context) + for (const province of DUTCH_PROVINCES) { + if (normalized.includes(province.name) || + province.variants.some(v => normalized.includes(v))) { + entities.location = province.code; // Use ISO code for backend compatibility + entities.locationType = 'province'; + break; + } + } + + // 3. City detection (only if no province found - cities are more specific) + if (!entities.location) { + for (const city of DUTCH_CITIES) { + if (normalized.includes(city)) { + // Normalize city name for cache key (lowercase, no special chars) + entities.location = city.replace(/[^a-z]/g, ''); + entities.locationType = 'city'; + break; + } + } + } + + // 4. Intent detection (count vs list vs info) + if (/\b(hoeveel|aantal|count|how many|tel|totaal|som)\b/i.test(normalized)) { + entities.intent = 'count'; + } else if (/\b(welke|lijst|list|toon|show|geef|overzicht|alle)\b/i.test(normalized)) { + entities.intent = 'list'; + } else if (/\b(wat is|who is|info|informatie|details|over)\b/i.test(normalized)) { + entities.intent = 'info'; + } + + return entities; +} + +/** + * Generate a structured cache key from extracted entities. + * This key is used for entity-aware cache matching to prevent geographic false positives. + * + * Format: "{intent}:{institutionType}:{location}" + * Examples: + * - "count:M:amsterdam" (how many museums in Amsterdam) + * - "list:A:NH" (list archives in Noord-Holland) + * - "query:any:nl" (generic query, no specific entities) + * + * @param entities - Entities extracted from the query + * @returns Structured cache key string + */ +export function generateStructuredCacheKey(entities: ExtractedEntities): string { + const parts = [ + entities.intent || 'query', + entities.institutionType || 'any', + entities.location || 'nl', + ]; + return parts.join(':').toLowerCase(); +} + +/** + * Check if two entity sets are compatible for cache matching. + * Returns false if there's a location or institution type mismatch. + * + * This is the KEY function that prevents geographic false positives: + * - Query: "musea in Amsterdam" → entities: {type: M, location: amsterdam, locationType: city} + * - Cached: "musea in Rotterdam" → entities: {type: M, location: rotterdam, locationType: city} + * - Result: FALSE (location mismatch) - no cache hit despite high embedding similarity + */ +export function entitiesMatch(queryEntities: ExtractedEntities, cachedEntities: ExtractedEntities): boolean { + // If query has a specific location, cached entry MUST have the same location + if (queryEntities.location && cachedEntities.location) { + if (queryEntities.location !== cachedEntities.location) { + return false; // Location mismatch - CRITICAL for preventing geographic false positives + } + } + + // If query has a specific location but cached entry has no location, no match + if (queryEntities.location && !cachedEntities.location) { + return false; + } + + // If query has a specific institution type, cached entry MUST match + if (queryEntities.institutionType && cachedEntities.institutionType) { + if (queryEntities.institutionType !== cachedEntities.institutionType) { + return false; // Type mismatch + } + } + + // If query has specific type but cached entry has no type, no match + if (queryEntities.institutionType && !cachedEntities.institutionType) { + return false; + } + + // If query has a specific intent, ideally match it (but allow fallback) + // Intent is less critical than location/type for false positive prevention + + return true; +} + // ============================================================================ // Embedding Service // ============================================================================ @@ -272,14 +475,18 @@ export class SemanticCache { // ========================================================================== /** - * Look up a query in both cache tiers + * Look up a query in both cache tiers with ENTITY-AWARE matching. * - * OPTIMIZATION: Shared cache is checked FIRST because: - * 1. Server generates embeddings (no client-side embedding delay) - * 2. Qdrant ANN search is fast (~200-300ms total) - * 3. Local cache only uses embedding for semantic match (exact/fuzzy don't need it) + * CRITICAL: Geographic queries like "musea in Amsterdam" vs "musea in Noord-Holland" + * have ~93% embedding similarity but require DIFFERENT answers. * - * Embedding parameter is now optional - only needed for local semantic matching. + * Flow: + * 1. Extract entities from query (institutionType, location, intent) - <5ms, no LLM + * 2. Check shared cache with entity validation (prevents geographic false positives) + * 3. Check local cache with entity validation + * 4. Embedding similarity is used ONLY if entities match (safe fallback) + * + * Embedding parameter is optional - only needed for local semantic matching. */ async lookup( query: string, @@ -288,28 +495,59 @@ export class SemanticCache { const startTime = performance.now(); await this.initialize(); - // TIER 1: Shared Qdrant cache FIRST (fast - server generates embeddings) - // This avoids 500-2000ms client-side embedding generation for cache hits + // STEP 1: Fast entity extraction (<5ms, no LLM) - CRITICAL for preventing false positives + const queryEntities = extractEntitiesFast(query); + const structuredKey = generateStructuredCacheKey(queryEntities); + const hasSpecificEntities = !!(queryEntities.location || queryEntities.institutionType); + + if (hasSpecificEntities) { + console.log( + `[SemanticCache] Entity extraction: ${JSON.stringify(queryEntities)} → key="${structuredKey}"` + ); + } + + // TIER 1: Shared Qdrant cache (server generates embeddings) if (this.config.enableSharedCache) { const sharedResult = await this.lookupShared(query, null); if (sharedResult?.found && sharedResult.entry) { - this.stats.sharedHits++; - this.stats.lookupCount++; - - // Populate local cache for next time - await this.storeLocal(sharedResult.entry); - - console.log( - `[SemanticCache] SHARED HIT: "${query.slice(0, 40)}..." ` + - `similarity=${sharedResult.similarity.toFixed(3)}` - ); - return { ...sharedResult, tier: 'shared', lookupTimeMs: performance.now() - startTime }; + // ENTITY VALIDATION: If query has specific entities, validate against cached entry + if (hasSpecificEntities) { + const cachedEntities = sharedResult.entry.entities || extractEntitiesFast(sharedResult.entry.query); + + if (!entitiesMatch(queryEntities, cachedEntities)) { + console.log( + `[SemanticCache] BLOCKED geographic false positive! ` + + `Query="${query.slice(0, 30)}..." entities=${JSON.stringify(queryEntities)} vs ` + + `Cached="${sharedResult.entry.query.slice(0, 30)}..." entities=${JSON.stringify(cachedEntities)}` + ); + // Fall through to local cache or miss - DO NOT return shared hit with mismatched entities + } else { + this.stats.sharedHits++; + this.stats.lookupCount++; + await this.storeLocal(sharedResult.entry); + console.log( + `[SemanticCache] SHARED HIT (entity-validated): "${query.slice(0, 40)}..." ` + + `similarity=${sharedResult.similarity.toFixed(3)}` + ); + return { ...sharedResult, tier: 'shared', method: 'structured', lookupTimeMs: performance.now() - startTime }; + } + } else { + // No specific entities - allow pure semantic match + this.stats.sharedHits++; + this.stats.lookupCount++; + await this.storeLocal(sharedResult.entry); + console.log( + `[SemanticCache] SHARED HIT: "${query.slice(0, 40)}..." ` + + `similarity=${sharedResult.similarity.toFixed(3)}` + ); + return { ...sharedResult, tier: 'shared', lookupTimeMs: performance.now() - startTime }; + } } } - // TIER 2: Local IndexedDB (exact/fuzzy match first, then semantic if embedding provided) - const localResult = await this.lookupLocal(query, embedding || null); + // TIER 2: Local IndexedDB with entity validation + const localResult = await this.lookupLocal(query, embedding || null, queryEntities); if (localResult.found) { this.stats.localHits++; @@ -336,10 +574,13 @@ export class SemanticCache { private async lookupLocal( query: string, - embedding: number[] | null + embedding: number[] | null, + queryEntities?: ExtractedEntities ): Promise { const startTime = performance.now(); const normalized = normalizeQuery(query); + const entities = queryEntities || extractEntitiesFast(query); + const hasSpecificEntities = !!(entities.location || entities.institutionType); // Exact match first const exactMatch = await this.getByNormalizedQuery(normalized); @@ -360,17 +601,26 @@ export class SemanticCache { let bestMatch: CachedQuery | undefined; let bestSimilarity = 0; - let matchMethod: 'semantic' | 'fuzzy' = 'semantic'; + let matchMethod: 'semantic' | 'fuzzy' | 'structured' = 'semantic'; + + // ENTITY-AWARE matching: Only consider entries with matching entities + // This prevents geographic false positives (Amsterdam vs Noord-Holland) + const entityCompatibleEntries = hasSpecificEntities + ? validEntries.filter(entry => { + const cachedEntities = entry.entities || extractEntitiesFast(entry.query); + return entitiesMatch(entities, cachedEntities); + }) + : validEntries; // If no specific entities in query, consider all entries // Semantic similarity (if embeddings available) if (embedding && embedding.length > 0) { - for (const entry of validEntries) { + for (const entry of entityCompatibleEntries) { if (entry.embedding && entry.embedding.length > 0) { const similarity = cosineSimilarity(embedding, entry.embedding); if (similarity > bestSimilarity && similarity >= this.config.similarityThreshold) { bestSimilarity = similarity; bestMatch = entry; - matchMethod = 'semantic'; + matchMethod = hasSpecificEntities ? 'structured' : 'semantic'; } } } @@ -378,7 +628,7 @@ export class SemanticCache { // Fuzzy text matching fallback if (!bestMatch && this.config.enableFuzzyMatch) { - for (const entry of validEntries) { + for (const entry of entityCompatibleEntries) { const jaccard = jaccardSimilarity(normalized, entry.queryNormalized); const levenshtein = levenshteinSimilarity(normalized, entry.queryNormalized); const combined = (jaccard * 0.6) + (levenshtein * 0.4); @@ -467,7 +717,8 @@ export class SemanticCache { } /** - * Store a query and response in cache + * Store a query and response in cache with extracted entities. + * Entities are extracted at storage time to enable entity-aware cache matching. */ async store( query: string, @@ -476,6 +727,10 @@ export class SemanticCache { llmProvider: string = 'zai', language: 'nl' | 'en' = 'nl' ): Promise { + // Extract entities at storage time for future entity-aware matching + const entities = extractEntitiesFast(query); + const structuredKey = generateStructuredCacheKey(entities); + const entry: CachedQuery = { id: generateCacheId(), query, @@ -488,6 +743,8 @@ export class SemanticCache { language, llmProvider, source: 'local', + entities, // Store extracted entities for entity-aware matching + structuredKey, // Store structured key for debugging/analytics }; // Store locally @@ -500,6 +757,10 @@ export class SemanticCache { }); } + console.log( + `[SemanticCache] Stored with entities: "${query.slice(0, 40)}..." → ${structuredKey}` + ); + return entry.id; } @@ -537,6 +798,9 @@ export class SemanticCache { response: entry.response, language: entry.language, model: entry.llmProvider, + // Include extracted entities for server-side entity-aware matching + entities: entry.entities, + structured_key: entry.structuredKey, }), }); } catch (error) { diff --git a/apps/archief-assistent/src/pages/ChatPage.tsx b/apps/archief-assistent/src/pages/ChatPage.tsx index e7289d9c2f..cdd32d62a7 100644 --- a/apps/archief-assistent/src/pages/ChatPage.tsx +++ b/apps/archief-assistent/src/pages/ChatPage.tsx @@ -215,7 +215,7 @@ interface Message { detectedPlace?: string // NEW: Detected location from query fromCache?: boolean cacheTier?: 'local' | 'shared' - cacheMethod?: 'semantic' | 'fuzzy' | 'exact' + cacheMethod?: 'semantic' | 'fuzzy' | 'exact' | 'structured' // 'structured' = entity-aware match (prevents geographic false positives) cacheSimilarity?: number lookupTimeMs?: number sparqlQuery?: string // SPARQL query used for knowledge graph search