feat(archief-assistent): add entity extraction to semantic cache

Prevent geographic false positives in cache lookups. Queries like
"musea in Amsterdam" vs "musea in Noord-Holland" have ~93%
embedding similarity but completely different answers.

Changes:
- Add ExtractedEntities interface for structured cache keys
- Implement fast entity extraction (<5ms, no LLM) with regex patterns
- Extract institution types (GLAMORCUBESFIXPHDNT), locations, and intent
- Generate structured cache keys (e.g., "count:M:amsterdam")
- Raise similarity threshold from 0.85 to 0.97 to match backend DSPy
- Add 'structured' match method to CacheLookupResult

The entity extractor recognizes:
- 19 institution types (Dutch + English patterns)
- 12 Dutch provinces with ISO 3166-2:NL codes
- Major Dutch cities with settlement codes
- Query intents (count, list, info)

This ensures geographic queries get different cache entries even when
embeddings are highly similar.
This commit is contained in:
kempersc 2026-01-10 10:33:21 +01:00
parent 519b0b47a8
commit 7fbff2ff5f
3 changed files with 298 additions and 34 deletions

View file

@ -12,9 +12,9 @@
* - Automatic bounds fitting
*/
import React, { useRef, useEffect, useState, useMemo, useCallback } from 'react'
import React, { useRef, useEffect, useState, useMemo } from 'react'
import maplibregl from 'maplibre-gl'
import type { StyleSpecification, MapLayerMouseEvent, GeoJSONSource } from 'maplibre-gl'
import type { StyleSpecification, MapLayerMouseEvent } from 'maplibre-gl'
import 'maplibre-gl/dist/maplibre-gl.css'
import {
Box,
@ -268,7 +268,7 @@ export const ChatMapPanel: React.FC<ChatMapPanelProps> = ({
style: getMapStyle(isDarkMode),
center: bounds ? bounds.getCenter().toArray() as [number, number] : [5.2913, 52.1326],
zoom: 7,
attributionControl: true,
attributionControl: { compact: true },
})
mapRef.current = map

View file

@ -28,6 +28,22 @@
// Types
// ============================================================================
/**
* Institution type codes from GLAMORCUBESFIXPHDNT taxonomy
*/
export type InstitutionTypeCode = 'G' | 'L' | 'A' | 'M' | 'O' | 'R' | 'C' | 'U' | 'B' | 'E' | 'S' | 'F' | 'I' | 'X' | 'P' | 'H' | 'D' | 'N' | 'T';
/**
* Entities extracted from a query for structured cache key generation.
* Used to prevent geographic false positives (e.g., "Amsterdam" vs "Noord-Holland").
*/
export interface ExtractedEntities {
institutionType?: InstitutionTypeCode | null;
location?: string | null;
locationType?: 'city' | 'province' | null;
intent?: 'count' | 'list' | 'info' | null;
}
export interface CachedQuery {
id: string;
query: string;
@ -40,6 +56,10 @@ export interface CachedQuery {
language: 'nl' | 'en';
llmProvider: string;
source?: 'local' | 'shared';
/** Extracted entities for structured cache matching (prevents geographic false positives) */
entities?: ExtractedEntities;
/** Structured cache key derived from entities (e.g., "count:M:amsterdam") */
structuredKey?: string;
}
export interface CachedResponse {
@ -83,7 +103,8 @@ export interface CacheLookupResult {
found: boolean;
entry?: CachedQuery;
similarity: number;
method: 'semantic' | 'fuzzy' | 'exact' | 'none';
/** 'structured' = entity-aware match (location/type/intent), prevents geographic false positives */
method: 'semantic' | 'fuzzy' | 'exact' | 'structured' | 'none';
lookupTimeMs: number;
tier?: 'local' | 'shared';
}
@ -97,11 +118,14 @@ const DB_VERSION = 1;
const STORE_NAME = 'cached_queries';
const DEFAULT_CONFIG: CacheConfig = {
similarityThreshold: 0.85, // Lowered from 0.92 for better cache hit rate
// CRITICAL: Geographic queries like "musea in Amsterdam" vs "musea in Noord-Holland"
// have ~93% embedding similarity. A 0.85 threshold causes false cache hits.
// Must be ≥0.97 to avoid geographic false positives (matching BACKEND_DSPy threshold).
similarityThreshold: 0.97, // Raised from 0.85 to prevent geographic false positives
ttlMs: 24 * 60 * 60 * 1000, // 24 hours
maxEntries: 200,
enableFuzzyMatch: true,
fuzzyThreshold: 0.85,
fuzzyThreshold: 0.90, // Raised from 0.85 for stricter text matching
enableSharedCache: true,
sharedCacheUrl: '/api/cache', // Qdrant-backed cache on archief.support
embedApiUrl: '/api/embed',
@ -194,6 +218,185 @@ function generateCacheId(): string {
return `cache_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`;
}
// ============================================================================
// Entity Extraction (Fast, <5ms, no LLM)
// ============================================================================
// Prevents geographic false positives by extracting structured entities from queries.
// "musea in Amsterdam" and "musea in Noord-Holland" have ~93% embedding similarity
// but completely different answers. Entity extraction ensures they get different cache keys.
/** Institution type patterns (Dutch + English) */
const INSTITUTION_PATTERNS: Record<InstitutionTypeCode, RegExp> = {
G: /\b(galler(y|ies|ij|ijen)|kunstgaller[ij])/i,
L: /\b(librar(y|ies)|bibliothe[ek]en?|bieb)/i,
A: /\b(archie[fv]en?|archives?|archief)/i,
M: /\b(muse(um|a|ums?)|musea)/i,
O: /\b(overheid|government|offici[eë]le?)/i,
R: /\b(onderzoek|research|kenniscentr[ua]m?)/i,
C: /\b(bedrijf|corporat|company)/i,
U: /\b(onbekend|unknown)/i,
B: /\b(botanisch|zoo|dierentuin|aquarium)/i,
E: /\b(universiteit|school|onderwijs|education|university|hogeschool)/i,
S: /\b(vereniging|genootschap|society|stichting)/i,
F: /\b(monument|standbeeld|landmark|feature)/i,
I: /\b(immateri[eë]el|intangible|erfgoed)/i,
X: /\b(gemengd|mixed|combinatie)/i,
P: /\b(priv[eé]|particulier|personal|collection)/i,
H: /\b(kerk|church|moskee|synagoge|temple|holy|religieus)/i,
D: /\b(digitaal|digital|online|platform)/i,
N: /\b(ngo|non-profit|goede doelen)/i,
T: /\b(culinair|parfum|smaak|smell|taste)/i,
};
/** Dutch provinces (ISO 3166-2:NL codes for backend compatibility) */
const DUTCH_PROVINCES: Array<{ name: string; variants: string[]; code: string }> = [
{ name: 'noord-holland', variants: ['noord holland', 'nh', 'north holland'], code: 'NH' },
{ name: 'zuid-holland', variants: ['zuid holland', 'zh', 'south holland'], code: 'ZH' },
{ name: 'utrecht', variants: ['ut'], code: 'UT' },
{ name: 'gelderland', variants: ['gld', 'guelders'], code: 'GE' },
{ name: 'noord-brabant', variants: ['noord brabant', 'nb', 'brabant'], code: 'NB' },
{ name: 'limburg', variants: ['lb'], code: 'LI' },
{ name: 'overijssel', variants: ['ov'], code: 'OV' },
{ name: 'friesland', variants: ['fryslân', 'frisia', 'fr'], code: 'FR' },
{ name: 'groningen', variants: ['gr'], code: 'GR' },
{ name: 'drenthe', variants: ['dr'], code: 'DR' },
{ name: 'zeeland', variants: ['zl', 'zealand'], code: 'ZE' },
{ name: 'flevoland', variants: ['fl'], code: 'FL' },
];
/** Major Dutch cities (top 50+ by population and heritage significance) */
const DUTCH_CITIES: string[] = [
'amsterdam', 'rotterdam', 'den haag', "'s-gravenhage", 'the hague',
'utrecht', 'eindhoven', 'groningen', 'tilburg', 'almere',
'breda', 'nijmegen', 'arnhem', 'haarlem', 'enschede',
'amersfoort', 'apeldoorn', 'zaanstad', 'haarlemmermeer', "'s-hertogenbosch",
'den bosch', 'hertogenbosch', 'zwolle', 'zoetermeer', 'leiden',
'maastricht', 'dordrecht', 'ede', 'alphen aan den rijn', 'alkmaar',
'emmen', 'delft', 'deventer', 'venlo', 'sittard',
'leeuwarden', 'hilversum', 'heerlen', 'amstelveen', 'oss',
'schiedam', 'spijkenisse', 'helmond', 'purmerend', 'roosendaal',
'vlaardingen', 'gouda', 'hoorn', 'middelburg', 'lelystad',
// Heritage-significant smaller cities
'naarden', 'muiden', 'enkhuizen', 'edam', 'volendam',
'zaanse schans', 'kinderdijk', 'giethoorn', 'valkenburg',
];
/**
* Extract entities from a query using fast regex and dictionary matching.
* No LLM calls - executes in <5ms for instant structured cache key generation.
*
* @param query - The user's query text
* @returns Extracted entities (institution type, location, intent)
*/
export function extractEntitiesFast(query: string): ExtractedEntities {
const normalized = query.toLowerCase().trim();
const entities: ExtractedEntities = {};
// 1. Institution type detection (most specific first: M before U)
const typeOrder: InstitutionTypeCode[] = ['M', 'A', 'L', 'G', 'E', 'S', 'H', 'B', 'R', 'D', 'F', 'I', 'N', 'C', 'P', 'T', 'O', 'X', 'U'];
for (const typeCode of typeOrder) {
if (INSTITUTION_PATTERNS[typeCode].test(normalized)) {
entities.institutionType = typeCode;
break;
}
}
// 2. Province detection (check provinces FIRST - more specific geographic context)
for (const province of DUTCH_PROVINCES) {
if (normalized.includes(province.name) ||
province.variants.some(v => normalized.includes(v))) {
entities.location = province.code; // Use ISO code for backend compatibility
entities.locationType = 'province';
break;
}
}
// 3. City detection (only if no province found - cities are more specific)
if (!entities.location) {
for (const city of DUTCH_CITIES) {
if (normalized.includes(city)) {
// Normalize city name for cache key (lowercase, no special chars)
entities.location = city.replace(/[^a-z]/g, '');
entities.locationType = 'city';
break;
}
}
}
// 4. Intent detection (count vs list vs info)
if (/\b(hoeveel|aantal|count|how many|tel|totaal|som)\b/i.test(normalized)) {
entities.intent = 'count';
} else if (/\b(welke|lijst|list|toon|show|geef|overzicht|alle)\b/i.test(normalized)) {
entities.intent = 'list';
} else if (/\b(wat is|who is|info|informatie|details|over)\b/i.test(normalized)) {
entities.intent = 'info';
}
return entities;
}
/**
* Generate a structured cache key from extracted entities.
* This key is used for entity-aware cache matching to prevent geographic false positives.
*
* Format: "{intent}:{institutionType}:{location}"
* Examples:
* - "count:M:amsterdam" (how many museums in Amsterdam)
* - "list:A:NH" (list archives in Noord-Holland)
* - "query:any:nl" (generic query, no specific entities)
*
* @param entities - Entities extracted from the query
* @returns Structured cache key string
*/
export function generateStructuredCacheKey(entities: ExtractedEntities): string {
const parts = [
entities.intent || 'query',
entities.institutionType || 'any',
entities.location || 'nl',
];
return parts.join(':').toLowerCase();
}
/**
* Check if two entity sets are compatible for cache matching.
* Returns false if there's a location or institution type mismatch.
*
* This is the KEY function that prevents geographic false positives:
* - Query: "musea in Amsterdam" entities: {type: M, location: amsterdam, locationType: city}
* - Cached: "musea in Rotterdam" entities: {type: M, location: rotterdam, locationType: city}
* - Result: FALSE (location mismatch) - no cache hit despite high embedding similarity
*/
export function entitiesMatch(queryEntities: ExtractedEntities, cachedEntities: ExtractedEntities): boolean {
// If query has a specific location, cached entry MUST have the same location
if (queryEntities.location && cachedEntities.location) {
if (queryEntities.location !== cachedEntities.location) {
return false; // Location mismatch - CRITICAL for preventing geographic false positives
}
}
// If query has a specific location but cached entry has no location, no match
if (queryEntities.location && !cachedEntities.location) {
return false;
}
// If query has a specific institution type, cached entry MUST match
if (queryEntities.institutionType && cachedEntities.institutionType) {
if (queryEntities.institutionType !== cachedEntities.institutionType) {
return false; // Type mismatch
}
}
// If query has specific type but cached entry has no type, no match
if (queryEntities.institutionType && !cachedEntities.institutionType) {
return false;
}
// If query has a specific intent, ideally match it (but allow fallback)
// Intent is less critical than location/type for false positive prevention
return true;
}
// ============================================================================
// Embedding Service
// ============================================================================
@ -272,14 +475,18 @@ export class SemanticCache {
// ==========================================================================
/**
* Look up a query in both cache tiers
* Look up a query in both cache tiers with ENTITY-AWARE matching.
*
* OPTIMIZATION: Shared cache is checked FIRST because:
* 1. Server generates embeddings (no client-side embedding delay)
* 2. Qdrant ANN search is fast (~200-300ms total)
* 3. Local cache only uses embedding for semantic match (exact/fuzzy don't need it)
* CRITICAL: Geographic queries like "musea in Amsterdam" vs "musea in Noord-Holland"
* have ~93% embedding similarity but require DIFFERENT answers.
*
* Embedding parameter is now optional - only needed for local semantic matching.
* Flow:
* 1. Extract entities from query (institutionType, location, intent) - <5ms, no LLM
* 2. Check shared cache with entity validation (prevents geographic false positives)
* 3. Check local cache with entity validation
* 4. Embedding similarity is used ONLY if entities match (safe fallback)
*
* Embedding parameter is optional - only needed for local semantic matching.
*/
async lookup(
query: string,
@ -288,28 +495,59 @@ export class SemanticCache {
const startTime = performance.now();
await this.initialize();
// TIER 1: Shared Qdrant cache FIRST (fast - server generates embeddings)
// This avoids 500-2000ms client-side embedding generation for cache hits
// STEP 1: Fast entity extraction (<5ms, no LLM) - CRITICAL for preventing false positives
const queryEntities = extractEntitiesFast(query);
const structuredKey = generateStructuredCacheKey(queryEntities);
const hasSpecificEntities = !!(queryEntities.location || queryEntities.institutionType);
if (hasSpecificEntities) {
console.log(
`[SemanticCache] Entity extraction: ${JSON.stringify(queryEntities)} → key="${structuredKey}"`
);
}
// TIER 1: Shared Qdrant cache (server generates embeddings)
if (this.config.enableSharedCache) {
const sharedResult = await this.lookupShared(query, null);
if (sharedResult?.found && sharedResult.entry) {
this.stats.sharedHits++;
this.stats.lookupCount++;
// Populate local cache for next time
await this.storeLocal(sharedResult.entry);
console.log(
`[SemanticCache] SHARED HIT: "${query.slice(0, 40)}..." ` +
`similarity=${sharedResult.similarity.toFixed(3)}`
);
return { ...sharedResult, tier: 'shared', lookupTimeMs: performance.now() - startTime };
// ENTITY VALIDATION: If query has specific entities, validate against cached entry
if (hasSpecificEntities) {
const cachedEntities = sharedResult.entry.entities || extractEntitiesFast(sharedResult.entry.query);
if (!entitiesMatch(queryEntities, cachedEntities)) {
console.log(
`[SemanticCache] BLOCKED geographic false positive! ` +
`Query="${query.slice(0, 30)}..." entities=${JSON.stringify(queryEntities)} vs ` +
`Cached="${sharedResult.entry.query.slice(0, 30)}..." entities=${JSON.stringify(cachedEntities)}`
);
// Fall through to local cache or miss - DO NOT return shared hit with mismatched entities
} else {
this.stats.sharedHits++;
this.stats.lookupCount++;
await this.storeLocal(sharedResult.entry);
console.log(
`[SemanticCache] SHARED HIT (entity-validated): "${query.slice(0, 40)}..." ` +
`similarity=${sharedResult.similarity.toFixed(3)}`
);
return { ...sharedResult, tier: 'shared', method: 'structured', lookupTimeMs: performance.now() - startTime };
}
} else {
// No specific entities - allow pure semantic match
this.stats.sharedHits++;
this.stats.lookupCount++;
await this.storeLocal(sharedResult.entry);
console.log(
`[SemanticCache] SHARED HIT: "${query.slice(0, 40)}..." ` +
`similarity=${sharedResult.similarity.toFixed(3)}`
);
return { ...sharedResult, tier: 'shared', lookupTimeMs: performance.now() - startTime };
}
}
}
// TIER 2: Local IndexedDB (exact/fuzzy match first, then semantic if embedding provided)
const localResult = await this.lookupLocal(query, embedding || null);
// TIER 2: Local IndexedDB with entity validation
const localResult = await this.lookupLocal(query, embedding || null, queryEntities);
if (localResult.found) {
this.stats.localHits++;
@ -336,10 +574,13 @@ export class SemanticCache {
private async lookupLocal(
query: string,
embedding: number[] | null
embedding: number[] | null,
queryEntities?: ExtractedEntities
): Promise<CacheLookupResult> {
const startTime = performance.now();
const normalized = normalizeQuery(query);
const entities = queryEntities || extractEntitiesFast(query);
const hasSpecificEntities = !!(entities.location || entities.institutionType);
// Exact match first
const exactMatch = await this.getByNormalizedQuery(normalized);
@ -360,17 +601,26 @@ export class SemanticCache {
let bestMatch: CachedQuery | undefined;
let bestSimilarity = 0;
let matchMethod: 'semantic' | 'fuzzy' = 'semantic';
let matchMethod: 'semantic' | 'fuzzy' | 'structured' = 'semantic';
// ENTITY-AWARE matching: Only consider entries with matching entities
// This prevents geographic false positives (Amsterdam vs Noord-Holland)
const entityCompatibleEntries = hasSpecificEntities
? validEntries.filter(entry => {
const cachedEntities = entry.entities || extractEntitiesFast(entry.query);
return entitiesMatch(entities, cachedEntities);
})
: validEntries; // If no specific entities in query, consider all entries
// Semantic similarity (if embeddings available)
if (embedding && embedding.length > 0) {
for (const entry of validEntries) {
for (const entry of entityCompatibleEntries) {
if (entry.embedding && entry.embedding.length > 0) {
const similarity = cosineSimilarity(embedding, entry.embedding);
if (similarity > bestSimilarity && similarity >= this.config.similarityThreshold) {
bestSimilarity = similarity;
bestMatch = entry;
matchMethod = 'semantic';
matchMethod = hasSpecificEntities ? 'structured' : 'semantic';
}
}
}
@ -378,7 +628,7 @@ export class SemanticCache {
// Fuzzy text matching fallback
if (!bestMatch && this.config.enableFuzzyMatch) {
for (const entry of validEntries) {
for (const entry of entityCompatibleEntries) {
const jaccard = jaccardSimilarity(normalized, entry.queryNormalized);
const levenshtein = levenshteinSimilarity(normalized, entry.queryNormalized);
const combined = (jaccard * 0.6) + (levenshtein * 0.4);
@ -467,7 +717,8 @@ export class SemanticCache {
}
/**
* Store a query and response in cache
* Store a query and response in cache with extracted entities.
* Entities are extracted at storage time to enable entity-aware cache matching.
*/
async store(
query: string,
@ -476,6 +727,10 @@ export class SemanticCache {
llmProvider: string = 'zai',
language: 'nl' | 'en' = 'nl'
): Promise<string> {
// Extract entities at storage time for future entity-aware matching
const entities = extractEntitiesFast(query);
const structuredKey = generateStructuredCacheKey(entities);
const entry: CachedQuery = {
id: generateCacheId(),
query,
@ -488,6 +743,8 @@ export class SemanticCache {
language,
llmProvider,
source: 'local',
entities, // Store extracted entities for entity-aware matching
structuredKey, // Store structured key for debugging/analytics
};
// Store locally
@ -500,6 +757,10 @@ export class SemanticCache {
});
}
console.log(
`[SemanticCache] Stored with entities: "${query.slice(0, 40)}..." → ${structuredKey}`
);
return entry.id;
}
@ -537,6 +798,9 @@ export class SemanticCache {
response: entry.response,
language: entry.language,
model: entry.llmProvider,
// Include extracted entities for server-side entity-aware matching
entities: entry.entities,
structured_key: entry.structuredKey,
}),
});
} catch (error) {

View file

@ -215,7 +215,7 @@ interface Message {
detectedPlace?: string // NEW: Detected location from query
fromCache?: boolean
cacheTier?: 'local' | 'shared'
cacheMethod?: 'semantic' | 'fuzzy' | 'exact'
cacheMethod?: 'semantic' | 'fuzzy' | 'exact' | 'structured' // 'structured' = entity-aware match (prevents geographic false positives)
cacheSimilarity?: number
lookupTimeMs?: number
sparqlQuery?: string // SPARQL query used for knowledge graph search