/** * Unit tests for semantic-cache.ts entity extraction and matching * * Tests Rule 46: Ontology-Driven Cache Segmentation * - Vocabulary-based entity extraction * - Structured cache key generation * - Entity matching for cache lookup */ import { describe, it, expect, vi, beforeEach } from 'vitest' import { extractEntitiesFast, extractEntitiesWithClarity, generateStructuredCacheKey, entitiesMatch, normalizeQuery, topologicalDistance, combinedSimilarity, calculateClarity, explainLastDecision, type ExtractedEntities, type InstitutionTypeCode, } from '../src/lib/semantic-cache' describe('extractEntitiesFast', () => { describe('institution type detection', () => { it('should detect museum type from "musea"', () => { const entities = extractEntitiesFast('Hoeveel musea zijn er in Amsterdam?') expect(entities.institutionType).toBe('M') }) it('should detect museum type from "museum"', () => { const entities = extractEntitiesFast('Waar is het museum?') expect(entities.institutionType).toBe('M') }) it('should detect archive type from "archief"', () => { const entities = extractEntitiesFast('Hoeveel archieven zijn er in Utrecht?') expect(entities.institutionType).toBe('A') }) it('should detect archive type from "archieven"', () => { const entities = extractEntitiesFast('Toon alle archieven') expect(entities.institutionType).toBe('A') }) it('should detect library type from "bibliotheek"', () => { const entities = extractEntitiesFast('Welke bibliotheken zijn er?') expect(entities.institutionType).toBe('L') }) it('should detect library type from "bibliotheken"', () => { const entities = extractEntitiesFast('Hoeveel bibliotheken in Groningen?') expect(entities.institutionType).toBe('L') }) it('should detect gallery type from "gallerij"', () => { const entities = extractEntitiesFast('Kunstgallerij in Amsterdam') expect(entities.institutionType).toBe('G') }) it('should detect education type from "universiteit"', () => { const entities = extractEntitiesFast('Universiteit Utrecht collecties') expect(entities.institutionType).toBe('E') }) it('should detect holy sites from "kerk"', () => { const entities = extractEntitiesFast('Welke kerken zijn er in Amsterdam?') expect(entities.institutionType).toBe('H') }) }) describe('location detection - provinces', () => { it('should detect Noord-Holland province', () => { const entities = extractEntitiesFast('musea in noord-holland') expect(entities.location).toBe('NH') expect(entities.locationType).toBe('province') }) it('should detect Zuid-Holland province', () => { const entities = extractEntitiesFast('archieven in zuid-holland') expect(entities.location).toBe('ZH') expect(entities.locationType).toBe('province') }) it('should detect Utrecht province', () => { const entities = extractEntitiesFast('bibliotheken in utrecht') expect(entities.location).toBe('UT') expect(entities.locationType).toBe('province') }) it('should detect Gelderland province', () => { const entities = extractEntitiesFast('musea gelderland') expect(entities.location).toBe('GE') expect(entities.locationType).toBe('province') }) it('should detect Limburg province', () => { const entities = extractEntitiesFast('archieven limburg') expect(entities.location).toBe('LI') expect(entities.locationType).toBe('province') }) }) describe('location detection - cities', () => { it('should detect Amsterdam', () => { const entities = extractEntitiesFast('musea in amsterdam') expect(entities.location).toBe('amsterdam') expect(entities.locationType).toBe('city') }) it('should detect Rotterdam', () => { const entities = extractEntitiesFast('archieven rotterdam') expect(entities.location).toBe('rotterdam') expect(entities.locationType).toBe('city') }) it('should detect Den Haag', () => { const entities = extractEntitiesFast('bibliotheken den haag') expect(entities.location).toBe('denhaag') expect(entities.locationType).toBe('city') }) it('should detect Maastricht', () => { const entities = extractEntitiesFast('musea maastricht') expect(entities.location).toBe('maastricht') expect(entities.locationType).toBe('city') }) }) describe('intent detection', () => { it('should detect count intent from "hoeveel"', () => { const entities = extractEntitiesFast('Hoeveel musea zijn er?') expect(entities.intent).toBe('count') }) it('should detect count intent from "aantal"', () => { const entities = extractEntitiesFast('Wat is het aantal archieven?') expect(entities.intent).toBe('count') }) it('should detect list intent from "welke"', () => { const entities = extractEntitiesFast('Welke bibliotheken zijn er?') expect(entities.intent).toBe('list') }) it('should detect list intent from "toon"', () => { const entities = extractEntitiesFast('Toon alle musea') expect(entities.intent).toBe('list') }) it('should detect info intent from "wat is"', () => { const entities = extractEntitiesFast('Wat is een archief?') expect(entities.intent).toBe('info') }) }) describe('combined entity extraction', () => { it('should extract type, location, and intent together', () => { const entities = extractEntitiesFast('Hoeveel musea zijn er in Amsterdam?') expect(entities.institutionType).toBe('M') expect(entities.location).toBe('amsterdam') expect(entities.locationType).toBe('city') expect(entities.intent).toBe('count') }) it('should prefer province over city when province is mentioned', () => { const entities = extractEntitiesFast('musea in noord-holland') expect(entities.location).toBe('NH') expect(entities.locationType).toBe('province') }) }) }) describe('generateStructuredCacheKey', () => { it('should generate key with intent, type, and location', () => { const entities: ExtractedEntities = { institutionType: 'M', location: 'amsterdam', intent: 'count', } expect(generateStructuredCacheKey(entities)).toBe('count:m:amsterdam') }) it('should include subtype when present', () => { const entities: ExtractedEntities = { institutionType: 'M', institutionSubtype: 'ART_MUSEUM', location: 'amsterdam', intent: 'count', } expect(generateStructuredCacheKey(entities)).toBe('count:m.art_museum:amsterdam') }) it('should include record set type when present', () => { const entities: ExtractedEntities = { institutionType: 'A', recordSetType: 'CIVIL_REGISTRY', location: 'NH', intent: 'list', } expect(generateStructuredCacheKey(entities)).toBe('list:a:civil_registry:nh') }) it('should include both subtype and record set type', () => { const entities: ExtractedEntities = { institutionType: 'A', institutionSubtype: 'MUNICIPAL_ARCHIVE', recordSetType: 'CIVIL_REGISTRY', location: 'amsterdam', intent: 'list', } expect(generateStructuredCacheKey(entities)).toBe('list:a.municipal_archive:civil_registry:amsterdam') }) it('should use defaults for missing fields', () => { const entities: ExtractedEntities = {} expect(generateStructuredCacheKey(entities)).toBe('query:any:nl') }) it('should normalize subtype to snake_case', () => { const entities: ExtractedEntities = { institutionType: 'M', institutionSubtype: 'Art Museum', location: 'amsterdam', } expect(generateStructuredCacheKey(entities)).toBe('query:m.art_museum:amsterdam') }) }) describe('entitiesMatch', () => { describe('location matching', () => { it('should match when locations are equal', () => { const query: ExtractedEntities = { location: 'amsterdam', institutionType: 'M' } const cached: ExtractedEntities = { location: 'amsterdam', institutionType: 'M' } expect(entitiesMatch(query, cached)).toBe(true) }) it('should NOT match when locations differ', () => { const query: ExtractedEntities = { location: 'amsterdam', institutionType: 'M' } const cached: ExtractedEntities = { location: 'rotterdam', institutionType: 'M' } expect(entitiesMatch(query, cached)).toBe(false) }) it('should NOT match when query has location but cached does not', () => { const query: ExtractedEntities = { location: 'amsterdam', institutionType: 'M' } const cached: ExtractedEntities = { institutionType: 'M' } expect(entitiesMatch(query, cached)).toBe(false) }) it('should match when query has no location but cached does', () => { const query: ExtractedEntities = { institutionType: 'M' } const cached: ExtractedEntities = { location: 'amsterdam', institutionType: 'M' } expect(entitiesMatch(query, cached)).toBe(true) }) }) describe('institution type matching', () => { it('should match when types are equal', () => { const query: ExtractedEntities = { institutionType: 'M' } const cached: ExtractedEntities = { institutionType: 'M' } expect(entitiesMatch(query, cached)).toBe(true) }) it('should NOT match when types differ', () => { const query: ExtractedEntities = { institutionType: 'M' } const cached: ExtractedEntities = { institutionType: 'A' } expect(entitiesMatch(query, cached)).toBe(false) }) it('should NOT match when query has type but cached does not', () => { const query: ExtractedEntities = { institutionType: 'M' } const cached: ExtractedEntities = {} expect(entitiesMatch(query, cached)).toBe(false) }) }) describe('subtype matching (Rule 46)', () => { it('should match when subtypes are equal', () => { const query: ExtractedEntities = { institutionType: 'M', institutionSubtype: 'ART_MUSEUM' } const cached: ExtractedEntities = { institutionType: 'M', institutionSubtype: 'ART_MUSEUM' } expect(entitiesMatch(query, cached)).toBe(true) }) it('should NOT match when subtypes differ', () => { const query: ExtractedEntities = { institutionType: 'M', institutionSubtype: 'ART_MUSEUM' } const cached: ExtractedEntities = { institutionType: 'M', institutionSubtype: 'HISTORY_MUSEUM' } expect(entitiesMatch(query, cached)).toBe(false) }) it('should NOT match when query has subtype but cached does not (prevents kunstmuseum -> generic museum match)', () => { const query: ExtractedEntities = { institutionType: 'M', institutionSubtype: 'ART_MUSEUM', location: 'amsterdam', } const cached: ExtractedEntities = { institutionType: 'M', location: 'amsterdam', // No subtype - generic museum cached response } expect(entitiesMatch(query, cached)).toBe(false) }) it('should match when query has no subtype but cached does (generic matches specific)', () => { const query: ExtractedEntities = { institutionType: 'M', location: 'amsterdam', // No subtype - generic museum query } const cached: ExtractedEntities = { institutionType: 'M', institutionSubtype: 'ART_MUSEUM', location: 'amsterdam', } expect(entitiesMatch(query, cached)).toBe(true) }) }) describe('record set type matching', () => { it('should match when record set types are equal', () => { const query: ExtractedEntities = { institutionType: 'A', recordSetType: 'CIVIL_REGISTRY' } const cached: ExtractedEntities = { institutionType: 'A', recordSetType: 'CIVIL_REGISTRY' } expect(entitiesMatch(query, cached)).toBe(true) }) it('should NOT match when record set types differ', () => { const query: ExtractedEntities = { institutionType: 'A', recordSetType: 'CIVIL_REGISTRY' } const cached: ExtractedEntities = { institutionType: 'A', recordSetType: 'COUNCIL_GOVERNANCE' } expect(entitiesMatch(query, cached)).toBe(false) }) it('should NOT match when query has record set type but cached does not', () => { const query: ExtractedEntities = { institutionType: 'A', recordSetType: 'CIVIL_REGISTRY', location: 'NH', } const cached: ExtractedEntities = { institutionType: 'A', location: 'NH', } expect(entitiesMatch(query, cached)).toBe(false) }) }) describe('combined matching - geographic false positives', () => { it('should prevent Amsterdam vs Rotterdam false positive', () => { const query: ExtractedEntities = { institutionType: 'M', location: 'amsterdam', intent: 'count', } const cached: ExtractedEntities = { institutionType: 'M', location: 'rotterdam', intent: 'count', } expect(entitiesMatch(query, cached)).toBe(false) }) it('should prevent city vs province false positive', () => { const query: ExtractedEntities = { institutionType: 'M', location: 'amsterdam', locationType: 'city', } const cached: ExtractedEntities = { institutionType: 'M', location: 'NH', locationType: 'province', } expect(entitiesMatch(query, cached)).toBe(false) }) }) describe('combined matching - subtype false positives', () => { it('should prevent kunstmuseum vs generic museum false positive', () => { const query: ExtractedEntities = { institutionType: 'M', institutionSubtype: 'ART_MUSEUM', location: 'amsterdam', intent: 'count', } const cached: ExtractedEntities = { institutionType: 'M', location: 'amsterdam', intent: 'count', // No subtype } expect(entitiesMatch(query, cached)).toBe(false) }) it('should prevent burgerlijke stand vs generic archive false positive', () => { const query: ExtractedEntities = { institutionType: 'A', recordSetType: 'CIVIL_REGISTRY', location: 'amsterdam', } const cached: ExtractedEntities = { institutionType: 'A', location: 'amsterdam', // No record set type } expect(entitiesMatch(query, cached)).toBe(false) }) }) }) describe('normalizeQuery', () => { it('should lowercase the query', () => { expect(normalizeQuery('Hoeveel MUSEA?')).toBe('hoeveel musea') }) it('should trim whitespace', () => { expect(normalizeQuery(' musea ')).toBe('musea') }) it('should replace punctuation with spaces', () => { expect(normalizeQuery('musea, archieven, en bibliotheken')).toBe('musea archieven en bibliotheken') }) it('should collapse multiple spaces', () => { expect(normalizeQuery('musea in amsterdam')).toBe('musea in amsterdam') }) }) // ============================================================================ // Phase 2: Topological Distance Tests (Rule 46 Evolution) // ============================================================================ describe('topologicalDistance', () => { it('should return 0 for identical types', () => { expect(topologicalDistance('M', 'M')).toBe(0) expect(topologicalDistance('A', 'A')).toBe(0) expect(topologicalDistance('M.ART', 'M.ART')).toBe(0) }) it('should return 0.25 for sibling subtypes (same parent)', () => { // M.ART and M.HISTORY are both children of M const dist = topologicalDistance('M.ART', 'M.HISTORY') expect(dist).toBeCloseTo(0.5, 1) // path: ART -> M -> HISTORY = 2 / 4 = 0.5 }) it('should return higher distance for different base types', () => { // M and A are siblings under root const dist = topologicalDistance('M', 'A') expect(dist).toBeCloseTo(0.5, 1) // path: M -> * -> A = 2 / 4 = 0.5 }) it('should return even higher distance for subtypes of different base types', () => { // M.ART and A.MUNICIPAL are in different branches const dist = topologicalDistance('M.ART', 'A.MUNICIPAL') expect(dist).toBeGreaterThan(0.5) }) it('should handle unknown types gracefully', () => { // Unknown types should be treated as direct children of root const dist = topologicalDistance('UNKNOWN', 'M') expect(dist).toBeGreaterThanOrEqual(0) expect(dist).toBeLessThanOrEqual(1) }) it('should be symmetric', () => { expect(topologicalDistance('M', 'A')).toBe(topologicalDistance('A', 'M')) expect(topologicalDistance('M.ART', 'L')).toBe(topologicalDistance('L', 'M.ART')) }) }) describe('combinedSimilarity', () => { it('should return pure embedding similarity when no types provided', () => { const similarity = combinedSimilarity(0.95, undefined, undefined) expect(similarity).toBe(0.95) }) it('should weight embedding similarity at 0.7 by default', () => { // Same type -> topological distance = 0 -> topo similarity = 1 const similarity = combinedSimilarity(0.9, 'M', 'M') // 0.7 * 0.9 + 0.3 * 1.0 = 0.63 + 0.3 = 0.93 expect(similarity).toBeCloseTo(0.93, 2) }) it('should penalize different types even with high embedding similarity', () => { // Different types -> topological distance > 0 -> lower combined similarity const sameType = combinedSimilarity(0.9, 'M', 'M') const diffType = combinedSimilarity(0.9, 'M', 'A') expect(diffType).toBeLessThan(sameType) }) it('should heavily penalize cross-branch subtype matches', () => { // M.ART vs A.MUNICIPAL - very different semantically const crossBranch = combinedSimilarity(0.92, 'M.ART', 'A.MUNICIPAL') // Even with 0.92 embedding similarity, the topological penalty should be significant expect(crossBranch).toBeLessThan(0.85) }) }) // ============================================================================ // Phase 5: Clarity Trading Tests (Rule 46 Evolution) // ============================================================================ describe('calculateClarity', () => { describe('ambiguity detection', () => { it('should penalize temporal vagueness without dates', () => { const result = calculateClarity('oude archieven in Amsterdam') expect(result.clarityScore).toBeLessThan(0.7) expect(result.ambiguities).toContain('temporal_vague') }) it('should NOT penalize temporal terms with dates', () => { const result = calculateClarity('archieven uit 1950 in Amsterdam') expect(result.ambiguities).not.toContain('temporal_vague') }) it('should penalize size vagueness', () => { const result = calculateClarity('grote musea') expect(result.ambiguities).toContain('size_vague') }) it('should penalize quality vagueness', () => { const result = calculateClarity('beste bibliotheken') expect(result.ambiguities).toContain('quality_vague') }) it('should penalize pronouns at start', () => { const result = calculateClarity('Het is een archief?') expect(result.ambiguities).toContain('pronoun_start') }) it('should heavily penalize very short queries', () => { const result = calculateClarity('musea?') expect(result.clarityScore).toBeLessThan(0.5) expect(result.ambiguities).toContain('too_short') }) }) describe('clarity boosters', () => { it('should boost clarity for specific city', () => { const withCity = calculateClarity('musea in amsterdam') const withoutCity = calculateClarity('musea in nederland') expect(withCity.clarityScore).toBeGreaterThan(withoutCity.clarityScore) }) it('should boost clarity for specific type', () => { // "museum" matches specific_type pattern (+0.10), "gebouwen" does not const withType = calculateClarity('Ik zoek een museum om te bezoeken') const withoutType = calculateClarity('Ik zoek een gebouw om te bezoeken') expect(withType.clarityScore).toBeGreaterThan(withoutType.clarityScore) }) it('should boost clarity for clear intent', () => { const withIntent = calculateClarity('hoeveel musea zijn er in amsterdam') const withoutIntent = calculateClarity('musea amsterdam') expect(withIntent.clarityScore).toBeGreaterThan(withoutIntent.clarityScore) }) it('should boost clarity for identifiers', () => { const result = calculateClarity('wat is ISIL code NL-AsdAM') expect(result.clarityScore).toBeGreaterThanOrEqual(0.7) }) it('should boost clarity for date ranges', () => { const result = calculateClarity('archieven 1800-1900 in amsterdam') expect(result.clarityScore).toBeGreaterThanOrEqual(0.7) }) }) describe('entity-based clarity boost', () => { it('should boost clarity when entities are extracted', () => { const entities: ExtractedEntities = { institutionType: 'M', location: 'amsterdam', intent: 'count', } const withEntities = calculateClarity('hoeveel musea amsterdam', entities) const withoutEntities = calculateClarity('hoeveel musea amsterdam') expect(withEntities.clarityScore).toBeGreaterThanOrEqual(withoutEntities.clarityScore) }) it('should boost clarity for subtype specificity', () => { const entities: ExtractedEntities = { institutionType: 'M', institutionSubtype: 'ART_MUSEUM', location: 'amsterdam', } const result = calculateClarity('kunstmusea amsterdam', entities) expect(result.clarityScore).toBeGreaterThanOrEqual(0.7) }) }) describe('combined clarity scoring', () => { it('should pass threshold for clear, specific queries', () => { const result = calculateClarity('Hoeveel musea zijn er in Amsterdam?') expect(result.clarityScore).toBeGreaterThanOrEqual(0.7) expect(result.ambiguities.length).toBe(0) }) it('should fail threshold for vague, short queries', () => { const result = calculateClarity('het oude?') expect(result.clarityScore).toBeLessThan(0.7) expect(result.ambiguities.length).toBeGreaterThan(0) }) }) }) describe('extractEntitiesWithClarity', () => { it('should include clarity score in extracted entities', () => { const entities = extractEntitiesWithClarity('Hoeveel musea zijn er in Amsterdam?') expect(entities.clarityScore).toBeDefined() expect(typeof entities.clarityScore).toBe('number') }) it('should include ambiguities when present', () => { const entities = extractEntitiesWithClarity('oude grote musea') expect(entities.ambiguities).toBeDefined() expect(entities.ambiguities?.length).toBeGreaterThan(0) }) it('should not include ambiguities array for clear queries', () => { const entities = extractEntitiesWithClarity('Hoeveel archieven zijn er in Rotterdam?') // Either no ambiguities or empty array expect(entities.ambiguities === undefined || entities.ambiguities.length === 0).toBe(true) }) it('should still extract institution type', () => { const entities = extractEntitiesWithClarity('musea in amsterdam') expect(entities.institutionType).toBe('M') }) it('should still extract location', () => { const entities = extractEntitiesWithClarity('musea in amsterdam') expect(entities.location).toBe('amsterdam') }) it('should still extract intent', () => { const entities = extractEntitiesWithClarity('hoeveel musea zijn er') expect(entities.intent).toBe('count') }) }) // ============================================================================ // Phase 4: Message Handler Tests (Smalltalk-Inspired Introspection) // ============================================================================ describe('explainLastDecision', () => { it('should return null when no lookup has been performed', () => { // Reset by calling the function - it starts as null // Note: This test may be affected by previous test runs const trace = explainLastDecision() // Either null (fresh) or has data from previous test expect(trace === null || typeof trace === 'object').toBe(true) }) })