/** * Unit tests for semantic-cache.ts entity extraction and matching * * Tests Rule 46: Ontology-Driven Cache Segmentation * - Vocabulary-based entity extraction * - Structured cache key generation * - Entity matching for cache lookup */ import { describe, it, expect, vi, beforeEach } from 'vitest' import { extractEntitiesFast, generateStructuredCacheKey, entitiesMatch, normalizeQuery, type ExtractedEntities, type InstitutionTypeCode, } from '../src/lib/semantic-cache' describe('extractEntitiesFast', () => { describe('institution type detection', () => { it('should detect museum type from "musea"', () => { const entities = extractEntitiesFast('Hoeveel musea zijn er in Amsterdam?') expect(entities.institutionType).toBe('M') }) it('should detect museum type from "museum"', () => { const entities = extractEntitiesFast('Waar is het museum?') expect(entities.institutionType).toBe('M') }) it('should detect archive type from "archief"', () => { const entities = extractEntitiesFast('Hoeveel archieven zijn er in Utrecht?') expect(entities.institutionType).toBe('A') }) it('should detect archive type from "archieven"', () => { const entities = extractEntitiesFast('Toon alle archieven') expect(entities.institutionType).toBe('A') }) it('should detect library type from "bibliotheek"', () => { const entities = extractEntitiesFast('Welke bibliotheken zijn er?') expect(entities.institutionType).toBe('L') }) it('should detect library type from "bibliotheken"', () => { const entities = extractEntitiesFast('Hoeveel bibliotheken in Groningen?') expect(entities.institutionType).toBe('L') }) it('should detect gallery type from "gallerij"', () => { const entities = extractEntitiesFast('Kunstgallerij in Amsterdam') expect(entities.institutionType).toBe('G') }) it('should detect education type from "universiteit"', () => { const entities = extractEntitiesFast('Universiteit Utrecht collecties') expect(entities.institutionType).toBe('E') }) it('should detect holy sites from "kerk"', () => { const entities = extractEntitiesFast('Welke kerken zijn er in Amsterdam?') expect(entities.institutionType).toBe('H') }) }) describe('location detection - provinces', () => { it('should detect Noord-Holland province', () => { const entities = extractEntitiesFast('musea in noord-holland') expect(entities.location).toBe('NH') expect(entities.locationType).toBe('province') }) it('should detect Zuid-Holland province', () => { const entities = extractEntitiesFast('archieven in zuid-holland') expect(entities.location).toBe('ZH') expect(entities.locationType).toBe('province') }) it('should detect Utrecht province', () => { const entities = extractEntitiesFast('bibliotheken in utrecht') expect(entities.location).toBe('UT') expect(entities.locationType).toBe('province') }) it('should detect Gelderland province', () => { const entities = extractEntitiesFast('musea gelderland') expect(entities.location).toBe('GE') expect(entities.locationType).toBe('province') }) it('should detect Limburg province', () => { const entities = extractEntitiesFast('archieven limburg') expect(entities.location).toBe('LI') expect(entities.locationType).toBe('province') }) }) describe('location detection - cities', () => { it('should detect Amsterdam', () => { const entities = extractEntitiesFast('musea in amsterdam') expect(entities.location).toBe('amsterdam') expect(entities.locationType).toBe('city') }) it('should detect Rotterdam', () => { const entities = extractEntitiesFast('archieven rotterdam') expect(entities.location).toBe('rotterdam') expect(entities.locationType).toBe('city') }) it('should detect Den Haag', () => { const entities = extractEntitiesFast('bibliotheken den haag') expect(entities.location).toBe('denhaag') expect(entities.locationType).toBe('city') }) it('should detect Maastricht', () => { const entities = extractEntitiesFast('musea maastricht') expect(entities.location).toBe('maastricht') expect(entities.locationType).toBe('city') }) }) describe('intent detection', () => { it('should detect count intent from "hoeveel"', () => { const entities = extractEntitiesFast('Hoeveel musea zijn er?') expect(entities.intent).toBe('count') }) it('should detect count intent from "aantal"', () => { const entities = extractEntitiesFast('Wat is het aantal archieven?') expect(entities.intent).toBe('count') }) it('should detect list intent from "welke"', () => { const entities = extractEntitiesFast('Welke bibliotheken zijn er?') expect(entities.intent).toBe('list') }) it('should detect list intent from "toon"', () => { const entities = extractEntitiesFast('Toon alle musea') expect(entities.intent).toBe('list') }) it('should detect info intent from "wat is"', () => { const entities = extractEntitiesFast('Wat is een archief?') expect(entities.intent).toBe('info') }) }) describe('combined entity extraction', () => { it('should extract type, location, and intent together', () => { const entities = extractEntitiesFast('Hoeveel musea zijn er in Amsterdam?') expect(entities.institutionType).toBe('M') expect(entities.location).toBe('amsterdam') expect(entities.locationType).toBe('city') expect(entities.intent).toBe('count') }) it('should prefer province over city when province is mentioned', () => { const entities = extractEntitiesFast('musea in noord-holland') expect(entities.location).toBe('NH') expect(entities.locationType).toBe('province') }) }) }) describe('generateStructuredCacheKey', () => { it('should generate key with intent, type, and location', () => { const entities: ExtractedEntities = { institutionType: 'M', location: 'amsterdam', intent: 'count', } expect(generateStructuredCacheKey(entities)).toBe('count:m:amsterdam') }) it('should include subtype when present', () => { const entities: ExtractedEntities = { institutionType: 'M', institutionSubtype: 'ART_MUSEUM', location: 'amsterdam', intent: 'count', } expect(generateStructuredCacheKey(entities)).toBe('count:m.art_museum:amsterdam') }) it('should include record set type when present', () => { const entities: ExtractedEntities = { institutionType: 'A', recordSetType: 'CIVIL_REGISTRY', location: 'NH', intent: 'list', } expect(generateStructuredCacheKey(entities)).toBe('list:a:civil_registry:nh') }) it('should include both subtype and record set type', () => { const entities: ExtractedEntities = { institutionType: 'A', institutionSubtype: 'MUNICIPAL_ARCHIVE', recordSetType: 'CIVIL_REGISTRY', location: 'amsterdam', intent: 'list', } expect(generateStructuredCacheKey(entities)).toBe('list:a.municipal_archive:civil_registry:amsterdam') }) it('should use defaults for missing fields', () => { const entities: ExtractedEntities = {} expect(generateStructuredCacheKey(entities)).toBe('query:any:nl') }) it('should normalize subtype to snake_case', () => { const entities: ExtractedEntities = { institutionType: 'M', institutionSubtype: 'Art Museum', location: 'amsterdam', } expect(generateStructuredCacheKey(entities)).toBe('query:m.art_museum:amsterdam') }) }) describe('entitiesMatch', () => { describe('location matching', () => { it('should match when locations are equal', () => { const query: ExtractedEntities = { location: 'amsterdam', institutionType: 'M' } const cached: ExtractedEntities = { location: 'amsterdam', institutionType: 'M' } expect(entitiesMatch(query, cached)).toBe(true) }) it('should NOT match when locations differ', () => { const query: ExtractedEntities = { location: 'amsterdam', institutionType: 'M' } const cached: ExtractedEntities = { location: 'rotterdam', institutionType: 'M' } expect(entitiesMatch(query, cached)).toBe(false) }) it('should NOT match when query has location but cached does not', () => { const query: ExtractedEntities = { location: 'amsterdam', institutionType: 'M' } const cached: ExtractedEntities = { institutionType: 'M' } expect(entitiesMatch(query, cached)).toBe(false) }) it('should match when query has no location but cached does', () => { const query: ExtractedEntities = { institutionType: 'M' } const cached: ExtractedEntities = { location: 'amsterdam', institutionType: 'M' } expect(entitiesMatch(query, cached)).toBe(true) }) }) describe('institution type matching', () => { it('should match when types are equal', () => { const query: ExtractedEntities = { institutionType: 'M' } const cached: ExtractedEntities = { institutionType: 'M' } expect(entitiesMatch(query, cached)).toBe(true) }) it('should NOT match when types differ', () => { const query: ExtractedEntities = { institutionType: 'M' } const cached: ExtractedEntities = { institutionType: 'A' } expect(entitiesMatch(query, cached)).toBe(false) }) it('should NOT match when query has type but cached does not', () => { const query: ExtractedEntities = { institutionType: 'M' } const cached: ExtractedEntities = {} expect(entitiesMatch(query, cached)).toBe(false) }) }) describe('subtype matching (Rule 46)', () => { it('should match when subtypes are equal', () => { const query: ExtractedEntities = { institutionType: 'M', institutionSubtype: 'ART_MUSEUM' } const cached: ExtractedEntities = { institutionType: 'M', institutionSubtype: 'ART_MUSEUM' } expect(entitiesMatch(query, cached)).toBe(true) }) it('should NOT match when subtypes differ', () => { const query: ExtractedEntities = { institutionType: 'M', institutionSubtype: 'ART_MUSEUM' } const cached: ExtractedEntities = { institutionType: 'M', institutionSubtype: 'HISTORY_MUSEUM' } expect(entitiesMatch(query, cached)).toBe(false) }) it('should NOT match when query has subtype but cached does not (prevents kunstmuseum -> generic museum match)', () => { const query: ExtractedEntities = { institutionType: 'M', institutionSubtype: 'ART_MUSEUM', location: 'amsterdam', } const cached: ExtractedEntities = { institutionType: 'M', location: 'amsterdam', // No subtype - generic museum cached response } expect(entitiesMatch(query, cached)).toBe(false) }) it('should match when query has no subtype but cached does (generic matches specific)', () => { const query: ExtractedEntities = { institutionType: 'M', location: 'amsterdam', // No subtype - generic museum query } const cached: ExtractedEntities = { institutionType: 'M', institutionSubtype: 'ART_MUSEUM', location: 'amsterdam', } expect(entitiesMatch(query, cached)).toBe(true) }) }) describe('record set type matching', () => { it('should match when record set types are equal', () => { const query: ExtractedEntities = { institutionType: 'A', recordSetType: 'CIVIL_REGISTRY' } const cached: ExtractedEntities = { institutionType: 'A', recordSetType: 'CIVIL_REGISTRY' } expect(entitiesMatch(query, cached)).toBe(true) }) it('should NOT match when record set types differ', () => { const query: ExtractedEntities = { institutionType: 'A', recordSetType: 'CIVIL_REGISTRY' } const cached: ExtractedEntities = { institutionType: 'A', recordSetType: 'COUNCIL_GOVERNANCE' } expect(entitiesMatch(query, cached)).toBe(false) }) it('should NOT match when query has record set type but cached does not', () => { const query: ExtractedEntities = { institutionType: 'A', recordSetType: 'CIVIL_REGISTRY', location: 'NH', } const cached: ExtractedEntities = { institutionType: 'A', location: 'NH', } expect(entitiesMatch(query, cached)).toBe(false) }) }) describe('combined matching - geographic false positives', () => { it('should prevent Amsterdam vs Rotterdam false positive', () => { const query: ExtractedEntities = { institutionType: 'M', location: 'amsterdam', intent: 'count', } const cached: ExtractedEntities = { institutionType: 'M', location: 'rotterdam', intent: 'count', } expect(entitiesMatch(query, cached)).toBe(false) }) it('should prevent city vs province false positive', () => { const query: ExtractedEntities = { institutionType: 'M', location: 'amsterdam', locationType: 'city', } const cached: ExtractedEntities = { institutionType: 'M', location: 'NH', locationType: 'province', } expect(entitiesMatch(query, cached)).toBe(false) }) }) describe('combined matching - subtype false positives', () => { it('should prevent kunstmuseum vs generic museum false positive', () => { const query: ExtractedEntities = { institutionType: 'M', institutionSubtype: 'ART_MUSEUM', location: 'amsterdam', intent: 'count', } const cached: ExtractedEntities = { institutionType: 'M', location: 'amsterdam', intent: 'count', // No subtype } expect(entitiesMatch(query, cached)).toBe(false) }) it('should prevent burgerlijke stand vs generic archive false positive', () => { const query: ExtractedEntities = { institutionType: 'A', recordSetType: 'CIVIL_REGISTRY', location: 'amsterdam', } const cached: ExtractedEntities = { institutionType: 'A', location: 'amsterdam', // No record set type } expect(entitiesMatch(query, cached)).toBe(false) }) }) }) describe('normalizeQuery', () => { it('should lowercase the query', () => { expect(normalizeQuery('Hoeveel MUSEA?')).toBe('hoeveel musea') }) it('should trim whitespace', () => { expect(normalizeQuery(' musea ')).toBe('musea') }) it('should replace punctuation with spaces', () => { expect(normalizeQuery('musea, archieven, en bibliotheken')).toBe('musea archieven en bibliotheken') }) it('should collapse multiple spaces', () => { expect(normalizeQuery('musea in amsterdam')).toBe('musea in amsterdam') }) })