445 lines
15 KiB
TypeScript
445 lines
15 KiB
TypeScript
/**
|
|
* Unit tests for semantic-cache.ts entity extraction and matching
|
|
*
|
|
* Tests Rule 46: Ontology-Driven Cache Segmentation
|
|
* - Vocabulary-based entity extraction
|
|
* - Structured cache key generation
|
|
* - Entity matching for cache lookup
|
|
*/
|
|
|
|
import { describe, it, expect, vi, beforeEach } from 'vitest'
|
|
import {
|
|
extractEntitiesFast,
|
|
generateStructuredCacheKey,
|
|
entitiesMatch,
|
|
normalizeQuery,
|
|
type ExtractedEntities,
|
|
type InstitutionTypeCode,
|
|
} from '../src/lib/semantic-cache'
|
|
|
|
describe('extractEntitiesFast', () => {
|
|
describe('institution type detection', () => {
|
|
it('should detect museum type from "musea"', () => {
|
|
const entities = extractEntitiesFast('Hoeveel musea zijn er in Amsterdam?')
|
|
expect(entities.institutionType).toBe('M')
|
|
})
|
|
|
|
it('should detect museum type from "museum"', () => {
|
|
const entities = extractEntitiesFast('Waar is het museum?')
|
|
expect(entities.institutionType).toBe('M')
|
|
})
|
|
|
|
it('should detect archive type from "archief"', () => {
|
|
const entities = extractEntitiesFast('Hoeveel archieven zijn er in Utrecht?')
|
|
expect(entities.institutionType).toBe('A')
|
|
})
|
|
|
|
it('should detect archive type from "archieven"', () => {
|
|
const entities = extractEntitiesFast('Toon alle archieven')
|
|
expect(entities.institutionType).toBe('A')
|
|
})
|
|
|
|
it('should detect library type from "bibliotheek"', () => {
|
|
const entities = extractEntitiesFast('Welke bibliotheken zijn er?')
|
|
expect(entities.institutionType).toBe('L')
|
|
})
|
|
|
|
it('should detect library type from "bibliotheken"', () => {
|
|
const entities = extractEntitiesFast('Hoeveel bibliotheken in Groningen?')
|
|
expect(entities.institutionType).toBe('L')
|
|
})
|
|
|
|
it('should detect gallery type from "gallerij"', () => {
|
|
const entities = extractEntitiesFast('Kunstgallerij in Amsterdam')
|
|
expect(entities.institutionType).toBe('G')
|
|
})
|
|
|
|
it('should detect education type from "universiteit"', () => {
|
|
const entities = extractEntitiesFast('Universiteit Utrecht collecties')
|
|
expect(entities.institutionType).toBe('E')
|
|
})
|
|
|
|
it('should detect holy sites from "kerk"', () => {
|
|
const entities = extractEntitiesFast('Welke kerken zijn er in Amsterdam?')
|
|
expect(entities.institutionType).toBe('H')
|
|
})
|
|
})
|
|
|
|
describe('location detection - provinces', () => {
|
|
it('should detect Noord-Holland province', () => {
|
|
const entities = extractEntitiesFast('musea in noord-holland')
|
|
expect(entities.location).toBe('NH')
|
|
expect(entities.locationType).toBe('province')
|
|
})
|
|
|
|
it('should detect Zuid-Holland province', () => {
|
|
const entities = extractEntitiesFast('archieven in zuid-holland')
|
|
expect(entities.location).toBe('ZH')
|
|
expect(entities.locationType).toBe('province')
|
|
})
|
|
|
|
it('should detect Utrecht province', () => {
|
|
const entities = extractEntitiesFast('bibliotheken in utrecht')
|
|
expect(entities.location).toBe('UT')
|
|
expect(entities.locationType).toBe('province')
|
|
})
|
|
|
|
it('should detect Gelderland province', () => {
|
|
const entities = extractEntitiesFast('musea gelderland')
|
|
expect(entities.location).toBe('GE')
|
|
expect(entities.locationType).toBe('province')
|
|
})
|
|
|
|
it('should detect Limburg province', () => {
|
|
const entities = extractEntitiesFast('archieven limburg')
|
|
expect(entities.location).toBe('LI')
|
|
expect(entities.locationType).toBe('province')
|
|
})
|
|
})
|
|
|
|
describe('location detection - cities', () => {
|
|
it('should detect Amsterdam', () => {
|
|
const entities = extractEntitiesFast('musea in amsterdam')
|
|
expect(entities.location).toBe('amsterdam')
|
|
expect(entities.locationType).toBe('city')
|
|
})
|
|
|
|
it('should detect Rotterdam', () => {
|
|
const entities = extractEntitiesFast('archieven rotterdam')
|
|
expect(entities.location).toBe('rotterdam')
|
|
expect(entities.locationType).toBe('city')
|
|
})
|
|
|
|
it('should detect Den Haag', () => {
|
|
const entities = extractEntitiesFast('bibliotheken den haag')
|
|
expect(entities.location).toBe('denhaag')
|
|
expect(entities.locationType).toBe('city')
|
|
})
|
|
|
|
it('should detect Maastricht', () => {
|
|
const entities = extractEntitiesFast('musea maastricht')
|
|
expect(entities.location).toBe('maastricht')
|
|
expect(entities.locationType).toBe('city')
|
|
})
|
|
})
|
|
|
|
describe('intent detection', () => {
|
|
it('should detect count intent from "hoeveel"', () => {
|
|
const entities = extractEntitiesFast('Hoeveel musea zijn er?')
|
|
expect(entities.intent).toBe('count')
|
|
})
|
|
|
|
it('should detect count intent from "aantal"', () => {
|
|
const entities = extractEntitiesFast('Wat is het aantal archieven?')
|
|
expect(entities.intent).toBe('count')
|
|
})
|
|
|
|
it('should detect list intent from "welke"', () => {
|
|
const entities = extractEntitiesFast('Welke bibliotheken zijn er?')
|
|
expect(entities.intent).toBe('list')
|
|
})
|
|
|
|
it('should detect list intent from "toon"', () => {
|
|
const entities = extractEntitiesFast('Toon alle musea')
|
|
expect(entities.intent).toBe('list')
|
|
})
|
|
|
|
it('should detect info intent from "wat is"', () => {
|
|
const entities = extractEntitiesFast('Wat is een archief?')
|
|
expect(entities.intent).toBe('info')
|
|
})
|
|
})
|
|
|
|
describe('combined entity extraction', () => {
|
|
it('should extract type, location, and intent together', () => {
|
|
const entities = extractEntitiesFast('Hoeveel musea zijn er in Amsterdam?')
|
|
expect(entities.institutionType).toBe('M')
|
|
expect(entities.location).toBe('amsterdam')
|
|
expect(entities.locationType).toBe('city')
|
|
expect(entities.intent).toBe('count')
|
|
})
|
|
|
|
it('should prefer province over city when province is mentioned', () => {
|
|
const entities = extractEntitiesFast('musea in noord-holland')
|
|
expect(entities.location).toBe('NH')
|
|
expect(entities.locationType).toBe('province')
|
|
})
|
|
})
|
|
})
|
|
|
|
describe('generateStructuredCacheKey', () => {
|
|
it('should generate key with intent, type, and location', () => {
|
|
const entities: ExtractedEntities = {
|
|
institutionType: 'M',
|
|
location: 'amsterdam',
|
|
intent: 'count',
|
|
}
|
|
expect(generateStructuredCacheKey(entities)).toBe('count:m:amsterdam')
|
|
})
|
|
|
|
it('should include subtype when present', () => {
|
|
const entities: ExtractedEntities = {
|
|
institutionType: 'M',
|
|
institutionSubtype: 'ART_MUSEUM',
|
|
location: 'amsterdam',
|
|
intent: 'count',
|
|
}
|
|
expect(generateStructuredCacheKey(entities)).toBe('count:m.art_museum:amsterdam')
|
|
})
|
|
|
|
it('should include record set type when present', () => {
|
|
const entities: ExtractedEntities = {
|
|
institutionType: 'A',
|
|
recordSetType: 'CIVIL_REGISTRY',
|
|
location: 'NH',
|
|
intent: 'list',
|
|
}
|
|
expect(generateStructuredCacheKey(entities)).toBe('list:a:civil_registry:nh')
|
|
})
|
|
|
|
it('should include both subtype and record set type', () => {
|
|
const entities: ExtractedEntities = {
|
|
institutionType: 'A',
|
|
institutionSubtype: 'MUNICIPAL_ARCHIVE',
|
|
recordSetType: 'CIVIL_REGISTRY',
|
|
location: 'amsterdam',
|
|
intent: 'list',
|
|
}
|
|
expect(generateStructuredCacheKey(entities)).toBe('list:a.municipal_archive:civil_registry:amsterdam')
|
|
})
|
|
|
|
it('should use defaults for missing fields', () => {
|
|
const entities: ExtractedEntities = {}
|
|
expect(generateStructuredCacheKey(entities)).toBe('query:any:nl')
|
|
})
|
|
|
|
it('should normalize subtype to snake_case', () => {
|
|
const entities: ExtractedEntities = {
|
|
institutionType: 'M',
|
|
institutionSubtype: 'Art Museum',
|
|
location: 'amsterdam',
|
|
}
|
|
expect(generateStructuredCacheKey(entities)).toBe('query:m.art_museum:amsterdam')
|
|
})
|
|
})
|
|
|
|
describe('entitiesMatch', () => {
|
|
describe('location matching', () => {
|
|
it('should match when locations are equal', () => {
|
|
const query: ExtractedEntities = { location: 'amsterdam', institutionType: 'M' }
|
|
const cached: ExtractedEntities = { location: 'amsterdam', institutionType: 'M' }
|
|
expect(entitiesMatch(query, cached)).toBe(true)
|
|
})
|
|
|
|
it('should NOT match when locations differ', () => {
|
|
const query: ExtractedEntities = { location: 'amsterdam', institutionType: 'M' }
|
|
const cached: ExtractedEntities = { location: 'rotterdam', institutionType: 'M' }
|
|
expect(entitiesMatch(query, cached)).toBe(false)
|
|
})
|
|
|
|
it('should NOT match when query has location but cached does not', () => {
|
|
const query: ExtractedEntities = { location: 'amsterdam', institutionType: 'M' }
|
|
const cached: ExtractedEntities = { institutionType: 'M' }
|
|
expect(entitiesMatch(query, cached)).toBe(false)
|
|
})
|
|
|
|
it('should match when query has no location but cached does', () => {
|
|
const query: ExtractedEntities = { institutionType: 'M' }
|
|
const cached: ExtractedEntities = { location: 'amsterdam', institutionType: 'M' }
|
|
expect(entitiesMatch(query, cached)).toBe(true)
|
|
})
|
|
})
|
|
|
|
describe('institution type matching', () => {
|
|
it('should match when types are equal', () => {
|
|
const query: ExtractedEntities = { institutionType: 'M' }
|
|
const cached: ExtractedEntities = { institutionType: 'M' }
|
|
expect(entitiesMatch(query, cached)).toBe(true)
|
|
})
|
|
|
|
it('should NOT match when types differ', () => {
|
|
const query: ExtractedEntities = { institutionType: 'M' }
|
|
const cached: ExtractedEntities = { institutionType: 'A' }
|
|
expect(entitiesMatch(query, cached)).toBe(false)
|
|
})
|
|
|
|
it('should NOT match when query has type but cached does not', () => {
|
|
const query: ExtractedEntities = { institutionType: 'M' }
|
|
const cached: ExtractedEntities = {}
|
|
expect(entitiesMatch(query, cached)).toBe(false)
|
|
})
|
|
})
|
|
|
|
describe('subtype matching (Rule 46)', () => {
|
|
it('should match when subtypes are equal', () => {
|
|
const query: ExtractedEntities = {
|
|
institutionType: 'M',
|
|
institutionSubtype: 'ART_MUSEUM'
|
|
}
|
|
const cached: ExtractedEntities = {
|
|
institutionType: 'M',
|
|
institutionSubtype: 'ART_MUSEUM'
|
|
}
|
|
expect(entitiesMatch(query, cached)).toBe(true)
|
|
})
|
|
|
|
it('should NOT match when subtypes differ', () => {
|
|
const query: ExtractedEntities = {
|
|
institutionType: 'M',
|
|
institutionSubtype: 'ART_MUSEUM'
|
|
}
|
|
const cached: ExtractedEntities = {
|
|
institutionType: 'M',
|
|
institutionSubtype: 'HISTORY_MUSEUM'
|
|
}
|
|
expect(entitiesMatch(query, cached)).toBe(false)
|
|
})
|
|
|
|
it('should NOT match when query has subtype but cached does not (prevents kunstmuseum -> generic museum match)', () => {
|
|
const query: ExtractedEntities = {
|
|
institutionType: 'M',
|
|
institutionSubtype: 'ART_MUSEUM',
|
|
location: 'amsterdam',
|
|
}
|
|
const cached: ExtractedEntities = {
|
|
institutionType: 'M',
|
|
location: 'amsterdam',
|
|
// No subtype - generic museum cached response
|
|
}
|
|
expect(entitiesMatch(query, cached)).toBe(false)
|
|
})
|
|
|
|
it('should match when query has no subtype but cached does (generic matches specific)', () => {
|
|
const query: ExtractedEntities = {
|
|
institutionType: 'M',
|
|
location: 'amsterdam',
|
|
// No subtype - generic museum query
|
|
}
|
|
const cached: ExtractedEntities = {
|
|
institutionType: 'M',
|
|
institutionSubtype: 'ART_MUSEUM',
|
|
location: 'amsterdam',
|
|
}
|
|
expect(entitiesMatch(query, cached)).toBe(true)
|
|
})
|
|
})
|
|
|
|
describe('record set type matching', () => {
|
|
it('should match when record set types are equal', () => {
|
|
const query: ExtractedEntities = {
|
|
institutionType: 'A',
|
|
recordSetType: 'CIVIL_REGISTRY'
|
|
}
|
|
const cached: ExtractedEntities = {
|
|
institutionType: 'A',
|
|
recordSetType: 'CIVIL_REGISTRY'
|
|
}
|
|
expect(entitiesMatch(query, cached)).toBe(true)
|
|
})
|
|
|
|
it('should NOT match when record set types differ', () => {
|
|
const query: ExtractedEntities = {
|
|
institutionType: 'A',
|
|
recordSetType: 'CIVIL_REGISTRY'
|
|
}
|
|
const cached: ExtractedEntities = {
|
|
institutionType: 'A',
|
|
recordSetType: 'COUNCIL_GOVERNANCE'
|
|
}
|
|
expect(entitiesMatch(query, cached)).toBe(false)
|
|
})
|
|
|
|
it('should NOT match when query has record set type but cached does not', () => {
|
|
const query: ExtractedEntities = {
|
|
institutionType: 'A',
|
|
recordSetType: 'CIVIL_REGISTRY',
|
|
location: 'NH',
|
|
}
|
|
const cached: ExtractedEntities = {
|
|
institutionType: 'A',
|
|
location: 'NH',
|
|
}
|
|
expect(entitiesMatch(query, cached)).toBe(false)
|
|
})
|
|
})
|
|
|
|
describe('combined matching - geographic false positives', () => {
|
|
it('should prevent Amsterdam vs Rotterdam false positive', () => {
|
|
const query: ExtractedEntities = {
|
|
institutionType: 'M',
|
|
location: 'amsterdam',
|
|
intent: 'count',
|
|
}
|
|
const cached: ExtractedEntities = {
|
|
institutionType: 'M',
|
|
location: 'rotterdam',
|
|
intent: 'count',
|
|
}
|
|
expect(entitiesMatch(query, cached)).toBe(false)
|
|
})
|
|
|
|
it('should prevent city vs province false positive', () => {
|
|
const query: ExtractedEntities = {
|
|
institutionType: 'M',
|
|
location: 'amsterdam',
|
|
locationType: 'city',
|
|
}
|
|
const cached: ExtractedEntities = {
|
|
institutionType: 'M',
|
|
location: 'NH',
|
|
locationType: 'province',
|
|
}
|
|
expect(entitiesMatch(query, cached)).toBe(false)
|
|
})
|
|
})
|
|
|
|
describe('combined matching - subtype false positives', () => {
|
|
it('should prevent kunstmuseum vs generic museum false positive', () => {
|
|
const query: ExtractedEntities = {
|
|
institutionType: 'M',
|
|
institutionSubtype: 'ART_MUSEUM',
|
|
location: 'amsterdam',
|
|
intent: 'count',
|
|
}
|
|
const cached: ExtractedEntities = {
|
|
institutionType: 'M',
|
|
location: 'amsterdam',
|
|
intent: 'count',
|
|
// No subtype
|
|
}
|
|
expect(entitiesMatch(query, cached)).toBe(false)
|
|
})
|
|
|
|
it('should prevent burgerlijke stand vs generic archive false positive', () => {
|
|
const query: ExtractedEntities = {
|
|
institutionType: 'A',
|
|
recordSetType: 'CIVIL_REGISTRY',
|
|
location: 'amsterdam',
|
|
}
|
|
const cached: ExtractedEntities = {
|
|
institutionType: 'A',
|
|
location: 'amsterdam',
|
|
// No record set type
|
|
}
|
|
expect(entitiesMatch(query, cached)).toBe(false)
|
|
})
|
|
})
|
|
})
|
|
|
|
describe('normalizeQuery', () => {
|
|
it('should lowercase the query', () => {
|
|
expect(normalizeQuery('Hoeveel MUSEA?')).toBe('hoeveel musea')
|
|
})
|
|
|
|
it('should trim whitespace', () => {
|
|
expect(normalizeQuery(' musea ')).toBe('musea')
|
|
})
|
|
|
|
it('should replace punctuation with spaces', () => {
|
|
expect(normalizeQuery('musea, archieven, en bibliotheken')).toBe('musea archieven en bibliotheken')
|
|
})
|
|
|
|
it('should collapse multiple spaces', () => {
|
|
expect(normalizeQuery('musea in amsterdam')).toBe('musea in amsterdam')
|
|
})
|
|
})
|