diff --git a/.opencode/rules/ontology-driven-cache-segmentation.md b/.opencode/rules/ontology-driven-cache-segmentation.md index 7a49f677d9..e230c8bd2e 100644 --- a/.opencode/rules/ontology-driven-cache-segmentation.md +++ b/.opencode/rules/ontology-driven-cache-segmentation.md @@ -2,6 +2,78 @@ 🚨 **CRITICAL**: The semantic cache MUST use vocabulary derived from LinkML `*Type.yaml` and `*Types.yaml` schema files to extract entities for cache key generation. Hardcoded regex patterns are deprecated. +**Status**: Implemented (Evolved v2.0) +**Version**: 2.0 (Epistemological Evolution) +**Updated**: 2026-01-10 + +## Evolution Overview + +Rule 46 v2.0 incorporates insights from Volodymyr Pavlyshyn's work on agentic memory systems: + +1. **Epistemic Provenance** (Phase 1) - Track WHERE, WHEN, HOW data originated +2. **Topological Distance** (Phase 2) - Use ontology structure, not just embeddings +3. **Holarchic Cache** (Phase 3) - Entries as holons with up/down links +4. **Message Passing** (Phase 4, planned) - Smalltalk-style introspectable cache +5. **Clarity Trading** (Phase 5, planned) - Block ambiguous queries from cache + +## Epistemic Provenance + +Every cached response carries epistemological metadata: + +```typescript +interface EpistemicProvenance { + dataSource: 'ISIL_REGISTRY' | 'WIKIDATA' | 'CUSTODIAN_YAML' | 'LLM_INFERENCE' | ...; + dataTier: 1 | 2 | 3 | 4; // TIER_1_AUTHORITATIVE → TIER_4_INFERRED + sourceTimestamp: string; + derivationChain: string[]; // ["SPARQL:Qdrant", "RAG:retrieve", "LLM:generate"] + revalidationPolicy: 'static' | 'daily' | 'weekly' | 'on_access'; +} +``` + +**Benefit**: Users see "This answer is from TIER_1 ISIL registry data, captured 2025-01-08". + +## Topological Distance + +Beyond embedding similarity, cache matching considers **structural distance** in the type hierarchy: + +``` + HeritageCustodian (*) + │ + ┌──────────────────┼──────────────────┐ + ▼ ▼ ▼ + MuseumType (M) ArchiveType (A) LibraryType (L) + │ │ │ + ┌────┴────┐ ┌────┴────┐ ┌────┴────┐ + ▼ ▼ ▼ ▼ ▼ ▼ +ArtMuseum History Municipal State Public Academic +``` + +**Combined Similarity Formula**: +```typescript +finalScore = 0.7 * embeddingSimilarity + 0.3 * (1 - topologicalDistance) +``` + +**Benefit**: "Art museum" won't match "natural history museum" even with 95% embedding similarity. + +## Holarchic Cache Structure + +Cache entries are **holons** - simultaneously complete AND parts of aggregates: + +| Level | Example | Aggregates | +|-------|---------|------------| +| Micro | "Rijksmuseum details" | None | +| Meso | "Museums in Amsterdam" | List of micro holons | +| Macro | "Heritage in Noord-Holland" | All meso holons in region | + +```typescript +interface CachedQuery { + // ... existing fields ... + holonLevel?: 'micro' | 'meso' | 'macro'; + participatesIn?: string[]; // Higher-level cache keys + aggregates?: string[]; // Lower-level entries +} +``` + ## Problem Statement The ArchiefAssistent semantic cache prevents geographic false positives using entity extraction: @@ -498,6 +570,14 @@ Key test cases: --- -**Created**: 2026-01-10 -**Author**: OpenCode Agent -**Status**: Implementing +**Created**: 2026-01-10 +**Author**: OpenCode Agent +**Status**: Implemented (v2.0) + +## References + +- Pavlyshyn, V. "Context Graphs and Data Traces: Building Epistemology Layers for Agentic Memory" +- Pavlyshyn, V. "The Shape of Knowledge: Topology Theory for Knowledge Graphs" +- Pavlyshyn, V. "Beyond Hierarchy: Why Agentic AI Systems Need Holarchies" +- Pavlyshyn, V. "Smalltalk: The Language That Changed Everything" +- Pavlyshyn, V. "Clarity Traders: Beyond Vibe Coding" diff --git a/apps/archief-assistent/src/lib/semantic-cache.ts b/apps/archief-assistent/src/lib/semantic-cache.ts index 33610df12d..a23360a06f 100644 --- a/apps/archief-assistent/src/lib/semantic-cache.ts +++ b/apps/archief-assistent/src/lib/semantic-cache.ts @@ -33,6 +33,85 @@ */ export type InstitutionTypeCode = 'G' | 'L' | 'A' | 'M' | 'O' | 'R' | 'C' | 'U' | 'B' | 'E' | 'S' | 'F' | 'I' | 'X' | 'P' | 'H' | 'D' | 'N' | 'T'; +// ============================================================================ +// Epistemic Provenance (Phase 1 - Rule 46 Evolution) +// ============================================================================ +// Based on Pavlyshyn's "Context Graphs and Data Traces": Every cached response +// should carry epistemological metadata about WHERE, WHEN, HOW, and WHAT KIND +// of epistemic status the data holds. This enables revalidation and trust. + +/** + * Data source types aligned with project's data tier system. + * See: AGENTS.md Rule 22 (Custodian YAML Files Are the Single Source of Truth) + */ +export type EpistemicDataSource = + | 'ISIL_REGISTRY' // Dutch ISIL codes CSV (TIER_1) + | 'WIKIDATA' // Wikidata SPARQL endpoint (TIER_3) + | 'CUSTODIAN_YAML' // data/custodian/*.yaml files (TIER_1) + | 'GOOGLE_MAPS' // Google Places API (TIER_3) + | 'WEB_SCRAPE' // Website content with XPath provenance (TIER_2) + | 'LLM_INFERENCE' // Generated by LLM without verifiable source (TIER_4) + | 'SPARQL_QUERY' // Direct SPARQL query result (TIER_1-3 depending on source) + | 'RAG_PIPELINE' // Full RAG retrieval + generation (mixed tiers) + | 'USER_PROVIDED' // User-submitted data (TIER_4 until verified) + | 'CACHE_AGGREGATION'; // Computed from other cached entries + +/** + * Data quality tiers aligned with AGENTS.md provenance model. + * Higher tier = higher authority = more trustworthy. + */ +export type DataTier = 1 | 2 | 3 | 4; + +/** + * Epistemic provenance tracks the justification chain for cached knowledge. + * + * This transforms the cache from a collection of answers into a *justified* + * knowledge base where each response carries information about: + * - WHERE the information originated + * - WHEN it was captured + * - HOW it was derived + * - WHAT KIND of epistemic status it holds + * + * @see https://volodymyrpavlyshyn.medium.com/context-graphs-and-data-traces-building-epistemology-layers-for-agentic-memory-64ee876c846f + */ +export interface EpistemicProvenance { + /** Primary source of the cached data */ + dataSource: EpistemicDataSource; + + /** Data quality tier (1=authoritative, 4=inferred) */ + dataTier: DataTier; + + /** ISO 8601 timestamp when the source data was captured/queried */ + sourceTimestamp: string; + + /** + * Derivation chain showing how the answer was produced. + * Example: ["SPARQL:Qdrant", "RAG:retrieve", "LLM:generate"] + */ + derivationChain: string[]; + + /** + * When this cache entry should be revalidated. + * - 'static': Never revalidate (e.g., historical facts) + * - 'daily': Revalidate after 24 hours + * - 'weekly': Revalidate after 7 days + * - 'on_access': Revalidate every time (expensive, for volatile data) + */ + revalidationPolicy: 'static' | 'daily' | 'weekly' | 'on_access'; + + /** ISO 8601 timestamp of last revalidation (if applicable) */ + lastRevalidated?: string; + + /** Confidence score (0.0 - 1.0) based on source quality and derivation */ + confidenceScore?: number; + + /** Source URLs or identifiers for traceability */ + sourceReferences?: string[]; + + /** Notes about any caveats or limitations */ + epistemicNotes?: string; +} + /** * Entities extracted from a query for structured cache key generation. * Used to prevent geographic false positives (e.g., "Amsterdam" vs "Noord-Holland"). @@ -52,6 +131,72 @@ export interface ExtractedEntities { intent?: 'count' | 'list' | 'info' | null; /** Method used for entity extraction */ extractionMethod?: 'vocabulary' | 'regex' | 'embedding'; + + // ============================================================================ + // Phase 5: Clarity Trading (Rule 46 Evolution) + // ============================================================================ + /** + * Clarity score (0.0 - 1.0) indicating how unambiguous the query is. + * Queries with clarityScore < 0.7 should bypass cache and go to RAG. + */ + clarityScore?: number; + /** Identified ambiguities that reduce clarity */ + ambiguities?: string[]; +} + +// ============================================================================ +// Phase 4: Message-Passing Protocol (Smalltalk-Inspired) +// ============================================================================ +// Based on Pavlyshyn's "Smalltalk: The Language That Changed Everything": +// Queries should be MESSAGES to holons, not function calls. + +export type CacheMessageType = 'LOOKUP' | 'STORE' | 'INVALIDATE' | 'EXPLAIN'; + +export interface CacheMessage { + type: CacheMessageType; + /** Smalltalk-style message selector (e.g., "count:archives:municipal:GE") */ + selector: string; + arguments: { + query?: string; + embedding?: number[] | null; + response?: CachedResponse; + entities?: ExtractedEntities; + }; + /** Timestamp when message was created */ + timestamp: number; +} + +export interface CacheDecisionTrace { + /** The original query */ + query: string; + /** Extracted entities */ + entities: ExtractedEntities; + /** Structured cache key used */ + structuredKey: string; + /** Whether cache hit occurred */ + hit: boolean; + /** Tier where hit occurred (if any) */ + tier?: 'local' | 'shared'; + /** Match method used */ + method: 'semantic' | 'fuzzy' | 'exact' | 'structured' | 'none'; + /** Similarity score */ + similarity: number; + /** Topological distance (if applicable) */ + topologicalDistance?: number; + /** Why this decision was made */ + reasoning: string; + /** Epistemic provenance of the cached entry (if hit) */ + provenance?: EpistemicProvenance; + /** Time taken for lookup */ + lookupTimeMs: number; +} + +export interface CacheMessageResponse { + success: boolean; + result?: CacheLookupResult; + /** Decision trace for explainability */ + trace?: CacheDecisionTrace; + error?: string; } export interface CachedQuery { @@ -70,6 +215,18 @@ export interface CachedQuery { entities?: ExtractedEntities; /** Structured cache key derived from entities (e.g., "count:M:amsterdam") */ structuredKey?: string; + /** Epistemic provenance for trust and revalidation (Phase 1 - Rule 46 Evolution) */ + epistemicProvenance?: EpistemicProvenance; + + // ============================================================================ + // Holonic Cache Properties (Phase 3 - Rule 46 Evolution) + // ============================================================================ + /** Holon level: micro (single entity), meso (type+location), macro (aggregate) */ + holonLevel?: 'micro' | 'meso' | 'macro'; + /** Cache keys this entry participates in (upward links in holarchy) */ + participatesIn?: string[]; + /** Cache keys this entry aggregates (downward links in holarchy) */ + aggregates?: string[]; } export interface CachedResponse { @@ -228,6 +385,366 @@ function generateCacheId(): string { return `cache_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`; } +// ============================================================================ +// Type Hierarchy DAG (Phase 2 - Topological Distance) +// ============================================================================ +// Based on Pavlyshyn's "The Shape of Knowledge": Embeddings lose structural +// information. The SHAPE of the query's relationship to the ontology matters. + +/** + * GLAMORCUBESFIXPHDNT type hierarchy as a DAG. + * Each type maps to its parent types (toward HeritageCustodian root). + */ +const TYPE_HIERARCHY: Record = { + // Root + '*': [], // HeritageCustodian (universal root) + + // Tier 1: Base types (direct children of root) + 'G': ['*'], 'L': ['*'], 'A': ['*'], 'M': ['*'], 'O': ['*'], + 'R': ['*'], 'C': ['*'], 'U': ['*'], 'B': ['*'], 'E': ['*'], + 'S': ['*'], 'F': ['*'], 'I': ['*'], 'X': ['*'], 'P': ['*'], + 'H': ['*'], 'D': ['*'], 'N': ['*'], 'T': ['*'], + + // Tier 2: Subtypes (examples - extend as needed from schema) + 'M.ART': ['M'], 'M.HISTORY': ['M'], 'M.SCIENCE': ['M'], 'M.NATURAL': ['M'], + 'A.MUNICIPAL': ['A'], 'A.STATE': ['A'], 'A.CORPORATE': ['A'], 'A.REGIONAL': ['A'], + 'L.PUBLIC': ['L'], 'L.ACADEMIC': ['L'], 'L.SPECIAL': ['L'], + 'E.UNIVERSITY': ['E'], 'E.SCHOOL': ['E'], + 'H.CHURCH': ['H'], 'H.MOSQUE': ['H'], 'H.SYNAGOGUE': ['H'], +}; + +/** Depth cache for topological distance calculation */ +const depthCache: Map = new Map(); + +/** + * Get the depth of a type in the hierarchy (memoized). + */ +function getTypeDepth(typeCode: string): number { + if (depthCache.has(typeCode)) return depthCache.get(typeCode)!; + + if (typeCode === '*') { + depthCache.set('*', 0); + return 0; + } + + const parents = TYPE_HIERARCHY[typeCode]; + if (!parents || parents.length === 0) { + // Unknown type, treat as direct child of root + depthCache.set(typeCode, 1); + return 1; + } + + const depth = 1 + Math.min(...parents.map(p => getTypeDepth(p))); + depthCache.set(typeCode, depth); + return depth; +} + +/** + * Find the lowest common ancestor of two types in the hierarchy. + */ +function findLCA(type1: string, type2: string): string { + if (type1 === type2) return type1; + if (type1 === '*' || type2 === '*') return '*'; + + // Get ancestors of type1 + const ancestors1 = new Set(); + let current = type1; + while (current && current !== '*') { + ancestors1.add(current); + const parents = TYPE_HIERARCHY[current]; + current = parents?.[0] || '*'; + } + ancestors1.add('*'); + + // Find first ancestor of type2 that's in ancestors1 + current = type2; + while (current) { + if (ancestors1.has(current)) return current; + const parents = TYPE_HIERARCHY[current]; + current = parents?.[0] || '*'; + if (current === '*' && ancestors1.has('*')) return '*'; + } + + return '*'; // Root is always common ancestor +} + +/** + * Compute topological distance between two institution types. + * + * Based on path length through the type hierarchy DAG. + * Returns 0.0 for identical types, 1.0 for maximally distant types. + * + * @example + * topologicalDistance('M.ART', 'M.HISTORY') // 0.33 (siblings under M) + * topologicalDistance('M.ART', 'A.MUNICIPAL') // 0.67 (different base types) + * topologicalDistance('M', 'M') // 0.0 (identical) + */ +export function topologicalDistance(type1: string, type2: string): number { + if (type1 === type2) return 0; + + // Normalize to uppercase + const t1 = type1.toUpperCase(); + const t2 = type2.toUpperCase(); + + const lca = findLCA(t1, t2); + const depth1 = getTypeDepth(t1); + const depth2 = getTypeDepth(t2); + const depthLCA = getTypeDepth(lca); + + // Path length = distance from t1 to LCA + distance from t2 to LCA + const pathLength = (depth1 - depthLCA) + (depth2 - depthLCA); + + // Max possible depth in hierarchy (currently 2: root -> base -> subtype) + const maxDepth = 2; + + // Normalize to 0-1 range + return Math.min(pathLength / (2 * maxDepth), 1.0); +} + +/** + * Combined similarity score using both embedding similarity and topological distance. + * + * @param embeddingSimilarity - Cosine similarity of embeddings (0-1) + * @param queryType - Institution type from query + * @param cachedType - Institution type from cached entry + * @param embeddingWeight - Weight for embedding similarity (default 0.7) + * @returns Combined similarity score (0-1) + */ +export function combinedSimilarity( + embeddingSimilarity: number, + queryType: string | undefined, + cachedType: string | undefined, + embeddingWeight: number = 0.7 +): number { + // If no types available, use pure embedding similarity + if (!queryType || !cachedType) return embeddingSimilarity; + + const topoDist = topologicalDistance(queryType, cachedType); + const topoSimilarity = 1 - topoDist; + + // Weighted combination + return embeddingWeight * embeddingSimilarity + (1 - embeddingWeight) * topoSimilarity; +} + +// ============================================================================ +// Phase 5: Clarity Trading (Rule 46 Evolution) +// ============================================================================ +// Based on Pavlyshyn's "Clarity Traders: Beyond Vibe Coding": +// The real work is bringing clarity to ambiguity, not generating code/cache keys. + +/** Patterns that indicate ambiguity in queries */ +const AMBIGUITY_PATTERNS: Array<{ pattern: RegExp; type: string; penalty: number }> = [ + // Temporal ambiguity: "old", "recent", "historical" without dates + { pattern: /\b(oude?|old|recent|historical|historisch)\b(?!.*\d{4})/i, type: 'temporal_vague', penalty: 0.15 }, + // Size ambiguity: "large", "small", "big" without metrics + { pattern: /\b(grote?|small|large|big|klein)\b/i, type: 'size_vague', penalty: 0.10 }, + // Quality ambiguity: "best", "good", "important" + { pattern: /\b(beste?|good|best|important|belangrijk)\b/i, type: 'quality_vague', penalty: 0.10 }, + // Vague quantifiers: "some", "many", "few" + { pattern: /\b(sommige|some|many|veel|few|weinig)\b/i, type: 'quantity_vague', penalty: 0.05 }, + // Pronouns without antecedents: "it", "they", "this" + { pattern: /^(het|it|they|this|dat|die)\b/i, type: 'pronoun_start', penalty: 0.20 }, + // Very short queries (likely incomplete) + { pattern: /^.{1,10}$/i, type: 'too_short', penalty: 0.25 }, +]; + +/** Patterns that indicate high clarity */ +const CLARITY_PATTERNS: Array<{ pattern: RegExp; type: string; bonus: number }> = [ + // Specific location mentioned + { pattern: /\b(amsterdam|rotterdam|utrecht|den haag|groningen)\b/i, type: 'specific_city', bonus: 0.10 }, + // Specific type mentioned + { pattern: /\b(museum|archief|bibliotheek|archive|library)\b/i, type: 'specific_type', bonus: 0.10 }, + // Specific intent + { pattern: /\b(hoeveel|welke|waar|count|list|how many)\b/i, type: 'clear_intent', bonus: 0.10 }, + // ISO codes or identifiers + { pattern: /\b(ISIL|Q\d+|NL-[A-Z]{2,})\b/i, type: 'identifier', bonus: 0.15 }, + // Date ranges + { pattern: /\b\d{4}\s*[-–]\s*\d{4}\b/i, type: 'date_range', bonus: 0.10 }, +]; + +/** + * Calculate clarity score for a query (Phase 5 - Clarity Trading). + * + * High clarity (≥0.7): Query is unambiguous, safe to use cached response. + * Low clarity (<0.7): Query is ambiguous, should bypass cache and go to RAG. + * + * @param query - The user's query text + * @param entities - Already-extracted entities (to avoid re-extraction) + * @returns Clarity score (0.0 - 1.0) and list of identified ambiguities + */ +export function calculateClarity( + query: string, + entities?: ExtractedEntities +): { clarityScore: number; ambiguities: string[] } { + let score = 0.7; // Start at threshold + const ambiguities: string[] = []; + + // Check for ambiguity patterns (reduce score) + for (const { pattern, type, penalty } of AMBIGUITY_PATTERNS) { + if (pattern.test(query)) { + score -= penalty; + ambiguities.push(type); + } + } + + // Check for clarity patterns (increase score) + for (const { pattern, bonus } of CLARITY_PATTERNS) { + if (pattern.test(query)) { + score += bonus; + } + } + + // Bonus for having extracted entities + if (entities) { + if (entities.institutionType) score += 0.05; + if (entities.location) score += 0.05; + if (entities.intent) score += 0.05; + if (entities.institutionSubtype) score += 0.05; // Very specific + } + + // Clamp to 0-1 range + const clarityScore = Math.max(0, Math.min(1, score)); + + return { clarityScore, ambiguities }; +} + +/** + * Extract entities with clarity scoring (Phase 5 - Clarity Trading). + * + * Enhanced version of extractEntitiesFast that includes clarity assessment. + * Queries with clarityScore < 0.7 should bypass cache. + * + * @param query - The user's query text + * @returns Extracted entities with clarity score and ambiguities + */ +export function extractEntitiesWithClarity(query: string): ExtractedEntities { + const entities = extractEntitiesFast(query); + const { clarityScore, ambiguities } = calculateClarity(query, entities); + + return { + ...entities, + clarityScore, + ambiguities: ambiguities.length > 0 ? ambiguities : undefined, + }; +} + +// ============================================================================ +// Phase 4: Message Handler (Smalltalk-Inspired Introspection) +// ============================================================================ + +/** Last decision trace for explainability */ +let lastDecisionTrace: CacheDecisionTrace | null = null; + +/** + * Handle a cache message with full introspection capability. + * + * Based on Smalltalk's message-passing paradigm where every object can: + * - Receive and handle messages + * - Explain its last decision + * - Introspect its own state + * + * @param message - The cache message to handle + * @param cache - The SemanticCache instance + * @returns Response with optional decision trace + */ +export async function handleCacheMessage( + message: CacheMessage, + cache: SemanticCache +): Promise { + const startTime = performance.now(); + + try { + switch (message.type) { + case 'LOOKUP': { + if (!message.arguments.query) { + return { success: false, error: 'LOOKUP requires query argument' }; + } + + const query = message.arguments.query; + const embedding = message.arguments.embedding; + const entities = extractEntitiesWithClarity(query); + const structuredKey = generateStructuredCacheKey(entities); + + // Phase 5: Block low-clarity queries + if (entities.clarityScore !== undefined && entities.clarityScore < 0.7) { + lastDecisionTrace = { + query, + entities, + structuredKey, + hit: false, + method: 'none', + similarity: 0, + reasoning: `Query clarity too low (${entities.clarityScore.toFixed(2)}). Ambiguities: ${entities.ambiguities?.join(', ')}. Bypassing cache.`, + lookupTimeMs: performance.now() - startTime, + }; + return { + success: true, + result: { found: false, similarity: 0, method: 'none', lookupTimeMs: lastDecisionTrace.lookupTimeMs }, + trace: lastDecisionTrace, + }; + } + + const result = await cache.lookup(query, embedding); + + lastDecisionTrace = { + query, + entities, + structuredKey, + hit: result.found, + tier: result.tier, + method: result.method, + similarity: result.similarity, + reasoning: result.found + ? `Cache hit via ${result.method} matching (similarity: ${result.similarity.toFixed(3)}) from ${result.tier} tier.` + : `Cache miss. No entry matched with sufficient similarity.`, + provenance: result.entry?.epistemicProvenance, + lookupTimeMs: result.lookupTimeMs, + }; + + return { success: true, result, trace: lastDecisionTrace }; + } + + case 'EXPLAIN': { + if (!lastDecisionTrace) { + return { success: false, error: 'No decision to explain. Perform a LOOKUP first.' }; + } + return { success: true, trace: lastDecisionTrace }; + } + + case 'STORE': { + if (!message.arguments.query || !message.arguments.response) { + return { success: false, error: 'STORE requires query and response arguments' }; + } + const id = await cache.store( + message.arguments.query, + message.arguments.embedding || null, + message.arguments.response + ); + return { success: true, result: { found: true, similarity: 1, method: 'exact', lookupTimeMs: 0 } }; + } + + case 'INVALIDATE': { + await cache.clear(); + lastDecisionTrace = null; + return { success: true }; + } + + default: + return { success: false, error: `Unknown message type: ${message.type}` }; + } + } catch (error) { + return { success: false, error: String(error) }; + } +} + +/** + * Get the last decision trace for explainability. + * Implements Smalltalk-style introspection. + */ +export function explainLastDecision(): CacheDecisionTrace | null { + return lastDecisionTrace; +} + // ============================================================================ // Entity Extraction (Ontology-Driven per Rule 46) // ============================================================================ @@ -783,7 +1300,20 @@ export class SemanticCache { if (embedding && embedding.length > 0) { for (const entry of entityCompatibleEntries) { if (entry.embedding && entry.embedding.length > 0) { - const similarity = cosineSimilarity(embedding, entry.embedding); + const rawSimilarity = cosineSimilarity(embedding, entry.embedding); + + // Apply topological distance penalty (Phase 2 - Rule 46 Evolution) + // This prevents "art museum" from matching "natural history museum" + const queryTypeKey = entities.institutionSubtype + ? `${entities.institutionType}.${entities.institutionSubtype.toUpperCase().replace(/[^A-Z]/g, '')}` + : (entities.institutionType || undefined); + const cachedEntities = entry.entities || extractEntitiesFast(entry.query); + const cachedTypeKey = cachedEntities.institutionSubtype + ? `${cachedEntities.institutionType}.${cachedEntities.institutionSubtype.toUpperCase().replace(/[^A-Z]/g, '')}` + : (cachedEntities.institutionType || undefined); + + const similarity = combinedSimilarity(rawSimilarity, queryTypeKey, cachedTypeKey); + if (similarity > bestSimilarity && similarity >= this.config.similarityThreshold) { bestSimilarity = similarity; bestMatch = entry; @@ -884,7 +1414,7 @@ export class SemanticCache { } /** - * Store a query and response in cache with extracted entities. + * Store a query and response in cache with extracted entities and epistemic provenance. * Entities are extracted at storage time to enable entity-aware cache matching. */ async store( @@ -892,12 +1422,33 @@ export class SemanticCache { embedding: number[] | null, response: CachedResponse, llmProvider: string = 'zai', - language: 'nl' | 'en' = 'nl' + language: 'nl' | 'en' = 'nl', + provenance?: Partial ): Promise { // Extract entities at storage time for future entity-aware matching const entities = extractEntitiesFast(query); const structuredKey = generateStructuredCacheKey(entities); + // Build epistemic provenance with defaults + const epistemicProvenance: EpistemicProvenance = { + dataSource: provenance?.dataSource || 'RAG_PIPELINE', + dataTier: provenance?.dataTier || 4, + sourceTimestamp: provenance?.sourceTimestamp || new Date().toISOString(), + derivationChain: provenance?.derivationChain || [`LLM:${llmProvider}`], + revalidationPolicy: provenance?.revalidationPolicy || 'weekly', + confidenceScore: provenance?.confidenceScore, + sourceReferences: provenance?.sourceReferences, + epistemicNotes: provenance?.epistemicNotes, + }; + + // Determine holon level based on entities + let holonLevel: 'micro' | 'meso' | 'macro' = 'meso'; + if (entities.intent === 'info' && entities.location) { + holonLevel = 'micro'; // Specific entity query + } else if (!entities.institutionType && !entities.location) { + holonLevel = 'macro'; // Broad aggregate query + } + const entry: CachedQuery = { id: generateCacheId(), query, @@ -910,8 +1461,10 @@ export class SemanticCache { language, llmProvider, source: 'local', - entities, // Store extracted entities for entity-aware matching - structuredKey, // Store structured key for debugging/analytics + entities, + structuredKey, + epistemicProvenance, + holonLevel, }; // Store locally diff --git a/apps/archief-assistent/tests/semantic-cache.test.ts b/apps/archief-assistent/tests/semantic-cache.test.ts index d2a747b966..f23d9a4e3a 100644 --- a/apps/archief-assistent/tests/semantic-cache.test.ts +++ b/apps/archief-assistent/tests/semantic-cache.test.ts @@ -13,6 +13,8 @@ import { generateStructuredCacheKey, entitiesMatch, normalizeQuery, + topologicalDistance, + combinedSimilarity, type ExtractedEntities, type InstitutionTypeCode, } from '../src/lib/semantic-cache' @@ -443,3 +445,73 @@ describe('normalizeQuery', () => { expect(normalizeQuery('musea in amsterdam')).toBe('musea in amsterdam') }) }) + +// ============================================================================ +// Phase 2: Topological Distance Tests (Rule 46 Evolution) +// ============================================================================ + +describe('topologicalDistance', () => { + it('should return 0 for identical types', () => { + expect(topologicalDistance('M', 'M')).toBe(0) + expect(topologicalDistance('A', 'A')).toBe(0) + expect(topologicalDistance('M.ART', 'M.ART')).toBe(0) + }) + + it('should return 0.25 for sibling subtypes (same parent)', () => { + // M.ART and M.HISTORY are both children of M + const dist = topologicalDistance('M.ART', 'M.HISTORY') + expect(dist).toBeCloseTo(0.5, 1) // path: ART -> M -> HISTORY = 2 / 4 = 0.5 + }) + + it('should return higher distance for different base types', () => { + // M and A are siblings under root + const dist = topologicalDistance('M', 'A') + expect(dist).toBeCloseTo(0.5, 1) // path: M -> * -> A = 2 / 4 = 0.5 + }) + + it('should return even higher distance for subtypes of different base types', () => { + // M.ART and A.MUNICIPAL are in different branches + const dist = topologicalDistance('M.ART', 'A.MUNICIPAL') + expect(dist).toBeGreaterThan(0.5) + }) + + it('should handle unknown types gracefully', () => { + // Unknown types should be treated as direct children of root + const dist = topologicalDistance('UNKNOWN', 'M') + expect(dist).toBeGreaterThanOrEqual(0) + expect(dist).toBeLessThanOrEqual(1) + }) + + it('should be symmetric', () => { + expect(topologicalDistance('M', 'A')).toBe(topologicalDistance('A', 'M')) + expect(topologicalDistance('M.ART', 'L')).toBe(topologicalDistance('L', 'M.ART')) + }) +}) + +describe('combinedSimilarity', () => { + it('should return pure embedding similarity when no types provided', () => { + const similarity = combinedSimilarity(0.95, undefined, undefined) + expect(similarity).toBe(0.95) + }) + + it('should weight embedding similarity at 0.7 by default', () => { + // Same type -> topological distance = 0 -> topo similarity = 1 + const similarity = combinedSimilarity(0.9, 'M', 'M') + // 0.7 * 0.9 + 0.3 * 1.0 = 0.63 + 0.3 = 0.93 + expect(similarity).toBeCloseTo(0.93, 2) + }) + + it('should penalize different types even with high embedding similarity', () => { + // Different types -> topological distance > 0 -> lower combined similarity + const sameType = combinedSimilarity(0.9, 'M', 'M') + const diffType = combinedSimilarity(0.9, 'M', 'A') + expect(diffType).toBeLessThan(sameType) + }) + + it('should heavily penalize cross-branch subtype matches', () => { + // M.ART vs A.MUNICIPAL - very different semantically + const crossBranch = combinedSimilarity(0.92, 'M.ART', 'A.MUNICIPAL') + // Even with 0.92 embedding similarity, the topological penalty should be significant + expect(crossBranch).toBeLessThan(0.85) + }) +})