glam/scripts/extract-types-vocab.ts
kempersc f2bc2d54cb feat(archief-assistent): integrate ontology-driven vocabulary into semantic cache
Implements Rule 46: Ontology-Driven Cache Segmentation

Semantic Cache Enhancements:
- Add institutionSubtype, recordSetType, wikidataEntity to ExtractedEntities
- Add extractionMethod field to track vocabulary vs regex extraction
- Implement async extractEntitiesWithVocabulary() using term log
- Maintain sync regex fallback for cache key generation (<5ms)

Build Pipeline:
- Add prebuild hook to regenerate types-vocab.json from LinkML schemas
- Extract vocabulary from *Type.yaml and *Types.yaml schema files
- Generate GLAMORCUBESFIXPHDNT code mappings automatically

New Script:
- scripts/extract-types-vocab.ts - Extracts vocabulary from LinkML schemas
- Supports --skip-embeddings flag for faster builds
- Outputs to apps/archief-assistent/public/types-vocab.json

This enables richer cache segmentation using ontology-derived subtypes
(e.g., 'MUNICIPAL_ARCHIVE', 'ART_MUSEUM') instead of just top-level
GLAMORCUBESFIXPHDNT codes.
2026-01-10 13:30:30 +01:00

494 lines
18 KiB
JavaScript

#!/usr/bin/env node
/**
* extract-types-vocab.ts
*
* Extracts vocabulary from LinkML *Type.yaml and *Types.yaml schema files
* and generates embeddings for two-tier semantic routing.
*
* Output: apps/archief-assistent/public/types-vocab.json
*
* Usage:
* npx tsx scripts/extract-types-vocab.ts
* npx tsx scripts/extract-types-vocab.ts --skip-embeddings # Skip embedding generation
*
* See: .opencode/rules/ontology-driven-cache-segmentation.md
*/
import { readFileSync, writeFileSync, readdirSync, existsSync, mkdirSync } from 'node:fs';
import { join, dirname } from 'node:path';
import { fileURLToPath } from 'node:url';
import { parse as parseYaml } from 'yaml';
// ESM compatibility for __dirname
const __filename = fileURLToPath(import.meta.url);
const __dirname = dirname(__filename);
// ============================================================================
// Configuration
// ============================================================================
const SCHEMA_DIR = join(__dirname, '../schemas/20251121/linkml/modules/classes');
const OUTPUT_FILE = join(__dirname, '../apps/archief-assistent/public/types-vocab.json');
const EMBEDDING_API_URL = process.env.EMBEDDING_API_URL || 'http://localhost:8000/api/embed';
// GLAMORCUBESFIXPHDNT code mapping
const TYPE_FILE_TO_CODE: Record<string, string> = {
'ArchiveOrganizationType': 'A',
'BioCustodianType': 'B',
'CommercialOrganizationType': 'C',
'DigitalPlatformType': 'D',
'EducationProviderType': 'E',
'FeatureCustodianType': 'F',
'GalleryType': 'G',
'HolySacredSiteType': 'H',
'IntangibleHeritageGroupType': 'I',
'LibraryType': 'L',
'MuseumType': 'M',
'NonProfitType': 'N',
'OfficialInstitutionType': 'O',
'PersonalCollectionType': 'P',
'ResearchOrganizationType': 'R',
'HeritageSocietyType': 'S',
'TasteScentHeritageType': 'T',
'UnspecifiedType': 'U',
'MixedCustodianType': 'X',
};
// ============================================================================
// Types
// ============================================================================
interface TermLogEntry {
typeCode: string;
typeName: string;
subtypeName?: string;
recordSetType?: string;
wikidata?: string;
lang: string;
}
interface SubtypeInfo {
className: string;
wikidata?: string;
accumulatedTerms: string;
keywords: Record<string, string[]>;
}
interface TypeInfo {
code: string;
className: string;
baseWikidata?: string;
accumulatedTerms: string;
keywords: Record<string, string[]>;
subtypes: Record<string, SubtypeInfo>;
}
interface RecordSetTypeInfo {
className: string;
accumulatedTerms: string;
keywords: Record<string, string[]>;
}
interface TypesVocabulary {
version: string;
schemaVersion: string;
embeddingModel: string;
embeddingDimensions: number;
tier1Embeddings: Record<string, number[]>;
tier2Embeddings: Record<string, Record<string, number[]>>;
termLog: Record<string, TermLogEntry>;
institutionTypes: Record<string, TypeInfo>;
recordSetTypes: Record<string, RecordSetTypeInfo>;
}
interface ParsedClass {
className: string;
description?: string;
keywords?: string[];
structuredAliases?: Array<{ literal_form: string; in_language?: string }>;
wikidataEntity?: string;
isSubtypeOf?: string;
}
// ============================================================================
// YAML Parsing
// ============================================================================
function parseYamlFile(filePath: string): Record<string, unknown> | null {
try {
const content = readFileSync(filePath, 'utf-8');
return parseYaml(content);
} catch (error) {
console.warn(`Warning: Could not parse ${filePath}: ${error}`);
return null;
}
}
function extractClassesFromYaml(yamlData: Record<string, unknown>): ParsedClass[] {
const classes: ParsedClass[] = [];
const classesSection = yamlData.classes as Record<string, unknown> | undefined;
if (!classesSection) return classes;
for (const [className, classDef] of Object.entries(classesSection)) {
if (typeof classDef !== 'object' || classDef === null) continue;
const classData = classDef as Record<string, unknown>;
// Skip abstract base classes (except the main Type class)
if (classData.abstract === true && !className.endsWith('Type')) continue;
const parsed: ParsedClass = {
className,
description: classData.description as string | undefined,
keywords: classData.keywords as string[] | undefined,
structuredAliases: classData.structured_aliases as Array<{ literal_form: string; in_language?: string }> | undefined,
isSubtypeOf: classData.is_a as string | undefined,
};
// Extract wikidata entity from slot_usage or mappings
const slotUsage = classData.slot_usage as Record<string, unknown> | undefined;
if (slotUsage?.wikidata_entity) {
const wdSlot = slotUsage.wikidata_entity as Record<string, unknown>;
parsed.wikidataEntity = wdSlot.equals_string as string | undefined;
}
// Check exact_mappings for Wikidata
const exactMappings = classData.exact_mappings as string[] | undefined;
if (exactMappings) {
const wdMapping = exactMappings.find(m => m.startsWith('wd:') || m.startsWith('wikidata:'));
if (wdMapping) {
parsed.wikidataEntity = wdMapping.replace(/^(wd:|wikidata:)/, '');
}
}
// Check broad_mappings for Wikidata
const broadMappings = classData.broad_mappings as string[] | undefined;
if (broadMappings && !parsed.wikidataEntity) {
const wdMapping = broadMappings.find(m => m.startsWith('wd:'));
if (wdMapping) {
parsed.wikidataEntity = wdMapping.replace('wd:', '');
}
}
classes.push(parsed);
}
return classes;
}
function extractKeywordsFromClass(parsedClass: ParsedClass): Record<string, string[]> {
const keywords: Record<string, string[]> = {};
// 1. Extract from keywords array (usually language-agnostic, assume Dutch/English)
if (parsedClass.keywords) {
keywords['nl'] = keywords['nl'] || [];
keywords['en'] = keywords['en'] || [];
for (const kw of parsedClass.keywords) {
// Simple heuristic: Dutch words often have Dutch-specific patterns
const isDutch = /[ij]|sch|cht|aa|ee|oo|uu/i.test(kw);
if (isDutch) {
keywords['nl'].push(kw.toLowerCase());
} else {
keywords['en'].push(kw.toLowerCase());
}
}
}
// 2. Extract from structured_aliases (language-tagged)
if (parsedClass.structuredAliases) {
for (const alias of parsedClass.structuredAliases) {
const lang = alias.in_language || 'en';
keywords[lang] = keywords[lang] || [];
keywords[lang].push(alias.literal_form.toLowerCase());
}
}
// 3. Convert class name to keywords
// MunicipalArchive -> ["municipal archive", "municipal", "archive"]
const classNameWords = parsedClass.className
.replace(/([A-Z])/g, ' $1')
.trim()
.toLowerCase()
.split(/\s+/);
keywords['en'] = keywords['en'] || [];
keywords['en'].push(classNameWords.join(' '));
return keywords;
}
function accumulateTerms(keywords: Record<string, string[]>): string {
const allTerms: string[] = [];
for (const terms of Object.values(keywords)) {
allTerms.push(...terms);
}
return [...new Set(allTerms)].join(' ');
}
// ============================================================================
// Embedding Generation
// ============================================================================
async function generateEmbedding(text: string, skipEmbeddings: boolean): Promise<number[]> {
if (skipEmbeddings) {
// Return empty placeholder
return [];
}
try {
const response = await fetch(EMBEDDING_API_URL, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ text }),
});
if (!response.ok) {
console.warn(`Embedding API error: ${response.status}`);
return [];
}
const data = await response.json();
return data.embedding || [];
} catch (error) {
console.warn(`Embedding generation failed: ${error}`);
return [];
}
}
// ============================================================================
// Main Processing
// ============================================================================
async function processTypeFiles(): Promise<TypesVocabulary> {
const skipEmbeddings = process.argv.includes('--skip-embeddings');
console.log('🔍 Scanning schema directory:', SCHEMA_DIR);
console.log(`📊 Embedding generation: ${skipEmbeddings ? 'SKIPPED' : 'ENABLED'}`);
const vocabulary: TypesVocabulary = {
version: new Date().toISOString(),
schemaVersion: '20251121',
embeddingModel: 'paraphrase-multilingual-MiniLM-L12-v2',
embeddingDimensions: 384,
tier1Embeddings: {},
tier2Embeddings: {},
termLog: {},
institutionTypes: {},
recordSetTypes: {},
};
// Find all *Type.yaml files (base types)
const files = readdirSync(SCHEMA_DIR);
const typeFiles = files.filter(f => f.endsWith('Type.yaml') && !f.endsWith('Types.yaml'));
const typesFiles = files.filter(f => f.endsWith('Types.yaml'));
console.log(`\n📁 Found ${typeFiles.length} Type files and ${typesFiles.length} Types files`);
// Process base Type files
for (const file of typeFiles) {
const typeName = file.replace('.yaml', '');
const code = TYPE_FILE_TO_CODE[typeName];
if (!code) {
console.log(` ⏭️ Skipping ${typeName} (not in GLAMORCUBESFIXPHDNT)`);
continue;
}
console.log(`\n📄 Processing ${typeName} (${code})`);
const filePath = join(SCHEMA_DIR, file);
const yamlData = parseYamlFile(filePath);
if (!yamlData) continue;
const classes = extractClassesFromYaml(yamlData);
const baseClass = classes.find(c => c.className === typeName);
if (!baseClass) {
console.log(` ⚠️ No base class found in ${file}`);
continue;
}
// Initialize type info
const typeInfo: TypeInfo = {
code,
className: typeName,
baseWikidata: baseClass.wikidataEntity,
accumulatedTerms: '',
keywords: extractKeywordsFromClass(baseClass),
subtypes: {},
};
// Look for corresponding Types file (subtypes)
const subtypesFilePath = join(SCHEMA_DIR, file.replace('Type.yaml', 'Types.yaml'));
if (existsSync(subtypesFilePath)) {
console.log(` 📂 Processing subtypes from ${subtypesFilePath.split('/').pop()}`);
const subtypesYaml = parseYamlFile(subtypesFilePath);
if (subtypesYaml) {
const subtypeClasses = extractClassesFromYaml(subtypesYaml);
for (const subclass of subtypeClasses) {
// Convert CamelCase to UPPER_SNAKE_CASE
const subtypeName = subclass.className
.replace(/([a-z])([A-Z])/g, '$1_$2')
.toUpperCase();
const subtypeKeywords = extractKeywordsFromClass(subclass);
const subtypeInfo: SubtypeInfo = {
className: subclass.className,
wikidata: subclass.wikidataEntity,
accumulatedTerms: accumulateTerms(subtypeKeywords),
keywords: subtypeKeywords,
};
typeInfo.subtypes[subtypeName] = subtypeInfo;
// Add to term log
for (const [lang, terms] of Object.entries(subtypeKeywords)) {
for (const term of terms) {
vocabulary.termLog[term] = {
typeCode: code,
typeName,
subtypeName,
wikidata: subclass.wikidataEntity,
lang,
};
}
}
console.log(`${subclass.className}: ${Object.values(subtypeKeywords).flat().length} terms`);
}
}
}
// Accumulate all terms for this type (base + all subtypes)
const allTypeTerms: string[] = [];
allTypeTerms.push(accumulateTerms(typeInfo.keywords));
for (const subtype of Object.values(typeInfo.subtypes)) {
allTypeTerms.push(subtype.accumulatedTerms);
}
typeInfo.accumulatedTerms = [...new Set(allTypeTerms.join(' ').split(' '))].join(' ');
// Add base type keywords to term log
for (const [lang, terms] of Object.entries(typeInfo.keywords)) {
for (const term of terms) {
vocabulary.termLog[term] = {
typeCode: code,
typeName,
lang,
};
}
}
vocabulary.institutionTypes[code] = typeInfo;
console.log(`${typeName}: ${Object.keys(typeInfo.subtypes).length} subtypes, ${typeInfo.accumulatedTerms.split(' ').length} total terms`);
}
// Process RecordSetTypes files
console.log('\n📁 Processing RecordSetTypes files...');
const recordSetTypesFiles = files.filter(f => f.endsWith('RecordSetTypes.yaml'));
for (const file of recordSetTypesFiles) {
const filePath = join(SCHEMA_DIR, file);
const yamlData = parseYamlFile(filePath);
if (!yamlData) continue;
const classes = extractClassesFromYaml(yamlData);
for (const cls of classes) {
// Skip abstract base classes
if (cls.className.endsWith('RecordSetType') && !cls.className.includes('Fonds') &&
!cls.className.includes('Series') && !cls.className.includes('Collection')) {
continue;
}
// Convert CamelCase to UPPER_SNAKE_CASE
const rstName = cls.className
.replace(/([a-z])([A-Z])/g, '$1_$2')
.toUpperCase();
const keywords = extractKeywordsFromClass(cls);
const rstInfo: RecordSetTypeInfo = {
className: cls.className,
accumulatedTerms: accumulateTerms(keywords),
keywords,
};
vocabulary.recordSetTypes[rstName] = rstInfo;
// Add to term log
for (const [lang, terms] of Object.entries(keywords)) {
for (const term of terms) {
vocabulary.termLog[term] = {
typeCode: 'A', // Most record set types are archive-related
typeName: 'ArchiveOrganizationType',
recordSetType: rstName,
lang,
};
}
}
}
}
console.log(` ✅ Extracted ${Object.keys(vocabulary.recordSetTypes).length} record set types`);
// Generate Tier 1 embeddings (Types file level)
console.log('\n🧮 Generating Tier 1 embeddings (Types files)...');
for (const [code, typeInfo] of Object.entries(vocabulary.institutionTypes)) {
const embedding = await generateEmbedding(typeInfo.accumulatedTerms, skipEmbeddings);
vocabulary.tier1Embeddings[typeInfo.className] = embedding;
console.log(`${typeInfo.className}: ${embedding.length} dimensions`);
}
// Generate Tier 2 embeddings (individual subtypes)
console.log('\n🧮 Generating Tier 2 embeddings (subtypes)...');
for (const [code, typeInfo] of Object.entries(vocabulary.institutionTypes)) {
vocabulary.tier2Embeddings[code] = {};
for (const [subtypeName, subtypeInfo] of Object.entries(typeInfo.subtypes)) {
const embedding = await generateEmbedding(subtypeInfo.accumulatedTerms, skipEmbeddings);
vocabulary.tier2Embeddings[code][subtypeName] = embedding;
}
console.log(`${typeInfo.className}: ${Object.keys(typeInfo.subtypes).length} subtype embeddings`);
}
return vocabulary;
}
// ============================================================================
// Main Entry Point
// ============================================================================
async function main() {
console.log('═══════════════════════════════════════════════════════════════');
console.log(' TypesVocabulary Extraction Script');
console.log(' Ontology-Driven Cache Segmentation (Rule 46)');
console.log('═══════════════════════════════════════════════════════════════\n');
const vocabulary = await processTypeFiles();
// Ensure output directory exists
const outputDir = dirname(OUTPUT_FILE);
if (!existsSync(outputDir)) {
mkdirSync(outputDir, { recursive: true });
}
// Write output
writeFileSync(OUTPUT_FILE, JSON.stringify(vocabulary, null, 2));
console.log('\n═══════════════════════════════════════════════════════════════');
console.log(' Summary');
console.log('═══════════════════════════════════════════════════════════════');
console.log(` 📊 Institution Types: ${Object.keys(vocabulary.institutionTypes).length}`);
console.log(` 📊 Total Subtypes: ${Object.values(vocabulary.institutionTypes).reduce((sum, t) => sum + Object.keys(t.subtypes).length, 0)}`);
console.log(` 📊 Record Set Types: ${Object.keys(vocabulary.recordSetTypes).length}`);
console.log(` 📊 Term Log Entries: ${Object.keys(vocabulary.termLog).length}`);
console.log(` 📊 Tier 1 Embeddings: ${Object.keys(vocabulary.tier1Embeddings).length}`);
console.log(` 📊 Tier 2 Embeddings: ${Object.values(vocabulary.tier2Embeddings).reduce((sum, t) => sum + Object.keys(t).length, 0)}`);
console.log(`\n ✅ Output written to: ${OUTPUT_FILE}`);
console.log('═══════════════════════════════════════════════════════════════\n');
}
main().catch(console.error);