#!/usr/bin/env node /** * extract-types-vocab.ts * * Extracts vocabulary from LinkML *Type.yaml and *Types.yaml schema files * and generates embeddings for two-tier semantic routing. * * Output: apps/archief-assistent/public/types-vocab.json * * Usage: * npx tsx scripts/extract-types-vocab.ts * npx tsx scripts/extract-types-vocab.ts --skip-embeddings # Skip embedding generation * * See: .opencode/rules/ontology-driven-cache-segmentation.md */ import { readFileSync, writeFileSync, readdirSync, existsSync, mkdirSync } from 'node:fs'; import { join, dirname } from 'node:path'; import { fileURLToPath } from 'node:url'; import { parse as parseYaml } from 'yaml'; // ESM compatibility for __dirname const __filename = fileURLToPath(import.meta.url); const __dirname = dirname(__filename); // ============================================================================ // Configuration // ============================================================================ const SCHEMA_DIR = join(__dirname, '../schemas/20251121/linkml/modules/classes'); const OUTPUT_FILE = join(__dirname, '../apps/archief-assistent/public/types-vocab.json'); const EMBEDDING_API_URL = process.env.EMBEDDING_API_URL || 'http://localhost:8000/api/embed'; // GLAMORCUBESFIXPHDNT code mapping const TYPE_FILE_TO_CODE: Record = { 'ArchiveOrganizationType': 'A', 'BioCustodianType': 'B', 'CommercialOrganizationType': 'C', 'DigitalPlatformType': 'D', 'EducationProviderType': 'E', 'FeatureCustodianType': 'F', 'GalleryType': 'G', 'HolySacredSiteType': 'H', 'IntangibleHeritageGroupType': 'I', 'LibraryType': 'L', 'MuseumType': 'M', 'NonProfitType': 'N', 'OfficialInstitutionType': 'O', 'PersonalCollectionType': 'P', 'ResearchOrganizationType': 'R', 'HeritageSocietyType': 'S', 'TasteScentHeritageType': 'T', 'UnspecifiedType': 'U', 'MixedCustodianType': 'X', }; // ============================================================================ // Types // ============================================================================ interface TermLogEntry { typeCode: string; typeName: string; subtypeName?: string; recordSetType?: string; wikidata?: string; lang: string; } interface SubtypeInfo { className: string; wikidata?: string; accumulatedTerms: string; keywords: Record; } interface TypeInfo { code: string; className: string; baseWikidata?: string; accumulatedTerms: string; keywords: Record; subtypes: Record; } interface RecordSetTypeInfo { className: string; accumulatedTerms: string; keywords: Record; } interface TypesVocabulary { version: string; schemaVersion: string; embeddingModel: string; embeddingDimensions: number; tier1Embeddings: Record; tier2Embeddings: Record>; termLog: Record; institutionTypes: Record; recordSetTypes: Record; } interface ParsedClass { className: string; description?: string; keywords?: string[]; structuredAliases?: Array<{ literal_form: string; in_language?: string }>; wikidataEntity?: string; isSubtypeOf?: string; } // ============================================================================ // YAML Parsing // ============================================================================ function parseYamlFile(filePath: string): Record | null { try { const content = readFileSync(filePath, 'utf-8'); return parseYaml(content); } catch (error) { console.warn(`Warning: Could not parse ${filePath}: ${error}`); return null; } } function extractClassesFromYaml(yamlData: Record): ParsedClass[] { const classes: ParsedClass[] = []; const classesSection = yamlData.classes as Record | undefined; if (!classesSection) return classes; for (const [className, classDef] of Object.entries(classesSection)) { if (typeof classDef !== 'object' || classDef === null) continue; const classData = classDef as Record; // Skip abstract base classes (except the main Type class) if (classData.abstract === true && !className.endsWith('Type')) continue; const parsed: ParsedClass = { className, description: classData.description as string | undefined, keywords: classData.keywords as string[] | undefined, structuredAliases: classData.structured_aliases as Array<{ literal_form: string; in_language?: string }> | undefined, isSubtypeOf: classData.is_a as string | undefined, }; // Extract wikidata entity from slot_usage or mappings const slotUsage = classData.slot_usage as Record | undefined; if (slotUsage?.wikidata_entity) { const wdSlot = slotUsage.wikidata_entity as Record; parsed.wikidataEntity = wdSlot.equals_string as string | undefined; } // Check exact_mappings for Wikidata const exactMappings = classData.exact_mappings as string[] | undefined; if (exactMappings) { const wdMapping = exactMappings.find(m => m.startsWith('wd:') || m.startsWith('wikidata:')); if (wdMapping) { parsed.wikidataEntity = wdMapping.replace(/^(wd:|wikidata:)/, ''); } } // Check broad_mappings for Wikidata const broadMappings = classData.broad_mappings as string[] | undefined; if (broadMappings && !parsed.wikidataEntity) { const wdMapping = broadMappings.find(m => m.startsWith('wd:')); if (wdMapping) { parsed.wikidataEntity = wdMapping.replace('wd:', ''); } } classes.push(parsed); } return classes; } function extractKeywordsFromClass(parsedClass: ParsedClass): Record { const keywords: Record = {}; // 1. Extract from keywords array (usually language-agnostic, assume Dutch/English) if (parsedClass.keywords) { keywords['nl'] = keywords['nl'] || []; keywords['en'] = keywords['en'] || []; for (const kw of parsedClass.keywords) { // Simple heuristic: Dutch words often have Dutch-specific patterns const isDutch = /[ij]|sch|cht|aa|ee|oo|uu/i.test(kw); if (isDutch) { keywords['nl'].push(kw.toLowerCase()); } else { keywords['en'].push(kw.toLowerCase()); } } } // 2. Extract from structured_aliases (language-tagged) if (parsedClass.structuredAliases) { for (const alias of parsedClass.structuredAliases) { const lang = alias.in_language || 'en'; keywords[lang] = keywords[lang] || []; keywords[lang].push(alias.literal_form.toLowerCase()); } } // 3. Convert class name to keywords // MunicipalArchive -> ["municipal archive", "municipal", "archive"] const classNameWords = parsedClass.className .replace(/([A-Z])/g, ' $1') .trim() .toLowerCase() .split(/\s+/); keywords['en'] = keywords['en'] || []; keywords['en'].push(classNameWords.join(' ')); return keywords; } function accumulateTerms(keywords: Record): string { const allTerms: string[] = []; for (const terms of Object.values(keywords)) { allTerms.push(...terms); } return [...new Set(allTerms)].join(' '); } // ============================================================================ // Embedding Generation // ============================================================================ async function generateEmbedding(text: string, skipEmbeddings: boolean): Promise { if (skipEmbeddings) { // Return empty placeholder return []; } try { const response = await fetch(EMBEDDING_API_URL, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ text }), }); if (!response.ok) { console.warn(`Embedding API error: ${response.status}`); return []; } const data = await response.json(); return data.embedding || []; } catch (error) { console.warn(`Embedding generation failed: ${error}`); return []; } } // ============================================================================ // Main Processing // ============================================================================ async function processTypeFiles(): Promise { const skipEmbeddings = process.argv.includes('--skip-embeddings'); console.log('šŸ” Scanning schema directory:', SCHEMA_DIR); console.log(`šŸ“Š Embedding generation: ${skipEmbeddings ? 'SKIPPED' : 'ENABLED'}`); const vocabulary: TypesVocabulary = { version: new Date().toISOString(), schemaVersion: '20251121', embeddingModel: 'paraphrase-multilingual-MiniLM-L12-v2', embeddingDimensions: 384, tier1Embeddings: {}, tier2Embeddings: {}, termLog: {}, institutionTypes: {}, recordSetTypes: {}, }; // Find all *Type.yaml files (base types) const files = readdirSync(SCHEMA_DIR); const typeFiles = files.filter(f => f.endsWith('Type.yaml') && !f.endsWith('Types.yaml')); const typesFiles = files.filter(f => f.endsWith('Types.yaml')); console.log(`\nšŸ“ Found ${typeFiles.length} Type files and ${typesFiles.length} Types files`); // Process base Type files for (const file of typeFiles) { const typeName = file.replace('.yaml', ''); const code = TYPE_FILE_TO_CODE[typeName]; if (!code) { console.log(` ā­ļø Skipping ${typeName} (not in GLAMORCUBESFIXPHDNT)`); continue; } console.log(`\nšŸ“„ Processing ${typeName} (${code})`); const filePath = join(SCHEMA_DIR, file); const yamlData = parseYamlFile(filePath); if (!yamlData) continue; const classes = extractClassesFromYaml(yamlData); const baseClass = classes.find(c => c.className === typeName); if (!baseClass) { console.log(` āš ļø No base class found in ${file}`); continue; } // Initialize type info const typeInfo: TypeInfo = { code, className: typeName, baseWikidata: baseClass.wikidataEntity, accumulatedTerms: '', keywords: extractKeywordsFromClass(baseClass), subtypes: {}, }; // Look for corresponding Types file (subtypes) const subtypesFilePath = join(SCHEMA_DIR, file.replace('Type.yaml', 'Types.yaml')); if (existsSync(subtypesFilePath)) { console.log(` šŸ“‚ Processing subtypes from ${subtypesFilePath.split('/').pop()}`); const subtypesYaml = parseYamlFile(subtypesFilePath); if (subtypesYaml) { const subtypeClasses = extractClassesFromYaml(subtypesYaml); for (const subclass of subtypeClasses) { // Convert CamelCase to UPPER_SNAKE_CASE const subtypeName = subclass.className .replace(/([a-z])([A-Z])/g, '$1_$2') .toUpperCase(); const subtypeKeywords = extractKeywordsFromClass(subclass); const subtypeInfo: SubtypeInfo = { className: subclass.className, wikidata: subclass.wikidataEntity, accumulatedTerms: accumulateTerms(subtypeKeywords), keywords: subtypeKeywords, }; typeInfo.subtypes[subtypeName] = subtypeInfo; // Add to term log for (const [lang, terms] of Object.entries(subtypeKeywords)) { for (const term of terms) { vocabulary.termLog[term] = { typeCode: code, typeName, subtypeName, wikidata: subclass.wikidataEntity, lang, }; } } console.log(` āœ“ ${subclass.className}: ${Object.values(subtypeKeywords).flat().length} terms`); } } } // Accumulate all terms for this type (base + all subtypes) const allTypeTerms: string[] = []; allTypeTerms.push(accumulateTerms(typeInfo.keywords)); for (const subtype of Object.values(typeInfo.subtypes)) { allTypeTerms.push(subtype.accumulatedTerms); } typeInfo.accumulatedTerms = [...new Set(allTypeTerms.join(' ').split(' '))].join(' '); // Add base type keywords to term log for (const [lang, terms] of Object.entries(typeInfo.keywords)) { for (const term of terms) { vocabulary.termLog[term] = { typeCode: code, typeName, lang, }; } } vocabulary.institutionTypes[code] = typeInfo; console.log(` āœ… ${typeName}: ${Object.keys(typeInfo.subtypes).length} subtypes, ${typeInfo.accumulatedTerms.split(' ').length} total terms`); } // Process RecordSetTypes files console.log('\nšŸ“ Processing RecordSetTypes files...'); const recordSetTypesFiles = files.filter(f => f.endsWith('RecordSetTypes.yaml')); for (const file of recordSetTypesFiles) { const filePath = join(SCHEMA_DIR, file); const yamlData = parseYamlFile(filePath); if (!yamlData) continue; const classes = extractClassesFromYaml(yamlData); for (const cls of classes) { // Skip abstract base classes if (cls.className.endsWith('RecordSetType') && !cls.className.includes('Fonds') && !cls.className.includes('Series') && !cls.className.includes('Collection')) { continue; } // Convert CamelCase to UPPER_SNAKE_CASE const rstName = cls.className .replace(/([a-z])([A-Z])/g, '$1_$2') .toUpperCase(); const keywords = extractKeywordsFromClass(cls); const rstInfo: RecordSetTypeInfo = { className: cls.className, accumulatedTerms: accumulateTerms(keywords), keywords, }; vocabulary.recordSetTypes[rstName] = rstInfo; // Add to term log for (const [lang, terms] of Object.entries(keywords)) { for (const term of terms) { vocabulary.termLog[term] = { typeCode: 'A', // Most record set types are archive-related typeName: 'ArchiveOrganizationType', recordSetType: rstName, lang, }; } } } } console.log(` āœ… Extracted ${Object.keys(vocabulary.recordSetTypes).length} record set types`); // Generate Tier 1 embeddings (Types file level) console.log('\n🧮 Generating Tier 1 embeddings (Types files)...'); for (const [code, typeInfo] of Object.entries(vocabulary.institutionTypes)) { const embedding = await generateEmbedding(typeInfo.accumulatedTerms, skipEmbeddings); vocabulary.tier1Embeddings[typeInfo.className] = embedding; console.log(` āœ“ ${typeInfo.className}: ${embedding.length} dimensions`); } // Generate Tier 2 embeddings (individual subtypes) console.log('\n🧮 Generating Tier 2 embeddings (subtypes)...'); for (const [code, typeInfo] of Object.entries(vocabulary.institutionTypes)) { vocabulary.tier2Embeddings[code] = {}; for (const [subtypeName, subtypeInfo] of Object.entries(typeInfo.subtypes)) { const embedding = await generateEmbedding(subtypeInfo.accumulatedTerms, skipEmbeddings); vocabulary.tier2Embeddings[code][subtypeName] = embedding; } console.log(` āœ“ ${typeInfo.className}: ${Object.keys(typeInfo.subtypes).length} subtype embeddings`); } return vocabulary; } // ============================================================================ // Main Entry Point // ============================================================================ async function main() { console.log('═══════════════════════════════════════════════════════════════'); console.log(' TypesVocabulary Extraction Script'); console.log(' Ontology-Driven Cache Segmentation (Rule 46)'); console.log('═══════════════════════════════════════════════════════════════\n'); const vocabulary = await processTypeFiles(); // Ensure output directory exists const outputDir = dirname(OUTPUT_FILE); if (!existsSync(outputDir)) { mkdirSync(outputDir, { recursive: true }); } // Write output writeFileSync(OUTPUT_FILE, JSON.stringify(vocabulary, null, 2)); console.log('\n═══════════════════════════════════════════════════════════════'); console.log(' Summary'); console.log('═══════════════════════════════════════════════════════════════'); console.log(` šŸ“Š Institution Types: ${Object.keys(vocabulary.institutionTypes).length}`); console.log(` šŸ“Š Total Subtypes: ${Object.values(vocabulary.institutionTypes).reduce((sum, t) => sum + Object.keys(t.subtypes).length, 0)}`); console.log(` šŸ“Š Record Set Types: ${Object.keys(vocabulary.recordSetTypes).length}`); console.log(` šŸ“Š Term Log Entries: ${Object.keys(vocabulary.termLog).length}`); console.log(` šŸ“Š Tier 1 Embeddings: ${Object.keys(vocabulary.tier1Embeddings).length}`); console.log(` šŸ“Š Tier 2 Embeddings: ${Object.values(vocabulary.tier2Embeddings).reduce((sum, t) => sum + Object.keys(t).length, 0)}`); console.log(`\n āœ… Output written to: ${OUTPUT_FILE}`); console.log('═══════════════════════════════════════════════════════════════\n'); } main().catch(console.error);